gcs: Fix download of "Content-Encoding: gzip" compressed objects

Before this change, if an object compressed with "Content-Encoding:
gzip" was downloaded, a length and hash mismatch would occur since the
as the go runtime automatically decompressed the object on download.

This change erases the length and hash on compressed objects so they
can be downloaded successfully, at the cost of not being able to check
the length or the hash of the downloaded object.

This also adds the --gcs-download-compressed flag to allow the
compressed files to be downloaded as-is providing compressed objects
with intact size and hash information.

Fixes #2658
This commit is contained in:
Nick Craig-Wood 2022-03-31 15:41:08 +01:00
parent 3d55f69338
commit 2781f8e2f1

View File

@ -24,6 +24,7 @@ import (
"path" "path"
"strconv" "strconv"
"strings" "strings"
"sync"
"time" "time"
"github.com/rclone/rclone/fs" "github.com/rclone/rclone/fs"
@ -304,6 +305,21 @@ rclone does if you know the bucket exists already.
`, `,
Default: false, Default: false,
Advanced: true, Advanced: true,
}, {
Name: "download_compressed",
Help: `If set this will download compressed objects as-is.
It is possible to upload objects to GCS with "Content-Encoding: gzip"
set. Normally rclone will transparently decompress these files on
download. This means that rclone can't check the hash or the size of
the file as both of these refer to the compressed object.
If this flag is set then rclone will download files with
"Content-Encoding: gzip" as they are received. This means that rclone
can check the size and hash but the file contents will be compressed.
`,
Advanced: true,
Default: false,
}, { }, {
Name: config.ConfigEncoding, Name: config.ConfigEncoding,
Help: config.ConfigEncodingHelp, Help: config.ConfigEncodingHelp,
@ -327,21 +343,23 @@ type Options struct {
Location string `config:"location"` Location string `config:"location"`
StorageClass string `config:"storage_class"` StorageClass string `config:"storage_class"`
NoCheckBucket bool `config:"no_check_bucket"` NoCheckBucket bool `config:"no_check_bucket"`
DownloadCompressed bool `config:"download_compressed"`
Enc encoder.MultiEncoder `config:"encoding"` Enc encoder.MultiEncoder `config:"encoding"`
} }
// Fs represents a remote storage server // Fs represents a remote storage server
type Fs struct { type Fs struct {
name string // name of this remote name string // name of this remote
root string // the path we are working on if any root string // the path we are working on if any
opt Options // parsed options opt Options // parsed options
features *fs.Features // optional features features *fs.Features // optional features
svc *storage.Service // the connection to the storage server svc *storage.Service // the connection to the storage server
client *http.Client // authorized client client *http.Client // authorized client
rootBucket string // bucket part of root (if any) rootBucket string // bucket part of root (if any)
rootDirectory string // directory part of root (if any) rootDirectory string // directory part of root (if any)
cache *bucket.Cache // cache of bucket status cache *bucket.Cache // cache of bucket status
pacer *fs.Pacer // To pace the API calls pacer *fs.Pacer // To pace the API calls
warnCompressed sync.Once // warn once about compressed files
} }
// Object describes a storage object // Object describes a storage object
@ -355,6 +373,7 @@ type Object struct {
bytes int64 // Bytes in the object bytes int64 // Bytes in the object
modTime time.Time // Modified time of the object modTime time.Time // Modified time of the object
mimeType string mimeType string
gzipped bool // set if object has Content-Encoding: gzip
} }
// ------------------------------------------------------------ // ------------------------------------------------------------
@ -975,6 +994,7 @@ func (o *Object) setMetaData(info *storage.Object) {
o.url = info.MediaLink o.url = info.MediaLink
o.bytes = int64(info.Size) o.bytes = int64(info.Size)
o.mimeType = info.ContentType o.mimeType = info.ContentType
o.gzipped = info.ContentEncoding == "gzip"
// Read md5sum // Read md5sum
md5sumData, err := base64.StdEncoding.DecodeString(info.Md5Hash) md5sumData, err := base64.StdEncoding.DecodeString(info.Md5Hash)
@ -1013,6 +1033,15 @@ func (o *Object) setMetaData(info *storage.Object) {
} else { } else {
o.modTime = modTime o.modTime = modTime
} }
// If gunzipping then size and md5sum are unknown
if o.gzipped && !o.fs.opt.DownloadCompressed {
o.bytes = -1
o.md5sum = ""
o.fs.warnCompressed.Do(func() {
fs.Logf(o.fs, "Decompressing 'Content-Encoding: gzip' compressed file. Use --gcs-download-compressed to override")
})
}
} }
// readObjectInfo reads the definition for an object // readObjectInfo reads the definition for an object
@ -1113,6 +1142,15 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read
return nil, err return nil, err
} }
fs.FixRangeOption(options, o.bytes) fs.FixRangeOption(options, o.bytes)
if o.gzipped && o.fs.opt.DownloadCompressed {
// Allow files which are stored on the cloud storage system
// compressed to be downloaded without being decompressed. Note
// that setting this here overrides the automatic decompression
// in the Transport.
//
// See: https://cloud.google.com/storage/docs/transcoding
req.Header.Set("Accept-Encoding", "gzip")
}
fs.OpenOptionAddHTTPHeaders(req.Header, options) fs.OpenOptionAddHTTPHeaders(req.Header, options)
var res *http.Response var res *http.Response
err = o.fs.pacer.Call(func() (bool, error) { err = o.fs.pacer.Call(func() (bool, error) {