From 910c80bd027848e1c90d78b6455e56912533ebbf Mon Sep 17 00:00:00 2001 From: Ivan Andreev Date: Wed, 9 Oct 2019 13:24:03 +0300 Subject: [PATCH] chunker: option to hash all files --- backend/chunker/chunker.go | 189 ++++++++++++++--------- backend/chunker/chunker_internal_test.go | 26 ++++ docs/content/chunker.md | 46 +++--- 3 files changed, 166 insertions(+), 95 deletions(-) diff --git a/backend/chunker/chunker.go b/backend/chunker/chunker.go index f83dc1a5b..53b4413bc 100644 --- a/backend/chunker/chunker.go +++ b/backend/chunker/chunker.go @@ -164,24 +164,28 @@ It has the following fields: ver, size, nchunks, md5, sha1.`, Name: "hash_type", Advanced: false, Default: "md5", - Help: `Choose how chunker handles hash sums.`, + Help: `Choose how chunker handles hash sums. All modes but "none" require metadata.`, Examples: []fs.OptionExample{{ Value: "none", - Help: `Chunker can pass any hash supported by wrapped remote -for non-chunked files but returns nothing otherwise.`, + Help: `Pass any hash supported by wrapped remote for non-chunked files, return nothing otherwise`, }, { Value: "md5", - Help: `MD5 for composite files. Requires "simplejson".`, + Help: `MD5 for composite files`, }, { Value: "sha1", - Help: `SHA1 for composite files. Requires "simplejson".`, + Help: `SHA1 for composite files`, + }, { + Value: "md5all", + Help: `MD5 for all files`, + }, { + Value: "sha1all", + Help: `SHA1 for all files`, }, { Value: "md5quick", - Help: `Copying a file to chunker will request MD5 from the source -falling back to SHA1 if unsupported. Requires "simplejson".`, + Help: `Copying a file to chunker will request MD5 from the source falling back to SHA1 if unsupported`, }, { Value: "sha1quick", - Help: `Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson".`, + Help: `Similar to "md5quick" but prefers SHA1 over MD5`, }}, }, { Name: "fail_hard", @@ -240,38 +244,8 @@ func NewFs(name, rpath string, m configmap.Mapper) (fs.Fs, error) { } f.dirSort = true // processEntries requires that meta Objects prerun data chunks atm. - switch opt.MetaFormat { - case "none": - f.useMeta = false - case "simplejson": - f.useMeta = true - default: - return nil, fmt.Errorf("unsupported meta format '%s'", opt.MetaFormat) - } - - requireMetaHash := true - switch opt.HashType { - case "none": - requireMetaHash = false - case "md5": - f.useMD5 = true - case "sha1": - f.useSHA1 = true - case "md5quick": - f.useMD5 = true - f.quickHash = true - case "sha1quick": - f.useSHA1 = true - f.quickHash = true - default: - return nil, fmt.Errorf("unsupported hash type '%s'", opt.HashType) - } - if requireMetaHash && opt.MetaFormat != "simplejson" { - return nil, fmt.Errorf("hash type '%s' requires meta format 'simplejson'", opt.HashType) - } - - if err := f.setChunkNameFormat(opt.NameFormat); err != nil { - return nil, errors.Wrapf(err, "invalid name format '%s'", opt.NameFormat) + if err := f.configure(opt.NameFormat, opt.MetaFormat, opt.HashType); err != nil { + return nil, err } // Handle the tricky case detected by FsMkdir/FsPutFiles/FsIsFile @@ -317,20 +291,87 @@ type Options struct { // Fs represents a wrapped fs.Fs type Fs struct { - name string - root string - base fs.Fs // remote wrapped by chunker overlay - wrapper fs.Fs // wrapper is used by SetWrapper - useMeta bool // false if metadata format is 'none' - useMD5 bool // mutually exclusive with useSHA1 - useSHA1 bool // mutually exclusive with useMD5 - quickHash bool // allows fallback from MD5 to SHA1 and vice versa - dataNameFmt string // name format of data chunks - ctrlNameFmt string // name format of control chunks - nameRegexp *regexp.Regexp // regular expression to match chunk names - opt Options // copy of Options - features *fs.Features // optional features - dirSort bool // reserved for future, ignored + name string + root string + base fs.Fs // remote wrapped by chunker overlay + wrapper fs.Fs // wrapper is used by SetWrapper + useMeta bool // false if metadata format is 'none' + useMD5 bool // mutually exclusive with useSHA1 + useSHA1 bool // mutually exclusive with useMD5 + hashFallback bool // allows fallback from MD5 to SHA1 and vice versa + hashAll bool // hash all files, mutually exclusive with hashFallback + dataNameFmt string // name format of data chunks + ctrlNameFmt string // name format of control chunks + nameRegexp *regexp.Regexp // regular expression to match chunk names + opt Options // copy of Options + features *fs.Features // optional features + dirSort bool // reserved for future, ignored +} + +// configure must be called only from NewFs or by unit tests +func (f *Fs) configure(nameFormat, metaFormat, hashType string) error { + if err := f.setChunkNameFormat(nameFormat); err != nil { + return errors.Wrapf(err, "invalid name format '%s'", nameFormat) + } + if err := f.setMetaFormat(metaFormat); err != nil { + return err + } + if err := f.setHashType(hashType); err != nil { + return err + } + return nil +} + +func (f *Fs) setMetaFormat(metaFormat string) error { + switch metaFormat { + case "none": + f.useMeta = false + case "simplejson": + f.useMeta = true + default: + return fmt.Errorf("unsupported meta format '%s'", metaFormat) + } + return nil +} + +// setHashType +// must be called *after* setMetaFormat. +// +// In the "All" mode chunker will force metadata on all files +// if the wrapped remote can't provide given hashsum. +func (f *Fs) setHashType(hashType string) error { + f.useMD5 = false + f.useSHA1 = false + f.hashFallback = false + f.hashAll = false + requireMetaHash := true + + switch hashType { + case "none": + requireMetaHash = false + case "md5": + f.useMD5 = true + case "sha1": + f.useSHA1 = true + case "md5quick": + f.useMD5 = true + f.hashFallback = true + case "sha1quick": + f.useSHA1 = true + f.hashFallback = true + case "md5all": + f.useMD5 = true + f.hashAll = !f.base.Hashes().Contains(hash.MD5) + case "sha1all": + f.useSHA1 = true + f.hashAll = !f.base.Hashes().Contains(hash.SHA1) + default: + return fmt.Errorf("unsupported hash type '%s'", hashType) + } + if requireMetaHash && !f.useMeta { + return fmt.Errorf("hash type '%s' requires compatible meta format", hashType) + } + return nil } // setChunkNameFormat converts pattern based chunk name format @@ -877,8 +918,8 @@ func (f *Fs) put(ctx context.Context, in io.Reader, src fs.ObjectInfo, remote st // Finalize small object as non-chunked. // This can be bypassed, and single chunk with metadata will be - // created due to unsafe input. - if !needMeta && f.useMeta { + // created if forced by consistent hashing or due to unsafe input. + if !needMeta && !f.hashAll && f.useMeta { // If previous object was chunked, remove its chunks f.removeOldChunks(ctx, baseRemote) @@ -989,7 +1030,7 @@ func (c *chunkingReader) wrapStream(ctx context.Context, in io.Reader, src fs.Ob switch { case c.fs.useMD5: if c.md5, _ = src.Hash(ctx, hash.MD5); c.md5 == "" { - if c.fs.quickHash { + if c.fs.hashFallback { c.sha1, _ = src.Hash(ctx, hash.SHA1) } else { c.hasher = md5.New() @@ -997,7 +1038,7 @@ func (c *chunkingReader) wrapStream(ctx context.Context, in io.Reader, src fs.Ob } case c.fs.useSHA1: if c.sha1, _ = src.Hash(ctx, hash.SHA1); c.sha1 == "" { - if c.fs.quickHash { + if c.fs.hashFallback { c.md5, _ = src.Hash(ctx, hash.MD5) } else { c.hasher = sha1.New() @@ -1157,11 +1198,11 @@ func (f *Fs) Precision() time.Duration { // Chunker advertises a hash type if and only if it can be calculated // for files of any size, non-chunked or composite. func (f *Fs) Hashes() hash.Set { - // composites && all of them && small files supported by wrapped remote - if f.useMD5 && !f.quickHash && f.base.Hashes().Contains(hash.MD5) { + // composites AND no fallback AND (chunker OR wrapped Fs will hash all non-chunked's) + if f.useMD5 && !f.hashFallback && (f.hashAll || f.base.Hashes().Contains(hash.MD5)) { return hash.NewHashSet(hash.MD5) } - if f.useSHA1 && !f.quickHash && f.base.Hashes().Contains(hash.SHA1) { + if f.useSHA1 && !f.hashFallback && (f.hashAll || f.base.Hashes().Contains(hash.SHA1)) { return hash.NewHashSet(hash.SHA1) } return hash.NewHashSet() // can't provide strong guarantees @@ -1383,14 +1424,14 @@ func (f *Fs) okForServerSide(ctx context.Context, src fs.Object, opName string) case f.useMD5: md5, _ = obj.Hash(ctx, hash.MD5) ok = md5 != "" - if !ok && f.quickHash { + if !ok && f.hashFallback { sha1, _ = obj.Hash(ctx, hash.SHA1) ok = sha1 != "" } case f.useSHA1: sha1, _ = obj.Hash(ctx, hash.SHA1) ok = sha1 != "" - if !ok && f.quickHash { + if !ok && f.hashFallback { md5, _ = obj.Hash(ctx, hash.MD5) ok = md5 != "" } @@ -1678,17 +1719,14 @@ func (o *Object) SetModTime(ctx context.Context, mtime time.Time) error { // Hash returns the selected checksum of the file. // If no checksum is available it returns "". +// If a particular hashsum type is not supported, chunker won't fail +// with `unsupported` error but return the empty hash string. // // Currently metadata (if not configured as 'none') is kept only for // composite files, but for non-chunked small files chunker obtains // hashsums from wrapped remote. -// If a particular hashsum type is not supported, chunker won't fail -// with `unsupported` error but return the empty hash string. -// -// In future metadata logic can be extended: if a normal (non-quick) -// hash type is configured, chunker will check whether wrapped remote -// supports it (see Fs.Hashes as an example). If not, it will add metadata -// to small files as well, thus providing hashsums for all files. +// In the "All" mode chunker will force metadata on all files if +// particular hashsum type is unsupported by wrapped remote. // func (o *Object) Hash(ctx context.Context, hashType hash.Type) (string, error) { if !o.isComposite() { @@ -1976,9 +2014,10 @@ type metaSimpleJSON struct { // marshalSimpleJSON // -// Current implementation creates metadata in two cases: +// Current implementation creates metadata in three cases: // - for files larger than chunk size // - if file contents can be mistaken as meta object +// - if consistent hashing is on but wrapped remote can't provide given hash // func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 string) ([]byte, error) { version := metadataVersion @@ -2000,13 +2039,9 @@ func marshalSimpleJSON(ctx context.Context, size int64, nChunks int, md5, sha1 s } // unmarshalSimpleJSON -// Note: only metadata format version 1 is supported atm. // -// Current implementation creates metadata only for files larger than -// configured chunk size. This approach has drawback: availability of -// configured hashsum type for small files depends on the wrapped remote. -// Future versions of chunker may change approach as described in comment -// to the Hash method. They can transparently migrate older metadata. +// Only metadata format version 1 is supported atm. +// Future releases will transparently migrate older metadata objects. // New format will have a higher version number and cannot be correctly // handled by current implementation. // The version check below will then explicitly ask user to upgrade rclone. diff --git a/backend/chunker/chunker_internal_test.go b/backend/chunker/chunker_internal_test.go index 75a78795a..24e8e2165 100644 --- a/backend/chunker/chunker_internal_test.go +++ b/backend/chunker/chunker_internal_test.go @@ -12,6 +12,7 @@ import ( "testing" "github.com/rclone/rclone/fs" + "github.com/rclone/rclone/fs/hash" "github.com/rclone/rclone/fs/operations" "github.com/rclone/rclone/fstest" "github.com/rclone/rclone/fstest/fstests" @@ -276,6 +277,11 @@ func testSmallFileInternals(t *testing.T, f *Fs) { assert.Nil(t, o.main) assert.True(t, o.isComposite()) // sorry, sometimes a name is misleading assert.Equal(t, 1, len(o.chunks)) + case f.hashAll: + // Consistent hashing forces meta object on small files too + assert.NotNil(t, o.main) + assert.True(t, o.isComposite()) + assert.Equal(t, 1, len(o.chunks)) default: // normally non-chunked file is kept in the Object's main field assert.NotNil(t, o.main) @@ -300,6 +306,24 @@ func testSmallFileInternals(t *testing.T, f *Fs) { _ = r.Close() } + checkHashsum := func(obj fs.Object) { + var ht hash.Type + switch { + case !f.hashAll: + return + case f.useMD5: + ht = hash.MD5 + case f.useSHA1: + ht = hash.SHA1 + default: + return + } + // even empty files must have hashsum in consistent mode + sum, err := obj.Hash(ctx, ht) + assert.NoError(t, err) + assert.NotEqual(t, sum, "") + } + checkSmallFile := func(name, contents string) { filename := path.Join(dir, name) item := fstest.Item{Path: filename, ModTime: modTime} @@ -307,6 +331,7 @@ func testSmallFileInternals(t *testing.T, f *Fs) { assert.NotNil(t, put) checkSmallFileInternals(put) checkContents(put, contents) + checkHashsum(put) // objects returned by Put and NewObject must have similar structure obj, err := f.NewObject(ctx, filename) @@ -314,6 +339,7 @@ func testSmallFileInternals(t *testing.T, f *Fs) { assert.NotNil(t, obj) checkSmallFileInternals(obj) checkContents(obj, contents) + checkHashsum(obj) _ = obj.Remove(ctx) _ = put.Remove(ctx) // for good diff --git a/docs/content/chunker.md b/docs/content/chunker.md index 8921c595a..ae0eb73d5 100644 --- a/docs/content/chunker.md +++ b/docs/content/chunker.md @@ -46,20 +46,22 @@ remote> remote:path Files larger than chunk size will be split in chunks. Enter a size with suffix k,M,G,T. Press Enter for the default ("2G"). chunk_size> 100M -Choose how chunker handles hash sums. +Choose how chunker handles hash sums. All modes but "none" require metadata. Enter a string value. Press Enter for the default ("md5"). Choose a number from below, or type in your own value - / Chunker can pass any hash supported by wrapped remote - 1 | for non-chunked files but returns nothing otherwise. + 1 / Pass any hash supported by wrapped remote for non-chunked files, return nothing otherwise \ "none" - 2 / MD5 for composite files. Requires "simplejson". + 2 / MD5 for composite files \ "md5" - 3 / SHA1 for composite files. Requires "simplejson". + 3 / SHA1 for composite files \ "sha1" - / Copying a file to chunker will request MD5 from the source - 4 | falling back to SHA1 if unsupported. Requires "simplejson". + 4 / MD5 for all files + \ "md5all" + 5 / SHA1 for all files + \ "sha1all" + 6 / Copying a file to chunker will request MD5 from the source falling back to SHA1 if unsupported \ "md5quick" - 5 / Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson". + 7 / Similar to "md5quick" but prefers SHA1 over MD5 \ "sha1quick" hash_type> md5 Edit advanced config? (y/n) @@ -190,8 +192,8 @@ Chunker supports hashsums only when a compatible metadata is present. Hence, if you choose metadata format of `none`, chunker will report hashsum as `UNSUPPORTED`. -Please note that metadata is stored only for composite files. If a file -is small (smaller than configured chunk size), chunker will transparently +Please note that by default metadata is stored only for composite files. +If a file is smaller than configured chunk size, chunker will transparently redirect hash requests to wrapped remote, so support depends on that. You will see the empty string as a hashsum of requested type for small files if the wrapped remote doesn't support it. @@ -204,6 +206,12 @@ wrapped remote hash for non-chunked ones, we advise you to choose the same hash type as supported by wrapped remote so that your file listings look coherent. +If your storage backend does not support MD5 or SHA1 but you need consistent +file hashing, configure chunker with `md5all` or `sha1all`. These two modes +guarantee given hash for all files. If wrapped remote doesn't support it, +chunker will then add metadata to all files, even small. However, this can +double the amount of small files in storage and incur additional service charges. + Normally, when a file is copied to chunker controlled remote, chunker will ask the file source for compatible file hash and revert to on-the-fly calculation if none is found. This involves some CPU overhead but provides @@ -309,7 +317,7 @@ Files larger than chunk size will be split in chunks. #### --chunker-hash-type -Choose how chunker handles hash sums. +Choose how chunker handles hash sums. All modes but "none" require metadata. - Config: hash_type - Env Var: RCLONE_CHUNKER_HASH_TYPE @@ -317,17 +325,19 @@ Choose how chunker handles hash sums. - Default: "md5" - Examples: - "none" - - Chunker can pass any hash supported by wrapped remote - - for non-chunked files but returns nothing otherwise. + - Pass any hash supported by wrapped remote for non-chunked files, return nothing otherwise - "md5" - - MD5 for composite files. Requires "simplejson". + - MD5 for composite files - "sha1" - - SHA1 for composite files. Requires "simplejson". + - SHA1 for composite files + - "md5all" + - MD5 for all files + - "sha1all" + - SHA1 for all files - "md5quick" - - Copying a file to chunker will request MD5 from the source - - falling back to SHA1 if unsupported. Requires "simplejson". + - Copying a file to chunker will request MD5 from the source falling back to SHA1 if unsupported - "sha1quick" - - Similar to "md5quick" but prefers SHA1 over MD5. Requires "simplejson". + - Similar to "md5quick" but prefers SHA1 over MD5 ### Advanced Options