diff --git a/docs/content/docs.md b/docs/content/docs.md index 17c909fea..4b15b37b5 100644 --- a/docs/content/docs.md +++ b/docs/content/docs.md @@ -2808,6 +2808,7 @@ For the filtering options * `--max-size` * `--min-age` * `--max-age` + * `--hash-filter` * `--dump filters` * `--metadata-include` * `--metadata-include-from` diff --git a/docs/content/filtering.md b/docs/content/filtering.md index 8c7ce232f..dc881e8bf 100644 --- a/docs/content/filtering.md +++ b/docs/content/filtering.md @@ -718,6 +718,98 @@ old or more. See [the time option docs](/docs/#time-option) for valid formats. +### `--hash-filter` - Deterministically select a subset of files {#hash-filter} + +The `--hash-filter` flag enables selecting a deterministic subset of files, useful for: + +1. Running large sync operations across multiple machines. +2. Checking a subset of files for bitrot. +3. Any other operations where a sample of files is required. + +#### Syntax + +The flag takes two parameters expressed as a fraction: + +``` +--hash-filter K/N +``` + +- `N`: The total number of partitions (must be a positive integer). +- `K`: The specific partition to select (an integer from `0` to `N`). + +For example: +- `--hash-filter 1/3`: Selects the first third of the files. +- `--hash-filter 2/3` and `--hash-filter 3/3`: Select the second and third partitions, respectively. + +Each partition is non-overlapping, ensuring all files are covered without duplication. + +#### Random Partition Selection + +Use `@` as `K` to randomly select a partition: + +``` +--hash-filter @/M +``` + +For example, `--hash-filter @/3` will randomly select a number between 0 and 2. This will stay constant across retries. + +#### How It Works + +- Rclone takes each file's full path, normalizes it to lowercase, and applies Unicode normalization. +- It then hashes the normalized path into a 64 bit number. +- The hash result is reduced modulo `N` to assign the file to a partition. +- If the calculated partition does not match `K` the file is excluded. +- Other filters may apply if the file is not excluded. + +**Important:** Rclone will traverse all directories to apply the filter. + +#### Usage Notes + +- Safe to use with `rclone sync`; source and destination selections will match. +- **Do not** use with `--delete-excluded`, as this could delete unselected files. +- Ignored if `--files-from` is used. + +#### Examples + +##### Dividing files into 4 partitions + +Assuming the current directory contains `file1.jpg` through `file9.jpg`: + +``` +$ rclone lsf --hash-filter 0/4 . +file1.jpg +file5.jpg + +$ rclone lsf --hash-filter 1/4 . +file3.jpg +file6.jpg +file9.jpg + +$ rclone lsf --hash-filter 2/4 . +file2.jpg +file4.jpg + +$ rclone lsf --hash-filter 3/4 . +file7.jpg +file8.jpg + +$ rclone lsf --hash-filter 4/4 . # the same as --hash-filter 0/4 +file1.jpg +file5.jpg +``` + +##### Syncing the first quarter of files + +``` +rclone sync --hash-filter 1/4 source:path destination:path +``` + +##### Checking a random 1% of files for integrity + +``` +rclone check --download --hash-filter @/100 source:path destination:path +``` + ## Other flags ### `--delete-excluded` - Delete files on dest excluded from sync diff --git a/fs/filter/filter.go b/fs/filter/filter.go index e978074d6..0bbf3e69d 100644 --- a/fs/filter/filter.go +++ b/fs/filter/filter.go @@ -3,15 +3,20 @@ package filter import ( "context" + "crypto/md5" + "encoding/binary" "errors" "fmt" + "math/rand/v2" "path" "slices" + "strconv" "strings" "time" "github.com/rclone/rclone/fs" "golang.org/x/sync/errgroup" + "golang.org/x/text/unicode/norm" ) // This is the globally active filter @@ -65,6 +70,11 @@ var OptionsInfo = fs.Options{{ Default: false, Help: "Ignore case in filters (case insensitive)", Groups: "Filter", +}, { + Name: "hash_filter", + Default: "", + Help: "Partition filenames by hash k/n or randomly @/n", + Groups: "Filter", }, { Name: "filter", Default: []string{}, @@ -141,6 +151,7 @@ type Options struct { MinSize fs.SizeSuffix `config:"min_size"` MaxSize fs.SizeSuffix `config:"max_size"` IgnoreCase bool `config:"ignore_case"` + HashFilter string `config:"hash_filter"` } func init() { @@ -168,6 +179,8 @@ type Filter struct { metaRules rules files FilesMap // files if filesFrom dirs FilesMap // dirs from filesFrom + hashFilterN uint64 // if non 0 do hash filtering + hashFilterK uint64 // select partition K/N } // NewFilter parses the command line options and creates a Filter @@ -190,10 +203,17 @@ func NewFilter(opt *Options) (f *Filter, err error) { if f.Opt.MaxAge.IsSet() { f.ModTimeFrom = time.Now().Add(-time.Duration(f.Opt.MaxAge)) if !f.ModTimeTo.IsZero() && f.ModTimeTo.Before(f.ModTimeFrom) { - fs.Fatalf(nil, "filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge) + return nil, fmt.Errorf("filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge) } fs.Debugf(nil, "--max-age %v to %v", f.Opt.MaxAge, f.ModTimeFrom) } + if f.Opt.HashFilter != "" { + f.hashFilterK, f.hashFilterN, err = parseHashFilter(f.Opt.HashFilter) + if err != nil { + return nil, err + } + fs.Debugf(nil, "Using --hash-filter %d/%d", f.hashFilterK, f.hashFilterN) + } err = parseRules(&f.Opt.RulesOpt, f.Add, f.Clear) if err != nil { @@ -243,6 +263,32 @@ func NewFilter(opt *Options) (f *Filter, err error) { return f, nil } +// Parse the --hash-filter arguments into k/n +func parseHashFilter(hashFilter string) (k, n uint64, err error) { + slash := strings.IndexRune(hashFilter, '/') + if slash < 0 { + return 0, 0, fmt.Errorf("filter: --hash-filter: no / found") + } + kStr, nStr := hashFilter[:slash], hashFilter[slash+1:] + n, err = strconv.ParseUint(nStr, 10, 64) + if err != nil { + return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse N=%q: %v", nStr, err) + } + if n == 0 { + return 0, 0, fmt.Errorf("filter: --hash-filter: N must be greater than 0") + } + if kStr == "@" { + k = rand.Uint64N(n) + } else { + k, err = strconv.ParseUint(kStr, 10, 64) + if err != nil { + return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse K=%q: %v", kStr, err) + } + k %= n + } + return k, n, nil +} + func mustNewFilter(opt *Options) *Filter { f, err := NewFilter(opt) if err != nil { @@ -367,7 +413,8 @@ func (f *Filter) InActive() bool { f.fileRules.len() == 0 && f.dirRules.len() == 0 && f.metaRules.len() == 0 && - len(f.Opt.ExcludeFile) == 0) + len(f.Opt.ExcludeFile) == 0 && + f.hashFilterN == 0) } // IncludeRemote returns whether this remote passes the filter rules. @@ -377,6 +424,21 @@ func (f *Filter) IncludeRemote(remote string) bool { _, include := f.files[remote] return include } + if f.hashFilterN != 0 { + // Normalise the remote first in case we are using a + // case insensitive remote or a remote which needs + // unicode normalisation. This means all the remotes + // which could be normalised together will be in the + // same partition. + normalized := norm.NFC.String(remote) + normalized = strings.ToLower(normalized) + hashBytes := md5.Sum([]byte(normalized)) + hash := binary.LittleEndian.Uint64(hashBytes[:]) + partition := hash % f.hashFilterN + if partition != f.hashFilterK { + return false + } + } return f.fileRules.include(remote) } diff --git a/fs/filter/filter_test.go b/fs/filter/filter_test.go index ef7a1440b..a5e82d3ab 100644 --- a/fs/filter/filter_test.go +++ b/fs/filter/filter_test.go @@ -28,6 +28,38 @@ func TestNewFilterDefault(t *testing.T) { assert.True(t, f.InActive()) } +func TestParseHashFilter(t *testing.T) { + for _, test := range []struct { + hashFilter string + n uint64 + k uint64 + err string + }{ + {hashFilter: "", err: "no / found"}, + {hashFilter: "17", err: "no / found"}, + {hashFilter: "-1/2", err: "can't parse K="}, + {hashFilter: "1/-2", err: "can't parse N="}, + {hashFilter: "0/0", err: "N must be greater than 0"}, + {hashFilter: "0/18446744073709551615", k: 0, n: 18446744073709551615}, + {hashFilter: "0/18446744073709551616", err: "can't parse N="}, + {hashFilter: "18446744073709551615/1", k: 0, n: 1}, + {hashFilter: "18446744073709551616/1", err: "can't parse K="}, + {hashFilter: "1/2", k: 1, n: 2}, + {hashFilter: "17/3", k: 2, n: 3}, + {hashFilter: "@/1", k: 0, n: 1}, + } { + gotK, gotN, gotErr := parseHashFilter(test.hashFilter) + if test.err != "" { + assert.Error(t, gotErr) + assert.ErrorContains(t, gotErr, test.err, test.hashFilter) + } else { + assert.Equal(t, test.k, gotK, test.hashFilter) + assert.Equal(t, test.n, gotN, test.hashFilter) + assert.NoError(t, gotErr, test.hashFilter) + } + } +} + // testFile creates a temp file with the contents func testFile(t *testing.T, contents string) string { out, err := os.CreateTemp("", "filter_test") @@ -209,6 +241,7 @@ type includeTest struct { } func testInclude(t *testing.T, f *Filter, tests []includeTest) { + t.Helper() for _, test := range tests { got := f.Include(test.in, test.size, time.Unix(test.modTime, 0), nil) assert.Equal(t, test.want, got, fmt.Sprintf("in=%q, size=%v, modTime=%v", test.in, test.size, time.Unix(test.modTime, 0))) @@ -539,6 +572,42 @@ func TestNewFilterMatchesRegexp(t *testing.T) { assert.False(t, f.InActive()) } +func TestNewFilterHashFilter(t *testing.T) { + const e1 = "filé1.jpg" // one of the unicode E characters + const e2 = "filé1.jpg" // a different unicode E character + assert.NotEqual(t, e1, e2) + for i := 0; i <= 4; i++ { + opt := Opt + opt.HashFilter = fmt.Sprintf("%d/4", i) + opt.ExcludeRule = []string{"*.bin"} + f, err := NewFilter(&opt) + require.NoError(t, err) + t.Run(opt.HashFilter, func(t *testing.T) { + testInclude(t, f, []includeTest{ + {"file1.jpg", 0, 0, i == 0 || i == 4}, + {"FILE1.jpg", 0, 0, i == 0 || i == 4}, + {"file2.jpg", 1, 0, i == 2}, + {"File2.jpg", 1, 0, i == 2}, + {"file3.jpg", 2, 0, i == 1}, + {"file4.jpg", 3, 0, i == 2}, + {"file5.jpg", 4, 0, i == 0 || i == 4}, + {"file6.jpg", 5, 0, i == 1}, + {"file7.jpg", 6, 0, i == 3}, + {"file8.jpg", 7, 0, i == 3}, + {"file9.jpg", 7, 0, i == 1}, + {e1, 0, 0, i == 3}, + {e2, 0, 0, i == 3}, + {"hello" + e1, 0, 0, i == 2}, + {"HELLO" + e2, 0, 0, i == 2}, + {"hello1" + e1, 0, 0, i == 1}, + {"Hello1" + e2, 0, 0, i == 1}, + {"exclude.bin", 8, 0, false}, + }) + }) + assert.False(t, f.InActive()) + } +} + type includeTestMetadata struct { in string metadata fs.Metadata