mirror of
https://github.com/rclone/rclone.git
synced 2025-06-21 04:08:02 +02:00
filter: Add --hash-filter
to deterministically select a subset of files
Fixes #8400
This commit is contained in:
parent
4e2b78f65d
commit
3fb4164d87
@ -2808,6 +2808,7 @@ For the filtering options
|
|||||||
* `--max-size`
|
* `--max-size`
|
||||||
* `--min-age`
|
* `--min-age`
|
||||||
* `--max-age`
|
* `--max-age`
|
||||||
|
* `--hash-filter`
|
||||||
* `--dump filters`
|
* `--dump filters`
|
||||||
* `--metadata-include`
|
* `--metadata-include`
|
||||||
* `--metadata-include-from`
|
* `--metadata-include-from`
|
||||||
|
@ -718,6 +718,98 @@ old or more.
|
|||||||
|
|
||||||
See [the time option docs](/docs/#time-option) for valid formats.
|
See [the time option docs](/docs/#time-option) for valid formats.
|
||||||
|
|
||||||
|
### `--hash-filter` - Deterministically select a subset of files {#hash-filter}
|
||||||
|
|
||||||
|
The `--hash-filter` flag enables selecting a deterministic subset of files, useful for:
|
||||||
|
|
||||||
|
1. Running large sync operations across multiple machines.
|
||||||
|
2. Checking a subset of files for bitrot.
|
||||||
|
3. Any other operations where a sample of files is required.
|
||||||
|
|
||||||
|
#### Syntax
|
||||||
|
|
||||||
|
The flag takes two parameters expressed as a fraction:
|
||||||
|
|
||||||
|
```
|
||||||
|
--hash-filter K/N
|
||||||
|
```
|
||||||
|
|
||||||
|
- `N`: The total number of partitions (must be a positive integer).
|
||||||
|
- `K`: The specific partition to select (an integer from `0` to `N`).
|
||||||
|
|
||||||
|
For example:
|
||||||
|
- `--hash-filter 1/3`: Selects the first third of the files.
|
||||||
|
- `--hash-filter 2/3` and `--hash-filter 3/3`: Select the second and third partitions, respectively.
|
||||||
|
|
||||||
|
Each partition is non-overlapping, ensuring all files are covered without duplication.
|
||||||
|
|
||||||
|
#### Random Partition Selection
|
||||||
|
|
||||||
|
Use `@` as `K` to randomly select a partition:
|
||||||
|
|
||||||
|
```
|
||||||
|
--hash-filter @/M
|
||||||
|
```
|
||||||
|
|
||||||
|
For example, `--hash-filter @/3` will randomly select a number between 0 and 2. This will stay constant across retries.
|
||||||
|
|
||||||
|
#### How It Works
|
||||||
|
|
||||||
|
- Rclone takes each file's full path, normalizes it to lowercase, and applies Unicode normalization.
|
||||||
|
- It then hashes the normalized path into a 64 bit number.
|
||||||
|
- The hash result is reduced modulo `N` to assign the file to a partition.
|
||||||
|
- If the calculated partition does not match `K` the file is excluded.
|
||||||
|
- Other filters may apply if the file is not excluded.
|
||||||
|
|
||||||
|
**Important:** Rclone will traverse all directories to apply the filter.
|
||||||
|
|
||||||
|
#### Usage Notes
|
||||||
|
|
||||||
|
- Safe to use with `rclone sync`; source and destination selections will match.
|
||||||
|
- **Do not** use with `--delete-excluded`, as this could delete unselected files.
|
||||||
|
- Ignored if `--files-from` is used.
|
||||||
|
|
||||||
|
#### Examples
|
||||||
|
|
||||||
|
##### Dividing files into 4 partitions
|
||||||
|
|
||||||
|
Assuming the current directory contains `file1.jpg` through `file9.jpg`:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ rclone lsf --hash-filter 0/4 .
|
||||||
|
file1.jpg
|
||||||
|
file5.jpg
|
||||||
|
|
||||||
|
$ rclone lsf --hash-filter 1/4 .
|
||||||
|
file3.jpg
|
||||||
|
file6.jpg
|
||||||
|
file9.jpg
|
||||||
|
|
||||||
|
$ rclone lsf --hash-filter 2/4 .
|
||||||
|
file2.jpg
|
||||||
|
file4.jpg
|
||||||
|
|
||||||
|
$ rclone lsf --hash-filter 3/4 .
|
||||||
|
file7.jpg
|
||||||
|
file8.jpg
|
||||||
|
|
||||||
|
$ rclone lsf --hash-filter 4/4 . # the same as --hash-filter 0/4
|
||||||
|
file1.jpg
|
||||||
|
file5.jpg
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Syncing the first quarter of files
|
||||||
|
|
||||||
|
```
|
||||||
|
rclone sync --hash-filter 1/4 source:path destination:path
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Checking a random 1% of files for integrity
|
||||||
|
|
||||||
|
```
|
||||||
|
rclone check --download --hash-filter @/100 source:path destination:path
|
||||||
|
```
|
||||||
|
|
||||||
## Other flags
|
## Other flags
|
||||||
|
|
||||||
### `--delete-excluded` - Delete files on dest excluded from sync
|
### `--delete-excluded` - Delete files on dest excluded from sync
|
||||||
|
@ -3,15 +3,20 @@ package filter
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"crypto/md5"
|
||||||
|
"encoding/binary"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math/rand/v2"
|
||||||
"path"
|
"path"
|
||||||
"slices"
|
"slices"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/rclone/rclone/fs"
|
"github.com/rclone/rclone/fs"
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
"golang.org/x/text/unicode/norm"
|
||||||
)
|
)
|
||||||
|
|
||||||
// This is the globally active filter
|
// This is the globally active filter
|
||||||
@ -65,6 +70,11 @@ var OptionsInfo = fs.Options{{
|
|||||||
Default: false,
|
Default: false,
|
||||||
Help: "Ignore case in filters (case insensitive)",
|
Help: "Ignore case in filters (case insensitive)",
|
||||||
Groups: "Filter",
|
Groups: "Filter",
|
||||||
|
}, {
|
||||||
|
Name: "hash_filter",
|
||||||
|
Default: "",
|
||||||
|
Help: "Partition filenames by hash k/n or randomly @/n",
|
||||||
|
Groups: "Filter",
|
||||||
}, {
|
}, {
|
||||||
Name: "filter",
|
Name: "filter",
|
||||||
Default: []string{},
|
Default: []string{},
|
||||||
@ -141,6 +151,7 @@ type Options struct {
|
|||||||
MinSize fs.SizeSuffix `config:"min_size"`
|
MinSize fs.SizeSuffix `config:"min_size"`
|
||||||
MaxSize fs.SizeSuffix `config:"max_size"`
|
MaxSize fs.SizeSuffix `config:"max_size"`
|
||||||
IgnoreCase bool `config:"ignore_case"`
|
IgnoreCase bool `config:"ignore_case"`
|
||||||
|
HashFilter string `config:"hash_filter"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
@ -168,6 +179,8 @@ type Filter struct {
|
|||||||
metaRules rules
|
metaRules rules
|
||||||
files FilesMap // files if filesFrom
|
files FilesMap // files if filesFrom
|
||||||
dirs FilesMap // dirs from filesFrom
|
dirs FilesMap // dirs from filesFrom
|
||||||
|
hashFilterN uint64 // if non 0 do hash filtering
|
||||||
|
hashFilterK uint64 // select partition K/N
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewFilter parses the command line options and creates a Filter
|
// NewFilter parses the command line options and creates a Filter
|
||||||
@ -190,10 +203,17 @@ func NewFilter(opt *Options) (f *Filter, err error) {
|
|||||||
if f.Opt.MaxAge.IsSet() {
|
if f.Opt.MaxAge.IsSet() {
|
||||||
f.ModTimeFrom = time.Now().Add(-time.Duration(f.Opt.MaxAge))
|
f.ModTimeFrom = time.Now().Add(-time.Duration(f.Opt.MaxAge))
|
||||||
if !f.ModTimeTo.IsZero() && f.ModTimeTo.Before(f.ModTimeFrom) {
|
if !f.ModTimeTo.IsZero() && f.ModTimeTo.Before(f.ModTimeFrom) {
|
||||||
fs.Fatalf(nil, "filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge)
|
return nil, fmt.Errorf("filter: --min-age %q can't be larger than --max-age %q", opt.MinAge, opt.MaxAge)
|
||||||
}
|
}
|
||||||
fs.Debugf(nil, "--max-age %v to %v", f.Opt.MaxAge, f.ModTimeFrom)
|
fs.Debugf(nil, "--max-age %v to %v", f.Opt.MaxAge, f.ModTimeFrom)
|
||||||
}
|
}
|
||||||
|
if f.Opt.HashFilter != "" {
|
||||||
|
f.hashFilterK, f.hashFilterN, err = parseHashFilter(f.Opt.HashFilter)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
fs.Debugf(nil, "Using --hash-filter %d/%d", f.hashFilterK, f.hashFilterN)
|
||||||
|
}
|
||||||
|
|
||||||
err = parseRules(&f.Opt.RulesOpt, f.Add, f.Clear)
|
err = parseRules(&f.Opt.RulesOpt, f.Add, f.Clear)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -243,6 +263,32 @@ func NewFilter(opt *Options) (f *Filter, err error) {
|
|||||||
return f, nil
|
return f, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parse the --hash-filter arguments into k/n
|
||||||
|
func parseHashFilter(hashFilter string) (k, n uint64, err error) {
|
||||||
|
slash := strings.IndexRune(hashFilter, '/')
|
||||||
|
if slash < 0 {
|
||||||
|
return 0, 0, fmt.Errorf("filter: --hash-filter: no / found")
|
||||||
|
}
|
||||||
|
kStr, nStr := hashFilter[:slash], hashFilter[slash+1:]
|
||||||
|
n, err = strconv.ParseUint(nStr, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse N=%q: %v", nStr, err)
|
||||||
|
}
|
||||||
|
if n == 0 {
|
||||||
|
return 0, 0, fmt.Errorf("filter: --hash-filter: N must be greater than 0")
|
||||||
|
}
|
||||||
|
if kStr == "@" {
|
||||||
|
k = rand.Uint64N(n)
|
||||||
|
} else {
|
||||||
|
k, err = strconv.ParseUint(kStr, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, fmt.Errorf("filter: --hash-filter: can't parse K=%q: %v", kStr, err)
|
||||||
|
}
|
||||||
|
k %= n
|
||||||
|
}
|
||||||
|
return k, n, nil
|
||||||
|
}
|
||||||
|
|
||||||
func mustNewFilter(opt *Options) *Filter {
|
func mustNewFilter(opt *Options) *Filter {
|
||||||
f, err := NewFilter(opt)
|
f, err := NewFilter(opt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -367,7 +413,8 @@ func (f *Filter) InActive() bool {
|
|||||||
f.fileRules.len() == 0 &&
|
f.fileRules.len() == 0 &&
|
||||||
f.dirRules.len() == 0 &&
|
f.dirRules.len() == 0 &&
|
||||||
f.metaRules.len() == 0 &&
|
f.metaRules.len() == 0 &&
|
||||||
len(f.Opt.ExcludeFile) == 0)
|
len(f.Opt.ExcludeFile) == 0 &&
|
||||||
|
f.hashFilterN == 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
// IncludeRemote returns whether this remote passes the filter rules.
|
// IncludeRemote returns whether this remote passes the filter rules.
|
||||||
@ -377,6 +424,21 @@ func (f *Filter) IncludeRemote(remote string) bool {
|
|||||||
_, include := f.files[remote]
|
_, include := f.files[remote]
|
||||||
return include
|
return include
|
||||||
}
|
}
|
||||||
|
if f.hashFilterN != 0 {
|
||||||
|
// Normalise the remote first in case we are using a
|
||||||
|
// case insensitive remote or a remote which needs
|
||||||
|
// unicode normalisation. This means all the remotes
|
||||||
|
// which could be normalised together will be in the
|
||||||
|
// same partition.
|
||||||
|
normalized := norm.NFC.String(remote)
|
||||||
|
normalized = strings.ToLower(normalized)
|
||||||
|
hashBytes := md5.Sum([]byte(normalized))
|
||||||
|
hash := binary.LittleEndian.Uint64(hashBytes[:])
|
||||||
|
partition := hash % f.hashFilterN
|
||||||
|
if partition != f.hashFilterK {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
return f.fileRules.include(remote)
|
return f.fileRules.include(remote)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -28,6 +28,38 @@ func TestNewFilterDefault(t *testing.T) {
|
|||||||
assert.True(t, f.InActive())
|
assert.True(t, f.InActive())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseHashFilter(t *testing.T) {
|
||||||
|
for _, test := range []struct {
|
||||||
|
hashFilter string
|
||||||
|
n uint64
|
||||||
|
k uint64
|
||||||
|
err string
|
||||||
|
}{
|
||||||
|
{hashFilter: "", err: "no / found"},
|
||||||
|
{hashFilter: "17", err: "no / found"},
|
||||||
|
{hashFilter: "-1/2", err: "can't parse K="},
|
||||||
|
{hashFilter: "1/-2", err: "can't parse N="},
|
||||||
|
{hashFilter: "0/0", err: "N must be greater than 0"},
|
||||||
|
{hashFilter: "0/18446744073709551615", k: 0, n: 18446744073709551615},
|
||||||
|
{hashFilter: "0/18446744073709551616", err: "can't parse N="},
|
||||||
|
{hashFilter: "18446744073709551615/1", k: 0, n: 1},
|
||||||
|
{hashFilter: "18446744073709551616/1", err: "can't parse K="},
|
||||||
|
{hashFilter: "1/2", k: 1, n: 2},
|
||||||
|
{hashFilter: "17/3", k: 2, n: 3},
|
||||||
|
{hashFilter: "@/1", k: 0, n: 1},
|
||||||
|
} {
|
||||||
|
gotK, gotN, gotErr := parseHashFilter(test.hashFilter)
|
||||||
|
if test.err != "" {
|
||||||
|
assert.Error(t, gotErr)
|
||||||
|
assert.ErrorContains(t, gotErr, test.err, test.hashFilter)
|
||||||
|
} else {
|
||||||
|
assert.Equal(t, test.k, gotK, test.hashFilter)
|
||||||
|
assert.Equal(t, test.n, gotN, test.hashFilter)
|
||||||
|
assert.NoError(t, gotErr, test.hashFilter)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// testFile creates a temp file with the contents
|
// testFile creates a temp file with the contents
|
||||||
func testFile(t *testing.T, contents string) string {
|
func testFile(t *testing.T, contents string) string {
|
||||||
out, err := os.CreateTemp("", "filter_test")
|
out, err := os.CreateTemp("", "filter_test")
|
||||||
@ -209,6 +241,7 @@ type includeTest struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func testInclude(t *testing.T, f *Filter, tests []includeTest) {
|
func testInclude(t *testing.T, f *Filter, tests []includeTest) {
|
||||||
|
t.Helper()
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
got := f.Include(test.in, test.size, time.Unix(test.modTime, 0), nil)
|
got := f.Include(test.in, test.size, time.Unix(test.modTime, 0), nil)
|
||||||
assert.Equal(t, test.want, got, fmt.Sprintf("in=%q, size=%v, modTime=%v", test.in, test.size, time.Unix(test.modTime, 0)))
|
assert.Equal(t, test.want, got, fmt.Sprintf("in=%q, size=%v, modTime=%v", test.in, test.size, time.Unix(test.modTime, 0)))
|
||||||
@ -539,6 +572,42 @@ func TestNewFilterMatchesRegexp(t *testing.T) {
|
|||||||
assert.False(t, f.InActive())
|
assert.False(t, f.InActive())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNewFilterHashFilter(t *testing.T) {
|
||||||
|
const e1 = "filé1.jpg" // one of the unicode E characters
|
||||||
|
const e2 = "filé1.jpg" // a different unicode E character
|
||||||
|
assert.NotEqual(t, e1, e2)
|
||||||
|
for i := 0; i <= 4; i++ {
|
||||||
|
opt := Opt
|
||||||
|
opt.HashFilter = fmt.Sprintf("%d/4", i)
|
||||||
|
opt.ExcludeRule = []string{"*.bin"}
|
||||||
|
f, err := NewFilter(&opt)
|
||||||
|
require.NoError(t, err)
|
||||||
|
t.Run(opt.HashFilter, func(t *testing.T) {
|
||||||
|
testInclude(t, f, []includeTest{
|
||||||
|
{"file1.jpg", 0, 0, i == 0 || i == 4},
|
||||||
|
{"FILE1.jpg", 0, 0, i == 0 || i == 4},
|
||||||
|
{"file2.jpg", 1, 0, i == 2},
|
||||||
|
{"File2.jpg", 1, 0, i == 2},
|
||||||
|
{"file3.jpg", 2, 0, i == 1},
|
||||||
|
{"file4.jpg", 3, 0, i == 2},
|
||||||
|
{"file5.jpg", 4, 0, i == 0 || i == 4},
|
||||||
|
{"file6.jpg", 5, 0, i == 1},
|
||||||
|
{"file7.jpg", 6, 0, i == 3},
|
||||||
|
{"file8.jpg", 7, 0, i == 3},
|
||||||
|
{"file9.jpg", 7, 0, i == 1},
|
||||||
|
{e1, 0, 0, i == 3},
|
||||||
|
{e2, 0, 0, i == 3},
|
||||||
|
{"hello" + e1, 0, 0, i == 2},
|
||||||
|
{"HELLO" + e2, 0, 0, i == 2},
|
||||||
|
{"hello1" + e1, 0, 0, i == 1},
|
||||||
|
{"Hello1" + e2, 0, 0, i == 1},
|
||||||
|
{"exclude.bin", 8, 0, false},
|
||||||
|
})
|
||||||
|
})
|
||||||
|
assert.False(t, f.InActive())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type includeTestMetadata struct {
|
type includeTestMetadata struct {
|
||||||
in string
|
in string
|
||||||
metadata fs.Metadata
|
metadata fs.Metadata
|
||||||
|
Loading…
x
Reference in New Issue
Block a user