mirror of
https://github.com/rclone/rclone.git
synced 2024-11-22 16:34:30 +01:00
dedupe: add --by-hash to dedupe on hash not file name - fixes #1674
This commit is contained in:
parent
e073720a8f
commit
507f861c67
@ -12,12 +12,14 @@ import (
|
||||
|
||||
var (
|
||||
dedupeMode = operations.DeduplicateInteractive
|
||||
byHash = false
|
||||
)
|
||||
|
||||
func init() {
|
||||
cmd.Root.AddCommand(commandDefinition)
|
||||
cmdFlag := commandDefinition.Flags()
|
||||
flags.FVarP(cmdFlag, &dedupeMode, "dedupe-mode", "", "Dedupe mode interactive|skip|first|newest|oldest|largest|smallest|rename.")
|
||||
flags.BoolVarP(cmdFlag, &byHash, "by-hash", "", false, "Find indentical hashes rather than names")
|
||||
}
|
||||
|
||||
var commandDefinition = &cobra.Command{
|
||||
@ -27,20 +29,26 @@ var commandDefinition = &cobra.Command{
|
||||
|
||||
By default ` + "`dedupe`" + ` interactively finds files with duplicate
|
||||
names and offers to delete all but one or rename them to be
|
||||
different.
|
||||
different. This is known as deduping by name.
|
||||
|
||||
This is only useful with backends like Google Drive which can have
|
||||
duplicate file names. It can be run on wrapping backends (e.g. crypt) if
|
||||
they wrap a backend which supports duplicate file names.
|
||||
Deduping by name is only useful with backends like Google Drive which
|
||||
can have duplicate file names. It can be run on wrapping backends
|
||||
(e.g. crypt) if they wrap a backend which supports duplicate file
|
||||
names.
|
||||
|
||||
In the first pass it will merge directories with the same name. It
|
||||
will do this iteratively until all the identically named directories
|
||||
have been merged.
|
||||
However if --by-hash is passed in then dedupe will find files with
|
||||
duplicate hashes instead which will work on any backend which supports
|
||||
at least one hash. This can be used to find files with duplicate
|
||||
content. This is known as deduping by hash.
|
||||
|
||||
In the second pass, for every group of duplicate file names, it will
|
||||
delete all but one identical files it finds without confirmation.
|
||||
This means that for most duplicated files the ` + "`dedupe`" + `
|
||||
command will not be interactive.
|
||||
If deduping by name, first rclone will merge directories with the same
|
||||
name. It will do this iteratively until all the identically named
|
||||
directories have been merged.
|
||||
|
||||
Next, if deduping by name, for every group of duplicate file names /
|
||||
hashes, it will delete all but one identical files it finds without
|
||||
confirmation. This means that for most duplicated files the ` +
|
||||
"`dedupe`" + ` command will not be interactive.
|
||||
|
||||
` + "`dedupe`" + ` considers files to be identical if they have the
|
||||
same file path and the same hash. If the backend does not support hashes (e.g. crypt wrapping
|
||||
@ -49,6 +57,10 @@ use the ` + "`--size-only`" + ` flag then files will be considered
|
||||
identical if they have the same size (any hash will be ignored). This
|
||||
can be useful on crypt backends which do not support hashes.
|
||||
|
||||
Next rclone will resolve the remaining duplicates. Exactly which
|
||||
action is taken depends on the dedupe mode. By default rclone will
|
||||
interactively query the user for each one.
|
||||
|
||||
**Important**: Since this can cause data loss, test first with the
|
||||
` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
|
||||
|
||||
@ -131,7 +143,7 @@ Or
|
||||
}
|
||||
fdst := cmd.NewFsSrc(args)
|
||||
cmd.Run(false, false, command, func() error {
|
||||
return operations.Deduplicate(context.Background(), fdst, dedupeMode)
|
||||
return operations.Deduplicate(context.Background(), fdst, dedupeMode, byHash)
|
||||
})
|
||||
},
|
||||
}
|
||||
|
@ -139,7 +139,7 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
|
||||
}
|
||||
|
||||
// dedupeInteractive interactively dedupes the slice of objects
|
||||
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
|
||||
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
|
||||
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
|
||||
for i, o := range objs {
|
||||
hashValue := ""
|
||||
@ -150,9 +150,17 @@ func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string
|
||||
hashValue = err.Error()
|
||||
}
|
||||
}
|
||||
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
|
||||
if byHash {
|
||||
fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
|
||||
} else {
|
||||
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
|
||||
}
|
||||
}
|
||||
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
|
||||
commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
|
||||
if !byHash {
|
||||
commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
|
||||
}
|
||||
switch config.Command(commands) {
|
||||
case 's':
|
||||
case 'k':
|
||||
keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
|
||||
@ -298,31 +306,51 @@ func sortSmallestFirst(objs []fs.Object) {
|
||||
// Deduplicate interactively finds duplicate files and offers to
|
||||
// delete all but one or rename them to be different. Only useful with
|
||||
// Google Drive which can have duplicate file names.
|
||||
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
|
||||
fs.Infof(f, "Looking for duplicates using %v mode.", mode)
|
||||
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
|
||||
ci := fs.GetConfig(ctx)
|
||||
// find a hash to use
|
||||
ht := f.Hashes().GetOne()
|
||||
what := "names"
|
||||
if byHash {
|
||||
if ht == hash.None {
|
||||
return errors.Errorf("%v has no hashes", f)
|
||||
}
|
||||
what = ht.String() + " hashes"
|
||||
}
|
||||
fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)
|
||||
|
||||
// Find duplicate directories first and fix them
|
||||
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(duplicateDirs) != 0 {
|
||||
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
|
||||
if !byHash {
|
||||
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(duplicateDirs) != 0 {
|
||||
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// find a hash to use
|
||||
ht := f.Hashes().GetOne()
|
||||
|
||||
// Now find duplicate files
|
||||
files := map[string][]fs.Object{}
|
||||
err = walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
|
||||
err := walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
|
||||
entries.ForObject(func(o fs.Object) {
|
||||
remote := o.Remote()
|
||||
files[remote] = append(files[remote], o)
|
||||
var remote string
|
||||
var err error
|
||||
if byHash {
|
||||
remote, err = o.Hash(ctx, ht)
|
||||
if err != nil {
|
||||
fs.Errorf(o, "Failed to hash: %v", err)
|
||||
remote = ""
|
||||
}
|
||||
} else {
|
||||
remote = o.Remote()
|
||||
}
|
||||
if remote != "" {
|
||||
files[remote] = append(files[remote], o)
|
||||
}
|
||||
})
|
||||
return nil
|
||||
})
|
||||
@ -332,15 +360,17 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
|
||||
|
||||
for remote, objs := range files {
|
||||
if len(objs) > 1 {
|
||||
fs.Logf(remote, "Found %d files with duplicate names", len(objs))
|
||||
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
|
||||
if len(objs) <= 1 {
|
||||
fs.Logf(remote, "All duplicates removed")
|
||||
continue
|
||||
fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
|
||||
if !byHash {
|
||||
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
|
||||
if len(objs) <= 1 {
|
||||
fs.Logf(remote, "All duplicates removed")
|
||||
continue
|
||||
}
|
||||
}
|
||||
switch mode {
|
||||
case DeduplicateInteractive:
|
||||
dedupeInteractive(ctx, f, ht, remote, objs)
|
||||
dedupeInteractive(ctx, f, ht, remote, objs, byHash)
|
||||
case DeduplicateFirst:
|
||||
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
||||
case DeduplicateNewest:
|
||||
@ -358,7 +388,7 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
|
||||
sortSmallestFirst(objs)
|
||||
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
||||
case DeduplicateSkip:
|
||||
fs.Logf(remote, "Skipping %d files with duplicate names", len(objs))
|
||||
fs.Logf(remote, "Skipping %d files with duplicate names %s", len(objs), what)
|
||||
default:
|
||||
//skip
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ import (
|
||||
"github.com/rclone/rclone/fs/operations"
|
||||
"github.com/rclone/rclone/fs/walk"
|
||||
"github.com/rclone/rclone/fstest"
|
||||
"github.com/rclone/rclone/lib/random"
|
||||
"github.com/spf13/pflag"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
@ -36,6 +37,12 @@ func skipIfNoHash(t *testing.T, f fs.Fs) {
|
||||
}
|
||||
}
|
||||
|
||||
func skipIfNoModTime(t *testing.T, f fs.Fs) {
|
||||
if f.Precision() >= fs.ModTimeNotSupported {
|
||||
t.Skip("Can't run this test without modtimes")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeduplicateInteractive(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
@ -47,7 +54,7 @@ func TestDeduplicateInteractive(t *testing.T) {
|
||||
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive)
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file1)
|
||||
@ -69,7 +76,7 @@ func TestDeduplicateSkip(t *testing.T) {
|
||||
files = append(files, file3)
|
||||
r.CheckWithDuplicates(t, files...)
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
r.CheckWithDuplicates(t, file1, file3)
|
||||
@ -92,7 +99,7 @@ func TestDeduplicateSizeOnly(t *testing.T) {
|
||||
ci.SizeOnly = false
|
||||
}()
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
r.CheckWithDuplicates(t, file1, file3)
|
||||
@ -108,7 +115,7 @@ func TestDeduplicateFirst(t *testing.T) {
|
||||
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one BB", t1)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst)
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
// list until we get one object
|
||||
@ -131,18 +138,38 @@ func TestDeduplicateNewest(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
skipIfNoModTime(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject(context.Background(), "one", "This is one too", t2)
|
||||
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest)
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file3)
|
||||
}
|
||||
|
||||
func TestDeduplicateNewestByHash(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfNoHash(t, r.Fremote)
|
||||
skipIfNoModTime(t, r.Fremote)
|
||||
contents := random.String(100)
|
||||
|
||||
file1 := r.WriteObject(context.Background(), "one", contents, t1)
|
||||
file2 := r.WriteObject(context.Background(), "also/one", contents, t2)
|
||||
file3 := r.WriteObject(context.Background(), "another", contents, t3)
|
||||
file4 := r.WriteObject(context.Background(), "not-one", "stuff", t3)
|
||||
fstest.CheckItems(t, r.Fremote, file1, file2, file3, file4)
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, true)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file3, file4)
|
||||
}
|
||||
|
||||
func TestDeduplicateOldest(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
@ -153,7 +180,7 @@ func TestDeduplicateOldest(t *testing.T) {
|
||||
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest)
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file1)
|
||||
@ -169,7 +196,7 @@ func TestDeduplicateLargest(t *testing.T) {
|
||||
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest)
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file3)
|
||||
@ -185,7 +212,7 @@ func TestDeduplicateSmallest(t *testing.T) {
|
||||
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest)
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
fstest.CheckItems(t, r.Fremote, file1)
|
||||
@ -202,7 +229,7 @@ func TestDeduplicateRename(t *testing.T) {
|
||||
file4 := r.WriteUncheckedObject(context.Background(), "one-1.txt", "This is not a duplicate", t1)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3, file4)
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename)
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, walk.ListR(context.Background(), r.Fremote, "", true, -1, walk.ListObjects, func(entries fs.DirEntries) error {
|
||||
|
Loading…
Reference in New Issue
Block a user