From d97fe3b82474bd1f3c612b02d01d46514e0ece0b Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Thu, 12 Apr 2018 17:17:11 +0100 Subject: [PATCH] fs/operations: make dedupe work with mega * factor into its own files * remove assumptions about having a given hash type * make tests work if the remote has no hash --- fs/operations/dedupe.go | 292 +++++++++++++++++++++++++++++++ fs/operations/dedupe_test.go | 199 +++++++++++++++++++++ fs/operations/operations.go | 273 ----------------------------- fs/operations/operations_test.go | 165 ----------------- 4 files changed, 491 insertions(+), 438 deletions(-) create mode 100644 fs/operations/dedupe.go create mode 100644 fs/operations/dedupe_test.go diff --git a/fs/operations/dedupe.go b/fs/operations/dedupe.go new file mode 100644 index 000000000..1f49fbaa6 --- /dev/null +++ b/fs/operations/dedupe.go @@ -0,0 +1,292 @@ +// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega) + +package operations + +import ( + "fmt" + "log" + "path" + "sort" + "strings" + + "github.com/ncw/rclone/fs" + "github.com/ncw/rclone/fs/config" + "github.com/ncw/rclone/fs/hash" + "github.com/ncw/rclone/fs/walk" + "github.com/pkg/errors" + "github.com/spf13/pflag" +) + +// dedupeRename renames the objs slice to different names +func dedupeRename(remote string, objs []fs.Object) { + f := objs[0].Fs() + doMove := f.Features().Move + if doMove == nil { + log.Fatalf("Fs %v doesn't support Move", f) + } + ext := path.Ext(remote) + base := remote[:len(remote)-len(ext)] + for i, o := range objs { + newName := fmt.Sprintf("%s-%d%s", base, i+1, ext) + if !fs.Config.DryRun { + newObj, err := doMove(o, newName) + if err != nil { + fs.CountError(err) + fs.Errorf(o, "Failed to rename: %v", err) + continue + } + fs.Infof(newObj, "renamed from: %v", o) + } else { + fs.Logf(remote, "Not renaming to %q as --dry-run", newName) + } + } +} + +// dedupeDeleteAllButOne deletes all but the one in keep +func dedupeDeleteAllButOne(keep int, remote string, objs []fs.Object) { + for i, o := range objs { + if i == keep { + continue + } + _ = DeleteFile(o) + } + fs.Logf(remote, "Deleted %d extra copies", len(objs)-1) +} + +// dedupeDeleteIdentical deletes all but one of identical (by hash) copies +func dedupeDeleteIdentical(ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) { + // See how many of these duplicates are identical + byHash := make(map[string][]fs.Object, len(objs)) + for _, o := range objs { + md5sum, err := o.Hash(ht) + if err != nil || md5sum == "" { + remainingObjs = append(remainingObjs, o) + } else { + byHash[md5sum] = append(byHash[md5sum], o) + } + } + + // Delete identical duplicates, filling remainingObjs with the ones remaining + for md5sum, hashObjs := range byHash { + if len(hashObjs) > 1 { + fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum) + for _, o := range hashObjs[1:] { + _ = DeleteFile(o) + } + } + remainingObjs = append(remainingObjs, hashObjs[0]) + } + + return remainingObjs +} + +// dedupeInteractive interactively dedupes the slice of objects +func dedupeInteractive(ht hash.Type, remote string, objs []fs.Object) { + fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) + for i, o := range objs { + md5sum, err := o.Hash(ht) + if err != nil { + md5sum = err.Error() + } + fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime().Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum) + } + switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { + case 's': + case 'k': + keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs)) + dedupeDeleteAllButOne(keep-1, remote, objs) + case 'r': + dedupeRename(remote, objs) + } +} + +type objectsSortedByModTime []fs.Object + +func (objs objectsSortedByModTime) Len() int { return len(objs) } +func (objs objectsSortedByModTime) Swap(i, j int) { objs[i], objs[j] = objs[j], objs[i] } +func (objs objectsSortedByModTime) Less(i, j int) bool { + return objs[i].ModTime().Before(objs[j].ModTime()) +} + +// DeduplicateMode is how the dedupe command chooses what to do +type DeduplicateMode int + +// Deduplicate modes +const ( + DeduplicateInteractive DeduplicateMode = iota // interactively ask the user + DeduplicateSkip // skip all conflicts + DeduplicateFirst // choose the first object + DeduplicateNewest // choose the newest object + DeduplicateOldest // choose the oldest object + DeduplicateRename // rename the objects +) + +func (x DeduplicateMode) String() string { + switch x { + case DeduplicateInteractive: + return "interactive" + case DeduplicateSkip: + return "skip" + case DeduplicateFirst: + return "first" + case DeduplicateNewest: + return "newest" + case DeduplicateOldest: + return "oldest" + case DeduplicateRename: + return "rename" + } + return "unknown" +} + +// Set a DeduplicateMode from a string +func (x *DeduplicateMode) Set(s string) error { + switch strings.ToLower(s) { + case "interactive": + *x = DeduplicateInteractive + case "skip": + *x = DeduplicateSkip + case "first": + *x = DeduplicateFirst + case "newest": + *x = DeduplicateNewest + case "oldest": + *x = DeduplicateOldest + case "rename": + *x = DeduplicateRename + default: + return errors.Errorf("Unknown mode for dedupe %q.", s) + } + return nil +} + +// Type of the value +func (x *DeduplicateMode) Type() string { + return "string" +} + +// Check it satisfies the interface +var _ pflag.Value = (*DeduplicateMode)(nil) + +// dedupeFindDuplicateDirs scans f for duplicate directories +func dedupeFindDuplicateDirs(f fs.Fs) ([][]fs.Directory, error) { + duplicateDirs := [][]fs.Directory{} + err := walk.Walk(f, "", true, fs.Config.MaxDepth, func(dirPath string, entries fs.DirEntries, err error) error { + if err != nil { + return err + } + dirs := map[string][]fs.Directory{} + entries.ForDir(func(d fs.Directory) { + dirs[d.Remote()] = append(dirs[d.Remote()], d) + }) + for _, ds := range dirs { + if len(ds) > 1 { + duplicateDirs = append(duplicateDirs, ds) + } + } + return nil + }) + if err != nil { + return nil, errors.Wrap(err, "find duplicate dirs") + } + return duplicateDirs, nil +} + +// dedupeMergeDuplicateDirs merges all the duplicate directories found +func dedupeMergeDuplicateDirs(f fs.Fs, duplicateDirs [][]fs.Directory) error { + mergeDirs := f.Features().MergeDirs + if mergeDirs == nil { + return errors.Errorf("%v: can't merge directories", f) + } + dirCacheFlush := f.Features().DirCacheFlush + if dirCacheFlush == nil { + return errors.Errorf("%v: can't flush dir cache", f) + } + for _, dirs := range duplicateDirs { + if !fs.Config.DryRun { + fs.Infof(dirs[0], "Merging contents of duplicate directories") + err := mergeDirs(dirs) + if err != nil { + return errors.Wrap(err, "merge duplicate dirs") + } + } else { + fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run") + } + } + dirCacheFlush() + return nil +} + +// Deduplicate interactively finds duplicate files and offers to +// delete all but one or rename them to be different. Only useful with +// Google Drive which can have duplicate file names. +func Deduplicate(f fs.Fs, mode DeduplicateMode) error { + fs.Infof(f, "Looking for duplicates using %v mode.", mode) + + // Find duplicate directories first and fix them - repeat + // until all fixed + for { + duplicateDirs, err := dedupeFindDuplicateDirs(f) + if err != nil { + return err + } + if len(duplicateDirs) == 0 { + break + } + err = dedupeMergeDuplicateDirs(f, duplicateDirs) + if err != nil { + return err + } + if fs.Config.DryRun { + break + } + } + + // find a hash to use + ht := f.Hashes().GetOne() + + // Now find duplicate files + files := map[string][]fs.Object{} + err := walk.Walk(f, "", true, fs.Config.MaxDepth, func(dirPath string, entries fs.DirEntries, err error) error { + if err != nil { + return err + } + entries.ForObject(func(o fs.Object) { + remote := o.Remote() + files[remote] = append(files[remote], o) + }) + return nil + }) + if err != nil { + return err + } + for remote, objs := range files { + if len(objs) > 1 { + fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs)) + objs = dedupeDeleteIdentical(ht, remote, objs) + if len(objs) <= 1 { + fs.Logf(remote, "All duplicates removed") + continue + } + switch mode { + case DeduplicateInteractive: + dedupeInteractive(ht, remote, objs) + case DeduplicateFirst: + dedupeDeleteAllButOne(0, remote, objs) + case DeduplicateNewest: + sort.Sort(objectsSortedByModTime(objs)) // sort oldest first + dedupeDeleteAllButOne(len(objs)-1, remote, objs) + case DeduplicateOldest: + sort.Sort(objectsSortedByModTime(objs)) // sort oldest first + dedupeDeleteAllButOne(0, remote, objs) + case DeduplicateRename: + dedupeRename(remote, objs) + case DeduplicateSkip: + // skip + default: + //skip + } + } + } + return nil +} diff --git a/fs/operations/dedupe_test.go b/fs/operations/dedupe_test.go new file mode 100644 index 000000000..a9b5c7ba6 --- /dev/null +++ b/fs/operations/dedupe_test.go @@ -0,0 +1,199 @@ +package operations_test + +import ( + "testing" + "time" + + "github.com/ncw/rclone/fs" + "github.com/ncw/rclone/fs/hash" + "github.com/ncw/rclone/fs/operations" + "github.com/ncw/rclone/fs/walk" + "github.com/ncw/rclone/fstest" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func skipIfCantDedupe(t *testing.T, f fs.Fs) { + if !f.Features().DuplicateFiles { + t.Skip("Can't test deduplicate - no duplicate files possible") + } + if f.Features().PutUnchecked == nil { + t.Skip("Can't test deduplicate - no PutUnchecked") + } + if f.Features().MergeDirs == nil { + t.Skip("Can't test deduplicate - no MergeDirs") + } +} + +func skipIfNoHash(t *testing.T, f fs.Fs) { + if f.Hashes().GetOne() == hash.None { + t.Skip("Can't run this test without a hash") + } +} + +func TestDeduplicateInteractive(t *testing.T) { + r := fstest.NewRun(t) + defer r.Finalise() + skipIfCantDedupe(t, r.Fremote) + skipIfNoHash(t, r.Fremote) + + file1 := r.WriteUncheckedObject("one", "This is one", t1) + file2 := r.WriteUncheckedObject("one", "This is one", t1) + file3 := r.WriteUncheckedObject("one", "This is one", t1) + r.CheckWithDuplicates(t, file1, file2, file3) + + err := operations.Deduplicate(r.Fremote, operations.DeduplicateInteractive) + require.NoError(t, err) + + fstest.CheckItems(t, r.Fremote, file1) +} + +func TestDeduplicateSkip(t *testing.T) { + r := fstest.NewRun(t) + defer r.Finalise() + skipIfCantDedupe(t, r.Fremote) + haveHash := r.Fremote.Hashes().GetOne() != hash.None + + file1 := r.WriteUncheckedObject("one", "This is one", t1) + files := []fstest.Item{file1} + if haveHash { + file2 := r.WriteUncheckedObject("one", "This is one", t1) + files = append(files, file2) + } + file3 := r.WriteUncheckedObject("one", "This is another one", t1) + files = append(files, file3) + r.CheckWithDuplicates(t, files...) + + err := operations.Deduplicate(r.Fremote, operations.DeduplicateSkip) + require.NoError(t, err) + + r.CheckWithDuplicates(t, file1, file3) +} + +func TestDeduplicateFirst(t *testing.T) { + r := fstest.NewRun(t) + defer r.Finalise() + skipIfCantDedupe(t, r.Fremote) + + file1 := r.WriteUncheckedObject("one", "This is one", t1) + file2 := r.WriteUncheckedObject("one", "This is one A", t1) + file3 := r.WriteUncheckedObject("one", "This is one BB", t1) + r.CheckWithDuplicates(t, file1, file2, file3) + + err := operations.Deduplicate(r.Fremote, operations.DeduplicateFirst) + require.NoError(t, err) + + // list until we get one object + var objects, size int64 + for try := 1; try <= *fstest.ListRetries; try++ { + objects, size, err = operations.Count(r.Fremote) + require.NoError(t, err) + if objects == 1 { + break + } + time.Sleep(time.Second) + } + assert.Equal(t, int64(1), objects) + if size != file1.Size && size != file2.Size && size != file3.Size { + t.Errorf("Size not one of the object sizes %d", size) + } +} + +func TestDeduplicateNewest(t *testing.T) { + r := fstest.NewRun(t) + defer r.Finalise() + skipIfCantDedupe(t, r.Fremote) + + file1 := r.WriteUncheckedObject("one", "This is one", t1) + file2 := r.WriteUncheckedObject("one", "This is one too", t2) + file3 := r.WriteUncheckedObject("one", "This is another one", t3) + r.CheckWithDuplicates(t, file1, file2, file3) + + err := operations.Deduplicate(r.Fremote, operations.DeduplicateNewest) + require.NoError(t, err) + + fstest.CheckItems(t, r.Fremote, file3) +} + +func TestDeduplicateOldest(t *testing.T) { + r := fstest.NewRun(t) + defer r.Finalise() + skipIfCantDedupe(t, r.Fremote) + + file1 := r.WriteUncheckedObject("one", "This is one", t1) + file2 := r.WriteUncheckedObject("one", "This is one too", t2) + file3 := r.WriteUncheckedObject("one", "This is another one", t3) + r.CheckWithDuplicates(t, file1, file2, file3) + + err := operations.Deduplicate(r.Fremote, operations.DeduplicateOldest) + require.NoError(t, err) + + fstest.CheckItems(t, r.Fremote, file1) +} + +func TestDeduplicateRename(t *testing.T) { + r := fstest.NewRun(t) + defer r.Finalise() + skipIfCantDedupe(t, r.Fremote) + + file1 := r.WriteUncheckedObject("one.txt", "This is one", t1) + file2 := r.WriteUncheckedObject("one.txt", "This is one too", t2) + file3 := r.WriteUncheckedObject("one.txt", "This is another one", t3) + r.CheckWithDuplicates(t, file1, file2, file3) + + err := operations.Deduplicate(r.Fremote, operations.DeduplicateRename) + require.NoError(t, err) + + require.NoError(t, walk.Walk(r.Fremote, "", true, -1, func(dirPath string, entries fs.DirEntries, err error) error { + if err != nil { + return err + } + entries.ForObject(func(o fs.Object) { + remote := o.Remote() + if remote != "one-1.txt" && + remote != "one-2.txt" && + remote != "one-3.txt" { + t.Errorf("Bad file name after rename %q", remote) + } + size := o.Size() + if size != file1.Size && size != file2.Size && size != file3.Size { + t.Errorf("Size not one of the object sizes %d", size) + } + }) + return nil + })) +} + +// This should really be a unit test, but the test framework there +// doesn't have enough tools to make it easy +func TestMergeDirs(t *testing.T) { + r := fstest.NewRun(t) + defer r.Finalise() + + mergeDirs := r.Fremote.Features().MergeDirs + if mergeDirs == nil { + t.Skip("Can't merge directories") + } + + file1 := r.WriteObject("dupe1/one.txt", "This is one", t1) + file2 := r.WriteObject("dupe2/two.txt", "This is one too", t2) + file3 := r.WriteObject("dupe3/three.txt", "This is another one", t3) + + objs, dirs, err := walk.GetAll(r.Fremote, "", true, 1) + require.NoError(t, err) + assert.Equal(t, 3, len(dirs)) + assert.Equal(t, 0, len(objs)) + + err = mergeDirs(dirs) + require.NoError(t, err) + + file2.Path = "dupe1/two.txt" + file3.Path = "dupe1/three.txt" + fstest.CheckItems(t, r.Fremote, file1, file2, file3) + + objs, dirs, err = walk.GetAll(r.Fremote, "", true, 1) + require.NoError(t, err) + assert.Equal(t, 1, len(dirs)) + assert.Equal(t, 0, len(objs)) + assert.Equal(t, "dupe1", dirs[0].Remote()) +} diff --git a/fs/operations/operations.go b/fs/operations/operations.go index 06a6c55da..6882e06c9 100644 --- a/fs/operations/operations.go +++ b/fs/operations/operations.go @@ -7,7 +7,6 @@ import ( "fmt" "io" "io/ioutil" - "log" "path" "sort" "strconv" @@ -18,7 +17,6 @@ import ( "github.com/ncw/rclone/fs" "github.com/ncw/rclone/fs/accounting" - "github.com/ncw/rclone/fs/config" "github.com/ncw/rclone/fs/fserrors" "github.com/ncw/rclone/fs/hash" "github.com/ncw/rclone/fs/march" @@ -26,7 +24,6 @@ import ( "github.com/ncw/rclone/fs/walk" "github.com/ncw/rclone/lib/readers" "github.com/pkg/errors" - "github.com/spf13/pflag" ) // CheckHashes checks the two files to see if they have common @@ -1010,276 +1007,6 @@ func Delete(f fs.Fs) error { return err } -// dedupeRename renames the objs slice to different names -func dedupeRename(remote string, objs []fs.Object) { - f := objs[0].Fs() - doMove := f.Features().Move - if doMove == nil { - log.Fatalf("Fs %v doesn't support Move", f) - } - ext := path.Ext(remote) - base := remote[:len(remote)-len(ext)] - for i, o := range objs { - newName := fmt.Sprintf("%s-%d%s", base, i+1, ext) - if !fs.Config.DryRun { - newObj, err := doMove(o, newName) - if err != nil { - fs.CountError(err) - fs.Errorf(o, "Failed to rename: %v", err) - continue - } - fs.Infof(newObj, "renamed from: %v", o) - } else { - fs.Logf(remote, "Not renaming to %q as --dry-run", newName) - } - } -} - -// dedupeDeleteAllButOne deletes all but the one in keep -func dedupeDeleteAllButOne(keep int, remote string, objs []fs.Object) { - for i, o := range objs { - if i == keep { - continue - } - _ = DeleteFile(o) - } - fs.Logf(remote, "Deleted %d extra copies", len(objs)-1) -} - -// dedupeDeleteIdentical deletes all but one of identical (by hash) copies -func dedupeDeleteIdentical(remote string, objs []fs.Object) []fs.Object { - // See how many of these duplicates are identical - byHash := make(map[string][]fs.Object, len(objs)) - for _, o := range objs { - md5sum, err := o.Hash(hash.MD5) - if err == nil { - byHash[md5sum] = append(byHash[md5sum], o) - } - } - - // Delete identical duplicates, refilling obj with the ones remaining - objs = nil - for md5sum, hashObjs := range byHash { - if len(hashObjs) > 1 { - fs.Logf(remote, "Deleting %d/%d identical duplicates (md5sum %q)", len(hashObjs)-1, len(hashObjs), md5sum) - for _, o := range hashObjs[1:] { - _ = DeleteFile(o) - } - } - objs = append(objs, hashObjs[0]) - } - - return objs -} - -// dedupeInteractive interactively dedupes the slice of objects -func dedupeInteractive(remote string, objs []fs.Object) { - fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) - for i, o := range objs { - md5sum, err := o.Hash(hash.MD5) - if err != nil { - md5sum = err.Error() - } - fmt.Printf(" %d: %12d bytes, %s, md5sum %32s\n", i+1, o.Size(), o.ModTime().Local().Format("2006-01-02 15:04:05.000000000"), md5sum) - } - switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { - case 's': - case 'k': - keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs)) - dedupeDeleteAllButOne(keep-1, remote, objs) - case 'r': - dedupeRename(remote, objs) - } -} - -type objectsSortedByModTime []fs.Object - -func (objs objectsSortedByModTime) Len() int { return len(objs) } -func (objs objectsSortedByModTime) Swap(i, j int) { objs[i], objs[j] = objs[j], objs[i] } -func (objs objectsSortedByModTime) Less(i, j int) bool { - return objs[i].ModTime().Before(objs[j].ModTime()) -} - -// DeduplicateMode is how the dedupe command chooses what to do -type DeduplicateMode int - -// Deduplicate modes -const ( - DeduplicateInteractive DeduplicateMode = iota // interactively ask the user - DeduplicateSkip // skip all conflicts - DeduplicateFirst // choose the first object - DeduplicateNewest // choose the newest object - DeduplicateOldest // choose the oldest object - DeduplicateRename // rename the objects -) - -func (x DeduplicateMode) String() string { - switch x { - case DeduplicateInteractive: - return "interactive" - case DeduplicateSkip: - return "skip" - case DeduplicateFirst: - return "first" - case DeduplicateNewest: - return "newest" - case DeduplicateOldest: - return "oldest" - case DeduplicateRename: - return "rename" - } - return "unknown" -} - -// Set a DeduplicateMode from a string -func (x *DeduplicateMode) Set(s string) error { - switch strings.ToLower(s) { - case "interactive": - *x = DeduplicateInteractive - case "skip": - *x = DeduplicateSkip - case "first": - *x = DeduplicateFirst - case "newest": - *x = DeduplicateNewest - case "oldest": - *x = DeduplicateOldest - case "rename": - *x = DeduplicateRename - default: - return errors.Errorf("Unknown mode for dedupe %q.", s) - } - return nil -} - -// Type of the value -func (x *DeduplicateMode) Type() string { - return "string" -} - -// Check it satisfies the interface -var _ pflag.Value = (*DeduplicateMode)(nil) - -// dedupeFindDuplicateDirs scans f for duplicate directories -func dedupeFindDuplicateDirs(f fs.Fs) ([][]fs.Directory, error) { - duplicateDirs := [][]fs.Directory{} - err := walk.Walk(f, "", true, fs.Config.MaxDepth, func(dirPath string, entries fs.DirEntries, err error) error { - if err != nil { - return err - } - dirs := map[string][]fs.Directory{} - entries.ForDir(func(d fs.Directory) { - dirs[d.Remote()] = append(dirs[d.Remote()], d) - }) - for _, ds := range dirs { - if len(ds) > 1 { - duplicateDirs = append(duplicateDirs, ds) - } - } - return nil - }) - if err != nil { - return nil, errors.Wrap(err, "find duplicate dirs") - } - return duplicateDirs, nil -} - -// dedupeMergeDuplicateDirs merges all the duplicate directories found -func dedupeMergeDuplicateDirs(f fs.Fs, duplicateDirs [][]fs.Directory) error { - mergeDirs := f.Features().MergeDirs - if mergeDirs == nil { - return errors.Errorf("%v: can't merge directories", f) - } - dirCacheFlush := f.Features().DirCacheFlush - if dirCacheFlush == nil { - return errors.Errorf("%v: can't flush dir cache", f) - } - for _, dirs := range duplicateDirs { - if !fs.Config.DryRun { - fs.Infof(dirs[0], "Merging contents of duplicate directories") - err := mergeDirs(dirs) - if err != nil { - return errors.Wrap(err, "merge duplicate dirs") - } - } else { - fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run") - } - } - dirCacheFlush() - return nil -} - -// Deduplicate interactively finds duplicate files and offers to -// delete all but one or rename them to be different. Only useful with -// Google Drive which can have duplicate file names. -func Deduplicate(f fs.Fs, mode DeduplicateMode) error { - fs.Infof(f, "Looking for duplicates using %v mode.", mode) - - // Find duplicate directories first and fix them - repeat - // until all fixed - for { - duplicateDirs, err := dedupeFindDuplicateDirs(f) - if err != nil { - return err - } - if len(duplicateDirs) == 0 { - break - } - err = dedupeMergeDuplicateDirs(f, duplicateDirs) - if err != nil { - return err - } - if fs.Config.DryRun { - break - } - } - - // Now find duplicate files - files := map[string][]fs.Object{} - err := walk.Walk(f, "", true, fs.Config.MaxDepth, func(dirPath string, entries fs.DirEntries, err error) error { - if err != nil { - return err - } - entries.ForObject(func(o fs.Object) { - remote := o.Remote() - files[remote] = append(files[remote], o) - }) - return nil - }) - if err != nil { - return err - } - for remote, objs := range files { - if len(objs) > 1 { - fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs)) - objs = dedupeDeleteIdentical(remote, objs) - if len(objs) <= 1 { - fs.Logf(remote, "All duplicates removed") - continue - } - switch mode { - case DeduplicateInteractive: - dedupeInteractive(remote, objs) - case DeduplicateFirst: - dedupeDeleteAllButOne(0, remote, objs) - case DeduplicateNewest: - sort.Sort(objectsSortedByModTime(objs)) // sort oldest first - dedupeDeleteAllButOne(len(objs)-1, remote, objs) - case DeduplicateOldest: - sort.Sort(objectsSortedByModTime(objs)) // sort oldest first - dedupeDeleteAllButOne(0, remote, objs) - case DeduplicateRename: - dedupeRename(remote, objs) - case DeduplicateSkip: - // skip - default: - //skip - } - } - } - return nil -} - // listToChan will transfer all objects in the listing to the output // // If an error occurs, the error will be logged, and it will close the diff --git a/fs/operations/operations_test.go b/fs/operations/operations_test.go index 6ec8e8c6f..aa294663d 100644 --- a/fs/operations/operations_test.go +++ b/fs/operations/operations_test.go @@ -37,7 +37,6 @@ import ( "github.com/ncw/rclone/fs/hash" "github.com/ncw/rclone/fs/list" "github.com/ncw/rclone/fs/operations" - "github.com/ncw/rclone/fs/walk" "github.com/ncw/rclone/fstest" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -301,170 +300,6 @@ func TestCheckSizeOnly(t *testing.T) { TestCheck(t) } -func skipIfCantDedupe(t *testing.T, f fs.Fs) { - if f.Features().PutUnchecked == nil { - t.Skip("Can't test deduplicate - no PutUnchecked") - } - if !f.Features().DuplicateFiles { - t.Skip("Can't test deduplicate - no duplicate files possible") - } - if !f.Hashes().Contains(hash.MD5) { - t.Skip("Can't test deduplicate - MD5 not supported") - } -} - -func TestDeduplicateInteractive(t *testing.T) { - r := fstest.NewRun(t) - defer r.Finalise() - skipIfCantDedupe(t, r.Fremote) - - file1 := r.WriteUncheckedObject("one", "This is one", t1) - file2 := r.WriteUncheckedObject("one", "This is one", t1) - file3 := r.WriteUncheckedObject("one", "This is one", t1) - r.CheckWithDuplicates(t, file1, file2, file3) - - err := operations.Deduplicate(r.Fremote, operations.DeduplicateInteractive) - require.NoError(t, err) - - fstest.CheckItems(t, r.Fremote, file1) -} - -func TestDeduplicateSkip(t *testing.T) { - r := fstest.NewRun(t) - defer r.Finalise() - skipIfCantDedupe(t, r.Fremote) - - file1 := r.WriteUncheckedObject("one", "This is one", t1) - file2 := r.WriteUncheckedObject("one", "This is one", t1) - file3 := r.WriteUncheckedObject("one", "This is another one", t1) - r.CheckWithDuplicates(t, file1, file2, file3) - - err := operations.Deduplicate(r.Fremote, operations.DeduplicateSkip) - require.NoError(t, err) - - r.CheckWithDuplicates(t, file1, file3) -} - -func TestDeduplicateFirst(t *testing.T) { - r := fstest.NewRun(t) - defer r.Finalise() - skipIfCantDedupe(t, r.Fremote) - - file1 := r.WriteUncheckedObject("one", "This is one", t1) - file2 := r.WriteUncheckedObject("one", "This is one A", t1) - file3 := r.WriteUncheckedObject("one", "This is one BB", t1) - r.CheckWithDuplicates(t, file1, file2, file3) - - err := operations.Deduplicate(r.Fremote, operations.DeduplicateFirst) - require.NoError(t, err) - - objects, size, err := operations.Count(r.Fremote) - require.NoError(t, err) - assert.Equal(t, int64(1), objects) - if size != file1.Size && size != file2.Size && size != file3.Size { - t.Errorf("Size not one of the object sizes %d", size) - } -} - -func TestDeduplicateNewest(t *testing.T) { - r := fstest.NewRun(t) - defer r.Finalise() - skipIfCantDedupe(t, r.Fremote) - - file1 := r.WriteUncheckedObject("one", "This is one", t1) - file2 := r.WriteUncheckedObject("one", "This is one too", t2) - file3 := r.WriteUncheckedObject("one", "This is another one", t3) - r.CheckWithDuplicates(t, file1, file2, file3) - - err := operations.Deduplicate(r.Fremote, operations.DeduplicateNewest) - require.NoError(t, err) - - fstest.CheckItems(t, r.Fremote, file3) -} - -func TestDeduplicateOldest(t *testing.T) { - r := fstest.NewRun(t) - defer r.Finalise() - skipIfCantDedupe(t, r.Fremote) - - file1 := r.WriteUncheckedObject("one", "This is one", t1) - file2 := r.WriteUncheckedObject("one", "This is one too", t2) - file3 := r.WriteUncheckedObject("one", "This is another one", t3) - r.CheckWithDuplicates(t, file1, file2, file3) - - err := operations.Deduplicate(r.Fremote, operations.DeduplicateOldest) - require.NoError(t, err) - - fstest.CheckItems(t, r.Fremote, file1) -} - -func TestDeduplicateRename(t *testing.T) { - r := fstest.NewRun(t) - defer r.Finalise() - skipIfCantDedupe(t, r.Fremote) - - file1 := r.WriteUncheckedObject("one.txt", "This is one", t1) - file2 := r.WriteUncheckedObject("one.txt", "This is one too", t2) - file3 := r.WriteUncheckedObject("one.txt", "This is another one", t3) - r.CheckWithDuplicates(t, file1, file2, file3) - - err := operations.Deduplicate(r.Fremote, operations.DeduplicateRename) - require.NoError(t, err) - - require.NoError(t, walk.Walk(r.Fremote, "", true, -1, func(dirPath string, entries fs.DirEntries, err error) error { - if err != nil { - return err - } - entries.ForObject(func(o fs.Object) { - remote := o.Remote() - if remote != "one-1.txt" && - remote != "one-2.txt" && - remote != "one-3.txt" { - t.Errorf("Bad file name after rename %q", remote) - } - size := o.Size() - if size != file1.Size && size != file2.Size && size != file3.Size { - t.Errorf("Size not one of the object sizes %d", size) - } - }) - return nil - })) -} - -// This should really be a unit test, but the test framework there -// doesn't have enough tools to make it easy -func TestMergeDirs(t *testing.T) { - r := fstest.NewRun(t) - defer r.Finalise() - - mergeDirs := r.Fremote.Features().MergeDirs - if mergeDirs == nil { - t.Skip("Can't merge directories") - } - - file1 := r.WriteObject("dupe1/one.txt", "This is one", t1) - file2 := r.WriteObject("dupe2/two.txt", "This is one too", t2) - file3 := r.WriteObject("dupe3/three.txt", "This is another one", t3) - - objs, dirs, err := walk.GetAll(r.Fremote, "", true, 1) - require.NoError(t, err) - assert.Equal(t, 3, len(dirs)) - assert.Equal(t, 0, len(objs)) - - err = mergeDirs(dirs) - require.NoError(t, err) - - file2.Path = "dupe1/two.txt" - file3.Path = "dupe1/three.txt" - fstest.CheckItems(t, r.Fremote, file1, file2, file3) - - objs, dirs, err = walk.GetAll(r.Fremote, "", true, 1) - require.NoError(t, err) - assert.Equal(t, 1, len(dirs)) - assert.Equal(t, 0, len(objs)) - assert.Equal(t, "dupe1", dirs[0].Remote()) -} - func TestCat(t *testing.T) { r := fstest.NewRun(t) defer r.Finalise()