dedupe: Make it obey the --size-only flag for duplicate detection #4321

2025-06-25 22:41:25 +02:00 · 2020-06-16 12:39:26 +01:00 · 2020-06-16 12:39:26 +01:00 · 62f0bbb598
commit 62f0bbb598
parent d5f4c74697
3 changed files with 77 additions and 31 deletions
--- a/cmd/dedupe/dedupe.go
+++ b/cmd/dedupe/dedupe.go
@ -22,19 +22,32 @@ func init() {

 var commandDefinition = &cobra.Command{
 	Use:   "dedupe [mode] remote:path",
-	Short: `Interactively find duplicate files and delete/rename them.`,
+	Short: `Interactively find duplicate filenames and delete/rename them.`,
 	Long: `
-By default ` + "`" + `dedupe` + "`" + ` interactively finds duplicate files and offers to
-delete all but one or rename them to be different. Only useful with
-Google Drive which can have duplicate file names.
+
+By default ` + "`dedupe`" + ` interactively finds files with duplicate
+names and offers to delete all but one or rename them to be
+different.
+
+This is only useful with backends like Google Drive which can have
+duplicate file names. It can be run on wrapping backends (eg crypt) if
+they wrap a backend which supports duplicate file names.

 In the first pass it will merge directories with the same name.  It
-will do this iteratively until all the identical directories have been
-merged.
+will do this iteratively until all the identically named directories
+have been merged.

-The ` + "`" + `dedupe` + "`" + ` command will delete all but one of any identical (same
-md5sum) files it finds without confirmation.  This means that for most
-duplicated files the ` + "`" + `dedupe` + "`" + ` command will not be interactive.
+In the second pass, for every group of duplicate file names, it will
+delete all but one identical files it finds without confirmation.
+This means that for most duplicated files the ` + "`dedupe`" + `
+command will not be interactive.
+
+` + "`dedupe`" + ` considers files to be identical if they have the
+same hash. If the backend does not support hashes (eg crypt wrapping
+Google Drive) then they will never be found to be identical. If you
+use the ` + "`--size-only`" + ` flag then files will be considered
+identical if they have the same size (any hash will be ignored). This
+can be useful on crypt backends which do not support hashes.

 **Important**: Since this can cause data loss, test first with the
 ` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
@ -52,26 +65,26 @@ Before - with duplicates
      1744073 2016-03-05 16:22:38.104000000 two.txt
       564374 2016-03-05 16:22:52.118000000 two.txt

-Now the ` + "`" + `dedupe` + "`" + ` session
+Now the ` + "`dedupe`" + ` session

    $ rclone dedupe drive:dupes
    2016/03/05 16:24:37 Google drive root 'dupes': Looking for duplicates using interactive mode.
-    one.txt: Found 4 duplicates - deleting identical copies
-    one.txt: Deleting 2/3 identical duplicates (md5sum "1eedaa9fe86fd4b8632e2ac549403b36")
+    one.txt: Found 4 files with duplicate names
+    one.txt: Deleting 2/3 identical duplicates (MD5 "1eedaa9fe86fd4b8632e2ac549403b36")
    one.txt: 2 duplicates remain
-      1:      6048320 bytes, 2016-03-05 16:23:16.798000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36
-      2:       564374 bytes, 2016-03-05 16:23:06.731000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81
+      1:      6048320 bytes, 2016-03-05 16:23:16.798000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
+      2:       564374 bytes, 2016-03-05 16:23:06.731000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
    s) Skip and do nothing
    k) Keep just one (choose which in next step)
    r) Rename all to be different (by changing file.jpg to file-1.jpg)
    s/k/r> k
    Enter the number of the file to keep> 1
    one.txt: Deleted 1 extra copies
-    two.txt: Found 3 duplicates - deleting identical copies
+    two.txt: Found 3 files with duplicates names
    two.txt: 3 duplicates remain
-      1:       564374 bytes, 2016-03-05 16:22:52.118000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81
-      2:      6048320 bytes, 2016-03-05 16:22:46.185000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36
-      3:      1744073 bytes, 2016-03-05 16:22:38.104000000, md5sum 851957f7fb6f0bc4ce76be966d336802
+      1:       564374 bytes, 2016-03-05 16:22:52.118000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
+      2:      6048320 bytes, 2016-03-05 16:22:46.185000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
+      3:      1744073 bytes, 2016-03-05 16:22:38.104000000, MD5 851957f7fb6f0bc4ce76be966d336802
    s) Skip and do nothing
    k) Keep just one (choose which in next step)
    r) Rename all to be different (by changing file.jpg to file-1.jpg)
--- a/fs/operations/dedupe.go
+++ b/fs/operations/dedupe.go
@ -101,22 +101,30 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
 	objs = newObjs

 	// See how many of these duplicates are identical
-	byHash := make(map[string][]fs.Object, len(objs))
+	dupesByID := make(map[string][]fs.Object, len(objs))
 	for _, o := range objs {
-		md5sum, err := o.Hash(ctx, ht)
-		if err != nil || md5sum == "" {
+		ID := ""
+		if fs.Config.SizeOnly && o.Size() >= 0 {
+			ID = fmt.Sprintf("size %d", o.Size())
+		} else if ht != hash.None {
+			hashValue, err := o.Hash(ctx, ht)
+			if err == nil && hashValue != "" {
+				ID = fmt.Sprintf("%v %s", ht, hashValue)
+			}
+		}
+		if ID == "" {
 			remainingObjs = append(remainingObjs, o)
 		} else {
-			byHash[md5sum] = append(byHash[md5sum], o)
+			dupesByID[ID] = append(dupesByID[ID], o)
 		}
 	}

 	// Delete identical duplicates, filling remainingObjs with the ones remaining
-	for md5sum, hashObjs := range byHash {
-		remainingObjs = append(remainingObjs, hashObjs[0])
-		if len(hashObjs) > 1 {
-			fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum)
-			for _, o := range hashObjs[1:] {
+	for ID, dupes := range dupesByID {
+		remainingObjs = append(remainingObjs, dupes[0])
+		if len(dupes) > 1 {
+			fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
+			for _, o := range dupes[1:] {
 				err := DeleteFile(ctx, o)
 				if err != nil {
 					remainingObjs = append(remainingObjs, o)
@ -132,11 +140,15 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
 func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
 	fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
 	for i, o := range objs {
-		md5sum, err := o.Hash(ctx, ht)
-		if err != nil {
-			md5sum = err.Error()
+		hashValue := ""
+		if ht != hash.None {
+			var err error
+			hashValue, err = o.Hash(ctx, ht)
+			if err != nil {
+				hashValue = err.Error()
+			}
 		}
-		fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum)
+		fmt.Printf("  %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
 	}
 	switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
 	case 's':
--- a/fs/operations/dedupe_test.go
+++ b/fs/operations/dedupe_test.go
@ -75,6 +75,27 @@ func TestDeduplicateSkip(t *testing.T) {
 	r.CheckWithDuplicates(t, file1, file3)
 }

+func TestDeduplicateSizeOnly(t *testing.T) {
+	r := fstest.NewRun(t)
+	defer r.Finalise()
+	skipIfCantDedupe(t, r.Fremote)
+
+	file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
+	file2 := r.WriteUncheckedObject(context.Background(), "one", "THIS IS ONE", t1)
+	file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t1)
+	r.CheckWithDuplicates(t, file1, file2, file3)
+
+	fs.Config.SizeOnly = true
+	defer func() {
+		fs.Config.SizeOnly = false
+	}()
+
+	err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
+	require.NoError(t, err)
+
+	r.CheckWithDuplicates(t, file1, file3)
+}
+
 func TestDeduplicateFirst(t *testing.T) {
 	r := fstest.NewRun(t)
 	defer r.Finalise()