mirror of
https://github.com/rclone/rclone.git
synced 2025-01-25 15:49:33 +01:00
dedupe: Make it obey the --size-only flag for duplicate detection #4321
This commit is contained in:
parent
d5f4c74697
commit
62f0bbb598
@ -22,19 +22,32 @@ func init() {
|
||||
|
||||
var commandDefinition = &cobra.Command{
|
||||
Use: "dedupe [mode] remote:path",
|
||||
Short: `Interactively find duplicate files and delete/rename them.`,
|
||||
Short: `Interactively find duplicate filenames and delete/rename them.`,
|
||||
Long: `
|
||||
By default ` + "`" + `dedupe` + "`" + ` interactively finds duplicate files and offers to
|
||||
delete all but one or rename them to be different. Only useful with
|
||||
Google Drive which can have duplicate file names.
|
||||
|
||||
By default ` + "`dedupe`" + ` interactively finds files with duplicate
|
||||
names and offers to delete all but one or rename them to be
|
||||
different.
|
||||
|
||||
This is only useful with backends like Google Drive which can have
|
||||
duplicate file names. It can be run on wrapping backends (eg crypt) if
|
||||
they wrap a backend which supports duplicate file names.
|
||||
|
||||
In the first pass it will merge directories with the same name. It
|
||||
will do this iteratively until all the identical directories have been
|
||||
merged.
|
||||
will do this iteratively until all the identically named directories
|
||||
have been merged.
|
||||
|
||||
The ` + "`" + `dedupe` + "`" + ` command will delete all but one of any identical (same
|
||||
md5sum) files it finds without confirmation. This means that for most
|
||||
duplicated files the ` + "`" + `dedupe` + "`" + ` command will not be interactive.
|
||||
In the second pass, for every group of duplicate file names, it will
|
||||
delete all but one identical files it finds without confirmation.
|
||||
This means that for most duplicated files the ` + "`dedupe`" + `
|
||||
command will not be interactive.
|
||||
|
||||
` + "`dedupe`" + ` considers files to be identical if they have the
|
||||
same hash. If the backend does not support hashes (eg crypt wrapping
|
||||
Google Drive) then they will never be found to be identical. If you
|
||||
use the ` + "`--size-only`" + ` flag then files will be considered
|
||||
identical if they have the same size (any hash will be ignored). This
|
||||
can be useful on crypt backends which do not support hashes.
|
||||
|
||||
**Important**: Since this can cause data loss, test first with the
|
||||
` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
|
||||
@ -52,26 +65,26 @@ Before - with duplicates
|
||||
1744073 2016-03-05 16:22:38.104000000 two.txt
|
||||
564374 2016-03-05 16:22:52.118000000 two.txt
|
||||
|
||||
Now the ` + "`" + `dedupe` + "`" + ` session
|
||||
Now the ` + "`dedupe`" + ` session
|
||||
|
||||
$ rclone dedupe drive:dupes
|
||||
2016/03/05 16:24:37 Google drive root 'dupes': Looking for duplicates using interactive mode.
|
||||
one.txt: Found 4 duplicates - deleting identical copies
|
||||
one.txt: Deleting 2/3 identical duplicates (md5sum "1eedaa9fe86fd4b8632e2ac549403b36")
|
||||
one.txt: Found 4 files with duplicate names
|
||||
one.txt: Deleting 2/3 identical duplicates (MD5 "1eedaa9fe86fd4b8632e2ac549403b36")
|
||||
one.txt: 2 duplicates remain
|
||||
1: 6048320 bytes, 2016-03-05 16:23:16.798000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36
|
||||
2: 564374 bytes, 2016-03-05 16:23:06.731000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81
|
||||
1: 6048320 bytes, 2016-03-05 16:23:16.798000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
|
||||
2: 564374 bytes, 2016-03-05 16:23:06.731000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
|
||||
s) Skip and do nothing
|
||||
k) Keep just one (choose which in next step)
|
||||
r) Rename all to be different (by changing file.jpg to file-1.jpg)
|
||||
s/k/r> k
|
||||
Enter the number of the file to keep> 1
|
||||
one.txt: Deleted 1 extra copies
|
||||
two.txt: Found 3 duplicates - deleting identical copies
|
||||
two.txt: Found 3 files with duplicates names
|
||||
two.txt: 3 duplicates remain
|
||||
1: 564374 bytes, 2016-03-05 16:22:52.118000000, md5sum 7594e7dc9fc28f727c42ee3e0749de81
|
||||
2: 6048320 bytes, 2016-03-05 16:22:46.185000000, md5sum 1eedaa9fe86fd4b8632e2ac549403b36
|
||||
3: 1744073 bytes, 2016-03-05 16:22:38.104000000, md5sum 851957f7fb6f0bc4ce76be966d336802
|
||||
1: 564374 bytes, 2016-03-05 16:22:52.118000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
|
||||
2: 6048320 bytes, 2016-03-05 16:22:46.185000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
|
||||
3: 1744073 bytes, 2016-03-05 16:22:38.104000000, MD5 851957f7fb6f0bc4ce76be966d336802
|
||||
s) Skip and do nothing
|
||||
k) Keep just one (choose which in next step)
|
||||
r) Rename all to be different (by changing file.jpg to file-1.jpg)
|
||||
|
@ -101,22 +101,30 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
|
||||
objs = newObjs
|
||||
|
||||
// See how many of these duplicates are identical
|
||||
byHash := make(map[string][]fs.Object, len(objs))
|
||||
dupesByID := make(map[string][]fs.Object, len(objs))
|
||||
for _, o := range objs {
|
||||
md5sum, err := o.Hash(ctx, ht)
|
||||
if err != nil || md5sum == "" {
|
||||
ID := ""
|
||||
if fs.Config.SizeOnly && o.Size() >= 0 {
|
||||
ID = fmt.Sprintf("size %d", o.Size())
|
||||
} else if ht != hash.None {
|
||||
hashValue, err := o.Hash(ctx, ht)
|
||||
if err == nil && hashValue != "" {
|
||||
ID = fmt.Sprintf("%v %s", ht, hashValue)
|
||||
}
|
||||
}
|
||||
if ID == "" {
|
||||
remainingObjs = append(remainingObjs, o)
|
||||
} else {
|
||||
byHash[md5sum] = append(byHash[md5sum], o)
|
||||
dupesByID[ID] = append(dupesByID[ID], o)
|
||||
}
|
||||
}
|
||||
|
||||
// Delete identical duplicates, filling remainingObjs with the ones remaining
|
||||
for md5sum, hashObjs := range byHash {
|
||||
remainingObjs = append(remainingObjs, hashObjs[0])
|
||||
if len(hashObjs) > 1 {
|
||||
fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum)
|
||||
for _, o := range hashObjs[1:] {
|
||||
for ID, dupes := range dupesByID {
|
||||
remainingObjs = append(remainingObjs, dupes[0])
|
||||
if len(dupes) > 1 {
|
||||
fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
|
||||
for _, o := range dupes[1:] {
|
||||
err := DeleteFile(ctx, o)
|
||||
if err != nil {
|
||||
remainingObjs = append(remainingObjs, o)
|
||||
@ -132,11 +140,15 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
|
||||
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
|
||||
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
|
||||
for i, o := range objs {
|
||||
md5sum, err := o.Hash(ctx, ht)
|
||||
hashValue := ""
|
||||
if ht != hash.None {
|
||||
var err error
|
||||
hashValue, err = o.Hash(ctx, ht)
|
||||
if err != nil {
|
||||
md5sum = err.Error()
|
||||
hashValue = err.Error()
|
||||
}
|
||||
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum)
|
||||
}
|
||||
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
|
||||
}
|
||||
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
|
||||
case 's':
|
||||
|
@ -75,6 +75,27 @@ func TestDeduplicateSkip(t *testing.T) {
|
||||
r.CheckWithDuplicates(t, file1, file3)
|
||||
}
|
||||
|
||||
func TestDeduplicateSizeOnly(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
skipIfCantDedupe(t, r.Fremote)
|
||||
|
||||
file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
|
||||
file2 := r.WriteUncheckedObject(context.Background(), "one", "THIS IS ONE", t1)
|
||||
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t1)
|
||||
r.CheckWithDuplicates(t, file1, file2, file3)
|
||||
|
||||
fs.Config.SizeOnly = true
|
||||
defer func() {
|
||||
fs.Config.SizeOnly = false
|
||||
}()
|
||||
|
||||
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
|
||||
require.NoError(t, err)
|
||||
|
||||
r.CheckWithDuplicates(t, file1, file3)
|
||||
}
|
||||
|
||||
func TestDeduplicateFirst(t *testing.T) {
|
||||
r := fstest.NewRun(t)
|
||||
defer r.Finalise()
|
||||
|
Loading…
Reference in New Issue
Block a user