dedupe: add --by-hash to dedupe on hash not file name - fixes #1674

This commit is contained in:
Nick Craig-Wood 2020-10-13 16:22:02 +01:00
parent e073720a8f
commit 507f861c67
3 changed files with 114 additions and 45 deletions

View File

@ -12,12 +12,14 @@ import (
var (
dedupeMode = operations.DeduplicateInteractive
byHash = false
)
func init() {
cmd.Root.AddCommand(commandDefinition)
cmdFlag := commandDefinition.Flags()
flags.FVarP(cmdFlag, &dedupeMode, "dedupe-mode", "", "Dedupe mode interactive|skip|first|newest|oldest|largest|smallest|rename.")
flags.BoolVarP(cmdFlag, &byHash, "by-hash", "", false, "Find indentical hashes rather than names")
}
var commandDefinition = &cobra.Command{
@ -27,20 +29,26 @@ var commandDefinition = &cobra.Command{
By default ` + "`dedupe`" + ` interactively finds files with duplicate
names and offers to delete all but one or rename them to be
different.
different. This is known as deduping by name.
This is only useful with backends like Google Drive which can have
duplicate file names. It can be run on wrapping backends (e.g. crypt) if
they wrap a backend which supports duplicate file names.
Deduping by name is only useful with backends like Google Drive which
can have duplicate file names. It can be run on wrapping backends
(e.g. crypt) if they wrap a backend which supports duplicate file
names.
In the first pass it will merge directories with the same name. It
will do this iteratively until all the identically named directories
have been merged.
However if --by-hash is passed in then dedupe will find files with
duplicate hashes instead which will work on any backend which supports
at least one hash. This can be used to find files with duplicate
content. This is known as deduping by hash.
In the second pass, for every group of duplicate file names, it will
delete all but one identical files it finds without confirmation.
This means that for most duplicated files the ` + "`dedupe`" + `
command will not be interactive.
If deduping by name, first rclone will merge directories with the same
name. It will do this iteratively until all the identically named
directories have been merged.
Next, if deduping by name, for every group of duplicate file names /
hashes, it will delete all but one identical files it finds without
confirmation. This means that for most duplicated files the ` +
"`dedupe`" + ` command will not be interactive.
` + "`dedupe`" + ` considers files to be identical if they have the
same file path and the same hash. If the backend does not support hashes (e.g. crypt wrapping
@ -49,6 +57,10 @@ use the ` + "`--size-only`" + ` flag then files will be considered
identical if they have the same size (any hash will be ignored). This
can be useful on crypt backends which do not support hashes.
Next rclone will resolve the remaining duplicates. Exactly which
action is taken depends on the dedupe mode. By default rclone will
interactively query the user for each one.
**Important**: Since this can cause data loss, test first with the
` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
@ -131,7 +143,7 @@ Or
}
fdst := cmd.NewFsSrc(args)
cmd.Run(false, false, command, func() error {
return operations.Deduplicate(context.Background(), fdst, dedupeMode)
return operations.Deduplicate(context.Background(), fdst, dedupeMode, byHash)
})
},
}

View File

@ -139,7 +139,7 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
}
// dedupeInteractive interactively dedupes the slice of objects
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
for i, o := range objs {
hashValue := ""
@ -150,9 +150,17 @@ func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string
hashValue = err.Error()
}
}
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
if byHash {
fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
} else {
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
}
}
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
if !byHash {
commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
}
switch config.Command(commands) {
case 's':
case 'k':
keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
@ -298,31 +306,51 @@ func sortSmallestFirst(objs []fs.Object) {
// Deduplicate interactively finds duplicate files and offers to
// delete all but one or rename them to be different. Only useful with
// Google Drive which can have duplicate file names.
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
fs.Infof(f, "Looking for duplicates using %v mode.", mode)
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
ci := fs.GetConfig(ctx)
// find a hash to use
ht := f.Hashes().GetOne()
what := "names"
if byHash {
if ht == hash.None {
return errors.Errorf("%v has no hashes", f)
}
what = ht.String() + " hashes"
}
fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)
// Find duplicate directories first and fix them
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
if err != nil {
return err
}
if len(duplicateDirs) != 0 {
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
if !byHash {
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
if err != nil {
return err
}
if len(duplicateDirs) != 0 {
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
if err != nil {
return err
}
}
}
// find a hash to use
ht := f.Hashes().GetOne()
// Now find duplicate files
files := map[string][]fs.Object{}
err = walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
err := walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
entries.ForObject(func(o fs.Object) {
remote := o.Remote()
files[remote] = append(files[remote], o)
var remote string
var err error
if byHash {
remote, err = o.Hash(ctx, ht)
if err != nil {
fs.Errorf(o, "Failed to hash: %v", err)
remote = ""
}
} else {
remote = o.Remote()
}
if remote != "" {
files[remote] = append(files[remote], o)
}
})
return nil
})
@ -332,15 +360,17 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
for remote, objs := range files {
if len(objs) > 1 {
fs.Logf(remote, "Found %d files with duplicate names", len(objs))
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
if len(objs) <= 1 {
fs.Logf(remote, "All duplicates removed")
continue
fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
if !byHash {
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
if len(objs) <= 1 {
fs.Logf(remote, "All duplicates removed")
continue
}
}
switch mode {
case DeduplicateInteractive:
dedupeInteractive(ctx, f, ht, remote, objs)
dedupeInteractive(ctx, f, ht, remote, objs, byHash)
case DeduplicateFirst:
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateNewest:
@ -358,7 +388,7 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateSkip:
fs.Logf(remote, "Skipping %d files with duplicate names", len(objs))
fs.Logf(remote, "Skipping %d files with duplicate names %s", len(objs), what)
default:
//skip
}

View File

@ -10,6 +10,7 @@ import (
"github.com/rclone/rclone/fs/operations"
"github.com/rclone/rclone/fs/walk"
"github.com/rclone/rclone/fstest"
"github.com/rclone/rclone/lib/random"
"github.com/spf13/pflag"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@ -36,6 +37,12 @@ func skipIfNoHash(t *testing.T, f fs.Fs) {
}
}
func skipIfNoModTime(t *testing.T, f fs.Fs) {
if f.Precision() >= fs.ModTimeNotSupported {
t.Skip("Can't run this test without modtimes")
}
}
func TestDeduplicateInteractive(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
@ -47,7 +54,7 @@ func TestDeduplicateInteractive(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive, false)
require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file1)
@ -69,7 +76,7 @@ func TestDeduplicateSkip(t *testing.T) {
files = append(files, file3)
r.CheckWithDuplicates(t, files...)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
require.NoError(t, err)
r.CheckWithDuplicates(t, file1, file3)
@ -92,7 +99,7 @@ func TestDeduplicateSizeOnly(t *testing.T) {
ci.SizeOnly = false
}()
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
require.NoError(t, err)
r.CheckWithDuplicates(t, file1, file3)
@ -108,7 +115,7 @@ func TestDeduplicateFirst(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one BB", t1)
r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst, false)
require.NoError(t, err)
// list until we get one object
@ -131,18 +138,38 @@ func TestDeduplicateNewest(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
skipIfCantDedupe(t, r.Fremote)
skipIfNoModTime(t, r.Fremote)
file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
file2 := r.WriteUncheckedObject(context.Background(), "one", "This is one too", t2)
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, false)
require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file3)
}
func TestDeduplicateNewestByHash(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
skipIfNoHash(t, r.Fremote)
skipIfNoModTime(t, r.Fremote)
contents := random.String(100)
file1 := r.WriteObject(context.Background(), "one", contents, t1)
file2 := r.WriteObject(context.Background(), "also/one", contents, t2)
file3 := r.WriteObject(context.Background(), "another", contents, t3)
file4 := r.WriteObject(context.Background(), "not-one", "stuff", t3)
fstest.CheckItems(t, r.Fremote, file1, file2, file3, file4)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, true)
require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file3, file4)
}
func TestDeduplicateOldest(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
@ -153,7 +180,7 @@ func TestDeduplicateOldest(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest, false)
require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file1)
@ -169,7 +196,7 @@ func TestDeduplicateLargest(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest, false)
require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file3)
@ -185,7 +212,7 @@ func TestDeduplicateSmallest(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest, false)
require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file1)
@ -202,7 +229,7 @@ func TestDeduplicateRename(t *testing.T) {
file4 := r.WriteUncheckedObject(context.Background(), "one-1.txt", "This is not a duplicate", t1)
r.CheckWithDuplicates(t, file1, file2, file3, file4)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename, false)
require.NoError(t, err)
require.NoError(t, walk.ListR(context.Background(), r.Fremote, "", true, -1, walk.ListObjects, func(entries fs.DirEntries) error {