mirror of
https://github.com/rclone/rclone.git
synced 2024-11-23 08:54:10 +01:00
237daa8aaf
Fixes #5881
501 lines
13 KiB
Go
501 lines
13 KiB
Go
// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
|
|
|
|
package operations
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"path"
|
|
"sort"
|
|
"strings"
|
|
|
|
"github.com/rclone/rclone/fs"
|
|
"github.com/rclone/rclone/fs/config"
|
|
"github.com/rclone/rclone/fs/hash"
|
|
"github.com/rclone/rclone/fs/walk"
|
|
)
|
|
|
|
// dedupeRename renames the objs slice to different names
|
|
func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
|
|
doMove := f.Features().Move
|
|
if doMove == nil {
|
|
log.Fatalf("Fs %v doesn't support Move", f)
|
|
}
|
|
ext := path.Ext(remote)
|
|
base := remote[:len(remote)-len(ext)]
|
|
|
|
outer:
|
|
for i, o := range objs {
|
|
suffix := 1
|
|
newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
|
|
_, err := f.NewObject(ctx, newName)
|
|
for ; err != fs.ErrorObjectNotFound; suffix++ {
|
|
if err != nil {
|
|
err = fs.CountError(err)
|
|
fs.Errorf(o, "Failed to check for existing object: %v", err)
|
|
continue outer
|
|
}
|
|
if suffix > 100 {
|
|
fs.Errorf(o, "Could not find an available new name")
|
|
continue outer
|
|
}
|
|
newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
|
|
_, err = f.NewObject(ctx, newName)
|
|
}
|
|
if !SkipDestructive(ctx, o, "rename") {
|
|
newObj, err := doMove(ctx, o, newName)
|
|
if err != nil {
|
|
err = fs.CountError(err)
|
|
fs.Errorf(o, "Failed to rename: %v", err)
|
|
continue
|
|
}
|
|
fs.Infof(newObj, "renamed from: %v", o)
|
|
}
|
|
}
|
|
}
|
|
|
|
// dedupeDeleteAllButOne deletes all but the one in keep
|
|
func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
|
|
count := 0
|
|
for i, o := range objs {
|
|
if i == keep {
|
|
continue
|
|
}
|
|
err := DeleteFile(ctx, o)
|
|
if err == nil {
|
|
count++
|
|
}
|
|
}
|
|
if count > 0 {
|
|
fs.Logf(remote, "Deleted %d extra copies", count)
|
|
}
|
|
}
|
|
|
|
// dedupeDeleteIdentical deletes all but one of identical (by hash) copies
|
|
func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
|
|
ci := fs.GetConfig(ctx)
|
|
|
|
// Make map of IDs
|
|
IDs := make(map[string]int, len(objs))
|
|
for _, o := range objs {
|
|
if do, ok := o.(fs.IDer); ok {
|
|
if ID := do.ID(); ID != "" {
|
|
IDs[ID]++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remove duplicate IDs
|
|
newObjs := objs[:0]
|
|
for _, o := range objs {
|
|
if do, ok := o.(fs.IDer); ok {
|
|
if ID := do.ID(); ID != "" {
|
|
if IDs[ID] <= 1 {
|
|
newObjs = append(newObjs, o)
|
|
} else {
|
|
fs.Logf(o, "Ignoring as it appears %d times in the listing and deleting would lead to data loss", IDs[ID])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
objs = newObjs
|
|
|
|
// See how many of these duplicates are identical
|
|
dupesByID := make(map[string][]fs.Object, len(objs))
|
|
for _, o := range objs {
|
|
ID := ""
|
|
if ci.SizeOnly && o.Size() >= 0 {
|
|
ID = fmt.Sprintf("size %d", o.Size())
|
|
} else if ht != hash.None {
|
|
hashValue, err := o.Hash(ctx, ht)
|
|
if err == nil && hashValue != "" {
|
|
ID = fmt.Sprintf("%v %s", ht, hashValue)
|
|
}
|
|
}
|
|
if ID == "" {
|
|
remainingObjs = append(remainingObjs, o)
|
|
} else {
|
|
dupesByID[ID] = append(dupesByID[ID], o)
|
|
}
|
|
}
|
|
|
|
// Delete identical duplicates, filling remainingObjs with the ones remaining
|
|
for ID, dupes := range dupesByID {
|
|
remainingObjs = append(remainingObjs, dupes[0])
|
|
if len(dupes) > 1 {
|
|
fs.Logf(remote, "Deleting %d/%d identical duplicates (%s)", len(dupes)-1, len(dupes), ID)
|
|
for _, o := range dupes[1:] {
|
|
err := DeleteFile(ctx, o)
|
|
if err != nil {
|
|
remainingObjs = append(remainingObjs, o)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return remainingObjs
|
|
}
|
|
|
|
// dedupeList lists the duplicates and does nothing
|
|
func dedupeList(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
|
|
fmt.Printf("%s: %d duplicates\n", remote, len(objs))
|
|
for i, o := range objs {
|
|
hashValue := ""
|
|
if ht != hash.None {
|
|
var err error
|
|
hashValue, err = o.Hash(ctx, ht)
|
|
if err != nil {
|
|
hashValue = err.Error()
|
|
}
|
|
}
|
|
if byHash {
|
|
fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
|
|
} else {
|
|
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
|
|
}
|
|
}
|
|
}
|
|
|
|
// dedupeInteractive interactively dedupes the slice of objects
|
|
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) bool {
|
|
dedupeList(ctx, f, ht, remote, objs, byHash)
|
|
commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
|
|
if !byHash {
|
|
commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
|
|
}
|
|
commands = append(commands, "qQuit")
|
|
switch config.Command(commands) {
|
|
case 's':
|
|
case 'k':
|
|
keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
|
|
dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
|
|
case 'r':
|
|
dedupeRename(ctx, f, remote, objs)
|
|
case 'q':
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// DeduplicateMode is how the dedupe command chooses what to do
|
|
type DeduplicateMode int
|
|
|
|
// Deduplicate modes
|
|
const (
|
|
DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
|
|
DeduplicateSkip // skip all conflicts
|
|
DeduplicateFirst // choose the first object
|
|
DeduplicateNewest // choose the newest object
|
|
DeduplicateOldest // choose the oldest object
|
|
DeduplicateRename // rename the objects
|
|
DeduplicateLargest // choose the largest object
|
|
DeduplicateSmallest // choose the smallest object
|
|
DeduplicateList // list duplicates only
|
|
)
|
|
|
|
func (x DeduplicateMode) String() string {
|
|
switch x {
|
|
case DeduplicateInteractive:
|
|
return "interactive"
|
|
case DeduplicateSkip:
|
|
return "skip"
|
|
case DeduplicateFirst:
|
|
return "first"
|
|
case DeduplicateNewest:
|
|
return "newest"
|
|
case DeduplicateOldest:
|
|
return "oldest"
|
|
case DeduplicateRename:
|
|
return "rename"
|
|
case DeduplicateLargest:
|
|
return "largest"
|
|
case DeduplicateSmallest:
|
|
return "smallest"
|
|
case DeduplicateList:
|
|
return "list"
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
// Set a DeduplicateMode from a string
|
|
func (x *DeduplicateMode) Set(s string) error {
|
|
switch strings.ToLower(s) {
|
|
case "interactive":
|
|
*x = DeduplicateInteractive
|
|
case "skip":
|
|
*x = DeduplicateSkip
|
|
case "first":
|
|
*x = DeduplicateFirst
|
|
case "newest":
|
|
*x = DeduplicateNewest
|
|
case "oldest":
|
|
*x = DeduplicateOldest
|
|
case "rename":
|
|
*x = DeduplicateRename
|
|
case "largest":
|
|
*x = DeduplicateLargest
|
|
case "smallest":
|
|
*x = DeduplicateSmallest
|
|
case "list":
|
|
*x = DeduplicateList
|
|
default:
|
|
return fmt.Errorf("unknown mode for dedupe %q", s)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Type of the value
|
|
func (x *DeduplicateMode) Type() string {
|
|
return "string"
|
|
}
|
|
|
|
// Directory with entry count and links to parents
|
|
type dedupeDir struct {
|
|
dir fs.Directory
|
|
parent string
|
|
count int
|
|
}
|
|
|
|
// Map of directories by ID with recursive counts
|
|
type dedupeDirsMap map[string]*dedupeDir
|
|
|
|
func (dm dedupeDirsMap) get(id string) *dedupeDir {
|
|
d := dm[id]
|
|
if d == nil {
|
|
d = &dedupeDir{}
|
|
dm[id] = d
|
|
}
|
|
return d
|
|
}
|
|
|
|
func (dm dedupeDirsMap) increment(parent string) {
|
|
if parent != "" {
|
|
d := dm.get(parent)
|
|
d.count++
|
|
dm.increment(d.parent)
|
|
}
|
|
}
|
|
|
|
// dedupeFindDuplicateDirs scans f for duplicate directories
|
|
func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) (duplicateDirs [][]*dedupeDir, err error) {
|
|
dirsByID := dedupeDirsMap{}
|
|
dirs := map[string][]*dedupeDir{}
|
|
|
|
ci := fs.GetConfig(ctx)
|
|
err = walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListAll, func(entries fs.DirEntries) error {
|
|
for _, entry := range entries {
|
|
remote := entry.Remote()
|
|
parentRemote := path.Dir(remote)
|
|
if parentRemote == "." {
|
|
parentRemote = ""
|
|
}
|
|
|
|
// Obtain ID of the object parent, if known.
|
|
// (This usually means that backend allows duplicate paths)
|
|
// Fall back to remote parent path, if unavailable.
|
|
var parent string
|
|
if entryParentIDer, ok := entry.(fs.ParentIDer); ok {
|
|
parent = entryParentIDer.ParentID()
|
|
}
|
|
if parent == "" {
|
|
parent = parentRemote
|
|
}
|
|
|
|
var ID string
|
|
if entryIDer, ok := entry.(fs.IDer); ok {
|
|
ID = entryIDer.ID()
|
|
}
|
|
if ID == "" {
|
|
ID = remote
|
|
}
|
|
|
|
if fsDir, ok := entry.(fs.Directory); ok {
|
|
d := dirsByID.get(ID)
|
|
d.dir = fsDir
|
|
d.parent = parent
|
|
dirs[remote] = append(dirs[remote], d)
|
|
}
|
|
|
|
dirsByID.increment(parent)
|
|
}
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("find duplicate dirs: %w", err)
|
|
}
|
|
|
|
// Make sure parents are before children
|
|
duplicateNames := []string{}
|
|
for name, ds := range dirs {
|
|
if len(ds) > 1 {
|
|
duplicateNames = append(duplicateNames, name)
|
|
}
|
|
}
|
|
sort.Strings(duplicateNames)
|
|
for _, name := range duplicateNames {
|
|
duplicateDirs = append(duplicateDirs, dirs[name])
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// dedupeMergeDuplicateDirs merges all the duplicate directories found
|
|
func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]*dedupeDir) error {
|
|
mergeDirs := f.Features().MergeDirs
|
|
if mergeDirs == nil {
|
|
return fmt.Errorf("%v: can't merge directories", f)
|
|
}
|
|
dirCacheFlush := f.Features().DirCacheFlush
|
|
if dirCacheFlush == nil {
|
|
return fmt.Errorf("%v: can't flush dir cache", f)
|
|
}
|
|
for _, dedupeDirs := range duplicateDirs {
|
|
if SkipDestructive(ctx, dedupeDirs[0].dir, "merge duplicate directories") {
|
|
continue
|
|
}
|
|
|
|
// Put largest directory in front to minimize movements
|
|
fsDirs := []fs.Directory{}
|
|
largestCount := -1
|
|
largestIdx := 0
|
|
for i, d := range dedupeDirs {
|
|
fsDirs = append(fsDirs, d.dir)
|
|
if d.count > largestCount {
|
|
largestIdx = i
|
|
largestCount = d.count
|
|
}
|
|
}
|
|
fsDirs[largestIdx], fsDirs[0] = fsDirs[0], fsDirs[largestIdx]
|
|
|
|
fs.Infof(fsDirs[0], "Merging contents of duplicate directories")
|
|
err := mergeDirs(ctx, fsDirs)
|
|
if err != nil {
|
|
err = fs.CountError(err)
|
|
fs.Errorf(nil, "merge duplicate dirs: %v", err)
|
|
}
|
|
}
|
|
dirCacheFlush()
|
|
return nil
|
|
}
|
|
|
|
// sort oldest first
|
|
func sortOldestFirst(objs []fs.Object) {
|
|
sort.Slice(objs, func(i, j int) bool {
|
|
return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
|
|
})
|
|
}
|
|
|
|
// sort smallest first
|
|
func sortSmallestFirst(objs []fs.Object) {
|
|
sort.Slice(objs, func(i, j int) bool {
|
|
return objs[i].Size() < objs[j].Size()
|
|
})
|
|
}
|
|
|
|
// Deduplicate interactively finds duplicate files and offers to
|
|
// delete all but one or rename them to be different. Only useful with
|
|
// Google Drive which can have duplicate file names.
|
|
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
|
|
ci := fs.GetConfig(ctx)
|
|
// find a hash to use
|
|
ht := f.Hashes().GetOne()
|
|
what := "names"
|
|
if byHash {
|
|
if ht == hash.None {
|
|
return fmt.Errorf("%v has no hashes", f)
|
|
}
|
|
what = ht.String() + " hashes"
|
|
}
|
|
fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)
|
|
|
|
// Find duplicate directories first and fix them
|
|
if !byHash {
|
|
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(duplicateDirs) > 0 {
|
|
if mode != DeduplicateList {
|
|
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
for _, dedupeDirs := range duplicateDirs {
|
|
remote := dedupeDirs[0].dir.Remote()
|
|
fmt.Printf("%s: %d duplicates of this directory\n", remote, len(dedupeDirs))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Now find duplicate files
|
|
files := map[string][]fs.Object{}
|
|
err := walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
|
|
entries.ForObject(func(o fs.Object) {
|
|
var remote string
|
|
var err error
|
|
if byHash {
|
|
remote, err = o.Hash(ctx, ht)
|
|
if err != nil {
|
|
fs.Errorf(o, "Failed to hash: %v", err)
|
|
remote = ""
|
|
}
|
|
} else {
|
|
remote = o.Remote()
|
|
}
|
|
if remote != "" {
|
|
files[remote] = append(files[remote], o)
|
|
}
|
|
})
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for remote, objs := range files {
|
|
if len(objs) <= 1 {
|
|
continue
|
|
}
|
|
fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
|
|
if !byHash && mode != DeduplicateList {
|
|
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
|
|
if len(objs) <= 1 {
|
|
fs.Logf(remote, "All duplicates removed")
|
|
continue
|
|
}
|
|
}
|
|
switch mode {
|
|
case DeduplicateInteractive:
|
|
if !dedupeInteractive(ctx, f, ht, remote, objs, byHash) {
|
|
return nil
|
|
}
|
|
case DeduplicateFirst:
|
|
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
|
case DeduplicateNewest:
|
|
sortOldestFirst(objs)
|
|
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
|
|
case DeduplicateOldest:
|
|
sortOldestFirst(objs)
|
|
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
|
case DeduplicateRename:
|
|
dedupeRename(ctx, f, remote, objs)
|
|
case DeduplicateLargest:
|
|
sortSmallestFirst(objs)
|
|
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
|
|
case DeduplicateSmallest:
|
|
sortSmallestFirst(objs)
|
|
dedupeDeleteAllButOne(ctx, 0, remote, objs)
|
|
case DeduplicateSkip:
|
|
fs.Logf(remote, "Skipping %d files with duplicate %s", len(objs), what)
|
|
case DeduplicateList:
|
|
dedupeList(ctx, f, ht, remote, objs, byHash)
|
|
default:
|
|
//skip
|
|
}
|
|
}
|
|
return nil
|
|
}
|