rclone/cmd/bisync/deltas.go
nielash 98f539de8f bisync: refactor normalization code, fix deltas - fixes #7270
Refactored the case / unicode normalization logic to be much more efficient,
 and fix the last outstanding issue from #7270. Before this change, we were
 doing lots of for loops and re-normalizing strings we had already normalized
 earlier. Now, we leave the normalizing entirely to March and avoid
 re-transforming later, which seems to make a large difference in terms of
 performance.
2024-01-20 14:50:08 -05:00

558 lines
15 KiB
Go

// Package bisync implements bisync
// Copyright (c) 2017-2020 Chris Nelson
package bisync
import (
"bytes"
"context"
"fmt"
"path/filepath"
"sort"
"strings"
"github.com/rclone/rclone/cmd/bisync/bilib"
"github.com/rclone/rclone/cmd/check"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/accounting"
"github.com/rclone/rclone/fs/filter"
"github.com/rclone/rclone/fs/operations"
"golang.org/x/text/unicode/norm"
)
// delta
type delta uint8
const (
deltaZero delta = 0
deltaNew delta = 1 << iota
deltaNewer
deltaOlder
deltaSize
deltaHash
deltaDeleted
)
const (
deltaModified delta = deltaNewer | deltaOlder | deltaSize | deltaHash | deltaDeleted
deltaOther delta = deltaNew | deltaNewer | deltaOlder
)
func (d delta) is(cond delta) bool {
return d&cond != 0
}
// deltaSet
type deltaSet struct {
deltas map[string]delta
opt *Options
fs fs.Fs // base filesystem
msg string // filesystem name for logging
oldCount int // original number of files (for "excess deletes" check)
deleted int // number of deleted files (for "excess deletes" check)
foundSame bool // true if found at least one unchanged file
checkFiles bilib.Names
}
func (ds *deltaSet) empty() bool {
return len(ds.deltas) == 0
}
func (ds *deltaSet) sort() (sorted []string) {
if ds.empty() {
return
}
sorted = make([]string, 0, len(ds.deltas))
for file := range ds.deltas {
sorted = append(sorted, file)
}
sort.Strings(sorted)
return
}
func (ds *deltaSet) printStats() {
if ds.empty() {
return
}
nAll := len(ds.deltas)
nNew := 0
nNewer := 0
nOlder := 0
nDeleted := 0
for _, d := range ds.deltas {
if d.is(deltaNew) {
nNew++
}
if d.is(deltaNewer) {
nNewer++
}
if d.is(deltaOlder) {
nOlder++
}
if d.is(deltaDeleted) {
nDeleted++
}
}
fs.Infof(nil, "%s: %4d changes: %4d new, %4d newer, %4d older, %4d deleted",
ds.msg, nAll, nNew, nNewer, nOlder, nDeleted)
}
// check potential conflicts (to avoid renaming if already identical)
func (b *bisyncRun) checkconflicts(ctxCheck context.Context, filterCheck *filter.Filter, fs1, fs2 fs.Fs) (bilib.Names, error) {
matches := bilib.Names{}
if filterCheck.HaveFilesFrom() {
fs.Debugf(nil, "There are potential conflicts to check.")
opt, close, checkopterr := check.GetCheckOpt(b.fs1, b.fs2)
if checkopterr != nil {
b.critical = true
b.retryable = true
fs.Debugf(nil, "GetCheckOpt error: %v", checkopterr)
return matches, checkopterr
}
defer close()
opt.Match = new(bytes.Buffer)
// TODO: consider using custom CheckFn to act like cryptcheck, if either fs is a crypt remote and -c has been passed
// note that cryptCheck() is not currently exported
fs.Infof(nil, "Checking potential conflicts...")
check := operations.Check(ctxCheck, opt)
fs.Infof(nil, "Finished checking the potential conflicts. %s", check)
//reset error count, because we don't want to count check errors as bisync errors
accounting.Stats(ctxCheck).ResetErrors()
//return the list of identical files to check against later
if len(fmt.Sprint(opt.Match)) > 0 {
matches = bilib.ToNames(strings.Split(fmt.Sprint(opt.Match), "\n"))
}
if matches.NotEmpty() {
fs.Debugf(nil, "The following potential conflicts were determined to be identical. %v", matches)
} else {
fs.Debugf(nil, "None of the conflicts were determined to be identical.")
}
}
return matches, nil
}
// findDeltas
func (b *bisyncRun) findDeltas(fctx context.Context, f fs.Fs, oldListing string, now *fileList, msg string) (ds *deltaSet, err error) {
var old *fileList
newListing := oldListing + "-new"
old, err = b.loadListing(oldListing)
if err != nil {
fs.Errorf(nil, "Failed loading prior %s listing: %s", msg, oldListing)
b.abort = true
return
}
if err = b.checkListing(old, oldListing, "prior "+msg); err != nil {
return
}
if err == nil {
err = b.checkListing(now, newListing, "current "+msg)
}
if err != nil {
return
}
ds = &deltaSet{
deltas: map[string]delta{},
fs: f,
msg: msg,
oldCount: len(old.list),
opt: b.opt,
checkFiles: bilib.Names{},
}
for _, file := range old.list {
d := deltaZero
if !now.has(file) {
b.indent(msg, file, "File was deleted")
ds.deleted++
d |= deltaDeleted
} else {
// skip dirs here, as we only care if they are new/deleted, not newer/older
if !now.isDir(file) {
if old.getTime(file) != now.getTime(file) {
if old.beforeOther(now, file) {
fs.Debugf(file, "(old: %v current: %v", old.getTime(file), now.getTime(file))
b.indent(msg, file, "File is newer")
d |= deltaNewer
} else { // Current version is older than prior sync.
fs.Debugf(file, "(old: %v current: %v", old.getTime(file), now.getTime(file))
b.indent(msg, file, "File is OLDER")
d |= deltaOlder
}
}
// TODO Compare sizes and hashes
}
}
if d.is(deltaModified) {
ds.deltas[file] = d
} else {
// Once we've found at least one unchanged file,
// we know that not everything has changed,
// as with a DST time change
ds.foundSame = true
}
}
for _, file := range now.list {
if !old.has(file) {
b.indent(msg, file, "File is new")
ds.deltas[file] = deltaNew
}
}
if b.opt.CheckAccess {
// checkFiles is a small structure compared with the `now`, so we
// return it alone and let the full delta map be garbage collected.
for _, file := range now.list {
if filepath.Base(file) == b.opt.CheckFilename {
ds.checkFiles.Add(file)
}
}
}
return
}
// applyDeltas
func (b *bisyncRun) applyDeltas(ctx context.Context, ds1, ds2 *deltaSet) (changes1, changes2 bool, results2to1, results1to2 []Results, queues queues, err error) {
path1 := bilib.FsPath(b.fs1)
path2 := bilib.FsPath(b.fs2)
copy1to2 := bilib.Names{}
copy2to1 := bilib.Names{}
delete1 := bilib.Names{}
delete2 := bilib.Names{}
handled := bilib.Names{}
renamed1 := bilib.Names{}
renamed2 := bilib.Names{}
renameSkipped := bilib.Names{}
deletedonboth := bilib.Names{}
skippedDirs1 := newFileList()
skippedDirs2 := newFileList()
ctxMove := b.opt.setDryRun(ctx)
// update AliasMap for deleted files, as march does not know about them
b.updateAliases(ctx, ds1, ds2)
// efficient isDir check
// we load the listing just once and store only the dirs
dirs1, dirs1Err := b.listDirsOnly(1)
if dirs1Err != nil {
b.critical = true
b.retryable = true
fs.Debugf(nil, "Error generating dirsonly list for path1: %v", dirs1Err)
return
}
dirs2, dirs2Err := b.listDirsOnly(2)
if dirs2Err != nil {
b.critical = true
b.retryable = true
fs.Debugf(nil, "Error generating dirsonly list for path2: %v", dirs2Err)
return
}
// build a list of only the "deltaOther"s so we don't have to check more files than necessary
// this is essentially the same as running rclone check with a --files-from filter, then exempting the --match results from being renamed
// we therefore avoid having to list the same directory more than once.
// we are intentionally overriding DryRun here because we need to perform the check, even during a dry run, or the results would be inaccurate.
// check is a read-only operation by its nature, so it's already "dry" in that sense.
ctxNew, ciCheck := fs.AddConfig(ctx)
ciCheck.DryRun = false
ctxCheck, filterCheck := filter.AddConfig(ctxNew)
for _, file := range ds1.sort() {
alias := b.aliases.Alias(file)
d1 := ds1.deltas[file]
if d1.is(deltaOther) {
d2, in2 := ds2.deltas[file]
if !in2 && file != alias {
d2 = ds2.deltas[alias]
}
if d2.is(deltaOther) {
checkit := func(filename string) {
if err := filterCheck.AddFile(filename); err != nil {
fs.Debugf(nil, "Non-critical error adding file to list of potential conflicts to check: %s", err)
} else {
fs.Debugf(nil, "Added file to list of potential conflicts to check: %s", filename)
}
}
checkit(file)
if file != alias {
checkit(alias)
}
}
}
}
//if there are potential conflicts to check, check them all here (outside the loop) in one fell swoop
matches, err := b.checkconflicts(ctxCheck, filterCheck, b.fs1, b.fs2)
for _, file := range ds1.sort() {
alias := b.aliases.Alias(file)
p1 := path1 + file
p2 := path2 + alias
d1 := ds1.deltas[file]
if d1.is(deltaOther) {
d2, in2 := ds2.deltas[file]
// try looking under alternate name
if !in2 && file != alias {
d2, in2 = ds2.deltas[alias]
}
if !in2 {
b.indent("Path1", p2, "Queue copy to Path2")
copy1to2.Add(file)
} else if d2.is(deltaDeleted) {
b.indent("Path1", p2, "Queue copy to Path2")
copy1to2.Add(file)
handled.Add(file)
} else if d2.is(deltaOther) {
b.indent("!WARNING", file, "New or changed in both paths")
//if files are identical, leave them alone instead of renaming
if (dirs1.has(file) || dirs1.has(alias)) && (dirs2.has(file) || dirs2.has(alias)) {
fs.Debugf(nil, "This is a directory, not a file. Skipping equality check and will not rename: %s", file)
ls1.getPut(file, skippedDirs1)
ls2.getPut(file, skippedDirs2)
} else {
equal := matches.Has(file)
if !equal {
equal = matches.Has(alias)
}
if equal {
if ciCheck.FixCase && file != alias {
// the content is equal but filename still needs to be FixCase'd, so copy1to2
// the Path1 version is deemed "correct" in this scenario
fs.Infof(alias, "Files are equal but will copy anyway to fix case to %s", file)
copy1to2.Add(file)
} else {
fs.Infof(nil, "Files are equal! Skipping: %s", file)
renameSkipped.Add(file)
renameSkipped.Add(alias)
}
} else {
fs.Debugf(nil, "Files are NOT equal: %s", file)
b.indent("!Path1", p1+"..path1", "Renaming Path1 copy")
if err = operations.MoveFile(ctxMove, b.fs1, b.fs1, file+"..path1", file); err != nil {
err = fmt.Errorf("path1 rename failed for %s: %w", p1, err)
b.critical = true
return
}
if b.opt.DryRun {
renameSkipped.Add(file)
} else {
renamed1.Add(file)
}
b.indent("!Path1", p2+"..path1", "Queue copy to Path2")
copy1to2.Add(file + "..path1")
b.indent("!Path2", p2+"..path2", "Renaming Path2 copy")
if err = operations.MoveFile(ctxMove, b.fs2, b.fs2, alias+"..path2", alias); err != nil {
err = fmt.Errorf("path2 rename failed for %s: %w", alias, err)
return
}
if b.opt.DryRun {
renameSkipped.Add(alias)
} else {
renamed2.Add(alias)
}
b.indent("!Path2", p1+"..path2", "Queue copy to Path1")
copy2to1.Add(alias + "..path2")
}
}
handled.Add(file)
}
} else {
// Path1 deleted
d2, in2 := ds2.deltas[file]
// try looking under alternate name
fs.Debugf(file, "alias: %s, in2: %v", alias, in2)
if !in2 && file != alias {
fs.Debugf(file, "looking for alias: %s", alias)
d2, in2 = ds2.deltas[alias]
if in2 {
fs.Debugf(file, "detected alias: %s", alias)
}
}
if !in2 {
b.indent("Path2", p2, "Queue delete")
delete2.Add(file)
copy1to2.Add(file)
} else if d2.is(deltaOther) {
b.indent("Path2", p1, "Queue copy to Path1")
copy2to1.Add(file)
handled.Add(file)
} else if d2.is(deltaDeleted) {
handled.Add(file)
deletedonboth.Add(file)
deletedonboth.Add(alias)
}
}
}
for _, file := range ds2.sort() {
alias := b.aliases.Alias(file)
p1 := path1 + alias
d2 := ds2.deltas[file]
if handled.Has(file) || handled.Has(alias) {
continue
}
if d2.is(deltaOther) {
b.indent("Path2", p1, "Queue copy to Path1")
copy2to1.Add(file)
} else {
// Deleted
b.indent("Path1", p1, "Queue delete")
delete1.Add(file)
copy2to1.Add(file)
}
}
// Do the batch operation
if copy2to1.NotEmpty() {
changes1 = true
b.indent("Path2", "Path1", "Do queued copies to")
results2to1, err = b.fastCopy(ctx, b.fs2, b.fs1, copy2to1, "copy2to1")
if err != nil {
return
}
//copy empty dirs from path2 to path1 (if --create-empty-src-dirs)
b.syncEmptyDirs(ctx, b.fs1, copy2to1, dirs2, &results2to1, "make")
}
if copy1to2.NotEmpty() {
changes2 = true
b.indent("Path1", "Path2", "Do queued copies to")
results1to2, err = b.fastCopy(ctx, b.fs1, b.fs2, copy1to2, "copy1to2")
if err != nil {
return
}
//copy empty dirs from path1 to path2 (if --create-empty-src-dirs)
b.syncEmptyDirs(ctx, b.fs2, copy1to2, dirs1, &results1to2, "make")
}
if delete1.NotEmpty() {
if err = b.saveQueue(delete1, "delete1"); err != nil {
return
}
//propagate deletions of empty dirs from path2 to path1 (if --create-empty-src-dirs)
b.syncEmptyDirs(ctx, b.fs1, delete1, dirs1, &results2to1, "remove")
}
if delete2.NotEmpty() {
if err = b.saveQueue(delete2, "delete2"); err != nil {
return
}
//propagate deletions of empty dirs from path1 to path2 (if --create-empty-src-dirs)
b.syncEmptyDirs(ctx, b.fs2, delete2, dirs2, &results1to2, "remove")
}
queues.copy1to2 = copy1to2
queues.copy2to1 = copy2to1
queues.renamed1 = renamed1
queues.renamed2 = renamed2
queues.renameSkipped = renameSkipped
queues.deletedonboth = deletedonboth
queues.skippedDirs1 = skippedDirs1
queues.skippedDirs2 = skippedDirs2
return
}
// excessDeletes checks whether number of deletes is within allowed range
func (ds *deltaSet) excessDeletes() bool {
maxDelete := ds.opt.MaxDelete
maxRatio := float64(maxDelete) / 100.0
curRatio := 0.0
if ds.deleted > 0 && ds.oldCount > 0 {
curRatio = float64(ds.deleted) / float64(ds.oldCount)
}
if curRatio <= maxRatio {
return false
}
fs.Errorf("Safety abort",
"too many deletes (>%d%%, %d of %d) on %s %s. Run with --force if desired.",
maxDelete, ds.deleted, ds.oldCount, ds.msg, quotePath(bilib.FsPath(ds.fs)))
return true
}
// normally we build the AliasMap from march results,
// however, march does not know about deleted files, so need to manually check them for aliases
func (b *bisyncRun) updateAliases(ctx context.Context, ds1, ds2 *deltaSet) {
ci := fs.GetConfig(ctx)
// skip if not needed
if ci.NoUnicodeNormalization && !ci.IgnoreCaseSync && !b.fs1.Features().CaseInsensitive && !b.fs2.Features().CaseInsensitive {
return
}
if ds1.deleted < 1 && ds2.deleted < 1 {
return
}
fs.Debugf(nil, "Updating AliasMap")
transform := func(s string) string {
if !ci.NoUnicodeNormalization {
s = norm.NFC.String(s)
}
// note: march only checks the dest, but we check both here
if ci.IgnoreCaseSync || b.fs1.Features().CaseInsensitive || b.fs2.Features().CaseInsensitive {
s = strings.ToLower(s)
}
return s
}
delMap1 := map[string]string{} // [transformedname]originalname
delMap2 := map[string]string{} // [transformedname]originalname
fullMap1 := map[string]string{} // [transformedname]originalname
fullMap2 := map[string]string{} // [transformedname]originalname
for _, name := range ls1.list {
fullMap1[transform(name)] = name
}
for _, name := range ls2.list {
fullMap2[transform(name)] = name
}
addDeletes := func(ds *deltaSet, delMap, fullMap map[string]string) {
for _, file := range ds.sort() {
d := ds.deltas[file]
if d.is(deltaDeleted) {
delMap[transform(file)] = file
fullMap[transform(file)] = file
}
}
}
addDeletes(ds1, delMap1, fullMap1)
addDeletes(ds2, delMap2, fullMap2)
addAliases := func(delMap, fullMap map[string]string) {
for transformedname, name := range delMap {
matchedName, found := fullMap[transformedname]
if found && name != matchedName {
fs.Debugf(name, "adding alias %s", matchedName)
b.aliases.Add(name, matchedName)
}
}
}
addAliases(delMap1, fullMap2)
addAliases(delMap2, fullMap1)
}