From e9cd3e59862637b784399aa4c95384d16afe2279 Mon Sep 17 00:00:00 2001 From: nielash Date: Sun, 3 Dec 2023 03:19:13 -0500 Subject: [PATCH] bisync: allow lock file expiration/renewal with --max-lock - #7470 Background: Bisync uses lock files as a safety feature to prevent interference from other bisync runs while it is running. Bisync normally removes these lock files at the end of a run, but if bisync is abruptly interrupted, these files will be left behind. By default, they will lock out all future runs, until the user has a chance to manually check things out and remove the lock. Before this change, lock files blocked future runs indefinitely, so a single interrupted run would lock out all future runs forever (absent user intervention), and there was no way to change this behavior. After this change, a new --max-lock flag can be used to make lock files automatically expire after a certain period of time, so that future runs are not locked out forever, and auto-recovery is possible. --max-lock can be any duration 2m or greater (or 0 to disable). If set, lock files older than this will be considered "expired", and future runs will be allowed to disregard them and proceed. (Note that the --max-lock duration must be set by the process that left the lock file -- not the later one interpreting it.) If set, bisync will also "renew" these lock files every --max-lock_minus_one_minute throughout a run, for extra safety. (For example, with --max-lock 5m, bisync would renew the lock file (for another 5 minutes) every 4 minutes until the run has completed.) In other words, it should not be possible for a lock file to pass its expiration time while the process that created it is still running -- and you can therefore be reasonably sure that any _expired_ lock file you may find was left there by an interrupted run, not one that is still running and just taking awhile. If --max-lock is 0 or not set, the default is that lock files will never expire, and will block future runs (of these same two bisync paths) indefinitely. For maximum resilience from disruptions, consider setting a relatively short duration like --max-lock 2m along with --resilient and --recover, and a relatively frequent cron schedule. The result will be a very robust "set-it-and-forget-it" bisync run that can automatically bounce back from almost any interruption it might encounter, without requiring the user to get involved and run a --resync. --- cmd/bisync/cmd.go | 3 + cmd/bisync/lockfile.go | 154 +++++++++++++++++++++++++++++++++++++++ cmd/bisync/operations.go | 41 ++--------- docs/content/bisync.md | 83 ++++++++++++++++----- 4 files changed, 228 insertions(+), 53 deletions(-) create mode 100644 cmd/bisync/lockfile.go diff --git a/cmd/bisync/cmd.go b/cmd/bisync/cmd.go index 3ed6a4c58..6e5b95912 100644 --- a/cmd/bisync/cmd.go +++ b/cmd/bisync/cmd.go @@ -54,6 +54,7 @@ type Options struct { Compare CompareOpt CompareFlag string DebugName string + MaxLock time.Duration } // Default values @@ -112,6 +113,7 @@ var Opt Options func init() { Opt.Retries = 3 + Opt.MaxLock = 0 cmd.Root.AddCommand(commandDefinition) cmdFlags := commandDefinition.Flags() // when adding new flags, remember to also update the rc params: @@ -138,6 +140,7 @@ func init() { flags.BoolVarP(cmdFlags, &Opt.Compare.NoSlowHash, "no-slow-hash", "", Opt.Compare.NoSlowHash, "Ignore listing checksums only on backends where they are slow", "") flags.BoolVarP(cmdFlags, &Opt.Compare.SlowHashSyncOnly, "slow-hash-sync-only", "", Opt.Compare.SlowHashSyncOnly, "Ignore slow checksums for listings and deltas, but still consider them during sync calls.", "") flags.BoolVarP(cmdFlags, &Opt.Compare.DownloadHash, "download-hash", "", Opt.Compare.DownloadHash, "Compute hash by downloading when otherwise unavailable. (warning: may be slow and use lots of data!)", "") + flags.DurationVarP(cmdFlags, &Opt.MaxLock, "max-lock", "", Opt.MaxLock, "Consider lock files older than this to be expired (default: 0 (never expire)) (minimum: 2m)", "") } // bisync command definition diff --git a/cmd/bisync/lockfile.go b/cmd/bisync/lockfile.go new file mode 100644 index 000000000..282fbdd1a --- /dev/null +++ b/cmd/bisync/lockfile.go @@ -0,0 +1,154 @@ +package bisync + +import ( + "encoding/json" + "fmt" + "io" + "os" + "strconv" + "sync" + "time" + + "github.com/rclone/rclone/cmd/bisync/bilib" + "github.com/rclone/rclone/fs" + "github.com/rclone/rclone/lib/terminal" +) + +const basicallyforever = 200 * 365 * 24 * time.Hour + +var stopRenewal func() + +var data = struct { + Session string + PID string + TimeRenewed time.Time + TimeExpires time.Time +}{} + +func (b *bisyncRun) setLockFile() error { + b.lockFile = "" + b.setLockFileExpiration() + if !b.opt.DryRun { + b.lockFile = b.basePath + ".lck" + if bilib.FileExists(b.lockFile) { + if !b.lockFileIsExpired() { + errTip := Color(terminal.MagentaFg, "Tip: this indicates that another bisync run (of these same paths) either is still running or was interrupted before completion. \n") + errTip += Color(terminal.MagentaFg, "If you're SURE you want to override this safety feature, you can delete the lock file with the following command, then run bisync again: \n") + errTip += fmt.Sprintf(Color(terminal.HiRedFg, "rclone deletefile \"%s\""), b.lockFile) + return fmt.Errorf(Color(terminal.RedFg, "prior lock file found: %s \n")+errTip, Color(terminal.HiYellowFg, b.lockFile)) + } + } + + pidStr := []byte(strconv.Itoa(os.Getpid())) + if err = os.WriteFile(b.lockFile, pidStr, bilib.PermSecure); err != nil { + return fmt.Errorf(Color(terminal.RedFg, "cannot create lock file: %s: %w"), b.lockFile, err) + } + fs.Debugf(nil, "Lock file created: %s", b.lockFile) + b.renewLockFile() + stopRenewal = b.startLockRenewal() + } + return nil +} + +func (b *bisyncRun) removeLockFile() { + if b.lockFile != "" { + stopRenewal() + errUnlock := os.Remove(b.lockFile) + if errUnlock == nil { + fs.Debugf(nil, "Lock file removed: %s", b.lockFile) + } else if err == nil { + err = errUnlock + } else { + fs.Errorf(nil, "cannot remove lockfile %s: %v", b.lockFile, errUnlock) + } + b.lockFile = "" // block removing it again + } +} + +func (b *bisyncRun) setLockFileExpiration() { + if b.opt.MaxLock > 0 && b.opt.MaxLock < 2*time.Minute { + fs.Logf(nil, Color(terminal.YellowFg, "--max-lock cannot be shorter than 2 minutes (unless 0.) Changing --max-lock from %v to %v"), b.opt.MaxLock, 2*time.Minute) + b.opt.MaxLock = 2 * time.Minute + } else if b.opt.MaxLock <= 0 { + b.opt.MaxLock = basicallyforever + } +} + +func (b *bisyncRun) renewLockFile() { + if b.lockFile != "" && bilib.FileExists(b.lockFile) { + + data.Session = b.basePath + data.PID = strconv.Itoa(os.Getpid()) + data.TimeRenewed = time.Now() + data.TimeExpires = time.Now().Add(b.opt.MaxLock) + + // save data file + df, err := os.Create(b.lockFile) + b.handleErr(b.lockFile, "error renewing lock file", err, true, true) + b.handleErr(b.lockFile, "error encoding JSON to lock file", json.NewEncoder(df).Encode(data), true, true) + b.handleErr(b.lockFile, "error closing lock file", df.Close(), true, true) + if b.opt.MaxLock < basicallyforever { + fs.Infof(nil, Color(terminal.HiBlueFg, "lock file renewed for %v. New expiration: %v"), b.opt.MaxLock, data.TimeExpires) + } + } +} + +func (b *bisyncRun) lockFileIsExpired() bool { + if b.lockFile != "" && bilib.FileExists(b.lockFile) { + rdf, err := os.Open(b.lockFile) + b.handleErr(b.lockFile, "error reading lock file", err, true, true) + dec := json.NewDecoder(rdf) + for { + if err := dec.Decode(&data); err == io.EOF { + break + } + } + b.handleErr(b.lockFile, "error closing file", rdf.Close(), true, true) + if !data.TimeExpires.IsZero() && data.TimeExpires.Before(time.Now()) { + fs.Infof(b.lockFile, Color(terminal.GreenFg, "Lock file found, but it expired at %v. Will delete it and proceed."), data.TimeExpires) + markFailed(b.listing1) // listing is untrusted so force revert to prior (if --recover) or create new ones (if --resync) + markFailed(b.listing2) + return true + } + fs.Infof(b.lockFile, Color(terminal.RedFg, "Valid lock file found. Expires at %v. (%v from now)"), data.TimeExpires, time.Since(data.TimeExpires).Abs().Round(time.Second)) + prettyprint(data, "Lockfile info", fs.LogLevelInfo) + } + return false +} + +// StartLockRenewal renews the lockfile every --max-lock minus one minute. +// +// It returns a func which should be called to stop the renewal. +func (b *bisyncRun) startLockRenewal() func() { + if b.opt.MaxLock <= 0 || b.opt.MaxLock >= basicallyforever || b.lockFile == "" { + return func() {} + } + stopLockRenewal := make(chan struct{}) + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + ticker := time.NewTicker(b.opt.MaxLock - time.Minute) + for { + select { + case <-ticker.C: + b.renewLockFile() + case <-stopLockRenewal: + ticker.Stop() + return + } + } + }() + return func() { + close(stopLockRenewal) + wg.Wait() + } +} + +func markFailed(file string) { + failFile := file + "-err" + if bilib.FileExists(file) { + _ = os.Remove(failFile) + _ = os.Rename(file, failFile) + } +} diff --git a/cmd/bisync/operations.go b/cmd/bisync/operations.go index 714a90f03..7b0fc0033 100644 --- a/cmd/bisync/operations.go +++ b/cmd/bisync/operations.go @@ -9,7 +9,6 @@ import ( "fmt" "os" "path/filepath" - "strconv" "strings" gosync "sync" "time" @@ -48,6 +47,7 @@ type bisyncRun struct { SyncCI *fs.ConfigInfo CancelSync context.CancelFunc DebugName string + lockFile string } type queues struct { @@ -102,32 +102,14 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) { b.aliases = bilib.AliasMap{} // Handle lock file - lockFile := "" - if !opt.DryRun { - lockFile = b.basePath + ".lck" - if bilib.FileExists(lockFile) { - errTip := Color(terminal.MagentaFg, "Tip: this indicates that another bisync run (of these same paths) either is still running or was interrupted before completion. \n") - errTip += Color(terminal.MagentaFg, "If you're SURE you want to override this safety feature, you can delete the lock file with the following command, then run bisync again: \n") - errTip += fmt.Sprintf(Color(terminal.HiRedFg, "rclone deletefile \"%s\""), lockFile) - return fmt.Errorf(Color(terminal.RedFg, "prior lock file found: %s \n")+errTip, Color(terminal.HiYellowFg, lockFile)) - } - - pidStr := []byte(strconv.Itoa(os.Getpid())) - if err = os.WriteFile(lockFile, pidStr, bilib.PermSecure); err != nil { - return fmt.Errorf("cannot create lock file: %s: %w", lockFile, err) - } - fs.Debugf(nil, "Lock file created: %s", lockFile) + err = b.setLockFile() + if err != nil { + return err } // Handle SIGINT var finaliseOnce gosync.Once - markFailed := func(file string) { - failFile := file + "-err" - if bilib.FileExists(file) { - _ = os.Remove(failFile) - _ = os.Rename(file, failFile) - } - } + // waitFor runs fn() until it returns true or the timeout expires waitFor := func(msg string, totalWait time.Duration, fn func() bool) (ok bool) { const individualWait = 1 * time.Second @@ -175,7 +157,7 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) { markFailed(b.listing1) markFailed(b.listing2) } - _ = os.Remove(lockFile) + b.removeLockFile() } }) } @@ -185,16 +167,7 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) { // run bisync err = b.runLocked(ctx) - if lockFile != "" { - errUnlock := os.Remove(lockFile) - if errUnlock == nil { - fs.Debugf(nil, "Lock file removed: %s", lockFile) - } else if err == nil { - err = errUnlock - } else { - fs.Errorf(nil, "cannot remove lockfile %s: %v", lockFile, errUnlock) - } - } + b.removeLockFile() b.CleanupCompleted = true if b.InGracefulShutdown { diff --git a/docs/content/bisync.md b/docs/content/bisync.md index 5778a1e5f..84b4a8735 100644 --- a/docs/content/bisync.md +++ b/docs/content/bisync.md @@ -487,25 +487,29 @@ See also: [Concurrent modifications](#concurrent-modifications), [`--resilient`] ***Caution: this is an experimental feature. Use at your own risk!*** -By default, most errors or interruptions will cause bisync to abort and -require [`--resync`](#resync) to recover. This is a safety feature, -to prevent bisync from running again until a user checks things out. -However, in some cases, bisync can go too far and enforce a lockout when one isn't actually necessary, -like for certain less-serious errors that might resolve themselves on the next run. -When `--resilient` is specified, bisync tries its best to recover and self-correct, -and only requires `--resync` as a last resort when a human's involvement is absolutely necessary. -The intended use case is for running bisync as a background process (such as via scheduled [cron](#cron)). +By default, most errors or interruptions will cause bisync to abort and +require [`--resync`](#resync) to recover. This is a safety feature, to prevent +bisync from running again until a user checks things out. However, in some +cases, bisync can go too far and enforce a lockout when one isn't actually +necessary, like for certain less-serious errors that might resolve themselves +on the next run. When `--resilient` is specified, bisync tries its best to +recover and self-correct, and only requires `--resync` as a last resort when a +human's involvement is absolutely necessary. The intended use case is for +running bisync as a background process (such as via scheduled [cron](#cron)). -When using `--resilient` mode, bisync will still report the error and abort, -however it will not lock out future runs -- allowing the possibility of retrying at the next normally scheduled time, -without requiring a `--resync` first. Examples of such retryable errors include -access test failures, missing listing files, and filter change detections. -These safety features will still prevent the *current* run from proceeding -- -the difference is that if conditions have improved by the time of the *next* run, -that next run will be allowed to proceed. -Certain more serious errors will still enforce a `--resync` lockout, even in `--resilient` mode, to prevent data loss. +When using `--resilient` mode, bisync will still report the error and abort, +however it will not lock out future runs -- allowing the possibility of +retrying at the next normally scheduled time, without requiring a `--resync` +first. Examples of such retryable errors include access test failures, missing +listing files, and filter change detections. These safety features will still +prevent the *current* run from proceeding -- the difference is that if +conditions have improved by the time of the *next* run, that next run will be +allowed to proceed. Certain more serious errors will still enforce a +`--resync` lockout, even in `--resilient` mode, to prevent data loss. -Behavior of `--resilient` may change in a future version. +Behavior of `--resilient` may change in a future version. (See also: +[`--recover`](#recover), [`--max-lock`](#max-lock), [Graceful +Shutdown](#graceful-shutdown)) ### --recover @@ -540,6 +544,42 @@ when bisync has chosen to abort itself due to safety features such as failing external interruptions such as a user shutting down their computer in the middle of a sync -- that is what `--recover` is for. +### --max-lock + +Bisync uses [lock files](#lock-file) as a safety feature to prevent +interference from other bisync runs while it is running. Bisync normally +removes these lock files at the end of a run, but if bisync is abruptly +interrupted, these files will be left behind. By default, they will lock out +all future runs, until the user has a chance to manually check things out and +remove the lock. As an alternative, `--max-lock` can be used to make them +automatically expire after a certain period of time, so that future runs are +not locked out forever, and auto-recovery is possible. `--max-lock` can be any +duration `2m` or greater (or `0` to disable). If set, lock files older than +this will be considered "expired", and future runs will be allowed to disregard +them and proceed. (Note that the `--max-lock` duration must be set by the +process that left the lock file -- not the later one interpreting it.) + +If set, bisync will also "renew" these lock files every `--max-lock minus one +minute` throughout a run, for extra safety. (For example, with `--max-lock 5m`, +bisync would renew the lock file (for another 5 minutes) every 4 minutes until +the run has completed.) In other words, it should not be possible for a lock +file to pass its expiration time while the process that created it is still +running -- and you can therefore be reasonably sure that any _expired_ lock +file you may find was left there by an interrupted run, not one that is still +running and just taking awhile. + +If `--max-lock` is `0` or not set, the default is that lock files will never +expire, and will block future runs (of these same two bisync paths) +indefinitely. + +For maximum resilience from disruptions, consider setting a relatively short +duration like `--max-lock 2m` along with [`--resilient`](#resilient) and +[`--recover`](#recover), and a relatively frequent [cron schedule](#cron). The +result will be a very robust "set-it-and-forget-it" bisync run that can +automatically bounce back from almost any interruption it might encounter, +without requiring the user to get involved and run a `--resync`. (See also: +[Graceful Shutdown](#graceful-shutdown) mode) + ### --backup-dir1 and --backup-dir2 @@ -679,7 +719,8 @@ typically at `${HOME}/.cache/rclone/bisync/` on Linux. Some errors are considered temporary and re-running the bisync is not blocked. The _critical return_ blocks further bisync runs. -See also: [`--resilient`](#resilient) +See also: [`--resilient`](#resilient), [`--recover`](#recover), +[`--max-lock`](#max-lock), [Graceful Shutdown](#graceful-shutdown) ### Lock file @@ -691,6 +732,8 @@ Delete the lock file as part of debugging the situation. The lock file effectively blocks follow-on (e.g., scheduled by _cron_) runs when the prior invocation is taking a long time. The lock file contains _PID_ of the blocking process, which may help in debug. +Lock files can be set to automatically expire after a certain amount of time, +using the [`--max-lock`](#max-lock) flag. **Note** that while concurrent bisync runs are allowed, _be very cautious_ @@ -727,7 +770,8 @@ NOT use [`--inplace`](/docs/#inplace), otherwise you risk leaving partially-written files on one side, which may be confused for real files on the next run. Note also that in the event of an abrupt interruption, a [lock file](#lock-file) will be left behind to block concurrent runs. You will need -to delete it before you can proceed with the next run. +to delete it before you can proceed with the next run (or wait for it to +expire on its own, if using `--max-lock`.) ## Limitations @@ -1559,6 +1603,7 @@ instead of of `--size-only`, when `check` is not available. * Bisync now fully supports comparing based on any combination of size, modtime, and checksum, lifting the prior restriction on backends without modtime support. * Bisync now supports a "Graceful Shutdown" mode to cleanly cancel a run early without requiring `--resync`. * New `--recover` flag allows robust recovery in the event of interruptions, without requiring `--resync`. +* A new `--max-lock` setting allows lock files to automatically renew and expire, for better automatic recovery when a run is interrupted. ### `v1.64` * Fixed an [issue](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=1.%20Dry%20runs%20are%20not%20completely%20dry)