bisync: Add experimental --resilient mode to allow recovery from self-correctable errors

https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=2.%20Bisync%20should%20be%20more%20resilient%20to%20self%2Dcorrectable%20errors
This commit is contained in:
nielash 2023-07-11 06:57:49 -04:00 committed by Nick Craig-Wood
parent f01a50eb47
commit e5bde42303
5 changed files with 54 additions and 7 deletions

View File

@ -40,6 +40,7 @@ type Options struct {
NoCleanup bool
SaveQueues bool // save extra debugging files (test only flag)
IgnoreListingChecksum bool
Resilient bool
}
// Default values
@ -110,6 +111,7 @@ func init() {
flags.BoolVarP(cmdFlags, &tzLocal, "localtime", "", tzLocal, "Use local time in listings (default: UTC)", "")
flags.BoolVarP(cmdFlags, &Opt.NoCleanup, "no-cleanup", "", Opt.NoCleanup, "Retain working files (useful for troubleshooting and testing).", "")
flags.BoolVarP(cmdFlags, &Opt.IgnoreListingChecksum, "ignore-listing-checksum", "", Opt.IgnoreListingChecksum, "Do not use checksums for listings (add --ignore-checksum to additionally skip post-copy checksum checks)", "")
flags.BoolVarP(cmdFlags, &Opt.Resilient, "resilient", "", Opt.Resilient, "Allow future runs to retry after certain less-serious errors, instead of requiring --resync. Use at your own risk!", "")
}
// bisync command definition

View File

@ -104,6 +104,7 @@ func (b *bisyncRun) checkconflicts(ctxCheck context.Context, filterCheck *filter
opt, close, checkopterr := check.GetCheckOpt(b.fs1, b.fs2)
if checkopterr != nil {
b.critical = true
b.retryable = true
fs.Debugf(nil, "GetCheckOpt error: %v", checkopterr)
return matches, checkopterr
}

View File

@ -301,5 +301,6 @@ func (b *bisyncRun) checkListing(ls *fileList, listing, msg string) error {
}
fs.Errorf(nil, "Empty %s listing. Cannot sync to an empty directory: %s", msg, listing)
b.critical = true
b.retryable = true
return fmt.Errorf("empty %s listing: %s", msg, listing)
}

View File

@ -25,13 +25,14 @@ var ErrBisyncAborted = errors.New("bisync aborted")
// bisyncRun keeps bisync runtime state
type bisyncRun struct {
fs1 fs.Fs
fs2 fs.Fs
abort bool
critical bool
basePath string
workDir string
opt *Options
fs1 fs.Fs
fs2 fs.Fs
abort bool
critical bool
retryable bool
basePath string
workDir string
opt *Options
}
// Bisync handles lock file, performs bisync run and checks exit status
@ -123,6 +124,10 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) {
}
if b.critical {
if b.retryable && b.opt.Resilient {
fs.Errorf(nil, "Bisync critical error: %v", err)
fs.Errorf(nil, "Bisync aborted. Error is retryable without --resync due to --resilient mode.")
} else {
if bilib.FileExists(listing1) {
_ = os.Rename(listing1, listing1+"-err")
}
@ -131,6 +136,7 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) {
}
fs.Errorf(nil, "Bisync critical error: %v", err)
fs.Errorf(nil, "Bisync aborted. Must run --resync to recover.")
}
return ErrBisyncAborted
}
if b.abort {
@ -152,6 +158,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
fs.Infof(nil, "Validating listings for Path1 %s vs Path2 %s", quotePath(path1), quotePath(path2))
if err = b.checkSync(listing1, listing2); err != nil {
b.critical = true
b.retryable = true
}
return err
}
@ -176,6 +183,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
var fctx context.Context
if fctx, err = b.opt.applyFilters(octx); err != nil {
b.critical = true
b.retryable = true
return
}
@ -188,6 +196,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
if !bilib.FileExists(listing1) || !bilib.FileExists(listing2) {
// On prior critical error abort, the prior listings are renamed to .lst-err to lock out further runs
b.critical = true
b.retryable = true
return errors.New("cannot find prior Path1 or Path2 listings, likely due to critical error on prior run")
}
@ -215,6 +224,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
err = b.checkAccess(ds1.checkFiles, ds2.checkFiles)
if err != nil {
b.critical = true
b.retryable = true
return
}
}
@ -255,6 +265,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
changes1, changes2, err = b.applyDeltas(octx, ds1, ds2)
if err != nil {
b.critical = true
// b.retryable = true // not sure about this one
return err
}
}
@ -283,6 +294,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
}
if err != nil {
b.critical = true
b.retryable = true
return err
}
@ -310,6 +322,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) (
}
if err != nil {
b.critical = true
b.retryable = true
return err
}
}
@ -369,6 +382,7 @@ func (b *bisyncRun) resync(octx, fctx context.Context, listing1, listing2 string
err = b.checkAccess(ds1.checkFiles, ds2.checkFiles)
if err != nil {
b.critical = true
b.retryable = true
return err
}
}

View File

@ -97,6 +97,8 @@ Optional Flags:
Consider using `--verbose` or `--dry-run` first.
--ignore-listing-checksum Do not use checksums for listings
(add --ignore-checksum to additionally skip post-copy checksum checks)
--resilient Allow future runs to retry after certain less-serious errors,
instead of requiring --resync. Use at your own risk!
--localtime Use local time in listings (default: UTC)
--no-cleanup Retain working files (useful for troubleshooting and testing).
--workdir PATH Use custom working directory (useful for testing).
@ -285,6 +287,30 @@ were generated on a run using `--ignore-listing-checksum`. For a more robust int
consider using [`check`](commands/rclone_check/)
(or [`cryptcheck`](/commands/rclone_cryptcheck/), if at least one path is a `crypt` remote.)
#### --resilient
***Caution: this is an experimental feature. Use at your own risk!***
By default, most errors or interruptions will cause bisync to abort and
require [`--resync`](#resync) to recover. This is a safety feature,
to prevent bisync from running again until a user checks things out.
However, in some cases, bisync can go too far and enforce a lockout when one isn't actually necessary,
like for certain less-serious errors that might resolve themselves on the next run.
When `--resilient` is specified, bisync tries its best to recover and self-correct,
and only requires `--resync` as a last resort when a human's involvement is absolutely necessary.
The intended use case is for running bisync as a background process (such as via scheduled [cron](#cron)).
When using `--resilient` mode, bisync will still report the error and abort,
however it will not lock out future runs -- allowing the possibility of retrying at the next normally scheduled time,
without requiring a `--resync` first. Examples of such retryable errors include
access test failures, missing listing files, and filter change detections.
These safety features will still prevent the *current* run from proceeding --
the difference is that if conditions have improved by the time of the *next* run,
that next run will be allowed to proceed.
Certain more serious errors will still enforce a `--resync` lockout, even in `--resilient` mode, to prevent data loss.
Behavior of `--resilient` may change in a future version.
## Operation
### Runtime flow details
@ -394,6 +420,8 @@ typically at `${HOME}/.cache/rclone/bisync/` on Linux.
Some errors are considered temporary and re-running the bisync is not blocked.
The _critical return_ blocks further bisync runs.
See also: [`--resilient`](#resilient)
### Lock file
When bisync is running, a lock file is created in the bisync working directory,
@ -1164,5 +1192,6 @@ causing dry runs to inadvertently commit filter changes
causing bisync to consider more files than necessary due to overbroad filters during delete operations
* [Improved detection of false positive change conflicts](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=1.%20Identical%20files%20should%20be%20left%20alone%2C%20even%20if%20new/newer/changed%20on%20both%20sides)
(identical files are now left alone instead of renamed)
* Added experimental `--resilient` mode to allow [recovery from self-correctable errors](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=2.%20Bisync%20should%20be%20more%20resilient%20to%20self%2Dcorrectable%20errors)
* Added [new `--ignore-listing-checksum` flag](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=6.%20%2D%2Dignore%2Dchecksum%20should%20be%20split%20into%20two%20flags%20for%20separate%20purposes)
to distinguish from `--ignore-checksum`