From 507f861c67d6e69408190017f57c209eba3201f2 Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Tue, 13 Oct 2020 16:22:02 +0100 Subject: [PATCH] dedupe: add --by-hash to dedupe on hash not file name - fixes #1674 --- cmd/dedupe/dedupe.go | 36 +++++++++++------ fs/operations/dedupe.go | 78 +++++++++++++++++++++++++----------- fs/operations/dedupe_test.go | 45 ++++++++++++++++----- 3 files changed, 114 insertions(+), 45 deletions(-) diff --git a/cmd/dedupe/dedupe.go b/cmd/dedupe/dedupe.go index 637981964..4a1d89405 100644 --- a/cmd/dedupe/dedupe.go +++ b/cmd/dedupe/dedupe.go @@ -12,12 +12,14 @@ import ( var ( dedupeMode = operations.DeduplicateInteractive + byHash = false ) func init() { cmd.Root.AddCommand(commandDefinition) cmdFlag := commandDefinition.Flags() flags.FVarP(cmdFlag, &dedupeMode, "dedupe-mode", "", "Dedupe mode interactive|skip|first|newest|oldest|largest|smallest|rename.") + flags.BoolVarP(cmdFlag, &byHash, "by-hash", "", false, "Find indentical hashes rather than names") } var commandDefinition = &cobra.Command{ @@ -27,20 +29,26 @@ var commandDefinition = &cobra.Command{ By default ` + "`dedupe`" + ` interactively finds files with duplicate names and offers to delete all but one or rename them to be -different. +different. This is known as deduping by name. -This is only useful with backends like Google Drive which can have -duplicate file names. It can be run on wrapping backends (e.g. crypt) if -they wrap a backend which supports duplicate file names. +Deduping by name is only useful with backends like Google Drive which +can have duplicate file names. It can be run on wrapping backends +(e.g. crypt) if they wrap a backend which supports duplicate file +names. -In the first pass it will merge directories with the same name. It -will do this iteratively until all the identically named directories -have been merged. +However if --by-hash is passed in then dedupe will find files with +duplicate hashes instead which will work on any backend which supports +at least one hash. This can be used to find files with duplicate +content. This is known as deduping by hash. -In the second pass, for every group of duplicate file names, it will -delete all but one identical files it finds without confirmation. -This means that for most duplicated files the ` + "`dedupe`" + ` -command will not be interactive. +If deduping by name, first rclone will merge directories with the same +name. It will do this iteratively until all the identically named +directories have been merged. + +Next, if deduping by name, for every group of duplicate file names / +hashes, it will delete all but one identical files it finds without +confirmation. This means that for most duplicated files the ` + + "`dedupe`" + ` command will not be interactive. ` + "`dedupe`" + ` considers files to be identical if they have the same file path and the same hash. If the backend does not support hashes (e.g. crypt wrapping @@ -49,6 +57,10 @@ use the ` + "`--size-only`" + ` flag then files will be considered identical if they have the same size (any hash will be ignored). This can be useful on crypt backends which do not support hashes. +Next rclone will resolve the remaining duplicates. Exactly which +action is taken depends on the dedupe mode. By default rclone will +interactively query the user for each one. + **Important**: Since this can cause data loss, test first with the ` + "`--dry-run` or the `--interactive`/`-i`" + ` flag. @@ -131,7 +143,7 @@ Or } fdst := cmd.NewFsSrc(args) cmd.Run(false, false, command, func() error { - return operations.Deduplicate(context.Background(), fdst, dedupeMode) + return operations.Deduplicate(context.Background(), fdst, dedupeMode, byHash) }) }, } diff --git a/fs/operations/dedupe.go b/fs/operations/dedupe.go index 56de6dd3d..3cd2d0a71 100644 --- a/fs/operations/dedupe.go +++ b/fs/operations/dedupe.go @@ -139,7 +139,7 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj } // dedupeInteractive interactively dedupes the slice of objects -func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) { +func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) { fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) for i, o := range objs { hashValue := "" @@ -150,9 +150,17 @@ func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string hashValue = err.Error() } } - fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue) + if byHash { + fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote()) + } else { + fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue) + } } - switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { + commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"} + if !byHash { + commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)") + } + switch config.Command(commands) { case 's': case 'k': keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs)) @@ -298,31 +306,51 @@ func sortSmallestFirst(objs []fs.Object) { // Deduplicate interactively finds duplicate files and offers to // delete all but one or rename them to be different. Only useful with // Google Drive which can have duplicate file names. -func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error { - fs.Infof(f, "Looking for duplicates using %v mode.", mode) +func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error { ci := fs.GetConfig(ctx) + // find a hash to use + ht := f.Hashes().GetOne() + what := "names" + if byHash { + if ht == hash.None { + return errors.Errorf("%v has no hashes", f) + } + what = ht.String() + " hashes" + } + fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode) // Find duplicate directories first and fix them - duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f) - if err != nil { - return err - } - if len(duplicateDirs) != 0 { - err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs) + if !byHash { + duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f) if err != nil { return err } + if len(duplicateDirs) != 0 { + err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs) + if err != nil { + return err + } + } } - // find a hash to use - ht := f.Hashes().GetOne() - // Now find duplicate files files := map[string][]fs.Object{} - err = walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error { + err := walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error { entries.ForObject(func(o fs.Object) { - remote := o.Remote() - files[remote] = append(files[remote], o) + var remote string + var err error + if byHash { + remote, err = o.Hash(ctx, ht) + if err != nil { + fs.Errorf(o, "Failed to hash: %v", err) + remote = "" + } + } else { + remote = o.Remote() + } + if remote != "" { + files[remote] = append(files[remote], o) + } }) return nil }) @@ -332,15 +360,17 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error { for remote, objs := range files { if len(objs) > 1 { - fs.Logf(remote, "Found %d files with duplicate names", len(objs)) - objs = dedupeDeleteIdentical(ctx, ht, remote, objs) - if len(objs) <= 1 { - fs.Logf(remote, "All duplicates removed") - continue + fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what) + if !byHash { + objs = dedupeDeleteIdentical(ctx, ht, remote, objs) + if len(objs) <= 1 { + fs.Logf(remote, "All duplicates removed") + continue + } } switch mode { case DeduplicateInteractive: - dedupeInteractive(ctx, f, ht, remote, objs) + dedupeInteractive(ctx, f, ht, remote, objs, byHash) case DeduplicateFirst: dedupeDeleteAllButOne(ctx, 0, remote, objs) case DeduplicateNewest: @@ -358,7 +388,7 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error { sortSmallestFirst(objs) dedupeDeleteAllButOne(ctx, 0, remote, objs) case DeduplicateSkip: - fs.Logf(remote, "Skipping %d files with duplicate names", len(objs)) + fs.Logf(remote, "Skipping %d files with duplicate names %s", len(objs), what) default: //skip } diff --git a/fs/operations/dedupe_test.go b/fs/operations/dedupe_test.go index 985642399..80b03dc7b 100644 --- a/fs/operations/dedupe_test.go +++ b/fs/operations/dedupe_test.go @@ -10,6 +10,7 @@ import ( "github.com/rclone/rclone/fs/operations" "github.com/rclone/rclone/fs/walk" "github.com/rclone/rclone/fstest" + "github.com/rclone/rclone/lib/random" "github.com/spf13/pflag" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -36,6 +37,12 @@ func skipIfNoHash(t *testing.T, f fs.Fs) { } } +func skipIfNoModTime(t *testing.T, f fs.Fs) { + if f.Precision() >= fs.ModTimeNotSupported { + t.Skip("Can't run this test without modtimes") + } +} + func TestDeduplicateInteractive(t *testing.T) { r := fstest.NewRun(t) defer r.Finalise() @@ -47,7 +54,7 @@ func TestDeduplicateInteractive(t *testing.T) { file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1) r.CheckWithDuplicates(t, file1, file2, file3) - err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive) + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive, false) require.NoError(t, err) fstest.CheckItems(t, r.Fremote, file1) @@ -69,7 +76,7 @@ func TestDeduplicateSkip(t *testing.T) { files = append(files, file3) r.CheckWithDuplicates(t, files...) - err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip) + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false) require.NoError(t, err) r.CheckWithDuplicates(t, file1, file3) @@ -92,7 +99,7 @@ func TestDeduplicateSizeOnly(t *testing.T) { ci.SizeOnly = false }() - err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip) + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false) require.NoError(t, err) r.CheckWithDuplicates(t, file1, file3) @@ -108,7 +115,7 @@ func TestDeduplicateFirst(t *testing.T) { file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one BB", t1) r.CheckWithDuplicates(t, file1, file2, file3) - err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst) + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst, false) require.NoError(t, err) // list until we get one object @@ -131,18 +138,38 @@ func TestDeduplicateNewest(t *testing.T) { r := fstest.NewRun(t) defer r.Finalise() skipIfCantDedupe(t, r.Fremote) + skipIfNoModTime(t, r.Fremote) file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1) file2 := r.WriteUncheckedObject(context.Background(), "one", "This is one too", t2) file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3) r.CheckWithDuplicates(t, file1, file2, file3) - err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest) + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, false) require.NoError(t, err) fstest.CheckItems(t, r.Fremote, file3) } +func TestDeduplicateNewestByHash(t *testing.T) { + r := fstest.NewRun(t) + defer r.Finalise() + skipIfNoHash(t, r.Fremote) + skipIfNoModTime(t, r.Fremote) + contents := random.String(100) + + file1 := r.WriteObject(context.Background(), "one", contents, t1) + file2 := r.WriteObject(context.Background(), "also/one", contents, t2) + file3 := r.WriteObject(context.Background(), "another", contents, t3) + file4 := r.WriteObject(context.Background(), "not-one", "stuff", t3) + fstest.CheckItems(t, r.Fremote, file1, file2, file3, file4) + + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, true) + require.NoError(t, err) + + fstest.CheckItems(t, r.Fremote, file3, file4) +} + func TestDeduplicateOldest(t *testing.T) { r := fstest.NewRun(t) defer r.Finalise() @@ -153,7 +180,7 @@ func TestDeduplicateOldest(t *testing.T) { file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3) r.CheckWithDuplicates(t, file1, file2, file3) - err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest) + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest, false) require.NoError(t, err) fstest.CheckItems(t, r.Fremote, file1) @@ -169,7 +196,7 @@ func TestDeduplicateLargest(t *testing.T) { file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3) r.CheckWithDuplicates(t, file1, file2, file3) - err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest) + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest, false) require.NoError(t, err) fstest.CheckItems(t, r.Fremote, file3) @@ -185,7 +212,7 @@ func TestDeduplicateSmallest(t *testing.T) { file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3) r.CheckWithDuplicates(t, file1, file2, file3) - err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest) + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest, false) require.NoError(t, err) fstest.CheckItems(t, r.Fremote, file1) @@ -202,7 +229,7 @@ func TestDeduplicateRename(t *testing.T) { file4 := r.WriteUncheckedObject(context.Background(), "one-1.txt", "This is not a duplicate", t1) r.CheckWithDuplicates(t, file1, file2, file3, file4) - err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename) + err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename, false) require.NoError(t, err) require.NoError(t, walk.ListR(context.Background(), r.Fremote, "", true, -1, walk.ListObjects, func(entries fs.DirEntries) error {