dedupe: add --by-hash to dedupe on hash not file name - fixes #1674

This commit is contained in:
Nick Craig-Wood 2020-10-13 16:22:02 +01:00
parent e073720a8f
commit 507f861c67
3 changed files with 114 additions and 45 deletions

View File

@ -12,12 +12,14 @@ import (
var ( var (
dedupeMode = operations.DeduplicateInteractive dedupeMode = operations.DeduplicateInteractive
byHash = false
) )
func init() { func init() {
cmd.Root.AddCommand(commandDefinition) cmd.Root.AddCommand(commandDefinition)
cmdFlag := commandDefinition.Flags() cmdFlag := commandDefinition.Flags()
flags.FVarP(cmdFlag, &dedupeMode, "dedupe-mode", "", "Dedupe mode interactive|skip|first|newest|oldest|largest|smallest|rename.") flags.FVarP(cmdFlag, &dedupeMode, "dedupe-mode", "", "Dedupe mode interactive|skip|first|newest|oldest|largest|smallest|rename.")
flags.BoolVarP(cmdFlag, &byHash, "by-hash", "", false, "Find indentical hashes rather than names")
} }
var commandDefinition = &cobra.Command{ var commandDefinition = &cobra.Command{
@ -27,20 +29,26 @@ var commandDefinition = &cobra.Command{
By default ` + "`dedupe`" + ` interactively finds files with duplicate By default ` + "`dedupe`" + ` interactively finds files with duplicate
names and offers to delete all but one or rename them to be names and offers to delete all but one or rename them to be
different. different. This is known as deduping by name.
This is only useful with backends like Google Drive which can have Deduping by name is only useful with backends like Google Drive which
duplicate file names. It can be run on wrapping backends (e.g. crypt) if can have duplicate file names. It can be run on wrapping backends
they wrap a backend which supports duplicate file names. (e.g. crypt) if they wrap a backend which supports duplicate file
names.
In the first pass it will merge directories with the same name. It However if --by-hash is passed in then dedupe will find files with
will do this iteratively until all the identically named directories duplicate hashes instead which will work on any backend which supports
have been merged. at least one hash. This can be used to find files with duplicate
content. This is known as deduping by hash.
In the second pass, for every group of duplicate file names, it will If deduping by name, first rclone will merge directories with the same
delete all but one identical files it finds without confirmation. name. It will do this iteratively until all the identically named
This means that for most duplicated files the ` + "`dedupe`" + ` directories have been merged.
command will not be interactive.
Next, if deduping by name, for every group of duplicate file names /
hashes, it will delete all but one identical files it finds without
confirmation. This means that for most duplicated files the ` +
"`dedupe`" + ` command will not be interactive.
` + "`dedupe`" + ` considers files to be identical if they have the ` + "`dedupe`" + ` considers files to be identical if they have the
same file path and the same hash. If the backend does not support hashes (e.g. crypt wrapping same file path and the same hash. If the backend does not support hashes (e.g. crypt wrapping
@ -49,6 +57,10 @@ use the ` + "`--size-only`" + ` flag then files will be considered
identical if they have the same size (any hash will be ignored). This identical if they have the same size (any hash will be ignored). This
can be useful on crypt backends which do not support hashes. can be useful on crypt backends which do not support hashes.
Next rclone will resolve the remaining duplicates. Exactly which
action is taken depends on the dedupe mode. By default rclone will
interactively query the user for each one.
**Important**: Since this can cause data loss, test first with the **Important**: Since this can cause data loss, test first with the
` + "`--dry-run` or the `--interactive`/`-i`" + ` flag. ` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
@ -131,7 +143,7 @@ Or
} }
fdst := cmd.NewFsSrc(args) fdst := cmd.NewFsSrc(args)
cmd.Run(false, false, command, func() error { cmd.Run(false, false, command, func() error {
return operations.Deduplicate(context.Background(), fdst, dedupeMode) return operations.Deduplicate(context.Background(), fdst, dedupeMode, byHash)
}) })
}, },
} }

View File

@ -139,7 +139,7 @@ func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, obj
} }
// dedupeInteractive interactively dedupes the slice of objects // dedupeInteractive interactively dedupes the slice of objects
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) { func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object, byHash bool) {
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs)) fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
for i, o := range objs { for i, o := range objs {
hashValue := "" hashValue := ""
@ -150,9 +150,17 @@ func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string
hashValue = err.Error() hashValue = err.Error()
} }
} }
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue) if byHash {
fmt.Printf(" %d: %12d bytes, %s, %s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), o.Remote())
} else {
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, hashValue)
}
} }
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) { commands := []string{"sSkip and do nothing", "kKeep just one (choose which in next step)"}
if !byHash {
commands = append(commands, "rRename all to be different (by changing file.jpg to file-1.jpg)")
}
switch config.Command(commands) {
case 's': case 's':
case 'k': case 'k':
keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs)) keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
@ -298,31 +306,51 @@ func sortSmallestFirst(objs []fs.Object) {
// Deduplicate interactively finds duplicate files and offers to // Deduplicate interactively finds duplicate files and offers to
// delete all but one or rename them to be different. Only useful with // delete all but one or rename them to be different. Only useful with
// Google Drive which can have duplicate file names. // Google Drive which can have duplicate file names.
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error { func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode, byHash bool) error {
fs.Infof(f, "Looking for duplicates using %v mode.", mode)
ci := fs.GetConfig(ctx) ci := fs.GetConfig(ctx)
// find a hash to use
ht := f.Hashes().GetOne()
what := "names"
if byHash {
if ht == hash.None {
return errors.Errorf("%v has no hashes", f)
}
what = ht.String() + " hashes"
}
fs.Infof(f, "Looking for duplicate %s using %v mode.", what, mode)
// Find duplicate directories first and fix them // Find duplicate directories first and fix them
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f) if !byHash {
if err != nil { duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
return err
}
if len(duplicateDirs) != 0 {
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
if err != nil { if err != nil {
return err return err
} }
if len(duplicateDirs) != 0 {
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
if err != nil {
return err
}
}
} }
// find a hash to use
ht := f.Hashes().GetOne()
// Now find duplicate files // Now find duplicate files
files := map[string][]fs.Object{} files := map[string][]fs.Object{}
err = walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error { err := walk.ListR(ctx, f, "", true, ci.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
entries.ForObject(func(o fs.Object) { entries.ForObject(func(o fs.Object) {
remote := o.Remote() var remote string
files[remote] = append(files[remote], o) var err error
if byHash {
remote, err = o.Hash(ctx, ht)
if err != nil {
fs.Errorf(o, "Failed to hash: %v", err)
remote = ""
}
} else {
remote = o.Remote()
}
if remote != "" {
files[remote] = append(files[remote], o)
}
}) })
return nil return nil
}) })
@ -332,15 +360,17 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
for remote, objs := range files { for remote, objs := range files {
if len(objs) > 1 { if len(objs) > 1 {
fs.Logf(remote, "Found %d files with duplicate names", len(objs)) fs.Logf(remote, "Found %d files with duplicate %s", len(objs), what)
objs = dedupeDeleteIdentical(ctx, ht, remote, objs) if !byHash {
if len(objs) <= 1 { objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
fs.Logf(remote, "All duplicates removed") if len(objs) <= 1 {
continue fs.Logf(remote, "All duplicates removed")
continue
}
} }
switch mode { switch mode {
case DeduplicateInteractive: case DeduplicateInteractive:
dedupeInteractive(ctx, f, ht, remote, objs) dedupeInteractive(ctx, f, ht, remote, objs, byHash)
case DeduplicateFirst: case DeduplicateFirst:
dedupeDeleteAllButOne(ctx, 0, remote, objs) dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateNewest: case DeduplicateNewest:
@ -358,7 +388,7 @@ func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
sortSmallestFirst(objs) sortSmallestFirst(objs)
dedupeDeleteAllButOne(ctx, 0, remote, objs) dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateSkip: case DeduplicateSkip:
fs.Logf(remote, "Skipping %d files with duplicate names", len(objs)) fs.Logf(remote, "Skipping %d files with duplicate names %s", len(objs), what)
default: default:
//skip //skip
} }

View File

@ -10,6 +10,7 @@ import (
"github.com/rclone/rclone/fs/operations" "github.com/rclone/rclone/fs/operations"
"github.com/rclone/rclone/fs/walk" "github.com/rclone/rclone/fs/walk"
"github.com/rclone/rclone/fstest" "github.com/rclone/rclone/fstest"
"github.com/rclone/rclone/lib/random"
"github.com/spf13/pflag" "github.com/spf13/pflag"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
@ -36,6 +37,12 @@ func skipIfNoHash(t *testing.T, f fs.Fs) {
} }
} }
func skipIfNoModTime(t *testing.T, f fs.Fs) {
if f.Precision() >= fs.ModTimeNotSupported {
t.Skip("Can't run this test without modtimes")
}
}
func TestDeduplicateInteractive(t *testing.T) { func TestDeduplicateInteractive(t *testing.T) {
r := fstest.NewRun(t) r := fstest.NewRun(t)
defer r.Finalise() defer r.Finalise()
@ -47,7 +54,7 @@ func TestDeduplicateInteractive(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1) file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
r.CheckWithDuplicates(t, file1, file2, file3) r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive) err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateInteractive, false)
require.NoError(t, err) require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file1) fstest.CheckItems(t, r.Fremote, file1)
@ -69,7 +76,7 @@ func TestDeduplicateSkip(t *testing.T) {
files = append(files, file3) files = append(files, file3)
r.CheckWithDuplicates(t, files...) r.CheckWithDuplicates(t, files...)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip) err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
require.NoError(t, err) require.NoError(t, err)
r.CheckWithDuplicates(t, file1, file3) r.CheckWithDuplicates(t, file1, file3)
@ -92,7 +99,7 @@ func TestDeduplicateSizeOnly(t *testing.T) {
ci.SizeOnly = false ci.SizeOnly = false
}() }()
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip) err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSkip, false)
require.NoError(t, err) require.NoError(t, err)
r.CheckWithDuplicates(t, file1, file3) r.CheckWithDuplicates(t, file1, file3)
@ -108,7 +115,7 @@ func TestDeduplicateFirst(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one BB", t1) file3 := r.WriteUncheckedObject(context.Background(), "one", "This is one BB", t1)
r.CheckWithDuplicates(t, file1, file2, file3) r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst) err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateFirst, false)
require.NoError(t, err) require.NoError(t, err)
// list until we get one object // list until we get one object
@ -131,18 +138,38 @@ func TestDeduplicateNewest(t *testing.T) {
r := fstest.NewRun(t) r := fstest.NewRun(t)
defer r.Finalise() defer r.Finalise()
skipIfCantDedupe(t, r.Fremote) skipIfCantDedupe(t, r.Fremote)
skipIfNoModTime(t, r.Fremote)
file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1) file1 := r.WriteUncheckedObject(context.Background(), "one", "This is one", t1)
file2 := r.WriteUncheckedObject(context.Background(), "one", "This is one too", t2) file2 := r.WriteUncheckedObject(context.Background(), "one", "This is one too", t2)
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3) file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3) r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest) err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, false)
require.NoError(t, err) require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file3) fstest.CheckItems(t, r.Fremote, file3)
} }
func TestDeduplicateNewestByHash(t *testing.T) {
r := fstest.NewRun(t)
defer r.Finalise()
skipIfNoHash(t, r.Fremote)
skipIfNoModTime(t, r.Fremote)
contents := random.String(100)
file1 := r.WriteObject(context.Background(), "one", contents, t1)
file2 := r.WriteObject(context.Background(), "also/one", contents, t2)
file3 := r.WriteObject(context.Background(), "another", contents, t3)
file4 := r.WriteObject(context.Background(), "not-one", "stuff", t3)
fstest.CheckItems(t, r.Fremote, file1, file2, file3, file4)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateNewest, true)
require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file3, file4)
}
func TestDeduplicateOldest(t *testing.T) { func TestDeduplicateOldest(t *testing.T) {
r := fstest.NewRun(t) r := fstest.NewRun(t)
defer r.Finalise() defer r.Finalise()
@ -153,7 +180,7 @@ func TestDeduplicateOldest(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3) file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3) r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest) err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateOldest, false)
require.NoError(t, err) require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file1) fstest.CheckItems(t, r.Fremote, file1)
@ -169,7 +196,7 @@ func TestDeduplicateLargest(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3) file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3) r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest) err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateLargest, false)
require.NoError(t, err) require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file3) fstest.CheckItems(t, r.Fremote, file3)
@ -185,7 +212,7 @@ func TestDeduplicateSmallest(t *testing.T) {
file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3) file3 := r.WriteUncheckedObject(context.Background(), "one", "This is another one", t3)
r.CheckWithDuplicates(t, file1, file2, file3) r.CheckWithDuplicates(t, file1, file2, file3)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest) err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateSmallest, false)
require.NoError(t, err) require.NoError(t, err)
fstest.CheckItems(t, r.Fremote, file1) fstest.CheckItems(t, r.Fremote, file1)
@ -202,7 +229,7 @@ func TestDeduplicateRename(t *testing.T) {
file4 := r.WriteUncheckedObject(context.Background(), "one-1.txt", "This is not a duplicate", t1) file4 := r.WriteUncheckedObject(context.Background(), "one-1.txt", "This is not a duplicate", t1)
r.CheckWithDuplicates(t, file1, file2, file3, file4) r.CheckWithDuplicates(t, file1, file2, file3, file4)
err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename) err := operations.Deduplicate(context.Background(), r.Fremote, operations.DeduplicateRename, false)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, walk.ListR(context.Background(), r.Fremote, "", true, -1, walk.ListObjects, func(entries fs.DirEntries) error { require.NoError(t, walk.ListR(context.Background(), r.Fremote, "", true, -1, walk.ListObjects, func(entries fs.DirEntries) error {