chunker: reservations for future extensions

This commit is contained in:
Ivan Andreev 2019-10-04 04:05:45 +03:00 committed by Nick Craig-Wood
parent 41ed33b08e
commit 7aa2b4191c
3 changed files with 690 additions and 394 deletions

File diff suppressed because it is too large Load Diff

View File

@ -18,7 +18,7 @@ var (
) )
// test that chunking does not break large uploads // test that chunking does not break large uploads
func (f *Fs) InternalTestPutLarge(t *testing.T, kilobytes int) { func testPutLarge(t *testing.T, f *Fs, kilobytes int) {
t.Run(fmt.Sprintf("PutLarge%dk", kilobytes), func(t *testing.T) { t.Run(fmt.Sprintf("PutLarge%dk", kilobytes), func(t *testing.T) {
fstests.TestPutLarge(context.Background(), t, f, &fstest.Item{ fstests.TestPutLarge(context.Background(), t, f, &fstest.Item{
ModTime: fstest.Time("2001-02-03T04:05:06.499999999Z"), ModTime: fstest.Time("2001-02-03T04:05:06.499999999Z"),
@ -28,119 +28,228 @@ func (f *Fs) InternalTestPutLarge(t *testing.T, kilobytes int) {
}) })
} }
func (f *Fs) InternalTestChunkNameFormat(t *testing.T) { // test chunk name parser
savedNameFormat := f.opt.NameFormat func testChunkNameFormat(t *testing.T, f *Fs) {
savedStartFrom := f.opt.StartFrom saveOpt := f.opt
defer func() { defer func() {
// restore original settings // restore original settings (f is pointer, f.opt is struct)
_ = f.parseNameFormat(savedNameFormat) f.opt = saveOpt
f.opt.StartFrom = savedStartFrom _ = f.setChunkNameFormat(f.opt.NameFormat)
}() }()
var err error
err = f.parseNameFormat("*.rclone_chunk.###") assertFormat := func(pattern, wantDataFormat, wantCtrlFormat, wantNameRegexp string) {
assert.NoError(t, err) err := f.setChunkNameFormat(pattern)
assert.Equal(t, `%s.rclone_chunk.%03d`, f.nameFormat) assert.NoError(t, err)
assert.Equal(t, `^(.+)\.rclone_chunk\.([0-9]{3,})$`, f.nameRegexp.String()) assert.Equal(t, wantDataFormat, f.dataNameFmt)
assert.Equal(t, wantCtrlFormat, f.ctrlNameFmt)
assert.Equal(t, wantNameRegexp, f.nameRegexp.String())
}
err = f.parseNameFormat("*.rclone_chunk.#") assertFormatValid := func(pattern string) {
assert.NoError(t, err) err := f.setChunkNameFormat(pattern)
assert.Equal(t, `%s.rclone_chunk.%d`, f.nameFormat) assert.NoError(t, err)
assert.Equal(t, `^(.+)\.rclone_chunk\.([0-9]+)$`, f.nameRegexp.String()) }
err = f.parseNameFormat("*_chunk_#####") assertFormatInvalid := func(pattern string) {
assert.NoError(t, err) err := f.setChunkNameFormat(pattern)
assert.Equal(t, `%s_chunk_%05d`, f.nameFormat) assert.Error(t, err)
assert.Equal(t, `^(.+)_chunk_([0-9]{5,})$`, f.nameRegexp.String()) }
err = f.parseNameFormat("*-chunk-#") assertMakeName := func(wantChunkName, mainName string, chunkNo int, ctrlType string, xactNo int64) {
assert.NoError(t, err) gotChunkName := f.makeChunkName(mainName, chunkNo, ctrlType, xactNo)
assert.Equal(t, `%s-chunk-%d`, f.nameFormat) assert.Equal(t, wantChunkName, gotChunkName)
assert.Equal(t, `^(.+)-chunk-([0-9]+)$`, f.nameRegexp.String()) }
err = f.parseNameFormat("_*-chunk-##,") assertMakeNamePanics := func(mainName string, chunkNo int, ctrlType string, xactNo int64) {
assert.NoError(t, err) assert.Panics(t, func() {
assert.Equal(t, `_%s-chunk-%02d,`, f.nameFormat) _ = f.makeChunkName(mainName, chunkNo, ctrlType, xactNo)
assert.Equal(t, `^_(.+)-chunk-([0-9]{2,}),$`, f.nameRegexp.String()) }, "makeChunkName(%q,%d,%q,%d) should panic", mainName, chunkNo, ctrlType, xactNo)
}
err = f.parseNameFormat(`*-chunk-#-%^$()[]{}.+-!?:\/`) assertParseName := func(fileName, wantMainName string, wantChunkNo int, wantCtrlType string, wantXactNo int64) {
assert.NoError(t, err) gotMainName, gotChunkNo, gotCtrlType, gotXactNo := f.parseChunkName(fileName)
assert.Equal(t, `%s-chunk-%d-%%^$()[]{}.+-!?:\/`, f.nameFormat) assert.Equal(t, wantMainName, gotMainName)
assert.Equal(t, `^(.+)-chunk-([0-9]+)-%\^\$\(\)\[\]\{\}\.\+-!\?:\\/$`, f.nameRegexp.String()) assert.Equal(t, wantChunkNo, gotChunkNo)
assert.Equal(t, wantCtrlType, gotCtrlType)
assert.Equal(t, wantXactNo, gotXactNo)
}
err = f.parseNameFormat("chunk-#") const newFormatSupported = false // support for patterns not starting with base name (*)
assert.Error(t, err)
err = f.parseNameFormat("*-chunk") // valid formats
assert.Error(t, err) assertFormat(`*.rclone_chunk.###`, `%s.rclone_chunk.%03d`, `%s.rclone_chunk._%s`, `^(.+?)\.rclone_chunk\.(?:([0-9]{3,})|_([a-z]{3,9}))(?:\.\.tmp_([0-9]{10,19}))?$`)
assertFormat(`*.rclone_chunk.#`, `%s.rclone_chunk.%d`, `%s.rclone_chunk._%s`, `^(.+?)\.rclone_chunk\.(?:([0-9]+)|_([a-z]{3,9}))(?:\.\.tmp_([0-9]{10,19}))?$`)
assertFormat(`*_chunk_#####`, `%s_chunk_%05d`, `%s_chunk__%s`, `^(.+?)_chunk_(?:([0-9]{5,})|_([a-z]{3,9}))(?:\.\.tmp_([0-9]{10,19}))?$`)
assertFormat(`*-chunk-#`, `%s-chunk-%d`, `%s-chunk-_%s`, `^(.+?)-chunk-(?:([0-9]+)|_([a-z]{3,9}))(?:\.\.tmp_([0-9]{10,19}))?$`)
assertFormat(`*-chunk-#-%^$()[]{}.+-!?:\`, `%s-chunk-%d-%%^$()[]{}.+-!?:\`, `%s-chunk-_%s-%%^$()[]{}.+-!?:\`, `^(.+?)-chunk-(?:([0-9]+)|_([a-z]{3,9}))-%\^\$\(\)\[\]\{\}\.\+-!\?:\\(?:\.\.tmp_([0-9]{10,19}))?$`)
if newFormatSupported {
assertFormat(`_*-chunk-##,`, `_%s-chunk-%02d,`, `_%s-chunk-_%s,`, `^_(.+?)-chunk-(?:([0-9]{2,})|_([a-z]{3,9})),(?:\.\.tmp_([0-9]{10,19}))?$`)
}
err = f.parseNameFormat("*-*-chunk-#") // invalid formats
assert.Error(t, err) assertFormatInvalid(`chunk-#`)
assertFormatInvalid(`*-chunk`)
assertFormatInvalid(`*-*-chunk-#`)
assertFormatInvalid(`*-chunk-#-#`)
assertFormatInvalid(`#-chunk-*`)
assertFormatInvalid(`*/#`)
err = f.parseNameFormat("*-chunk-#-#") assertFormatValid(`*#`)
assert.Error(t, err) assertFormatInvalid(`**#`)
assertFormatInvalid(`#*`)
assertFormatInvalid(``)
assertFormatInvalid(`-`)
err = f.parseNameFormat("#-chunk-*") // quick tests
assert.Error(t, err) if newFormatSupported {
assertFormat(`part_*_#`, `part_%s_%d`, `part_%s__%s`, `^part_(.+?)_(?:([0-9]+)|_([a-z]{3,9}))(?:\.\.tmp_([0-9]{10,19}))?$`)
f.opt.StartFrom = 1
err = f.parseNameFormat("*#") assertMakeName(`part_fish_1`, "fish", 0, "", -1)
assert.NoError(t, err) assertParseName(`part_fish_43`, "fish", 42, "", -1)
assertMakeName(`part_fish_3..tmp_0000000004`, "fish", 2, "", 4)
err = f.parseNameFormat("**#") assertParseName(`part_fish_4..tmp_0000000005`, "fish", 3, "", 5)
assert.Error(t, err) assertMakeName(`part_fish__locks`, "fish", -2, "locks", -3)
err = f.parseNameFormat("#*") assertParseName(`part_fish__locks`, "fish", -1, "locks", -1)
assert.Error(t, err) assertMakeName(`part_fish__blockinfo..tmp_1234567890123456789`, "fish", -3, "blockinfo", 1234567890123456789)
err = f.parseNameFormat("") assertParseName(`part_fish__blockinfo..tmp_1234567890123456789`, "fish", -1, "blockinfo", 1234567890123456789)
assert.Error(t, err) }
err = f.parseNameFormat("-")
assert.Error(t, err)
// prepare format for long tests
assertFormat(`*.chunk.###`, `%s.chunk.%03d`, `%s.chunk._%s`, `^(.+?)\.chunk\.(?:([0-9]{3,})|_([a-z]{3,9}))(?:\.\.tmp_([0-9]{10,19}))?$`)
f.opt.StartFrom = 2 f.opt.StartFrom = 2
err = f.parseNameFormat("*.chunk.###")
assert.NoError(t, err)
assert.Equal(t, `%s.chunk.%03d`, f.nameFormat)
assert.Equal(t, `^(.+)\.chunk\.([0-9]{3,})$`, f.nameRegexp.String())
assert.Equal(t, "fish.chunk.003", f.makeChunkName("fish", 1, -1)) // valid data chunks
assert.Equal(t, "fish.chunk.011..tmp_0000054321", f.makeChunkName("fish", 9, 54321)) assertMakeName(`fish.chunk.003`, "fish", 1, "", -1)
assert.Equal(t, "fish.chunk.011..tmp_1234567890", f.makeChunkName("fish", 9, 1234567890)) assertMakeName(`fish.chunk.011..tmp_0000054321`, "fish", 9, "", 54321)
assert.Equal(t, "fish.chunk.1916..tmp_123456789012345", f.makeChunkName("fish", 1914, 123456789012345)) assertMakeName(`fish.chunk.011..tmp_1234567890`, "fish", 9, "", 1234567890)
assertMakeName(`fish.chunk.1916..tmp_123456789012345`, "fish", 1914, "", 123456789012345)
name, chunkNo, tempNo := f.parseChunkName("fish.chunk.003") assertParseName(`fish.chunk.003`, "fish", 1, "", -1)
assert.True(t, name == "fish" && chunkNo == 1 && tempNo == -1) assertParseName(`fish.chunk.004..tmp_0000000021`, "fish", 2, "", 21)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.004..tmp_0000000021") assertParseName(`fish.chunk.021`, "fish", 19, "", -1)
assert.True(t, name == "fish" && chunkNo == 2 && tempNo == 21) assertParseName(`fish.chunk.323..tmp_1234567890123456789`, "fish", 321, "", 1234567890123456789)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.021")
assert.True(t, name == "fish" && chunkNo == 19 && tempNo == -1) // parsing invalid data chunk names
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.323..tmp_1234567890123456789") assertParseName(`fish.chunk.3`, "", -1, "", -1)
assert.True(t, name == "fish" && chunkNo == 321 && tempNo == 1234567890123456789) assertParseName(`fish.chunk.001`, "", -1, "", -1)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.3") assertParseName(`fish.chunk.21`, "", -1, "", -1)
assert.True(t, name == "" && chunkNo == -1 && tempNo == -1) assertParseName(`fish.chunk.-21`, "", -1, "", -1)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.001")
assert.True(t, name == "" && chunkNo == -1 && tempNo == -1) assertParseName(`fish.chunk.004.tmp_0000000021`, "", -1, "", -1)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.21") assertParseName(`fish.chunk.003..tmp_123456789`, "", -1, "", -1)
assert.True(t, name == "" && chunkNo == -1 && tempNo == -1) assertParseName(`fish.chunk.003..tmp_012345678901234567890123456789`, "", -1, "", -1)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.-21") assertParseName(`fish.chunk.003..tmp_-1`, "", -1, "", -1)
assert.True(t, name == "" && chunkNo == -1 && tempNo == -1)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.004.tmp_0000000021") // valid control chunks
assert.True(t, name == "" && chunkNo == -1 && tempNo == -1) assertMakeName(`fish.chunk._info`, "fish", -1, "info", -1)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.003..tmp_123456789") assertMakeName(`fish.chunk._locks`, "fish", -2, "locks", -1)
assert.True(t, name == "" && chunkNo == -1 && tempNo == -1) assertMakeName(`fish.chunk._blockinfo`, "fish", -3, "blockinfo", -1)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.003..tmp_012345678901234567890123456789")
assert.True(t, name == "" && chunkNo == -1 && tempNo == -1) assertParseName(`fish.chunk._info`, "fish", -1, "info", -1)
name, chunkNo, tempNo = f.parseChunkName("fish.chunk.003..tmp_-1") assertParseName(`fish.chunk._locks`, "fish", -1, "locks", -1)
assert.True(t, name == "" && chunkNo == -1 && tempNo == -1) assertParseName(`fish.chunk._blockinfo`, "fish", -1, "blockinfo", -1)
// valid temporary control chunks
assertMakeName(`fish.chunk._info..tmp_0000000021`, "fish", -1, "info", 21)
assertMakeName(`fish.chunk._locks..tmp_0000054321`, "fish", -2, "locks", 54321)
assertMakeName(`fish.chunk._uploads..tmp_0000000000`, "fish", -3, "uploads", 0)
assertMakeName(`fish.chunk._blockinfo..tmp_1234567890123456789`, "fish", -4, "blockinfo", 1234567890123456789)
assertParseName(`fish.chunk._info..tmp_0000000021`, "fish", -1, "info", 21)
assertParseName(`fish.chunk._locks..tmp_0000054321`, "fish", -1, "locks", 54321)
assertParseName(`fish.chunk._uploads..tmp_0000000000`, "fish", -1, "uploads", 0)
assertParseName(`fish.chunk._blockinfo..tmp_1234567890123456789`, "fish", -1, "blockinfo", 1234567890123456789)
// parsing invalid control chunk names
assertParseName(`fish.chunk.info`, "", -1, "", -1)
assertParseName(`fish.chunk.locks`, "", -1, "", -1)
assertParseName(`fish.chunk.uploads`, "", -1, "", -1)
assertParseName(`fish.chunk.blockinfo`, "", -1, "", -1)
assertParseName(`fish.chunk._os`, "", -1, "", -1)
assertParseName(`fish.chunk._futuredata`, "", -1, "", -1)
assertParseName(`fish.chunk._me_ta`, "", -1, "", -1)
assertParseName(`fish.chunk._in-fo`, "", -1, "", -1)
assertParseName(`fish.chunk._.bin`, "", -1, "", -1)
assertParseName(`fish.chunk._locks..tmp_123456789`, "", -1, "", -1)
assertParseName(`fish.chunk._meta..tmp_-1`, "", -1, "", -1)
assertParseName(`fish.chunk._blockinfo..tmp_012345678901234567890123456789`, "", -1, "", -1)
// short control chunk names: 3 letters ok, 1-2 letters not allowed
assertMakeName(`fish.chunk._ext`, "fish", -1, "ext", -1)
assertMakeName(`fish.chunk._ext..tmp_0000000021`, "fish", -1, "ext", 21)
assertParseName(`fish.chunk._int`, "fish", -1, "int", -1)
assertParseName(`fish.chunk._int..tmp_0000000021`, "fish", -1, "int", 21)
assertMakeNamePanics("fish", -1, "in", -1)
assertMakeNamePanics("fish", -1, "up", 4)
assertMakeNamePanics("fish", -1, "x", -1)
assertMakeNamePanics("fish", -1, "c", 4)
// base file name can sometimes look like a valid chunk name
assertParseName(`fish.chunk.003.chunk.004`, "fish.chunk.003", 2, "", -1)
assertParseName(`fish.chunk.003.chunk.005..tmp_0000000021`, "fish.chunk.003", 3, "", 21)
assertParseName(`fish.chunk.003.chunk._info`, "fish.chunk.003", -1, "info", -1)
assertParseName(`fish.chunk.003.chunk._blockinfo..tmp_1234567890123456789`, "fish.chunk.003", -1, "blockinfo", 1234567890123456789)
assertParseName(`fish.chunk.003.chunk._Meta`, "", -1, "", -1)
assertParseName(`fish.chunk.003.chunk._x..tmp_0000054321`, "", -1, "", -1)
assertParseName(`fish.chunk.004..tmp_0000000021.chunk.004`, "fish.chunk.004..tmp_0000000021", 2, "", -1)
assertParseName(`fish.chunk.004..tmp_0000000021.chunk.005..tmp_0000000021`, "fish.chunk.004..tmp_0000000021", 3, "", 21)
assertParseName(`fish.chunk.004..tmp_0000000021.chunk._info`, "fish.chunk.004..tmp_0000000021", -1, "info", -1)
assertParseName(`fish.chunk.004..tmp_0000000021.chunk._blockinfo..tmp_1234567890123456789`, "fish.chunk.004..tmp_0000000021", -1, "blockinfo", 1234567890123456789)
assertParseName(`fish.chunk.004..tmp_0000000021.chunk._Meta`, "", -1, "", -1)
assertParseName(`fish.chunk.004..tmp_0000000021.chunk._x..tmp_0000054321`, "", -1, "", -1)
assertParseName(`fish.chunk._info.chunk.004`, "fish.chunk._info", 2, "", -1)
assertParseName(`fish.chunk._info.chunk.005..tmp_0000000021`, "fish.chunk._info", 3, "", 21)
assertParseName(`fish.chunk._info.chunk._info`, "fish.chunk._info", -1, "info", -1)
assertParseName(`fish.chunk._info.chunk._blockinfo..tmp_1234567890123456789`, "fish.chunk._info", -1, "blockinfo", 1234567890123456789)
assertParseName(`fish.chunk._info.chunk._info.chunk._Meta`, "", -1, "", -1)
assertParseName(`fish.chunk._info.chunk._info.chunk._x..tmp_0000054321`, "", -1, "", -1)
assertParseName(`fish.chunk._blockinfo..tmp_1234567890123456789.chunk.004`, "fish.chunk._blockinfo..tmp_1234567890123456789", 2, "", -1)
assertParseName(`fish.chunk._blockinfo..tmp_1234567890123456789.chunk.005..tmp_0000000021`, "fish.chunk._blockinfo..tmp_1234567890123456789", 3, "", 21)
assertParseName(`fish.chunk._blockinfo..tmp_1234567890123456789.chunk._info`, "fish.chunk._blockinfo..tmp_1234567890123456789", -1, "info", -1)
assertParseName(`fish.chunk._blockinfo..tmp_1234567890123456789.chunk._blockinfo..tmp_1234567890123456789`, "fish.chunk._blockinfo..tmp_1234567890123456789", -1, "blockinfo", 1234567890123456789)
assertParseName(`fish.chunk._blockinfo..tmp_1234567890123456789.chunk._info.chunk._Meta`, "", -1, "", -1)
assertParseName(`fish.chunk._blockinfo..tmp_1234567890123456789.chunk._info.chunk._x..tmp_0000054321`, "", -1, "", -1)
// attempts to make invalid chunk names
assertMakeNamePanics("fish", -1, "", -1) // neither data nor control
assertMakeNamePanics("fish", 0, "info", -1) // both data and control
assertMakeNamePanics("fish", -1, "futuredata", -1) // control type too long
assertMakeNamePanics("fish", -1, "123", -1) // digits not allowed
assertMakeNamePanics("fish", -1, "Meta", -1) // only lower case letters allowed
assertMakeNamePanics("fish", -1, "in-fo", -1) // punctuation not allowed
assertMakeNamePanics("fish", -1, "_info", -1)
assertMakeNamePanics("fish", -1, "info_", -1)
assertMakeNamePanics("fish", -2, ".bind", -3)
assertMakeNamePanics("fish", -2, "bind.", -3)
assertMakeNamePanics("fish", -1, "", 1) // neither data nor control
assertMakeNamePanics("fish", 0, "info", 12) // both data and control
assertMakeNamePanics("fish", -1, "futuredata", 45) // control type too long
assertMakeNamePanics("fish", -1, "123", 123) // digits not allowed
assertMakeNamePanics("fish", -1, "Meta", 456) // only lower case letters allowed
assertMakeNamePanics("fish", -1, "in-fo", 321) // punctuation not allowed
assertMakeNamePanics("fish", -1, "_info", 15678)
assertMakeNamePanics("fish", -1, "info_", 999)
assertMakeNamePanics("fish", -2, ".bind", 0)
assertMakeNamePanics("fish", -2, "bind.", 0)
} }
// InternalTest dispatches all internal tests
func (f *Fs) InternalTest(t *testing.T) { func (f *Fs) InternalTest(t *testing.T) {
t.Run("PutLarge", func(t *testing.T) { t.Run("PutLarge", func(t *testing.T) {
if *UploadKilobytes <= 0 { if *UploadKilobytes <= 0 {
t.Skip("-upload-kilobytes is not set") t.Skip("-upload-kilobytes is not set")
} }
f.InternalTestPutLarge(t, *UploadKilobytes) testPutLarge(t, f, *UploadKilobytes)
}) })
t.Run("ChunkNameFormat", func(t *testing.T) { t.Run("ChunkNameFormat", func(t *testing.T) {
f.InternalTestChunkNameFormat(t) testChunkNameFormat(t, f)
}) })
} }

View File

@ -22,7 +22,7 @@ won't. This means that if you are using a bucket based remote (eg S3, B2, swift)
then you should probably put the bucket in the remote `s3:bucket`. then you should probably put the bucket in the remote `s3:bucket`.
Now configure `chunker` using `rclone config`. We will call this one `overlay` Now configure `chunker` using `rclone config`. We will call this one `overlay`
to separate it from the `remote`. to separate it from the `remote` itself.
``` ```
No remotes found - make a new one No remotes found - make a new one
@ -50,11 +50,11 @@ Choose how chunker handles hash sums.
Enter a string value. Press Enter for the default ("md5"). Enter a string value. Press Enter for the default ("md5").
Choose a number from below, or type in your own value Choose a number from below, or type in your own value
/ Chunker can pass any hash supported by wrapped remote / Chunker can pass any hash supported by wrapped remote
1 | for a single-chunk file but returns nothing otherwise. 1 | for non-chunked files but returns nothing otherwise.
\ "none" \ "none"
2 / MD5 for multi-chunk files. Requires "simplejson". 2 / MD5 for composite files. Requires "simplejson".
\ "md5" \ "md5"
3 / SHA1 for multi-chunk files. Requires "simplejson". 3 / SHA1 for composite files. Requires "simplejson".
\ "sha1" \ "sha1"
/ Copying a file to chunker will request MD5 from the source / Copying a file to chunker will request MD5 from the source
4 | falling back to SHA1 if unsupported. Requires "simplejson". 4 | falling back to SHA1 if unsupported. Requires "simplejson".
@ -95,28 +95,44 @@ When rclone starts a file upload, chunker checks the file size. If it
doesn't exceed the configured chunk size, chunker will just pass the file doesn't exceed the configured chunk size, chunker will just pass the file
to the wrapped remote. If a file is large, chunker will transparently cut to the wrapped remote. If a file is large, chunker will transparently cut
data in pieces with temporary names and stream them one by one, on the fly. data in pieces with temporary names and stream them one by one, on the fly.
Each chunk will contain the specified number of data byts, except for the Each data chunk will contain the specified number of bytes, except for the
last one which may have less data. If file size is unknown in advance last one which may have less data. If file size is unknown in advance
(this is called a streaming upload), chunker will internally create (this is called a streaming upload), chunker will internally create
a temporary copy, record its size and repeat the above process. a temporary copy, record its size and repeat the above process.
When upload completes, temporary chunk files are finally renamed. When upload completes, temporary chunk files are finally renamed.
This scheme guarantees that operations look from outside as atomic. This scheme guarantees that operations can be run in parallel and look
from outside as atomic.
A similar method with hidden temporary chunks is used for other operations A similar method with hidden temporary chunks is used for other operations
(copy/move/rename etc). If an operation fails, hidden chunks are normally (copy/move/rename etc). If an operation fails, hidden chunks are normally
destroyed, and the destination composite file stays intact. destroyed, and the target composite file stays intact.
When a composite file download is requested, chunker transparently
assembles it by concatenating data chunks in order. As the split is trivial
one could even manually concatenate data chunks together to obtain the
original content.
When the `list` rclone command scans a directory on wrapped remote,
the potential chunk files are accounted for, grouped and assembled into
composite directory entries. Any temporary chunks are hidden.
`list` can sometimes come across composite files with missing or invalid
chunks, eg if wrapped file system has been tampered with or damaged.
If chunker detects a missing chunk it will by default silently ignore
the whole group. You can use the `--chunker-fail-on-bad-chunks`
rclone option to make `list` fail with a loud error message.
#### Chunk names #### Chunk names
By default chunk names are `BIG_FILE_NAME.rclone-chunk.001`, The default chunk name format is `*.rclone-chunk.###`, hence by default
`BIG_FILE_NAME.rclone-chunk.002` etc, because the default chunk name chunk names are `BIG_FILE_NAME.rclone-chunk.001`,
format is `*.rclone-chunk.###`. You can configure another name format `BIG_FILE_NAME.rclone-chunk.002` etc. You can configure a different name
using the `--chunker-name-format` option. The format uses asterisk format using the `--chunker-name-format` option. The format uses asterisk
`*` as a placeholder for the base file name and one or more consecutive `*` as a placeholder for the base file name and one or more consecutive
hash characters `#` as a placeholder for sequential chunk number. hash characters `#` as a placeholder for sequential chunk number.
There must be one and only one asterisk. The number of consecutive hash There must be one and only one asterisk. The number of consecutive hash
characters defines the minimum length of a string representing a chunk number. characters defines the minimum length of a string representing a chunk number.
If decimal chunk number has less digits than the number of hashes, it is If decimal chunk number has less digits than the number of hashes, it is
left-padded by zeros. If the number stringis longer, it is left intact. left-padded by zeros. If the decimal string is longer, it is left intact.
By default numbering starts from 1 but there is another option that allows By default numbering starts from 1 but there is another option that allows
user to start from 0, eg. for compatibility with legacy software. user to start from 0, eg. for compatibility with legacy software.
@ -125,24 +141,18 @@ For example, if name format is `big_*-##.part` and original file name is
`big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part` `big_data.txt-00.part`, the 99th chunk will be `big_data.txt-98.part`
and the 302nd chunk will become `big_data.txt-301.part`. and the 302nd chunk will become `big_data.txt-301.part`.
When the `list` rclone command scans a directory on wrapped remote, the Note that `list` assembles composite directory entries only when chunk names
potential chunk files are accounted for and merged into composite directory match the configured format and treats non-conforming file names as normal
entries only if their names match the configured format. All other files non-chunked files.
are ignored, including temporary chunks.
The list command might encounter composite files with missing or invalid
chunks. If chunker detects a missing chunk it will by default silently
ignore the whole group. You can use the `--chunker-fail-on-bad-chunks`
command line flag to make `list` fail with an error message.
### Metadata ### Metadata
By default when a file is large enough, chunker will create a metadata Besides data chunks chunker will by default create metadata object for
object besides data chunks. The object is named after the original file. a composite file. The object is named after the original file.
Chunker allows user to disable metadata completely (the `none` format). Chunker allows user to disable metadata completely (the `none` format).
Please note that currently metadata is not created for files smaller Note that metadata is normally not created for files smaller than the
than configured chunk size. This may change in future as new formats configured chunk size. This may change in future rclone releases.
are developed.
#### Simple JSON metadata format #### Simple JSON metadata format
@ -151,13 +161,13 @@ for composite files. Meta objects carry the following fields:
- `ver` - version of format, currently `1` - `ver` - version of format, currently `1`
- `size` - total size of composite file - `size` - total size of composite file
- `nchunks` - number of chunks in the file - `nchunks` - number of data chunks in file
- `md5` - MD5 hashsum of composite file (if present) - `md5` - MD5 hashsum of composite file (if present)
- `sha1` - SHA1 hashsum (if present) - `sha1` - SHA1 hashsum (if present)
There is no field for composite file name as it's simply equal to the name There is no field for composite file name as it's simply equal to the name
of meta object on the wrapped remote. Please refer to respective sections of meta object on the wrapped remote. Please refer to respective sections
for detils on hashsums and handling of modified time. for details on hashsums and modified time handling.
#### No metadata #### No metadata
@ -165,16 +175,15 @@ You can disable meta objects by setting the meta format option to `none`.
In this mode chunker will scan directory for all files that follow In this mode chunker will scan directory for all files that follow
configured chunk name format, group them by detecting chunks with the same configured chunk name format, group them by detecting chunks with the same
base name and show group names as virtual composite files. base name and show group names as virtual composite files.
When a download is requested, chunker will transparently assemble compound This method is more prone to missing chunk errors (especially missing
files by merging chunks in order. This method is more prone to missing chunk last chunk) than format with metadata enabled.
errors (especially missing last chunk) than metadata-enabled formats.
### Hashsums ### Hashsums
Chunker supports hashsums only when a compatible metadata is present. Chunker supports hashsums only when a compatible metadata is present.
Thus, if you choose metadata format of `none`, chunker will return Hence, if you choose metadata format of `none`, chunker will report hashsum
`UNSUPPORTED` as hashsum. as `UNSUPPORTED`.
Please note that metadata is stored only for composite files. If a file Please note that metadata is stored only for composite files. If a file
is small (smaller than configured chunk size), chunker will transparently is small (smaller than configured chunk size), chunker will transparently
@ -183,18 +192,19 @@ You will see the empty string as a hashsum of requested type for small
files if the wrapped remote doesn't support it. files if the wrapped remote doesn't support it.
Many storage backends support MD5 and SHA1 hash types, so does chunker. Many storage backends support MD5 and SHA1 hash types, so does chunker.
Currently you can choose one or another but not both. With chunker you can choose one or another but not both.
MD5 is set by default as the most supported type. MD5 is set by default as the most supported type.
Since chunker keeps hashes for composite files and falls back to the Since chunker keeps hashes for composite files and falls back to the
wrapped remote hash for small ones, we advise you to choose the same wrapped remote hash for non-chunked ones, we advise you to choose the same
hash type as wrapped remote so that your file listings look coherent. hash type as supported by wrapped remote so that your file listings
look coherent.
Normally, when a file is copied to a chunker controlled remote, chunker Normally, when a file is copied to chunker controlled remote, chunker
will ask the file source for compatible file hash and revert to on-the-fly will ask the file source for compatible file hash and revert to on-the-fly
calculation if none is found. This involves some CPU overhead but provides calculation if none is found. This involves some CPU overhead but provides
a guarantee that given hashsum is available. Also, chunker will reject a guarantee that given hashsum is available. Also, chunker will reject
a server-side copy or move operation if source and destination hashsum a server-side copy or move operation if source and destination hashsum
types are different, resulting in the extra network bandwidth, too. types are different resulting in the extra network bandwidth, too.
In some rare cases this may be undesired, so chunker provides two optional In some rare cases this may be undesired, so chunker provides two optional
choices: `sha1quick` and `md5quick`. If the source does not support primary choices: `sha1quick` and `md5quick`. If the source does not support primary
hash type and the quick mode is enabled, chunker will try to fall back to hash type and the quick mode is enabled, chunker will try to fall back to
@ -209,10 +219,10 @@ between source and target are not found.
Chunker stores modification times using the wrapped remote so support Chunker stores modification times using the wrapped remote so support
depends on that. For a small non-chunked file the chunker overlay simply depends on that. For a small non-chunked file the chunker overlay simply
manipulates modification time of the wrapped remote file. manipulates modification time of the wrapped remote file.
If file is large and metadata is present, then chunker will get and set For a composite file with metadata chunker will get and set
modification time of the metadata object on the wrapped remote. modification time of the metadata object on the wrapped remote.
If file is chunked but metadata format is `none` then chunker will If file is chunked but metadata format is `none` then chunker will
use modification time of the first chunk. use modification time of the first data chunk.
### Migrations ### Migrations
@ -222,11 +232,11 @@ chunk naming scheme is to:
- Collect all your chunked files under a directory and have your - Collect all your chunked files under a directory and have your
chunker remote point to it. chunker remote point to it.
- Create another directory (possibly on the same cloud storage) - Create another directory (most probably on the same cloud storage)
and configure a new remote with desired metadata format, and configure a new remote with desired metadata format,
hash type, chunk naming etc. hash type, chunk naming etc.
- Now run `rclone sync oldchunks: newchunks:` and all your data - Now run `rclone sync oldchunks: newchunks:` and all your data
will be transparently converted at transfer. will be transparently converted in transfer.
This may take some time, yet chunker will try server-side This may take some time, yet chunker will try server-side
copy if possible. copy if possible.
- After checking data integrity you may remove configuration section - After checking data integrity you may remove configuration section
@ -235,11 +245,11 @@ chunk naming scheme is to:
If rclone gets killed during a long operation on a big composite file, If rclone gets killed during a long operation on a big composite file,
hidden temporary chunks may stay in the directory. They will not be hidden temporary chunks may stay in the directory. They will not be
shown by the `list` command but will eat up your account quota. shown by the `list` command but will eat up your account quota.
Please note that the `deletefile` rclone command deletes only active Please note that the `deletefile` command deletes only active
chunks of a file. As a workaround, you can use remote of the wrapped chunks of a file. As a workaround, you can use remote of the wrapped
file system to see them. file system to see them.
An easy way to get rid of hidden garbage is to copy littered directory An easy way to get rid of hidden garbage is to copy littered directory
somewhere using the chunker remote and purge original directory. somewhere using the chunker remote and purge the original directory.
The `copy` command will copy only active chunks while the `purge` will The `copy` command will copy only active chunks while the `purge` will
remove everything including garbage. remove everything including garbage.
@ -260,7 +270,7 @@ Beware that in result of this some files which have been treated as chunks
before the change can pop up in directory listings as normal files before the change can pop up in directory listings as normal files
and vice versa. The same warning holds for the chunk size. and vice versa. The same warning holds for the chunk size.
If you desperately need to change critical chunking setings, you should If you desperately need to change critical chunking setings, you should
run data migration as described in a dedicated section. run data migration as described above.
If wrapped remote is case insensitive, the chunker overlay will inherit If wrapped remote is case insensitive, the chunker overlay will inherit
that property (so you can't have a file called "Hello.doc" and "hello.doc" that property (so you can't have a file called "Hello.doc" and "hello.doc"
@ -303,11 +313,11 @@ Choose how chunker handles hash sums.
- Examples: - Examples:
- "none" - "none"
- Chunker can pass any hash supported by wrapped remote - Chunker can pass any hash supported by wrapped remote
- for a single-chunk file but returns nothing otherwise. - for non-chunked files but returns nothing otherwise.
- "md5" - "md5"
- MD5 for multi-chunk files. Requires "simplejson". - MD5 for composite files. Requires "simplejson".
- "sha1" - "sha1"
- SHA1 for multi-chunk files. Requires "simplejson". - SHA1 for composite files. Requires "simplejson".
- "md5quick" - "md5quick"
- Copying a file to chunker will request MD5 from the source - Copying a file to chunker will request MD5 from the source
- falling back to SHA1 if unsupported. Requires "simplejson". - falling back to SHA1 if unsupported. Requires "simplejson".
@ -316,7 +326,7 @@ Choose how chunker handles hash sums.
### Advanced Options ### Advanced Options
Here are the advanced options specific to chunker (Transparently chunk/split large files). Here are the advanced options specific to chunker.
#### --chunker-name-format #### --chunker-name-format
@ -356,7 +366,7 @@ Metadata is a small JSON file named after the composite file.
- Do not use metadata files at all. Requires hash type "none". - Do not use metadata files at all. Requires hash type "none".
- "simplejson" - "simplejson"
- Simple JSON supports hash sums and chunk validation. - Simple JSON supports hash sums and chunk validation.
- It has the following fields: size, nchunks, md5, sha1. - It has the following fields: ver, size, nchunks, md5, sha1.
#### --chunker-fail-on-bad-chunks #### --chunker-fail-on-bad-chunks