From 424aaac2e1b3f3af1c55385198cc3644a4306d34 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 4 Jan 2021 17:09:09 +0100 Subject: [PATCH] encoder/filename: Add SCSU as tables Instead of only adding SCSU, add it as an existing table. Allow direct SCSU and add a, perhaps, reasonable table as well. Add byte interfaces that doesn't base64 encode the URL as well with `EncodeBytes` and `DecodeBytes`. Fuzz tested and decode tests added. --- go.mod | 1 + go.sum | 3 + lib/encoder/filename/decode.go | 13 +++- lib/encoder/filename/decode_test.go | 92 +++++++++++++++++++++++++++++ lib/encoder/filename/encode.go | 35 +++++++++-- lib/encoder/filename/fuzz.go | 4 +- lib/encoder/filename/gentable.go | 64 +++++++++++++++----- lib/encoder/filename/init.go | 13 +++- 8 files changed, 201 insertions(+), 24 deletions(-) create mode 100644 lib/encoder/filename/decode_test.go diff --git a/go.mod b/go.mod index 4f3e6cd64..13c94287a 100644 --- a/go.mod +++ b/go.mod @@ -21,6 +21,7 @@ require ( github.com/calebcase/tmpfile v1.0.2 // indirect github.com/colinmarc/hdfs/v2 v2.2.0 github.com/coreos/go-semver v0.3.0 + github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669 github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible github.com/gabriel-vasile/mimetype v1.1.2 github.com/gogo/protobuf v1.3.2 // indirect diff --git a/go.sum b/go.sum index d23f53196..135d8930f 100644 --- a/go.sum +++ b/go.sum @@ -171,11 +171,14 @@ github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumC github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= +github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669 h1:e28M2/odOZjMc1J2ZZwgex6NM9+aqr1nMlTqPLayxbk= +github.com/dop251/scsu v0.0.0-20200422003335-8fadfb689669/go.mod h1:Gth7Xev0h28tuTayG4HlTZy90IXhiDgV2+MLtJzjpP0= github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible h1:DtumzkLk2zZ2SeElEr+VNz+zV7l+BTe509cV4sKPXbM= github.com/dropbox/dropbox-sdk-go-unofficial v5.6.0+incompatible/go.mod h1:lr+LhMM3F6Y3lW1T9j2U5l7QeuWm87N9+PPXo3yH4qY= github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/dustin/go-humanize v0.0.0-20180421182945-02af3965c54e/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813 h1:NgO45/5mBLRVfiXerEFzH6ikcZ7DNRPS639xFg3ENzU= github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw= github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs= github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU= diff --git a/lib/encoder/filename/decode.go b/lib/encoder/filename/decode.go index 75f87ccd5..6e859fd31 100644 --- a/lib/encoder/filename/decode.go +++ b/lib/encoder/filename/decode.go @@ -7,6 +7,7 @@ import ( "errors" "sync" + "github.com/dop251/scsu" "github.com/klauspost/compress/huff0" ) @@ -22,6 +23,7 @@ var customDecMu sync.Mutex // Decode an encoded string. func Decode(s string) (string, error) { + initCoders() if len(s) < 1 { return "", ErrCorrupted } @@ -31,19 +33,25 @@ func Decode(s string) (string, error) { } table-- s = s[1:] - data := make([]byte, base64.URLEncoding.DecodedLen(len(s))) n, err := base64.URLEncoding.Decode(data, ([]byte)(s)) if err != nil || n < 0 { return "", ErrCorrupted } data = data[:n] + return DecodeBytes(table, data) +} +// DecodeBytes will decode raw id and data values. +func DecodeBytes(table byte, data []byte) (string, error) { + initCoders() switch table { case tableUncompressed: return string(data), nil case tableReserved: return "", ErrUnsupported + case tableSCSUPlain: + return scsu.Decode(data) case tableRLE: if len(data) < 2 { return "", ErrCorrupted @@ -79,6 +87,9 @@ func Decode(s string) (string, error) { if err != nil { return "", ErrCorrupted } + if table == tableSCSU { + return scsu.Decode(name) + } return string(name), nil } } diff --git a/lib/encoder/filename/decode_test.go b/lib/encoder/filename/decode_test.go new file mode 100644 index 000000000..6d377ac09 --- /dev/null +++ b/lib/encoder/filename/decode_test.go @@ -0,0 +1,92 @@ +package filename + +import "testing" + +func TestDecode(t *testing.T) { + tests := []struct { + name string + encoded string + want string + wantErr bool + }{ + { + name: "unicode-1", + encoded: "8D5V3MESVd-WEF7WuqaOvpKUWtYGEyw5UDQ==", + want: "長い長いUNICODEファイル名", + wantErr: false, + }, + { + name: "unicode-2", + encoded: "8GyHV1N7u2OEg4ufQ3eHQ3Ngg6N3X0CDg4-HX0NXU2tg=", + want: "ვეპხის ტყაოსანი შოთა რუსთაველი", + wantErr: false, + }, + { + name: "unicode-3", + encoded: "7LpehMXOrWe7mcT_lpf2MN1Nmgu55jpXHLavZcXJb2UTJ-UmGU15iznkD", + want: "Sønderjysk: Æ ka æe glass uhen at det go mæ naue.,", + wantErr: false, + }, + { + name: "unicode-4", + encoded: "7TCSRm0liJDR0ulpBq4Lla_XB2mWdLFMEs8wEQKHAGa8FRr333ntJ6Ww6_f__N5VKeYM=", + want: "Hello------world 時危兵甲滿天涯,載道流離起怨咨.bin", + }, + { + name: "plain-1", + encoded: "BzGQYxqHBA6ljTsir80gUM5Y=", + want: "-Duplican99E8ZI4___9_", + wantErr: false, + }, + { + name: "hex-1", + encoded: "D_--tHZROQpqqJ9PafqNa6STF", + want: "13646871dfabbs43323564654bbefff", + wantErr: false, + }, + { + name: "base64-1", + encoded: "FMpABB9Ef0KP8OrVxjnE3LzUePuLZi8pPg7eW8bgyW2d3Ucckf4rlE0mkAvlILVpOmF3L-rFbmNrpUO2HQFlF4SCMPVPeCEX6LeOg5JVpUVCXV1WSazD9vSpr", + want: "UxAYiB0FNTTkXRw9P8hwq-WmN7tYwbe-sFw8C3snDRG1d-yjrdOUVZQyLdtkJ8tuvhBSnuBiLjVieCAroWEZDIO4Hb_rKgdzPjMqFE7inwHJ2isF==", + wantErr: false, + }, + { + name: "custom-1", + encoded: "-BeADJCoG_________________xc=", + want: "Uaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + wantErr: false, + }, + { + name: "rle-1", + encoded: "9a2E=", + want: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + wantErr: false, + }, + { + name: "regular-1", + encoded: "BeSSrnzj0j3OXyR9K81M=", + want: "regular-filename.txt", + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := Decode(tt.encoded) + if (err != nil) != tt.wantErr { + if tt.encoded == "" && tt.want != "" { + proposed := Encode(tt.want) + table := decodeMap[proposed[0]] - 1 + t.Errorf("No encoded value, try '%s', table is %d", proposed, table) + return + } + t.Errorf("Decode() error = %v, wantErr %v", err, tt.wantErr) + return + } + + if got != tt.want { + t.Errorf("Decode() got = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/lib/encoder/filename/encode.go b/lib/encoder/filename/encode.go index 555bf6b91..031dbd07a 100644 --- a/lib/encoder/filename/encode.go +++ b/lib/encoder/filename/encode.go @@ -4,6 +4,7 @@ import ( "encoding/base64" "encoding/binary" + "github.com/dop251/scsu" "github.com/klauspost/compress/huff0" ) @@ -11,21 +12,45 @@ import ( // Calling Decode with the returned string should always succeed. // It is not a requirement that the input string is valid utf-8. func Encode(s string) string { + table, payload := EncodeBytes(s) + return string(encodeURL[table]) + base64.URLEncoding.EncodeToString(payload) +} + +// EncodeBytes will compress the given string and return a table identifier and a payload. +func EncodeBytes(s string) (table byte, payload []byte) { initCoders() bestSize := len(s) - bestTable := tableUncompressed + bestTable := byte(tableUncompressed) org := []byte(s) bestOut := []byte(s) - // Try all tables and choose the best for i, enc := range encTables[:] { + org := org if len(org) <= 1 || len(org) > maxLength { // Use the uncompressed break } + if enc == nil { continue } + + if i == tableSCSU { + var err error + olen := len(org) + org, err = scsu.EncodeStrict(s, make([]byte, 0, len(org))) + if err != nil || olen <= len(org) { + continue + } + if len(org) < bestSize { + // This is already better, store so we can use if the table cannot. + bestOut = bestOut[:len(org)] + bestTable = tableSCSUPlain + bestSize = len(org) + copy(bestOut, org) + } + } + // Try to encode using table. err := func() error { encTableLocks[i].Lock() @@ -36,14 +61,14 @@ func Encode(s string) string { } if len(out) < bestSize { bestOut = bestOut[:len(out)] - bestTable = i + bestTable = byte(i) bestSize = len(out) copy(bestOut, out) } return nil }() // If input is a single byte repeated store as RLE or save uncompressed. - if err == huff0.ErrUseRLE { + if err == huff0.ErrUseRLE && i != tableSCSU { if len(org) > 2 { // Encode as one byte repeated since it will be smaller than uncompressed. n := binary.PutUvarint(bestOut, uint64(len(org))) @@ -56,5 +81,5 @@ func Encode(s string) string { } } - return string(encodeURL[bestTable]) + base64.URLEncoding.EncodeToString(bestOut) + return bestTable, bestOut } diff --git a/lib/encoder/filename/fuzz.go b/lib/encoder/filename/fuzz.go index 73468ee71..5983a407f 100644 --- a/lib/encoder/filename/fuzz.go +++ b/lib/encoder/filename/fuzz.go @@ -25,7 +25,9 @@ func Fuzz(data []byte) int { panic(fmt.Sprintf("error decoding %q, input %q: %v", enc, string(data), err)) } if !bytes.Equal(data, []byte(decoded)) { - panic(fmt.Sprintf("decode mismatch, encoded: %q, org: %q, got: %q", enc, string(data), decoded)) + table := decodeMap[enc[0]] + table-- + panic(fmt.Sprintf("decode mismatch, encoded: %q, org: %q, got: %q, table %d", enc, string(data), decoded, int(table))) } // Everything is good. diff --git a/lib/encoder/filename/gentable.go b/lib/encoder/filename/gentable.go index 4acb61707..5475c60ab 100644 --- a/lib/encoder/filename/gentable.go +++ b/lib/encoder/filename/gentable.go @@ -3,21 +3,33 @@ package main import ( + "bufio" + "bytes" "encoding/base64" + "flag" "fmt" + "io/ioutil" "math" + "strings" + "unicode/utf8" + "github.com/dop251/scsu" "github.com/klauspost/compress" "github.com/klauspost/compress/huff0" ) -// Replace/add histogram data and execute go run gentable.go +// execute go run gentable.go +var indexFile = flag.String("index", "", "Index this file for table") + +// Allow non-represented characters. +var addUnused = flag.Bool("all", true, "Make all bytes possible") +var scsuEncode = flag.Bool("scsu", false, "SCSU encode on each line before table") func main() { - // Allow non-represented characters. - const omitUnused = false + flag.Parse() histogram := [256]uint64{ + // Replace/add histogram data and execute go run gentable.go // ncw home directory //0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19442, 760, 0, 349, 570, 1520, 199, 76, 685, 654, 0, 40377, 1605, 395132, 935270, 0, 1156377, 887730, 811737, 712241, 693240, 689139, 675964, 656417, 666577, 657413, 532, 24, 0, 145, 0, 3, 946, 44932, 37362, 46126, 36752, 76346, 19338, 47457, 14288, 38163, 4350, 7867, 36541, 65011, 30255, 26792, 22097, 1803, 39191, 61965, 76585, 11887, 12896, 5931, 1935, 1731, 1385, 1279, 9, 1278, 1, 420185, 0, 1146359, 746359, 968896, 868703, 1393640, 745019, 354147, 159462, 483979, 169092, 75937, 385858, 322166, 466635, 571268, 447132, 13792, 446484, 736844, 732675, 170232, 112983, 63184, 142357, 173945, 21521, 250, 0, 250, 4140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 39, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 15, 0, 0, 0, 10, 0, 5, 0, 0, 0, 0, 0, 0, 283, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //Images: @@ -26,16 +38,40 @@ func main() { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459, 0, 0, 7, 0, 0, 0, 7, 1, 1, 0, 2, 1, 506, 706, 0, 3903, 3552, 3694, 3338, 3262, 3257, 3222, 3249, 3325, 3261, 5, 0, 0, 1, 0, 0, 0, 48, 31, 61, 53, 46, 17, 17, 34, 32, 9, 22, 17, 31, 27, 19, 52, 5, 46, 84, 38, 14, 5, 19, 2, 2, 0, 8, 0, 8, 0, 180, 0, 5847, 3282, 3729, 3695, 3842, 3356, 316, 139, 487, 117, 95, 476, 289, 428, 609, 467, 5, 446, 592, 955, 130, 112, 57, 390, 168, 14, 0, 2, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, } - // Override with equally distributed characters - if false { - histogram = [256]uint64{} - var chars string - // base c64 - chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" - // hex - //chars = "0123456789abcdef" - for _, v := range []byte(chars) { - histogram[v] = 1 + if *indexFile != "" { + for i := range histogram[:] { + histogram[i] = 0 + } + b, err := ioutil.ReadFile(*indexFile) + if err != nil { + panic(err) + } + if *scsuEncode { + br := bufio.NewReader(bytes.NewBuffer(b)) + var encoded []byte + for { + line, err := br.ReadString('\n') + if err != nil { + break + } + line = strings.TrimSpace(line) + if len(line) < 3 || !utf8.ValidString(line) { + continue + } + e, err := scsu.Encode(line, nil) + if err != nil { + panic(err) + } + if len(e) >= len([]byte(line)) { + continue + } + encoded = append(encoded, e...) + } + fmt.Println("scsu", len(b), "->", len(encoded), "(excluding bigger)") + b = encoded + } + for _, v := range b { + histogram[v]++ } } @@ -49,7 +85,7 @@ func main() { const scale = 100 << 10 var tmp []byte for i, v := range histogram[:] { - if v == 0 && omitUnused { + if v == 0 && !*addUnused { continue } nf := float64(v) / float64(total) * scale diff --git a/lib/encoder/filename/init.go b/lib/encoder/filename/init.go index 1a7ae41ac..6f6bad147 100644 --- a/lib/encoder/filename/init.go +++ b/lib/encoder/filename/init.go @@ -26,9 +26,12 @@ var ( const ( tableUncompressed = 0 - tableRLE = 61 - tableCustom = 62 - tableReserved = 63 + + tableSCSU = 59 + tableSCSUPlain = 60 + tableRLE = 61 + tableCustom = 62 + tableReserved = 63 ) // predefined tables as base64 URL encoded string. @@ -47,6 +50,10 @@ var tablesData = [64]string{ 5: "JRDIcQf_______8PgIiIiIgINkggARHlkQwSSCCBxHFYINHdfXI=", // Special tables: + // SCSU and a fairly generic table: + tableSCSU: "UxAgZmEB-RYPU8hrnAk6uMgpTNQMB5MGRBx0D3T0JjyUyY-yOi5CoGgktbAktSh7d36HtPTFu7SXJ7FYw_AYmA74ZH2vWgc8O6Z5jLnWnsFqU_4B", + // SCSU with no table... + tableSCSUPlain: "", // Compressed data has its own table. tableCustom: "", // Reserved for extension.