From 22abd785eba35ca29cfcd6c162ca46e933344a1a Mon Sep 17 00:00:00 2001 From: Nick Craig-Wood Date: Tue, 24 May 2022 12:32:39 +0100 Subject: [PATCH] s3: implement reading and writing of metadata #111 --- backend/s3/s3.go | 273 +++++++++++++++++++++++++++------ backend/s3/s3_internal_test.go | 61 ++++++++ docs/content/s3.md | 20 +++ 3 files changed, 306 insertions(+), 48 deletions(-) create mode 100644 backend/s3/s3_internal_test.go diff --git a/backend/s3/s3.go b/backend/s3/s3.go index 7a2898264..1693446b0 100644 --- a/backend/s3/s3.go +++ b/backend/s3/s3.go @@ -71,6 +71,10 @@ func init() { } return nil, fmt.Errorf("unknown state %q", config.State) }, + MetadataInfo: &fs.MetadataInfo{ + System: systemMetadataInfo, + Help: `User metadata is stored as x-amz-meta- keys. S3 metadata keys are case insensitive and are always returned in lower case.`, + }, Options: []fs.Option{{ Name: fs.ConfigProvider, Help: "Choose your S3 provider.", @@ -1984,8 +1988,8 @@ circumstances or for testing. // Constants const ( - metaMtime = "Mtime" // the meta key to store mtime in - e.g. X-Amz-Meta-Mtime - metaMD5Hash = "Md5chksum" // the meta key to store md5hash in + metaMtime = "mtime" // the meta key to store mtime in - e.g. X-Amz-Meta-Mtime + metaMD5Hash = "md5chksum" // the meta key to store md5hash in // The maximum size of object we can COPY - this should be 5 GiB but is < 5 GB for b2 compatibility // See https://forum.rclone.org/t/copying-files-within-a-b2-bucket/16680/76 maxSizeForCopy = 4768 * 1024 * 1024 @@ -2000,6 +2004,57 @@ const ( maxExpireDuration = fs.Duration(7 * 24 * time.Hour) // max expiry is 1 week ) +// system metadata keys which this backend owns +var systemMetadataInfo = map[string]fs.MetadataHelp{ + "cache-control": { + Help: "Cache-Control header", + Type: "string", + Example: "no-cache", + }, + "content-disposition": { + Help: "Content-Disposition header", + Type: "string", + Example: "inline", + }, + "content-encoding": { + Help: "Content-Encoding header", + Type: "string", + Example: "gzip", + }, + "content-language": { + Help: "Content-Language header", + Type: "string", + Example: "en-US", + }, + "content-type": { + Help: "Content-Type header", + Type: "string", + Example: "text/plain", + }, + // "tagging": { + // Help: "x-amz-tagging header", + // Type: "string", + // Example: "tag1=value1&tag2=value2", + // }, + "tier": { + Help: "Tier of the object", + Type: "string", + Example: "GLACIER", + ReadOnly: true, + }, + "mtime": { + Help: "Time of last modification, read from rclone metadata", + Type: "RFC 3339", + Example: "2006-01-02T15:04:05.999999999Z07:00", + }, + "btime": { + Help: "Time of file birth (creation) read from Last-Modified header", + Type: "RFC 3339", + Example: "2006-01-02T15:04:05.999999999Z07:00", + ReadOnly: true, + }, +} + // Options defines the configuration for this backend type Options struct { Provider string `config:"provider"` @@ -2079,7 +2134,13 @@ type Object struct { lastModified time.Time // Last modified meta map[string]string // The object metadata if known - may be nil - with lower case keys mimeType string // MimeType of object - may be "" - storageClass string // e.g. GLACIER + + // Metadata as pointers to strings as they often won't be present + storageClass *string // e.g. GLACIER + cacheControl *string // Cache-Control: header + contentDisposition *string // Content-Disposition: header + contentEncoding *string // Content-Encoding: header + contentLanguage *string // Content-Language: header } // ------------------------------------------------------------ @@ -2573,6 +2634,9 @@ func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, e f.features = (&fs.Features{ ReadMimeType: true, WriteMimeType: true, + ReadMetadata: true, + WriteMetadata: true, + UserMetadata: true, BucketBased: true, BucketBasedRootOK: true, SetTier: true, @@ -2623,7 +2687,7 @@ func (f *Fs) newObjectWithInfo(ctx context.Context, remote string, info *s3.Obje } o.setMD5FromEtag(aws.StringValue(info.ETag)) o.bytes = aws.Int64Value(info.Size) - o.storageClass = aws.StringValue(info.StorageClass) + o.storageClass = info.StorageClass } else if !o.fs.opt.NoHeadObject { err := o.readMetaData(ctx) // reads info and meta, returning an error if err != nil { @@ -3491,7 +3555,7 @@ func (f *Fs) Command(ctx context.Context, name string, arg []string, opt map[str st.Status = "Not an S3 object" return } - if o.storageClass != "GLACIER" && o.storageClass != "DEEP_ARCHIVE" { + if o.storageClass == nil || (*o.storageClass != "GLACIER" && *o.storageClass != "DEEP_ARCHIVE") { st.Status = "Not GLACIER or DEEP_ARCHIVE storage class" return } @@ -3749,7 +3813,8 @@ func (o *Object) readMetaData(ctx context.Context) (err error) { if err != nil { return err } - o.setMetaData(resp.ETag, resp.ContentLength, resp.LastModified, resp.Metadata, resp.ContentType, resp.StorageClass) + o.setMetaData(resp) + // resp.ETag, resp.ContentLength, resp.LastModified, resp.Metadata, resp.ContentType, resp.StorageClass) return nil } @@ -3774,14 +3839,14 @@ func mapToS3Metadata(meta map[string]string) map[string]*string { return s3Meta } -func (o *Object) setMetaData(etag *string, contentLength *int64, lastModified *time.Time, meta map[string]*string, mimeType *string, storageClass *string) { +func (o *Object) setMetaData(resp *s3.HeadObjectOutput) { // Ignore missing Content-Length assuming it is 0 // Some versions of ceph do this due their apache proxies - if contentLength != nil { - o.bytes = *contentLength + if resp.ContentLength != nil { + o.bytes = *resp.ContentLength } - o.setMD5FromEtag(aws.StringValue(etag)) - o.meta = s3MetadataToMap(meta) + o.setMD5FromEtag(aws.StringValue(resp.ETag)) + o.meta = s3MetadataToMap(resp.Metadata) // Read MD5 from metadata if present if md5sumBase64, ok := o.meta[metaMD5Hash]; ok { md5sumBytes, err := base64.StdEncoding.DecodeString(md5sumBase64) @@ -3793,14 +3858,20 @@ func (o *Object) setMetaData(etag *string, contentLength *int64, lastModified *t o.md5 = hex.EncodeToString(md5sumBytes) } } - o.storageClass = aws.StringValue(storageClass) - if lastModified == nil { + if resp.LastModified == nil { o.lastModified = time.Now() fs.Logf(o, "Failed to read last modified") } else { - o.lastModified = *lastModified + o.lastModified = *resp.LastModified } - o.mimeType = aws.StringValue(mimeType) + o.mimeType = aws.StringValue(resp.ContentType) + + // Set system metadata + o.storageClass = resp.StorageClass + o.cacheControl = resp.CacheControl + o.contentDisposition = resp.ContentDisposition + o.contentEncoding = resp.ContentEncoding + o.contentLanguage = resp.ContentLanguage } // ModTime returns the modification time of the object @@ -3839,7 +3910,7 @@ func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error { o.meta[metaMtime] = swift.TimeToFloatString(modTime) // Can't update metadata here, so return this error to force a recopy - if o.storageClass == "GLACIER" || o.storageClass == "DEEP_ARCHIVE" { + if o.storageClass != nil && (*o.storageClass == "GLACIER" || *o.storageClass == "DEEP_ARCHIVE") { return fs.ErrorCantSetModTime } @@ -3900,17 +3971,34 @@ func (o *Object) downloadFromURL(ctx context.Context, bucketPath string, options metaData := make(map[string]*string) for key, value := range resp.Header { - if strings.HasPrefix(key, "x-amz-meta") { + key = strings.ToLower(key) + if strings.HasPrefix(key, "x-amz-meta-") { metaKey := strings.TrimPrefix(key, "x-amz-meta-") - metaData[strings.Title(metaKey)] = &value[0] + metaData[metaKey] = &value[0] } } - storageClass := resp.Header.Get("X-Amz-Storage-Class") - contentType := resp.Header.Get("Content-Type") - etag := resp.Header.Get("Etag") + header := func(k string) *string { + v := resp.Header.Get(k) + if v == "" { + return nil + } + return &v + } - o.setMetaData(&etag, contentLength, &lastModified, metaData, &contentType, &storageClass) + var head = s3.HeadObjectOutput{ + ETag: header("Etag"), + ContentLength: contentLength, + LastModified: &lastModified, + Metadata: metaData, + CacheControl: header("Cache-Control"), + ContentDisposition: header("Content-Disposition"), + ContentEncoding: header("Content-Encoding"), + ContentLanguage: header("Content-Language"), + ContentType: header("Content-Type"), + StorageClass: header("X-Amz-Storage-Class"), + } + o.setMetaData(&head) return resp.Body, err } @@ -3985,7 +4073,10 @@ func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.Read fs.Debugf(o, "Failed to find length in %q", contentRange) } } - o.setMetaData(resp.ETag, size, resp.LastModified, resp.Metadata, resp.ContentType, resp.StorageClass) + var head s3.HeadObjectOutput + structs.SetFrom(&head, resp) + head.ContentLength = size + o.setMetaData(&head) return resp.Body, nil } @@ -4322,11 +4413,56 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op multipart := size < 0 || size >= int64(o.fs.opt.UploadCutoff) - // Set the mtime in the meta data - metadata := map[string]*string{ - metaMtime: aws.String(swift.TimeToFloatString(modTime)), + req := s3.PutObjectInput{ + Bucket: &bucket, + ACL: &o.fs.opt.ACL, + Key: &bucketPath, } + // Fetch metadata if --metadata is in use + meta, err := fs.GetMetadataOptions(ctx, src, options) + if err != nil { + return fmt.Errorf("failed to read metadata from source object: %w", err) + } + req.Metadata = make(map[string]*string, len(meta)+2) + // merge metadata into request and user metadata + for k, v := range meta { + pv := aws.String(v) + k = strings.ToLower(k) + switch k { + case "cache-control": + req.CacheControl = pv + case "content-disposition": + req.ContentDisposition = pv + case "content-encoding": + req.ContentEncoding = pv + case "content-language": + req.ContentLanguage = pv + case "content-type": + req.ContentType = pv + case "x-amz-tagging": + req.Tagging = pv + case "tier": + // ignore + case "mtime": + // mtime in meta overrides source ModTime + metaModTime, err := time.Parse(time.RFC3339Nano, v) + if err != nil { + fs.Debugf(o, "failed to parse metadata %s: %q: %v", k, v, err) + } else { + modTime = metaModTime + } + case "btime": + // write as metadata since we can't set it + req.Metadata[k] = pv + default: + req.Metadata[k] = pv + } + } + + // Set the mtime in the meta data + req.Metadata[metaMtime] = aws.String(swift.TimeToFloatString(modTime)) + // read the md5sum if available // - for non multipart // - so we can add a ContentMD5 @@ -4346,20 +4482,15 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op // - a multipart upload // - the Etag is not an MD5, eg when using SSE/SSE-C // provided checksums aren't disabled - metadata[metaMD5Hash] = &md5sumBase64 + req.Metadata[metaMD5Hash] = &md5sumBase64 } } } } - // Guess the content type - mimeType := fs.MimeType(ctx, src) - req := s3.PutObjectInput{ - Bucket: &bucket, - ACL: &o.fs.opt.ACL, - Key: &bucketPath, - ContentType: &mimeType, - Metadata: metadata, + // Set the content type it it isn't set already + if req.ContentType == nil { + req.ContentType = aws.String(fs.MimeType(ctx, src)) } if size >= 0 { req.ContentLength = &size @@ -4438,19 +4569,19 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op // so make up the object as best we can assuming it got // uploaded properly. If size < 0 then we need to do the HEAD. if o.fs.opt.NoHead && size >= 0 { - o.md5 = md5sumHex - o.bytes = size - o.lastModified = time.Now() - o.meta = s3MetadataToMap(req.Metadata) - o.mimeType = aws.StringValue(req.ContentType) - o.storageClass = aws.StringValue(req.StorageClass) + var head s3.HeadObjectOutput + structs.SetFrom(&head, req) + head.ETag = &md5sumHex // doesn't matter quotes are misssing + head.ContentLength = &size // If we have done a single part PUT request then we can read these if gotEtag != "" { - o.setMD5FromEtag(gotEtag) + head.ETag = &gotEtag } - if !o.lastModified.IsZero() { - o.lastModified = lastModified + if lastModified.IsZero() { + lastModified = time.Now() } + head.LastModified = &lastModified + o.setMetaData(&head) return nil } @@ -4460,7 +4591,7 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op if err != nil { return err } - o.setMetaData(head.ETag, head.ContentLength, head.LastModified, head.Metadata, head.ContentType, head.StorageClass) + o.setMetaData(head) if o.fs.opt.UseMultipartEtag.Value && !o.fs.etagIsNotMD5 && wantETag != "" && head.ETag != nil && *head.ETag != "" { gotETag := strings.Trim(strings.ToLower(*head.ETag), `"`) if wantETag != gotETag { @@ -4511,16 +4642,61 @@ func (o *Object) SetTier(tier string) (err error) { if err != nil { return err } - o.storageClass = tier + o.storageClass = &tier return err } // GetTier returns storage class as string func (o *Object) GetTier() string { - if o.storageClass == "" { + if o.storageClass == nil || *o.storageClass == "" { return "STANDARD" } - return o.storageClass + return *o.storageClass +} + +// Metadata returns metadata for an object +// +// It should return nil if there is no Metadata +func (o *Object) Metadata(ctx context.Context) (metadata fs.Metadata, err error) { + err = o.readMetaData(ctx) + if err != nil { + return nil, err + } + metadata = make(fs.Metadata, len(o.meta)+7) + for k, v := range o.meta { + switch k { + case metaMtime: + if modTime, err := swift.FloatStringToTime(v); err == nil { + metadata["mtime"] = modTime.Format(time.RFC3339Nano) + } + case metaMD5Hash: + // don't write hash metadata + default: + metadata[k] = v + } + } + if o.mimeType != "" { + metadata["content-type"] = o.mimeType + } + // metadata["x-amz-tagging"] = "" + if !o.lastModified.IsZero() { + metadata["btime"] = o.lastModified.Format(time.RFC3339Nano) + } + + // Set system metadata + setMetadata := func(k string, v *string) { + if v == nil || *v == "" { + return + } + metadata[k] = *v + } + setMetadata("cache-control", o.cacheControl) + setMetadata("content-disposition", o.contentDisposition) + setMetadata("content-encoding", o.contentEncoding) + setMetadata("content-language", o.contentLanguage) + setMetadata("tier", o.storageClass) + + return metadata, nil } // Check the interfaces are satisfied @@ -4535,4 +4711,5 @@ var ( _ fs.MimeTyper = &Object{} _ fs.GetTierer = &Object{} _ fs.SetTierer = &Object{} + _ fs.Metadataer = &Object{} ) diff --git a/backend/s3/s3_internal_test.go b/backend/s3/s3_internal_test.go new file mode 100644 index 000000000..ecd39f41f --- /dev/null +++ b/backend/s3/s3_internal_test.go @@ -0,0 +1,61 @@ +package s3 + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/rclone/rclone/fs" + "github.com/rclone/rclone/fstest" + "github.com/rclone/rclone/fstest/fstests" + "github.com/rclone/rclone/lib/random" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func (f *Fs) InternalTestMetadata(t *testing.T) { + ctx := context.Background() + contents := random.String(100) + item := fstest.NewItem("test-metadata", contents, fstest.Time("2001-05-06T04:05:06.499999999Z")) + btime := time.Now() + metadata := fs.Metadata{ + "cache-control": "no-cache", + "content-disposition": "inline", + "content-encoding": "gzip", + "content-language": "en-US", + "content-type": "text/plain", + "mtime": "2009-05-06T04:05:06.499999999Z", + // "tier" - read only + // "btime" - read only + } + obj := fstests.PutTestContentsMetadata(ctx, t, f, &item, contents, true, "text/html", metadata) + defer func() { + assert.NoError(t, obj.Remove(ctx)) + }() + o := obj.(*Object) + gotMetadata, err := o.Metadata(ctx) + require.NoError(t, err) + for k, v := range metadata { + got := gotMetadata[k] + switch k { + case "mtime": + assert.True(t, fstest.Time(v).Equal(fstest.Time(got))) + case "btime": + gotBtime := fstest.Time(got) + dt := gotBtime.Sub(btime) + assert.True(t, dt < time.Minute && dt > -time.Minute, fmt.Sprintf("btime more than 1 minute out want %v got %v delta %v", btime, gotBtime, dt)) + assert.True(t, fstest.Time(v).Equal(fstest.Time(got))) + case "tier": + assert.NotEqual(t, "", got) + default: + assert.Equal(t, v, got, k) + } + } +} + +func (f *Fs) InternalTest(t *testing.T) { + t.Run("Metadata", f.InternalTestMetadata) +} + +var _ fstests.InternalTester = (*Fs)(nil) diff --git a/docs/content/s3.md b/docs/content/s3.md index c2adbfe95..ddf9fd23d 100644 --- a/docs/content/s3.md +++ b/docs/content/s3.md @@ -2369,6 +2369,26 @@ Properties: - Type: Tristate - Default: unset +### Metadata + +User metadata is stored as x-amz-meta- keys. S3 metadata keys are case insensitive and are always returned in lower case. + +Here are the possible system metadata items for the s3 backend. + +| Name | Help | Type | Example | Read Only | +|------|------|------|---------|-----------| +| btime | Time of file birth (creation) read from Last-Modified header | RFC 3339 | 2006-01-02T15:04:05.999999999Z07:00 | **Y** | +| cache-control | Cache-Control header | string | no-cache | N | +| content-disposition | Content-Disposition header | string | inline | N | +| content-encoding | Content-Encoding header | string | gzip | N | +| content-language | Content-Language header | string | en-US | N | +| content-type | Content-Type header | string | text/plain | N | +| mtime | Time of last modification, read from rclone metadata | RFC 3339 | 2006-01-02T15:04:05.999999999Z07:00 | N | +| tier | Tier of the object | string | GLACIER | **Y** | + + +See the [metadata](/docs/#metadata) docs for more info. + ## Backend commands Here are the commands specific to the s3 backend.