From 190dd8853c1cd43332f552be95a8135f1291ec96 Mon Sep 17 00:00:00 2001 From: Ben McClelland Date: Tue, 21 May 2024 16:02:04 -0700 Subject: [PATCH] fix: expose posix tmpfile fd to enable copy_file_range The complete multipart upload can be optimized in some cases to not need to copy the full data from parts to the final object file. If the filesystem supports it, there can be optimizations to just clone exent references and not have to actually copy the data. For io.Copy() to make use of file_copy_range, we have to pass it the *os.File file handles from the filesystem for the source and destination. We were previously adding a layer of indirection that was causing io.Copy() to fallback to full data copy. This fixes the copy by exposing the underlying fd. This also skips the falloc for the final object in complete mutlipart upload, because some filesystems will be able to use file_copy_range to optimize the copy and may not even need new data allocations in the final object file. Note that this only affects posix mode as scoutfs has special handling for this case that is similar to but not compatible with copy_file_range using a special ioctl. --- backend/posix/posix.go | 13 ++++++++----- backend/posix/with_otmpfile.go | 10 +++++++--- backend/posix/without_otmpfile.go | 6 +++++- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/backend/posix/posix.go b/backend/posix/posix.go index ca14dde8..304c0716 100644 --- a/backend/posix/posix.go +++ b/backend/posix/posix.go @@ -79,6 +79,9 @@ const ( bucketLockKey = "bucket-lock" objectRetentionKey = "object-retention" objectLegalHoldKey = "object-legal-hold" + + doFalloc = true + skipFalloc = false ) type PosixOpts struct { @@ -461,7 +464,7 @@ func (p *Posix) CompleteMultipartUpload(ctx context.Context, input *s3.CompleteM } f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir), bucket, object, - totalsize, acct) + totalsize, acct, skipFalloc) if err != nil { if errors.Is(err, syscall.EDQUOT) { return nil, s3err.GetAPIError(s3err.ErrQuotaExceeded) @@ -477,7 +480,7 @@ func (p *Posix) CompleteMultipartUpload(ctx context.Context, input *s3.CompleteM if err != nil { return nil, fmt.Errorf("open part %v: %v", *part.PartNumber, err) } - _, err = io.Copy(f, pf) + _, err = io.Copy(f.File(), pf) pf.Close() if err != nil { if errors.Is(err, syscall.EDQUOT) { @@ -970,7 +973,7 @@ func (p *Posix) UploadPart(ctx context.Context, input *s3.UploadPartInput) (stri partPath := filepath.Join(objdir, uploadID, fmt.Sprintf("%v", *part)) f, err := p.openTmpFile(filepath.Join(bucket, objdir), - bucket, partPath, length, acct) + bucket, partPath, length, acct, doFalloc) if err != nil { if errors.Is(err, syscall.EDQUOT) { return "", s3err.GetAPIError(s3err.ErrQuotaExceeded) @@ -1078,7 +1081,7 @@ func (p *Posix) UploadPartCopy(ctx context.Context, upi *s3.UploadPartCopyInput) } f, err := p.openTmpFile(filepath.Join(*upi.Bucket, objdir), - *upi.Bucket, partPath, length, acct) + *upi.Bucket, partPath, length, acct, doFalloc) if err != nil { if errors.Is(err, syscall.EDQUOT) { return s3response.CopyObjectResult{}, s3err.GetAPIError(s3err.ErrQuotaExceeded) @@ -1217,7 +1220,7 @@ func (p *Posix) PutObject(ctx context.Context, po *s3.PutObjectInput) (string, e } f, err := p.openTmpFile(filepath.Join(*po.Bucket, metaTmpDir), - *po.Bucket, *po.Key, contentLength, acct) + *po.Bucket, *po.Key, contentLength, acct, doFalloc) if err != nil { if errors.Is(err, syscall.EDQUOT) { return "", s3err.GetAPIError(s3err.ErrQuotaExceeded) diff --git a/backend/posix/with_otmpfile.go b/backend/posix/with_otmpfile.go index a772800a..fc49827f 100644 --- a/backend/posix/with_otmpfile.go +++ b/backend/posix/with_otmpfile.go @@ -50,7 +50,7 @@ var ( defaultFilePerm uint32 = 0644 ) -func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account) (*tmpfile, error) { +func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, dofalloc bool) (*tmpfile, error) { uid, gid, doChown := p.getChownIDs(acct) // O_TMPFILE allows for a file handle to an unnamed file in the filesystem. @@ -81,7 +81,7 @@ func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Accou gid: gid, } // falloc is best effort, its fine if this fails - if size > 0 { + if size > 0 && dofalloc { tmp.falloc() } @@ -111,7 +111,7 @@ func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Accou } // falloc is best effort, its fine if this fails - if size > 0 { + if size > 0 && dofalloc { tmp.falloc() } @@ -221,3 +221,7 @@ func (tmp *tmpfile) Write(b []byte) (int, error) { func (tmp *tmpfile) cleanup() { tmp.f.Close() } + +func (tmp *tmpfile) File() *os.File { + return tmp.f +} diff --git a/backend/posix/without_otmpfile.go b/backend/posix/without_otmpfile.go index d6d41944..717f3c6c 100644 --- a/backend/posix/without_otmpfile.go +++ b/backend/posix/without_otmpfile.go @@ -36,7 +36,7 @@ type tmpfile struct { size int64 } -func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account) (*tmpfile, error) { +func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, _ bool) (*tmpfile, error) { uid, gid, doChown := p.getChownIDs(acct) // Create a temp file for upload while in progress (see link comments below). @@ -112,3 +112,7 @@ func (tmp *tmpfile) Write(b []byte) (int, error) { func (tmp *tmpfile) cleanup() { tmp.f.Close() } + +func (tmp *tmpfile) File() *os.File { + return tmp.f +}