feat: add option to disable use of O_TMPFILE

O_TMPFILE can fail if the location we need to link the final
file is not within the same filesystem. This can happen if
there are different filesystem mounts within a bucket or if
using zfs nested datasets within a bucket.

Fixes #1194
Fixes #1035
This commit is contained in:
Ben McClelland
2025-04-08 21:08:09 -07:00
parent adadba8fa8
commit f0a1184459
7 changed files with 218 additions and 85 deletions

View File

@@ -17,11 +17,14 @@ package backend
import (
"crypto/md5"
"encoding/hex"
"errors"
"fmt"
"io"
"io/fs"
"os"
"strconv"
"strings"
"syscall"
"time"
"github.com/aws/aws-sdk-go-v2/service/s3/types"
@@ -269,3 +272,54 @@ func (f *FileSectionReadCloser) Read(p []byte) (int, error) {
func (f *FileSectionReadCloser) Close() error {
return f.F.Close()
}
// MoveFile moves a file from source to destination.
func MoveFile(source, destination string, perm os.FileMode) error {
// We use Rename as the atomic operation for object puts. The upload is
// written to a temp file to not conflict with any other simultaneous
// uploads. The final operation is to move the temp file into place for
// the object. This ensures the object semantics of last upload completed
// wins and is not some combination of writes from simultaneous uploads.
err := os.Rename(source, destination)
if err == nil || !errors.Is(err, syscall.EXDEV) {
return err
}
// Rename can fail if the source and destination are not on the same
// filesystem. The fallback is to copy the file and then remove the source.
// We need to be careful that the desination does not exist before copying
// to prevent any other simultaneous writes to the file.
sourceFile, err := os.Open(source)
if err != nil {
return fmt.Errorf("open source: %w", err)
}
defer sourceFile.Close()
var destFile *os.File
for {
destFile, err = os.OpenFile(destination, os.O_CREATE|os.O_EXCL|os.O_WRONLY, perm)
if err != nil {
if errors.Is(err, fs.ErrExist) {
if removeErr := os.Remove(destination); removeErr != nil {
return fmt.Errorf("remove existing destination: %w", removeErr)
}
continue
}
return fmt.Errorf("create destination: %w", err)
}
break
}
defer destFile.Close()
_, err = io.Copy(destFile, sourceFile)
if err != nil {
return fmt.Errorf("copy data: %w", err)
}
err = os.Remove(source)
if err != nil {
return fmt.Errorf("remove source: %w", err)
}
return nil
}

View File

@@ -73,6 +73,11 @@ type Posix struct {
// newDirPerm is the permission to set on newly created directories
newDirPerm fs.FileMode
// forceNoTmpFile is a flag to disable the use of O_TMPFILE even
// if the filesystem supports it. This is needed for cases where
// there are different filesystems mounted below the bucket level.
forceNoTmpFile bool
}
var _ backend.Backend = &Posix{}
@@ -108,13 +113,23 @@ const (
skipFalloc = false
)
// PosixOpts are the options for the Posix backend
type PosixOpts struct {
ChownUID bool
ChownGID bool
BucketLinks bool
// ChownUID sets the UID of the object to the UID of the user on PUT
ChownUID bool
// ChownGID sets the GID of the object to the GID of the user on PUT
ChownGID bool
// BucketLinks enables symlinks to directories to be treated as buckets
BucketLinks bool
//VersioningDir sets the version directory to enable object versioning
VersioningDir string
NewDirPerm fs.FileMode
SideCarDir string
// NewDirPerm specifies the permission to set on newly created directories
NewDirPerm fs.FileMode
// SideCarDir sets the directory to store sidecar metadata
SideCarDir string
// ForceNoTmpFile disables the use of O_TMPFILE even if the filesystem
// supports it
ForceNoTmpFile bool
}
func New(rootdir string, meta meta.MetadataStorer, opts PosixOpts) (*Posix, error) {
@@ -164,16 +179,17 @@ func New(rootdir string, meta meta.MetadataStorer, opts PosixOpts) (*Posix, erro
}
return &Posix{
meta: meta,
rootfd: f,
rootdir: rootdir,
euid: os.Geteuid(),
egid: os.Getegid(),
chownuid: opts.ChownUID,
chowngid: opts.ChownGID,
bucketlinks: opts.BucketLinks,
versioningDir: verioningdirAbs,
newDirPerm: opts.NewDirPerm,
meta: meta,
rootfd: f,
rootdir: rootdir,
euid: os.Geteuid(),
egid: os.Getegid(),
chownuid: opts.ChownUID,
chowngid: opts.ChownGID,
bucketlinks: opts.BucketLinks,
versioningDir: verioningdirAbs,
newDirPerm: opts.NewDirPerm,
forceNoTmpFile: opts.ForceNoTmpFile,
}, nil
}
@@ -691,7 +707,8 @@ func (p *Posix) createObjVersion(bucket, key string, size int64, acc auth.Accoun
versionBucketPath := filepath.Join(p.versioningDir, bucket)
versioningKey := filepath.Join(genObjVersionKey(key), versionId)
versionTmpPath := filepath.Join(versionBucketPath, metaTmpDir)
f, err := p.openTmpFile(versionTmpPath, versionBucketPath, versioningKey, size, acc, doFalloc)
f, err := p.openTmpFile(versionTmpPath, versionBucketPath, versioningKey,
size, acc, doFalloc, p.forceNoTmpFile)
if err != nil {
return versionPath, err
}
@@ -1488,7 +1505,7 @@ func (p *Posix) CompleteMultipartUpload(ctx context.Context, input *s3.CompleteM
}
f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir), bucket, object,
totalsize, acct, skipFalloc)
totalsize, acct, skipFalloc, p.forceNoTmpFile)
if err != nil {
if errors.Is(err, syscall.EDQUOT) {
return nil, s3err.GetAPIError(s3err.ErrQuotaExceeded)
@@ -2316,7 +2333,7 @@ func (p *Posix) UploadPart(ctx context.Context, input *s3.UploadPartInput) (*s3.
partPath := filepath.Join(mpPath, fmt.Sprintf("%v", *part))
f, err := p.openTmpFile(filepath.Join(bucket, objdir),
bucket, partPath, length, acct, doFalloc)
bucket, partPath, length, acct, doFalloc, p.forceNoTmpFile)
if err != nil {
if errors.Is(err, syscall.EDQUOT) {
return nil, s3err.GetAPIError(s3err.ErrQuotaExceeded)
@@ -2536,7 +2553,7 @@ func (p *Posix) UploadPartCopy(ctx context.Context, upi *s3.UploadPartCopyInput)
}
f, err := p.openTmpFile(filepath.Join(*upi.Bucket, objdir),
*upi.Bucket, partPath, length, acct, doFalloc)
*upi.Bucket, partPath, length, acct, doFalloc, p.forceNoTmpFile)
if err != nil {
if errors.Is(err, syscall.EDQUOT) {
return s3response.CopyPartResult{}, s3err.GetAPIError(s3err.ErrQuotaExceeded)
@@ -2778,7 +2795,7 @@ func (p *Posix) PutObject(ctx context.Context, po s3response.PutObjectInput) (s3
}
f, err := p.openTmpFile(filepath.Join(*po.Bucket, metaTmpDir),
*po.Bucket, *po.Key, contentLength, acct, doFalloc)
*po.Bucket, *po.Key, contentLength, acct, doFalloc, p.forceNoTmpFile)
if err != nil {
if errors.Is(err, syscall.EDQUOT) {
return s3response.PutObjectOutput{}, s3err.GetAPIError(s3err.ErrQuotaExceeded)
@@ -3131,7 +3148,9 @@ func (p *Posix) DeleteObject(ctx context.Context, input *s3.DeleteObjectInput) (
acct = auth.Account{}
}
f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir), bucket, object, srcObjVersion.Size(), acct, doFalloc)
f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir),
bucket, object, srcObjVersion.Size(), acct, doFalloc,
p.forceNoTmpFile)
if err != nil {
return nil, fmt.Errorf("open tmp file: %w", err)
}

View File

@@ -52,9 +52,13 @@ var (
defaultFilePerm uint32 = 0644
)
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, dofalloc bool) (*tmpfile, error) {
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, dofalloc bool, forceNoTmpFile bool) (*tmpfile, error) {
uid, gid, doChown := p.getChownIDs(acct)
if forceNoTmpFile {
return p.openMkTemp(dir, bucket, obj, size, dofalloc, uid, gid, doChown)
}
// O_TMPFILE allows for a file handle to an unnamed file in the filesystem.
// This can help reduce contention within the namespace (parent directories),
// etc. And will auto cleanup the inode on close if we never link this
@@ -68,43 +72,7 @@ func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Accou
}
// O_TMPFILE not supported, try fallback
err = backend.MkdirAll(dir, uid, gid, doChown, p.newDirPerm)
if err != nil {
if errors.Is(err, syscall.EROFS) {
return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed)
}
return nil, fmt.Errorf("make temp dir: %w", err)
}
f, err := os.CreateTemp(dir,
fmt.Sprintf("%x.", sha256.Sum256([]byte(obj))))
if err != nil {
if errors.Is(err, syscall.EROFS) {
return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed)
}
return nil, err
}
tmp := &tmpfile{
f: f,
bucket: bucket,
objname: obj,
size: size,
needsChown: doChown,
uid: uid,
gid: gid,
}
// falloc is best effort, its fine if this fails
if size > 0 && dofalloc {
tmp.falloc()
}
if doChown {
err := f.Chown(uid, gid)
if err != nil {
return nil, fmt.Errorf("set temp file ownership: %w", err)
}
}
return tmp, nil
return p.openMkTemp(dir, bucket, obj, size, dofalloc, uid, gid, doChown)
}
// for O_TMPFILE, filename is /proc/self/fd/<fd> to be used
@@ -138,6 +106,46 @@ func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Accou
return tmp, nil
}
func (p *Posix) openMkTemp(dir, bucket, obj string, size int64, dofalloc bool, uid, gid int, doChown bool) (*tmpfile, error) {
err := backend.MkdirAll(dir, uid, gid, doChown, p.newDirPerm)
if err != nil {
if errors.Is(err, syscall.EROFS) {
return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed)
}
return nil, fmt.Errorf("make temp dir: %w", err)
}
f, err := os.CreateTemp(dir,
fmt.Sprintf("%x.", sha256.Sum256([]byte(obj))))
if err != nil {
if errors.Is(err, syscall.EROFS) {
return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed)
}
return nil, err
}
tmp := &tmpfile{
f: f,
bucket: bucket,
objname: obj,
size: size,
needsChown: doChown,
uid: uid,
gid: gid,
}
// falloc is best effort, its fine if this fails
if size > 0 && dofalloc {
tmp.falloc()
}
if doChown {
err := f.Chown(uid, gid)
if err != nil {
return nil, fmt.Errorf("set temp file ownership: %w", err)
}
}
return tmp, nil
}
func (tmp *tmpfile) falloc() error {
err := syscall.Fallocate(int(tmp.f.Fd()), 0, 0, tmp.size)
if err != nil {
@@ -228,7 +236,9 @@ func (tmp *tmpfile) fallbackLink() error {
objPath := filepath.Join(tmp.bucket, tmp.objname)
err = os.Rename(tempname, objPath)
if err != nil {
return fmt.Errorf("rename tmpfile: %w", err)
// rename only works for files within the same filesystem
// if this fails fallback to copy
return backend.MoveFile(tempname, objPath, fs.FileMode(defaultFilePerm))
}
return nil

View File

@@ -38,7 +38,7 @@ type tmpfile struct {
size int64
}
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, _ bool) (*tmpfile, error) {
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, _ bool, _ bool) (*tmpfile, error) {
uid, gid, doChown := p.getChownIDs(acct)
// Create a temp file for upload while in progress (see link comments below).
@@ -80,31 +80,17 @@ func (tmp *tmpfile) link() error {
// this will no longer exist
defer os.Remove(tempname)
// We use Rename as the atomic operation for object puts. The upload is
// written to a temp file to not conflict with any other simultaneous
// uploads. The final operation is to move the temp file into place for
// the object. This ensures the object semantics of last upload completed
// wins and is not some combination of writes from simultaneous uploads.
objPath := filepath.Join(tmp.bucket, tmp.objname)
err := os.Remove(objPath)
if err != nil && !errors.Is(err, fs.ErrNotExist) {
return fmt.Errorf("remove stale path: %w", err)
}
// reset default file mode because CreateTemp uses 0600
tmp.f.Chmod(defaultFilePerm)
err = tmp.f.Close()
err := tmp.f.Close()
if err != nil {
return fmt.Errorf("close tmpfile: %w", err)
}
err = os.Rename(tempname, objPath)
if err != nil {
return fmt.Errorf("rename tmpfile: %w", err)
}
return nil
return backend.MoveFile(tempname, objPath, defaultFilePerm)
}
func (tmp *tmpfile) Write(b []byte) (int, error) {