feat: add option to disable use of O_TMPFILE

O_TMPFILE can fail if the location we need to link the final
file is not within the same filesystem. This can happen if
there are different filesystem mounts within a bucket or if
using zfs nested datasets within a bucket.

Fixes #1194
Fixes #1035
This commit is contained in:
Ben McClelland
2025-04-08 21:08:09 -07:00
parent adadba8fa8
commit f0a1184459
7 changed files with 218 additions and 85 deletions

View File

@@ -17,11 +17,14 @@ package backend
import (
"crypto/md5"
"encoding/hex"
"errors"
"fmt"
"io"
"io/fs"
"os"
"strconv"
"strings"
"syscall"
"time"
"github.com/aws/aws-sdk-go-v2/service/s3/types"
@@ -269,3 +272,54 @@ func (f *FileSectionReadCloser) Read(p []byte) (int, error) {
func (f *FileSectionReadCloser) Close() error {
return f.F.Close()
}
// MoveFile moves a file from source to destination.
func MoveFile(source, destination string, perm os.FileMode) error {
// We use Rename as the atomic operation for object puts. The upload is
// written to a temp file to not conflict with any other simultaneous
// uploads. The final operation is to move the temp file into place for
// the object. This ensures the object semantics of last upload completed
// wins and is not some combination of writes from simultaneous uploads.
err := os.Rename(source, destination)
if err == nil || !errors.Is(err, syscall.EXDEV) {
return err
}
// Rename can fail if the source and destination are not on the same
// filesystem. The fallback is to copy the file and then remove the source.
// We need to be careful that the desination does not exist before copying
// to prevent any other simultaneous writes to the file.
sourceFile, err := os.Open(source)
if err != nil {
return fmt.Errorf("open source: %w", err)
}
defer sourceFile.Close()
var destFile *os.File
for {
destFile, err = os.OpenFile(destination, os.O_CREATE|os.O_EXCL|os.O_WRONLY, perm)
if err != nil {
if errors.Is(err, fs.ErrExist) {
if removeErr := os.Remove(destination); removeErr != nil {
return fmt.Errorf("remove existing destination: %w", removeErr)
}
continue
}
return fmt.Errorf("create destination: %w", err)
}
break
}
defer destFile.Close()
_, err = io.Copy(destFile, sourceFile)
if err != nil {
return fmt.Errorf("copy data: %w", err)
}
err = os.Remove(source)
if err != nil {
return fmt.Errorf("remove source: %w", err)
}
return nil
}

View File

@@ -73,6 +73,11 @@ type Posix struct {
// newDirPerm is the permission to set on newly created directories
newDirPerm fs.FileMode
// forceNoTmpFile is a flag to disable the use of O_TMPFILE even
// if the filesystem supports it. This is needed for cases where
// there are different filesystems mounted below the bucket level.
forceNoTmpFile bool
}
var _ backend.Backend = &Posix{}
@@ -108,13 +113,23 @@ const (
skipFalloc = false
)
// PosixOpts are the options for the Posix backend
type PosixOpts struct {
ChownUID bool
ChownGID bool
BucketLinks bool
// ChownUID sets the UID of the object to the UID of the user on PUT
ChownUID bool
// ChownGID sets the GID of the object to the GID of the user on PUT
ChownGID bool
// BucketLinks enables symlinks to directories to be treated as buckets
BucketLinks bool
//VersioningDir sets the version directory to enable object versioning
VersioningDir string
NewDirPerm fs.FileMode
SideCarDir string
// NewDirPerm specifies the permission to set on newly created directories
NewDirPerm fs.FileMode
// SideCarDir sets the directory to store sidecar metadata
SideCarDir string
// ForceNoTmpFile disables the use of O_TMPFILE even if the filesystem
// supports it
ForceNoTmpFile bool
}
func New(rootdir string, meta meta.MetadataStorer, opts PosixOpts) (*Posix, error) {
@@ -164,16 +179,17 @@ func New(rootdir string, meta meta.MetadataStorer, opts PosixOpts) (*Posix, erro
}
return &Posix{
meta: meta,
rootfd: f,
rootdir: rootdir,
euid: os.Geteuid(),
egid: os.Getegid(),
chownuid: opts.ChownUID,
chowngid: opts.ChownGID,
bucketlinks: opts.BucketLinks,
versioningDir: verioningdirAbs,
newDirPerm: opts.NewDirPerm,
meta: meta,
rootfd: f,
rootdir: rootdir,
euid: os.Geteuid(),
egid: os.Getegid(),
chownuid: opts.ChownUID,
chowngid: opts.ChownGID,
bucketlinks: opts.BucketLinks,
versioningDir: verioningdirAbs,
newDirPerm: opts.NewDirPerm,
forceNoTmpFile: opts.ForceNoTmpFile,
}, nil
}
@@ -691,7 +707,8 @@ func (p *Posix) createObjVersion(bucket, key string, size int64, acc auth.Accoun
versionBucketPath := filepath.Join(p.versioningDir, bucket)
versioningKey := filepath.Join(genObjVersionKey(key), versionId)
versionTmpPath := filepath.Join(versionBucketPath, metaTmpDir)
f, err := p.openTmpFile(versionTmpPath, versionBucketPath, versioningKey, size, acc, doFalloc)
f, err := p.openTmpFile(versionTmpPath, versionBucketPath, versioningKey,
size, acc, doFalloc, p.forceNoTmpFile)
if err != nil {
return versionPath, err
}
@@ -1488,7 +1505,7 @@ func (p *Posix) CompleteMultipartUpload(ctx context.Context, input *s3.CompleteM
}
f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir), bucket, object,
totalsize, acct, skipFalloc)
totalsize, acct, skipFalloc, p.forceNoTmpFile)
if err != nil {
if errors.Is(err, syscall.EDQUOT) {
return nil, s3err.GetAPIError(s3err.ErrQuotaExceeded)
@@ -2316,7 +2333,7 @@ func (p *Posix) UploadPart(ctx context.Context, input *s3.UploadPartInput) (*s3.
partPath := filepath.Join(mpPath, fmt.Sprintf("%v", *part))
f, err := p.openTmpFile(filepath.Join(bucket, objdir),
bucket, partPath, length, acct, doFalloc)
bucket, partPath, length, acct, doFalloc, p.forceNoTmpFile)
if err != nil {
if errors.Is(err, syscall.EDQUOT) {
return nil, s3err.GetAPIError(s3err.ErrQuotaExceeded)
@@ -2536,7 +2553,7 @@ func (p *Posix) UploadPartCopy(ctx context.Context, upi *s3.UploadPartCopyInput)
}
f, err := p.openTmpFile(filepath.Join(*upi.Bucket, objdir),
*upi.Bucket, partPath, length, acct, doFalloc)
*upi.Bucket, partPath, length, acct, doFalloc, p.forceNoTmpFile)
if err != nil {
if errors.Is(err, syscall.EDQUOT) {
return s3response.CopyPartResult{}, s3err.GetAPIError(s3err.ErrQuotaExceeded)
@@ -2778,7 +2795,7 @@ func (p *Posix) PutObject(ctx context.Context, po s3response.PutObjectInput) (s3
}
f, err := p.openTmpFile(filepath.Join(*po.Bucket, metaTmpDir),
*po.Bucket, *po.Key, contentLength, acct, doFalloc)
*po.Bucket, *po.Key, contentLength, acct, doFalloc, p.forceNoTmpFile)
if err != nil {
if errors.Is(err, syscall.EDQUOT) {
return s3response.PutObjectOutput{}, s3err.GetAPIError(s3err.ErrQuotaExceeded)
@@ -3131,7 +3148,9 @@ func (p *Posix) DeleteObject(ctx context.Context, input *s3.DeleteObjectInput) (
acct = auth.Account{}
}
f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir), bucket, object, srcObjVersion.Size(), acct, doFalloc)
f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir),
bucket, object, srcObjVersion.Size(), acct, doFalloc,
p.forceNoTmpFile)
if err != nil {
return nil, fmt.Errorf("open tmp file: %w", err)
}

View File

@@ -52,9 +52,13 @@ var (
defaultFilePerm uint32 = 0644
)
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, dofalloc bool) (*tmpfile, error) {
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, dofalloc bool, forceNoTmpFile bool) (*tmpfile, error) {
uid, gid, doChown := p.getChownIDs(acct)
if forceNoTmpFile {
return p.openMkTemp(dir, bucket, obj, size, dofalloc, uid, gid, doChown)
}
// O_TMPFILE allows for a file handle to an unnamed file in the filesystem.
// This can help reduce contention within the namespace (parent directories),
// etc. And will auto cleanup the inode on close if we never link this
@@ -68,43 +72,7 @@ func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Accou
}
// O_TMPFILE not supported, try fallback
err = backend.MkdirAll(dir, uid, gid, doChown, p.newDirPerm)
if err != nil {
if errors.Is(err, syscall.EROFS) {
return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed)
}
return nil, fmt.Errorf("make temp dir: %w", err)
}
f, err := os.CreateTemp(dir,
fmt.Sprintf("%x.", sha256.Sum256([]byte(obj))))
if err != nil {
if errors.Is(err, syscall.EROFS) {
return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed)
}
return nil, err
}
tmp := &tmpfile{
f: f,
bucket: bucket,
objname: obj,
size: size,
needsChown: doChown,
uid: uid,
gid: gid,
}
// falloc is best effort, its fine if this fails
if size > 0 && dofalloc {
tmp.falloc()
}
if doChown {
err := f.Chown(uid, gid)
if err != nil {
return nil, fmt.Errorf("set temp file ownership: %w", err)
}
}
return tmp, nil
return p.openMkTemp(dir, bucket, obj, size, dofalloc, uid, gid, doChown)
}
// for O_TMPFILE, filename is /proc/self/fd/<fd> to be used
@@ -138,6 +106,46 @@ func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Accou
return tmp, nil
}
func (p *Posix) openMkTemp(dir, bucket, obj string, size int64, dofalloc bool, uid, gid int, doChown bool) (*tmpfile, error) {
err := backend.MkdirAll(dir, uid, gid, doChown, p.newDirPerm)
if err != nil {
if errors.Is(err, syscall.EROFS) {
return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed)
}
return nil, fmt.Errorf("make temp dir: %w", err)
}
f, err := os.CreateTemp(dir,
fmt.Sprintf("%x.", sha256.Sum256([]byte(obj))))
if err != nil {
if errors.Is(err, syscall.EROFS) {
return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed)
}
return nil, err
}
tmp := &tmpfile{
f: f,
bucket: bucket,
objname: obj,
size: size,
needsChown: doChown,
uid: uid,
gid: gid,
}
// falloc is best effort, its fine if this fails
if size > 0 && dofalloc {
tmp.falloc()
}
if doChown {
err := f.Chown(uid, gid)
if err != nil {
return nil, fmt.Errorf("set temp file ownership: %w", err)
}
}
return tmp, nil
}
func (tmp *tmpfile) falloc() error {
err := syscall.Fallocate(int(tmp.f.Fd()), 0, 0, tmp.size)
if err != nil {
@@ -228,7 +236,9 @@ func (tmp *tmpfile) fallbackLink() error {
objPath := filepath.Join(tmp.bucket, tmp.objname)
err = os.Rename(tempname, objPath)
if err != nil {
return fmt.Errorf("rename tmpfile: %w", err)
// rename only works for files within the same filesystem
// if this fails fallback to copy
return backend.MoveFile(tempname, objPath, fs.FileMode(defaultFilePerm))
}
return nil

View File

@@ -38,7 +38,7 @@ type tmpfile struct {
size int64
}
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, _ bool) (*tmpfile, error) {
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, _ bool, _ bool) (*tmpfile, error) {
uid, gid, doChown := p.getChownIDs(acct)
// Create a temp file for upload while in progress (see link comments below).
@@ -80,31 +80,17 @@ func (tmp *tmpfile) link() error {
// this will no longer exist
defer os.Remove(tempname)
// We use Rename as the atomic operation for object puts. The upload is
// written to a temp file to not conflict with any other simultaneous
// uploads. The final operation is to move the temp file into place for
// the object. This ensures the object semantics of last upload completed
// wins and is not some combination of writes from simultaneous uploads.
objPath := filepath.Join(tmp.bucket, tmp.objname)
err := os.Remove(objPath)
if err != nil && !errors.Is(err, fs.ErrNotExist) {
return fmt.Errorf("remove stale path: %w", err)
}
// reset default file mode because CreateTemp uses 0600
tmp.f.Chmod(defaultFilePerm)
err = tmp.f.Close()
err := tmp.f.Close()
if err != nil {
return fmt.Errorf("close tmpfile: %w", err)
}
err = os.Rename(tempname, objPath)
if err != nil {
return fmt.Errorf("rename tmpfile: %w", err)
}
return nil
return backend.MoveFile(tempname, objPath, defaultFilePerm)
}
func (tmp *tmpfile) Write(b []byte) (int, error) {

View File

@@ -524,19 +524,19 @@ func initFlags() []cli.Flag {
},
&cli.StringFlag{
Name: "ipa-user",
Usage: "Username used to connect to FreeIPA. Needs permissions to read user vault contents",
Usage: "Username used to connect to FreeIPA (requires permissions to read user vault contents)",
EnvVars: []string{"VGW_IPA_USER"},
Destination: &ipaUser,
},
&cli.StringFlag{
Name: "ipa-password",
Usage: "Password of the user used to connect to FreeIPA.",
Usage: "Password of the user used to connect to FreeIPA",
EnvVars: []string{"VGW_IPA_PASSWORD"},
Destination: &ipaPassword,
},
&cli.BoolFlag{
Name: "ipa-insecure",
Usage: "Verify TLS certificate of FreeIPA server. Default is 'true'.",
Usage: "Disable verify TLS certificate of FreeIPA server",
EnvVars: []string{"VGW_IPA_INSECURE"},
Destination: &ipaInsecure,
},

View File

@@ -31,6 +31,7 @@ var (
dirPerms uint
sidecar string
nometa bool
forceNoTmpFile bool
)
func posixCommand() *cli.Command {
@@ -93,6 +94,12 @@ will be translated into the file /mnt/fs/gwroot/mybucket/a/b/c/myobject`,
EnvVars: []string{"VGW_META_NONE"},
Destination: &nometa,
},
&cli.BoolFlag{
Name: "disableotmp",
Usage: "disable O_TMPFILE support for new objects",
EnvVars: []string{"VGW_DISABLE_OTMP"},
Destination: &forceNoTmpFile,
},
},
}
}
@@ -113,11 +120,12 @@ func runPosix(ctx *cli.Context) error {
}
opts := posix.PosixOpts{
ChownUID: chownuid,
ChownGID: chowngid,
BucketLinks: bucketlinks,
VersioningDir: versioningDir,
NewDirPerm: fs.FileMode(dirPerms),
ChownUID: chownuid,
ChownGID: chowngid,
BucketLinks: bucketlinks,
VersioningDir: versioningDir,
NewDirPerm: fs.FileMode(dirPerms),
ForceNoTmpFile: forceNoTmpFile,
}
var ms meta.MetadataStorer

View File

@@ -99,6 +99,10 @@ ROOT_SECRET_ACCESS_KEY=
# endpoint is unauthenticated, and returns a 200 status for GET.
#VGW_HEALTH=
# Enable VGW_READ_ONLY to only allow read operations to the S3 server. No write
# operations will be allowed.
#VGW_READ_ONLY=false
###############
# Access Logs #
###############
@@ -240,6 +244,24 @@ ROOT_SECRET_ACCESS_KEY=
#VGW_IAM_LDAP_USER_ID_ATR=
#VGW_IAM_LDAP_GROUP_ID_ATR=
# The FreeIPA options will enable the FreeIPA IAM service with accounts stored
# in an external FreeIPA service. Currently the FreeIPA IAM service only
# supports account retrieval. Creating and modifying accounts must be done
# outside of the versitygw service.
# FreeIPA server url e.g. https://ipa.example.test
#VGW_IPA_HOST=
# A name of the user vault containing their secret
#VGW_IPA_VAULT_NAME=
# Username used to connect to FreeIPA (requires permissions to read user vault
# contents)
#VGW_IPA_USER=
# Password of the user used to connect to FreeIPA
#VGW_IPA_PASSWORD=
# Disable verify TLS certificate of FreeIPA server
#VGW_IPA_INSECURE=false
# FreeIPA IAM debug output
#VGW_IPA_DEBUG=false
###############
# IAM caching #
###############
@@ -317,6 +339,40 @@ ROOT_SECRET_ACCESS_KEY=
# as any parent directories automatically created with object uploads.
#VGW_DIR_PERMS=0755
# To enable object versions, the VGW_VERSIONING_DIR option must be set to the
# directory that will be used to store the object versions. The version
# directory must NOT be a subdirectory of the VGW_BACKEND_ARG directory.
#VGW_VERSIONING_DIR=
# The gateway uses xattrs to store metadata for objects by default. For systems
# that do not support xattrs, the VGW_META_SIDECAR option can be set to a
# directory that will be used to store the metadata for objects. This is
# currently experimental, and may have issues for some edge cases.
#VGW_META_SIDECAR=
# The VGW_META_NONE option will disable the metadata functionality for the
# gateway. This will cause the gateway to not store any metadata for objects
# or buckets. This include bucket ACLs and Policy. This may be useful for
# read only access to pre-existing data where the gateway should not modify
# the data. It is recommened to enable VGW_READ_ONLY (Global Options) along
# with this.
#VGW_META_NONE=false
# The gateway will use O_TMPFILE for writing objects while uploading and
# link the file to the final object name when the upload is complete if the
# filesystem supports O_TMPFILE. This creates an atomic object creation
# that is not visible to other clients or racing uploads until the upload
# is complete. This will not work if there is a different filesystem mounted
# below the bucket level than where the bucket resides. The VGW_DISABLE_OTMP
# option can be set to true to disable this functionality and force the fallback
# mode when O_TMPFILE is not available. This fallback will create a temporary
# file in the bucket directory and rename it to the final object name when
# the upload is complete if the final location is in the same filesystem, or
# copy the file to the final location if the final location is in a different
# filesystem. This fallback mode is still atomic, but may be less efficient
# than O_TMPFILE when the data needs to be copied into the final location.
#VGW_DISABLE_OTMP=false
###########
# scoutfs #
###########