diff --git a/backend/common.go b/backend/common.go index 7795447..8a461a4 100644 --- a/backend/common.go +++ b/backend/common.go @@ -17,11 +17,14 @@ package backend import ( "crypto/md5" "encoding/hex" + "errors" "fmt" "io" + "io/fs" "os" "strconv" "strings" + "syscall" "time" "github.com/aws/aws-sdk-go-v2/service/s3/types" @@ -269,3 +272,54 @@ func (f *FileSectionReadCloser) Read(p []byte) (int, error) { func (f *FileSectionReadCloser) Close() error { return f.F.Close() } + +// MoveFile moves a file from source to destination. +func MoveFile(source, destination string, perm os.FileMode) error { + // We use Rename as the atomic operation for object puts. The upload is + // written to a temp file to not conflict with any other simultaneous + // uploads. The final operation is to move the temp file into place for + // the object. This ensures the object semantics of last upload completed + // wins and is not some combination of writes from simultaneous uploads. + err := os.Rename(source, destination) + if err == nil || !errors.Is(err, syscall.EXDEV) { + return err + } + + // Rename can fail if the source and destination are not on the same + // filesystem. The fallback is to copy the file and then remove the source. + // We need to be careful that the desination does not exist before copying + // to prevent any other simultaneous writes to the file. + sourceFile, err := os.Open(source) + if err != nil { + return fmt.Errorf("open source: %w", err) + } + defer sourceFile.Close() + + var destFile *os.File + for { + destFile, err = os.OpenFile(destination, os.O_CREATE|os.O_EXCL|os.O_WRONLY, perm) + if err != nil { + if errors.Is(err, fs.ErrExist) { + if removeErr := os.Remove(destination); removeErr != nil { + return fmt.Errorf("remove existing destination: %w", removeErr) + } + continue + } + return fmt.Errorf("create destination: %w", err) + } + break + } + defer destFile.Close() + + _, err = io.Copy(destFile, sourceFile) + if err != nil { + return fmt.Errorf("copy data: %w", err) + } + + err = os.Remove(source) + if err != nil { + return fmt.Errorf("remove source: %w", err) + } + + return nil +} diff --git a/backend/posix/posix.go b/backend/posix/posix.go index 6ba0573..9111296 100644 --- a/backend/posix/posix.go +++ b/backend/posix/posix.go @@ -73,6 +73,11 @@ type Posix struct { // newDirPerm is the permission to set on newly created directories newDirPerm fs.FileMode + + // forceNoTmpFile is a flag to disable the use of O_TMPFILE even + // if the filesystem supports it. This is needed for cases where + // there are different filesystems mounted below the bucket level. + forceNoTmpFile bool } var _ backend.Backend = &Posix{} @@ -108,13 +113,23 @@ const ( skipFalloc = false ) +// PosixOpts are the options for the Posix backend type PosixOpts struct { - ChownUID bool - ChownGID bool - BucketLinks bool + // ChownUID sets the UID of the object to the UID of the user on PUT + ChownUID bool + // ChownGID sets the GID of the object to the GID of the user on PUT + ChownGID bool + // BucketLinks enables symlinks to directories to be treated as buckets + BucketLinks bool + //VersioningDir sets the version directory to enable object versioning VersioningDir string - NewDirPerm fs.FileMode - SideCarDir string + // NewDirPerm specifies the permission to set on newly created directories + NewDirPerm fs.FileMode + // SideCarDir sets the directory to store sidecar metadata + SideCarDir string + // ForceNoTmpFile disables the use of O_TMPFILE even if the filesystem + // supports it + ForceNoTmpFile bool } func New(rootdir string, meta meta.MetadataStorer, opts PosixOpts) (*Posix, error) { @@ -164,16 +179,17 @@ func New(rootdir string, meta meta.MetadataStorer, opts PosixOpts) (*Posix, erro } return &Posix{ - meta: meta, - rootfd: f, - rootdir: rootdir, - euid: os.Geteuid(), - egid: os.Getegid(), - chownuid: opts.ChownUID, - chowngid: opts.ChownGID, - bucketlinks: opts.BucketLinks, - versioningDir: verioningdirAbs, - newDirPerm: opts.NewDirPerm, + meta: meta, + rootfd: f, + rootdir: rootdir, + euid: os.Geteuid(), + egid: os.Getegid(), + chownuid: opts.ChownUID, + chowngid: opts.ChownGID, + bucketlinks: opts.BucketLinks, + versioningDir: verioningdirAbs, + newDirPerm: opts.NewDirPerm, + forceNoTmpFile: opts.ForceNoTmpFile, }, nil } @@ -691,7 +707,8 @@ func (p *Posix) createObjVersion(bucket, key string, size int64, acc auth.Accoun versionBucketPath := filepath.Join(p.versioningDir, bucket) versioningKey := filepath.Join(genObjVersionKey(key), versionId) versionTmpPath := filepath.Join(versionBucketPath, metaTmpDir) - f, err := p.openTmpFile(versionTmpPath, versionBucketPath, versioningKey, size, acc, doFalloc) + f, err := p.openTmpFile(versionTmpPath, versionBucketPath, versioningKey, + size, acc, doFalloc, p.forceNoTmpFile) if err != nil { return versionPath, err } @@ -1488,7 +1505,7 @@ func (p *Posix) CompleteMultipartUpload(ctx context.Context, input *s3.CompleteM } f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir), bucket, object, - totalsize, acct, skipFalloc) + totalsize, acct, skipFalloc, p.forceNoTmpFile) if err != nil { if errors.Is(err, syscall.EDQUOT) { return nil, s3err.GetAPIError(s3err.ErrQuotaExceeded) @@ -2316,7 +2333,7 @@ func (p *Posix) UploadPart(ctx context.Context, input *s3.UploadPartInput) (*s3. partPath := filepath.Join(mpPath, fmt.Sprintf("%v", *part)) f, err := p.openTmpFile(filepath.Join(bucket, objdir), - bucket, partPath, length, acct, doFalloc) + bucket, partPath, length, acct, doFalloc, p.forceNoTmpFile) if err != nil { if errors.Is(err, syscall.EDQUOT) { return nil, s3err.GetAPIError(s3err.ErrQuotaExceeded) @@ -2536,7 +2553,7 @@ func (p *Posix) UploadPartCopy(ctx context.Context, upi *s3.UploadPartCopyInput) } f, err := p.openTmpFile(filepath.Join(*upi.Bucket, objdir), - *upi.Bucket, partPath, length, acct, doFalloc) + *upi.Bucket, partPath, length, acct, doFalloc, p.forceNoTmpFile) if err != nil { if errors.Is(err, syscall.EDQUOT) { return s3response.CopyPartResult{}, s3err.GetAPIError(s3err.ErrQuotaExceeded) @@ -2778,7 +2795,7 @@ func (p *Posix) PutObject(ctx context.Context, po s3response.PutObjectInput) (s3 } f, err := p.openTmpFile(filepath.Join(*po.Bucket, metaTmpDir), - *po.Bucket, *po.Key, contentLength, acct, doFalloc) + *po.Bucket, *po.Key, contentLength, acct, doFalloc, p.forceNoTmpFile) if err != nil { if errors.Is(err, syscall.EDQUOT) { return s3response.PutObjectOutput{}, s3err.GetAPIError(s3err.ErrQuotaExceeded) @@ -3131,7 +3148,9 @@ func (p *Posix) DeleteObject(ctx context.Context, input *s3.DeleteObjectInput) ( acct = auth.Account{} } - f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir), bucket, object, srcObjVersion.Size(), acct, doFalloc) + f, err := p.openTmpFile(filepath.Join(bucket, metaTmpDir), + bucket, object, srcObjVersion.Size(), acct, doFalloc, + p.forceNoTmpFile) if err != nil { return nil, fmt.Errorf("open tmp file: %w", err) } diff --git a/backend/posix/with_otmpfile.go b/backend/posix/with_otmpfile.go index 751c61b..77f4f86 100644 --- a/backend/posix/with_otmpfile.go +++ b/backend/posix/with_otmpfile.go @@ -52,9 +52,13 @@ var ( defaultFilePerm uint32 = 0644 ) -func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, dofalloc bool) (*tmpfile, error) { +func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, dofalloc bool, forceNoTmpFile bool) (*tmpfile, error) { uid, gid, doChown := p.getChownIDs(acct) + if forceNoTmpFile { + return p.openMkTemp(dir, bucket, obj, size, dofalloc, uid, gid, doChown) + } + // O_TMPFILE allows for a file handle to an unnamed file in the filesystem. // This can help reduce contention within the namespace (parent directories), // etc. And will auto cleanup the inode on close if we never link this @@ -68,43 +72,7 @@ func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Accou } // O_TMPFILE not supported, try fallback - err = backend.MkdirAll(dir, uid, gid, doChown, p.newDirPerm) - if err != nil { - if errors.Is(err, syscall.EROFS) { - return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed) - } - return nil, fmt.Errorf("make temp dir: %w", err) - } - f, err := os.CreateTemp(dir, - fmt.Sprintf("%x.", sha256.Sum256([]byte(obj)))) - if err != nil { - if errors.Is(err, syscall.EROFS) { - return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed) - } - return nil, err - } - tmp := &tmpfile{ - f: f, - bucket: bucket, - objname: obj, - size: size, - needsChown: doChown, - uid: uid, - gid: gid, - } - // falloc is best effort, its fine if this fails - if size > 0 && dofalloc { - tmp.falloc() - } - - if doChown { - err := f.Chown(uid, gid) - if err != nil { - return nil, fmt.Errorf("set temp file ownership: %w", err) - } - } - - return tmp, nil + return p.openMkTemp(dir, bucket, obj, size, dofalloc, uid, gid, doChown) } // for O_TMPFILE, filename is /proc/self/fd/ to be used @@ -138,6 +106,46 @@ func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Accou return tmp, nil } +func (p *Posix) openMkTemp(dir, bucket, obj string, size int64, dofalloc bool, uid, gid int, doChown bool) (*tmpfile, error) { + err := backend.MkdirAll(dir, uid, gid, doChown, p.newDirPerm) + if err != nil { + if errors.Is(err, syscall.EROFS) { + return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed) + } + return nil, fmt.Errorf("make temp dir: %w", err) + } + f, err := os.CreateTemp(dir, + fmt.Sprintf("%x.", sha256.Sum256([]byte(obj)))) + if err != nil { + if errors.Is(err, syscall.EROFS) { + return nil, s3err.GetAPIError(s3err.ErrMethodNotAllowed) + } + return nil, err + } + tmp := &tmpfile{ + f: f, + bucket: bucket, + objname: obj, + size: size, + needsChown: doChown, + uid: uid, + gid: gid, + } + // falloc is best effort, its fine if this fails + if size > 0 && dofalloc { + tmp.falloc() + } + + if doChown { + err := f.Chown(uid, gid) + if err != nil { + return nil, fmt.Errorf("set temp file ownership: %w", err) + } + } + + return tmp, nil +} + func (tmp *tmpfile) falloc() error { err := syscall.Fallocate(int(tmp.f.Fd()), 0, 0, tmp.size) if err != nil { @@ -228,7 +236,9 @@ func (tmp *tmpfile) fallbackLink() error { objPath := filepath.Join(tmp.bucket, tmp.objname) err = os.Rename(tempname, objPath) if err != nil { - return fmt.Errorf("rename tmpfile: %w", err) + // rename only works for files within the same filesystem + // if this fails fallback to copy + return backend.MoveFile(tempname, objPath, fs.FileMode(defaultFilePerm)) } return nil diff --git a/backend/posix/without_otmpfile.go b/backend/posix/without_otmpfile.go index 2310394..d19b549 100644 --- a/backend/posix/without_otmpfile.go +++ b/backend/posix/without_otmpfile.go @@ -38,7 +38,7 @@ type tmpfile struct { size int64 } -func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, _ bool) (*tmpfile, error) { +func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, _ bool, _ bool) (*tmpfile, error) { uid, gid, doChown := p.getChownIDs(acct) // Create a temp file for upload while in progress (see link comments below). @@ -80,31 +80,17 @@ func (tmp *tmpfile) link() error { // this will no longer exist defer os.Remove(tempname) - // We use Rename as the atomic operation for object puts. The upload is - // written to a temp file to not conflict with any other simultaneous - // uploads. The final operation is to move the temp file into place for - // the object. This ensures the object semantics of last upload completed - // wins and is not some combination of writes from simultaneous uploads. objPath := filepath.Join(tmp.bucket, tmp.objname) - err := os.Remove(objPath) - if err != nil && !errors.Is(err, fs.ErrNotExist) { - return fmt.Errorf("remove stale path: %w", err) - } // reset default file mode because CreateTemp uses 0600 tmp.f.Chmod(defaultFilePerm) - err = tmp.f.Close() + err := tmp.f.Close() if err != nil { return fmt.Errorf("close tmpfile: %w", err) } - err = os.Rename(tempname, objPath) - if err != nil { - return fmt.Errorf("rename tmpfile: %w", err) - } - - return nil + return backend.MoveFile(tempname, objPath, defaultFilePerm) } func (tmp *tmpfile) Write(b []byte) (int, error) { diff --git a/cmd/versitygw/main.go b/cmd/versitygw/main.go index b501c8c..e0bc39d 100644 --- a/cmd/versitygw/main.go +++ b/cmd/versitygw/main.go @@ -524,19 +524,19 @@ func initFlags() []cli.Flag { }, &cli.StringFlag{ Name: "ipa-user", - Usage: "Username used to connect to FreeIPA. Needs permissions to read user vault contents", + Usage: "Username used to connect to FreeIPA (requires permissions to read user vault contents)", EnvVars: []string{"VGW_IPA_USER"}, Destination: &ipaUser, }, &cli.StringFlag{ Name: "ipa-password", - Usage: "Password of the user used to connect to FreeIPA.", + Usage: "Password of the user used to connect to FreeIPA", EnvVars: []string{"VGW_IPA_PASSWORD"}, Destination: &ipaPassword, }, &cli.BoolFlag{ Name: "ipa-insecure", - Usage: "Verify TLS certificate of FreeIPA server. Default is 'true'.", + Usage: "Disable verify TLS certificate of FreeIPA server", EnvVars: []string{"VGW_IPA_INSECURE"}, Destination: &ipaInsecure, }, diff --git a/cmd/versitygw/posix.go b/cmd/versitygw/posix.go index 79d8fc0..b557ea3 100644 --- a/cmd/versitygw/posix.go +++ b/cmd/versitygw/posix.go @@ -31,6 +31,7 @@ var ( dirPerms uint sidecar string nometa bool + forceNoTmpFile bool ) func posixCommand() *cli.Command { @@ -93,6 +94,12 @@ will be translated into the file /mnt/fs/gwroot/mybucket/a/b/c/myobject`, EnvVars: []string{"VGW_META_NONE"}, Destination: &nometa, }, + &cli.BoolFlag{ + Name: "disableotmp", + Usage: "disable O_TMPFILE support for new objects", + EnvVars: []string{"VGW_DISABLE_OTMP"}, + Destination: &forceNoTmpFile, + }, }, } } @@ -113,11 +120,12 @@ func runPosix(ctx *cli.Context) error { } opts := posix.PosixOpts{ - ChownUID: chownuid, - ChownGID: chowngid, - BucketLinks: bucketlinks, - VersioningDir: versioningDir, - NewDirPerm: fs.FileMode(dirPerms), + ChownUID: chownuid, + ChownGID: chowngid, + BucketLinks: bucketlinks, + VersioningDir: versioningDir, + NewDirPerm: fs.FileMode(dirPerms), + ForceNoTmpFile: forceNoTmpFile, } var ms meta.MetadataStorer diff --git a/extra/example.conf b/extra/example.conf index 2ab7b91..c7ac7ea 100644 --- a/extra/example.conf +++ b/extra/example.conf @@ -99,6 +99,10 @@ ROOT_SECRET_ACCESS_KEY= # endpoint is unauthenticated, and returns a 200 status for GET. #VGW_HEALTH= +# Enable VGW_READ_ONLY to only allow read operations to the S3 server. No write +# operations will be allowed. +#VGW_READ_ONLY=false + ############### # Access Logs # ############### @@ -240,6 +244,24 @@ ROOT_SECRET_ACCESS_KEY= #VGW_IAM_LDAP_USER_ID_ATR= #VGW_IAM_LDAP_GROUP_ID_ATR= +# The FreeIPA options will enable the FreeIPA IAM service with accounts stored +# in an external FreeIPA service. Currently the FreeIPA IAM service only +# supports account retrieval. Creating and modifying accounts must be done +# outside of the versitygw service. +# FreeIPA server url e.g. https://ipa.example.test +#VGW_IPA_HOST= +# A name of the user vault containing their secret +#VGW_IPA_VAULT_NAME= +# Username used to connect to FreeIPA (requires permissions to read user vault +# contents) +#VGW_IPA_USER= +# Password of the user used to connect to FreeIPA +#VGW_IPA_PASSWORD= +# Disable verify TLS certificate of FreeIPA server +#VGW_IPA_INSECURE=false +# FreeIPA IAM debug output +#VGW_IPA_DEBUG=false + ############### # IAM caching # ############### @@ -317,6 +339,40 @@ ROOT_SECRET_ACCESS_KEY= # as any parent directories automatically created with object uploads. #VGW_DIR_PERMS=0755 +# To enable object versions, the VGW_VERSIONING_DIR option must be set to the +# directory that will be used to store the object versions. The version +# directory must NOT be a subdirectory of the VGW_BACKEND_ARG directory. +#VGW_VERSIONING_DIR= + +# The gateway uses xattrs to store metadata for objects by default. For systems +# that do not support xattrs, the VGW_META_SIDECAR option can be set to a +# directory that will be used to store the metadata for objects. This is +# currently experimental, and may have issues for some edge cases. +#VGW_META_SIDECAR= + +# The VGW_META_NONE option will disable the metadata functionality for the +# gateway. This will cause the gateway to not store any metadata for objects +# or buckets. This include bucket ACLs and Policy. This may be useful for +# read only access to pre-existing data where the gateway should not modify +# the data. It is recommened to enable VGW_READ_ONLY (Global Options) along +# with this. +#VGW_META_NONE=false + +# The gateway will use O_TMPFILE for writing objects while uploading and +# link the file to the final object name when the upload is complete if the +# filesystem supports O_TMPFILE. This creates an atomic object creation +# that is not visible to other clients or racing uploads until the upload +# is complete. This will not work if there is a different filesystem mounted +# below the bucket level than where the bucket resides. The VGW_DISABLE_OTMP +# option can be set to true to disable this functionality and force the fallback +# mode when O_TMPFILE is not available. This fallback will create a temporary +# file in the bucket directory and rename it to the final object name when +# the upload is complete if the final location is in the same filesystem, or +# copy the file to the final location if the final location is in a different +# filesystem. This fallback mode is still atomic, but may be less efficient +# than O_TMPFILE when the data needs to be copied into the final location. +#VGW_DISABLE_OTMP=false + ########### # scoutfs # ###########