Files
versitygw/backend/posix/with_otmpfile.go
Ben McClelland 190dd8853c fix: expose posix tmpfile fd to enable copy_file_range
The complete multipart upload can be optimized in some cases
to not need to copy the full data from parts to the final
object file. If the filesystem supports it, there can be
optimizations to just clone exent references and not have to
actually copy the data.

For io.Copy() to make use of file_copy_range, we have to pass it
the *os.File file handles from the filesystem for the source and
destination. We were previously adding a layer of indirection
that was causing io.Copy() to fallback to full data copy. This
fixes the copy by exposing the underlying fd.

This also skips the falloc for the final object in complete
mutlipart upload, because some filesystems will be able to use
file_copy_range to optimize the copy and may not even need
new data allocations in the final object file.

Note that this only affects posix mode as scoutfs has special
handling for this case that is similar to but not compatible
with copy_file_range using a special ioctl.
2024-05-21 16:16:00 -07:00

228 lines
5.5 KiB
Go

// Copyright 2023 Versity Software
// This file is licensed under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//go:build linux
// +build linux
package posix
import (
"crypto/sha256"
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
"strconv"
"syscall"
"github.com/versity/versitygw/auth"
"github.com/versity/versitygw/backend"
"golang.org/x/sys/unix"
)
const procfddir = "/proc/self/fd"
type tmpfile struct {
f *os.File
bucket string
objname string
isOTmp bool
size int64
needsChown bool
uid int
gid int
}
var (
// TODO: make this configurable
defaultFilePerm uint32 = 0644
)
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, dofalloc bool) (*tmpfile, error) {
uid, gid, doChown := p.getChownIDs(acct)
// O_TMPFILE allows for a file handle to an unnamed file in the filesystem.
// This can help reduce contention within the namespace (parent directories),
// etc. And will auto cleanup the inode on close if we never link this
// file descriptor into the namespace.
// Not all filesystems support this, so fallback to CreateTemp for when
// this is not supported.
fd, err := unix.Open(dir, unix.O_RDWR|unix.O_TMPFILE|unix.O_CLOEXEC, defaultFilePerm)
if err != nil {
// O_TMPFILE not supported, try fallback
err = backend.MkdirAll(dir, uid, gid, doChown)
if err != nil {
return nil, fmt.Errorf("make temp dir: %w", err)
}
f, err := os.CreateTemp(dir,
fmt.Sprintf("%x.", sha256.Sum256([]byte(obj))))
if err != nil {
return nil, err
}
tmp := &tmpfile{
f: f,
bucket: bucket,
objname: obj,
size: size,
needsChown: doChown,
uid: uid,
gid: gid,
}
// falloc is best effort, its fine if this fails
if size > 0 && dofalloc {
tmp.falloc()
}
if doChown {
err := f.Chown(uid, gid)
if err != nil {
return nil, fmt.Errorf("set temp file ownership: %w", err)
}
}
return tmp, nil
}
// for O_TMPFILE, filename is /proc/self/fd/<fd> to be used
// later to link file into namespace
f := os.NewFile(uintptr(fd), filepath.Join(procfddir, strconv.Itoa(fd)))
tmp := &tmpfile{
f: f,
bucket: bucket,
objname: obj,
isOTmp: true,
size: size,
needsChown: doChown,
uid: uid,
gid: gid,
}
// falloc is best effort, its fine if this fails
if size > 0 && dofalloc {
tmp.falloc()
}
if doChown {
err := f.Chown(uid, gid)
if err != nil {
return nil, fmt.Errorf("set temp file ownership: %w", err)
}
}
return tmp, nil
}
func (tmp *tmpfile) falloc() error {
err := syscall.Fallocate(int(tmp.f.Fd()), 0, 0, tmp.size)
if err != nil {
return fmt.Errorf("fallocate: %v", err)
}
return nil
}
func (tmp *tmpfile) link() error {
// We use Linkat/Rename as the atomic operation for object puts. The
// upload is written to a temp (or unnamed/O_TMPFILE) file to not conflict
// with any other simultaneous uploads. The final operation is to move the
// temp file into place for the object. This ensures the object semantics
// of last upload completed wins and is not some combination of writes
// from simultaneous uploads.
objPath := filepath.Join(tmp.bucket, tmp.objname)
err := os.Remove(objPath)
if err != nil && !errors.Is(err, fs.ErrNotExist) {
return fmt.Errorf("remove stale path: %w", err)
}
dir := filepath.Dir(objPath)
err = backend.MkdirAll(dir, tmp.uid, tmp.gid, tmp.needsChown)
if err != nil {
return fmt.Errorf("make parent dir: %w", err)
}
if !tmp.isOTmp {
// O_TMPFILE not suported, use fallback
return tmp.fallbackLink()
}
procdir, err := os.Open(procfddir)
if err != nil {
return fmt.Errorf("open proc dir: %w", err)
}
defer procdir.Close()
dirf, err := os.Open(dir)
if err != nil {
return fmt.Errorf("open parent dir: %w", err)
}
defer dirf.Close()
err = unix.Linkat(int(procdir.Fd()), filepath.Base(tmp.f.Name()),
int(dirf.Fd()), filepath.Base(objPath), unix.AT_SYMLINK_FOLLOW)
if err != nil {
return fmt.Errorf("link tmpfile (%q in %q): %w",
filepath.Dir(objPath), filepath.Base(tmp.f.Name()), err)
}
err = tmp.f.Close()
if err != nil {
return fmt.Errorf("close tmpfile: %w", err)
}
return nil
}
func (tmp *tmpfile) fallbackLink() error {
tempname := tmp.f.Name()
// cleanup in case anything goes wrong, if rename succeeds then
// this will no longer exist
defer os.Remove(tempname)
// reset default file mode because CreateTemp uses 0600
tmp.f.Chmod(fs.FileMode(defaultFilePerm))
err := tmp.f.Close()
if err != nil {
return fmt.Errorf("close tmpfile: %w", err)
}
objPath := filepath.Join(tmp.bucket, tmp.objname)
err = os.Rename(tempname, objPath)
if err != nil {
return fmt.Errorf("rename tmpfile: %w", err)
}
return nil
}
func (tmp *tmpfile) Write(b []byte) (int, error) {
if int64(len(b)) > tmp.size {
return 0, fmt.Errorf("write exceeds content length %v", tmp.size)
}
n, err := tmp.f.Write(b)
tmp.size -= int64(n)
return n, err
}
func (tmp *tmpfile) cleanup() {
tmp.f.Close()
}
func (tmp *tmpfile) File() *os.File {
return tmp.f
}