mirror of
https://github.com/versity/versitygw.git
synced 2026-01-03 18:44:04 +00:00
The complete multipart upload can be optimized in some cases to not need to copy the full data from parts to the final object file. If the filesystem supports it, there can be optimizations to just clone exent references and not have to actually copy the data. For io.Copy() to make use of file_copy_range, we have to pass it the *os.File file handles from the filesystem for the source and destination. We were previously adding a layer of indirection that was causing io.Copy() to fallback to full data copy. This fixes the copy by exposing the underlying fd. This also skips the falloc for the final object in complete mutlipart upload, because some filesystems will be able to use file_copy_range to optimize the copy and may not even need new data allocations in the final object file. Note that this only affects posix mode as scoutfs has special handling for this case that is similar to but not compatible with copy_file_range using a special ioctl.
228 lines
5.5 KiB
Go
228 lines
5.5 KiB
Go
// Copyright 2023 Versity Software
|
|
// This file is licensed under the Apache License, Version 2.0
|
|
// (the "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
//go:build linux
|
|
// +build linux
|
|
|
|
package posix
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"errors"
|
|
"fmt"
|
|
"io/fs"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"syscall"
|
|
|
|
"github.com/versity/versitygw/auth"
|
|
"github.com/versity/versitygw/backend"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
const procfddir = "/proc/self/fd"
|
|
|
|
type tmpfile struct {
|
|
f *os.File
|
|
bucket string
|
|
objname string
|
|
isOTmp bool
|
|
size int64
|
|
needsChown bool
|
|
uid int
|
|
gid int
|
|
}
|
|
|
|
var (
|
|
// TODO: make this configurable
|
|
defaultFilePerm uint32 = 0644
|
|
)
|
|
|
|
func (p *Posix) openTmpFile(dir, bucket, obj string, size int64, acct auth.Account, dofalloc bool) (*tmpfile, error) {
|
|
uid, gid, doChown := p.getChownIDs(acct)
|
|
|
|
// O_TMPFILE allows for a file handle to an unnamed file in the filesystem.
|
|
// This can help reduce contention within the namespace (parent directories),
|
|
// etc. And will auto cleanup the inode on close if we never link this
|
|
// file descriptor into the namespace.
|
|
// Not all filesystems support this, so fallback to CreateTemp for when
|
|
// this is not supported.
|
|
fd, err := unix.Open(dir, unix.O_RDWR|unix.O_TMPFILE|unix.O_CLOEXEC, defaultFilePerm)
|
|
if err != nil {
|
|
// O_TMPFILE not supported, try fallback
|
|
err = backend.MkdirAll(dir, uid, gid, doChown)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("make temp dir: %w", err)
|
|
}
|
|
f, err := os.CreateTemp(dir,
|
|
fmt.Sprintf("%x.", sha256.Sum256([]byte(obj))))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
tmp := &tmpfile{
|
|
f: f,
|
|
bucket: bucket,
|
|
objname: obj,
|
|
size: size,
|
|
needsChown: doChown,
|
|
uid: uid,
|
|
gid: gid,
|
|
}
|
|
// falloc is best effort, its fine if this fails
|
|
if size > 0 && dofalloc {
|
|
tmp.falloc()
|
|
}
|
|
|
|
if doChown {
|
|
err := f.Chown(uid, gid)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("set temp file ownership: %w", err)
|
|
}
|
|
}
|
|
|
|
return tmp, nil
|
|
}
|
|
|
|
// for O_TMPFILE, filename is /proc/self/fd/<fd> to be used
|
|
// later to link file into namespace
|
|
f := os.NewFile(uintptr(fd), filepath.Join(procfddir, strconv.Itoa(fd)))
|
|
|
|
tmp := &tmpfile{
|
|
f: f,
|
|
bucket: bucket,
|
|
objname: obj,
|
|
isOTmp: true,
|
|
size: size,
|
|
needsChown: doChown,
|
|
uid: uid,
|
|
gid: gid,
|
|
}
|
|
|
|
// falloc is best effort, its fine if this fails
|
|
if size > 0 && dofalloc {
|
|
tmp.falloc()
|
|
}
|
|
|
|
if doChown {
|
|
err := f.Chown(uid, gid)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("set temp file ownership: %w", err)
|
|
}
|
|
}
|
|
|
|
return tmp, nil
|
|
}
|
|
|
|
func (tmp *tmpfile) falloc() error {
|
|
err := syscall.Fallocate(int(tmp.f.Fd()), 0, 0, tmp.size)
|
|
if err != nil {
|
|
return fmt.Errorf("fallocate: %v", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (tmp *tmpfile) link() error {
|
|
// We use Linkat/Rename as the atomic operation for object puts. The
|
|
// upload is written to a temp (or unnamed/O_TMPFILE) file to not conflict
|
|
// with any other simultaneous uploads. The final operation is to move the
|
|
// temp file into place for the object. This ensures the object semantics
|
|
// of last upload completed wins and is not some combination of writes
|
|
// from simultaneous uploads.
|
|
objPath := filepath.Join(tmp.bucket, tmp.objname)
|
|
err := os.Remove(objPath)
|
|
if err != nil && !errors.Is(err, fs.ErrNotExist) {
|
|
return fmt.Errorf("remove stale path: %w", err)
|
|
}
|
|
|
|
dir := filepath.Dir(objPath)
|
|
|
|
err = backend.MkdirAll(dir, tmp.uid, tmp.gid, tmp.needsChown)
|
|
if err != nil {
|
|
return fmt.Errorf("make parent dir: %w", err)
|
|
}
|
|
|
|
if !tmp.isOTmp {
|
|
// O_TMPFILE not suported, use fallback
|
|
return tmp.fallbackLink()
|
|
}
|
|
|
|
procdir, err := os.Open(procfddir)
|
|
if err != nil {
|
|
return fmt.Errorf("open proc dir: %w", err)
|
|
}
|
|
defer procdir.Close()
|
|
|
|
dirf, err := os.Open(dir)
|
|
if err != nil {
|
|
return fmt.Errorf("open parent dir: %w", err)
|
|
}
|
|
defer dirf.Close()
|
|
|
|
err = unix.Linkat(int(procdir.Fd()), filepath.Base(tmp.f.Name()),
|
|
int(dirf.Fd()), filepath.Base(objPath), unix.AT_SYMLINK_FOLLOW)
|
|
if err != nil {
|
|
return fmt.Errorf("link tmpfile (%q in %q): %w",
|
|
filepath.Dir(objPath), filepath.Base(tmp.f.Name()), err)
|
|
}
|
|
|
|
err = tmp.f.Close()
|
|
if err != nil {
|
|
return fmt.Errorf("close tmpfile: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (tmp *tmpfile) fallbackLink() error {
|
|
tempname := tmp.f.Name()
|
|
// cleanup in case anything goes wrong, if rename succeeds then
|
|
// this will no longer exist
|
|
defer os.Remove(tempname)
|
|
|
|
// reset default file mode because CreateTemp uses 0600
|
|
tmp.f.Chmod(fs.FileMode(defaultFilePerm))
|
|
|
|
err := tmp.f.Close()
|
|
if err != nil {
|
|
return fmt.Errorf("close tmpfile: %w", err)
|
|
}
|
|
|
|
objPath := filepath.Join(tmp.bucket, tmp.objname)
|
|
err = os.Rename(tempname, objPath)
|
|
if err != nil {
|
|
return fmt.Errorf("rename tmpfile: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (tmp *tmpfile) Write(b []byte) (int, error) {
|
|
if int64(len(b)) > tmp.size {
|
|
return 0, fmt.Errorf("write exceeds content length %v", tmp.size)
|
|
}
|
|
|
|
n, err := tmp.f.Write(b)
|
|
tmp.size -= int64(n)
|
|
return n, err
|
|
}
|
|
|
|
func (tmp *tmpfile) cleanup() {
|
|
tmp.f.Close()
|
|
}
|
|
|
|
func (tmp *tmpfile) File() *os.File {
|
|
return tmp.f
|
|
}
|