feat: scoutfs backend with move blocks multipart optimized

This commit is contained in:
Ben McClelland
2023-06-15 10:45:31 -07:00
parent 3ba5f21f51
commit 09e8889e75
10 changed files with 636 additions and 28 deletions

View File

@@ -15,7 +15,10 @@
package backend
import (
"crypto/md5"
"encoding/hex"
"errors"
"fmt"
"io/fs"
"strconv"
"strings"
@@ -84,3 +87,25 @@ func ParseRange(file fs.FileInfo, acceptRange string) (int64, int64, error) {
return int64(startOffset), int64(endOffset - startOffset + 1), nil
}
func GetMultipartMD5(parts []types.Part) string {
var partsEtagBytes []byte
for _, part := range parts {
partsEtagBytes = append(partsEtagBytes, getEtagBytes(*part.ETag)...)
}
s3MD5 := fmt.Sprintf("%s-%d", md5String(partsEtagBytes), len(parts))
return s3MD5
}
func getEtagBytes(etag string) []byte {
decode, err := hex.DecodeString(strings.ReplaceAll(etag, string('"'), ""))
if err != nil {
return []byte(etag)
}
return decode
}
func md5String(data []byte) string {
sum := md5.Sum(data)
return hex.EncodeToString(sum[:])
}

View File

@@ -74,7 +74,7 @@ func (p *Posix) Shutdown() {
p.rootfd.Close()
}
func (p *Posix) Sring() string {
func (p *Posix) String() string {
return "Posix Gateway"
}
@@ -317,7 +317,7 @@ func (p *Posix) CompleteMultipartUpload(bucket, object, uploadID string, parts [
}
// Calculate s3 compatible md5sum for complete multipart.
s3MD5 := getMultipartMD5(parts)
s3MD5 := backend.GetMultipartMD5(parts)
err = xattr.Set(objname, "user.etag", []byte(s3MD5))
if err != nil {
@@ -404,8 +404,8 @@ func isValidMeta(val string) bool {
return false
}
// mkdirAll is similar to os.MkdirAll but it will also set uid/gid when
// making new directories
// mkdirAll is similar to os.MkdirAll but it will return ErrObjectParentIsFile
// when appropriate
func mkdirAll(path string, perm os.FileMode, bucket, object string) error {
// Fast path: if we can tell whether path is a directory or file, stop with success or error.
dir, err := os.Stat(path)
@@ -413,7 +413,7 @@ func mkdirAll(path string, perm os.FileMode, bucket, object string) error {
if dir.IsDir() {
return nil
}
return &os.PathError{Op: "mkdir", Path: path, Err: syscall.ENOTDIR}
return s3err.GetAPIError(s3err.ErrObjectParentIsFile)
}
// Slow path: make sure parent exists and then call Mkdir for path.
@@ -449,28 +449,6 @@ func mkdirAll(path string, perm os.FileMode, bucket, object string) error {
return nil
}
func getMultipartMD5(parts []types.Part) string {
var partsEtagBytes []byte
for _, part := range parts {
partsEtagBytes = append(partsEtagBytes, getEtagBytes(*part.ETag)...)
}
s3MD5 := fmt.Sprintf("%s-%d", md5String(partsEtagBytes), len(parts))
return s3MD5
}
func getEtagBytes(etag string) []byte {
decode, err := hex.DecodeString(strings.ReplaceAll(etag, string('"'), ""))
if err != nil {
return []byte(etag)
}
return decode
}
func md5String(data []byte) string {
sum := md5.Sum(data)
return hex.EncodeToString(sum[:])
}
func (p *Posix) AbortMultipartUpload(mpu *s3.AbortMultipartUploadInput) error {
bucket := *mpu.Bucket
object := *mpu.Key

View File

@@ -15,12 +15,282 @@
package scoutfs
import (
"crypto/sha256"
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"syscall"
"github.com/aws/aws-sdk-go-v2/service/s3"
"github.com/aws/aws-sdk-go-v2/service/s3/types"
"github.com/pkg/xattr"
"github.com/versity/scoutfs-go"
"github.com/versity/versitygw/backend"
"github.com/versity/versitygw/backend/posix"
"github.com/versity/versitygw/s3err"
)
type ScoutFS struct {
*posix.Posix
rootfd *os.File
rootdir string
}
var _ backend.Backend = ScoutFS{}
var _ backend.Backend = &ScoutFS{}
const (
metaTmpDir = ".sgwtmp"
metaTmpMultipartDir = metaTmpDir + "/multipart"
onameAttr = "user.objname"
tagHdr = "X-Amz-Tagging"
emptyMD5 = "d41d8cd98f00b204e9800998ecf8427e"
)
func (s *ScoutFS) Shutdown() {
s.Posix.Shutdown()
s.rootfd.Close()
_ = s.rootdir
}
func (*ScoutFS) String() string {
return "ScoutFS Gateway"
}
// CompleteMultipartUpload scoutfs complete upload uses scoutfs move blocks
// ioctl to not have to read and copy the part data to the final object. This
// saves a read and write cycle for all mutlipart uploads.
func (p *ScoutFS) CompleteMultipartUpload(bucket, object, uploadID string, parts []types.Part) (*s3.CompleteMultipartUploadOutput, error) {
_, err := os.Stat(bucket)
if errors.Is(err, fs.ErrNotExist) {
return nil, s3err.GetAPIError(s3err.ErrNoSuchBucket)
}
if err != nil {
return nil, fmt.Errorf("stat bucket: %w", err)
}
sum, err := p.checkUploadIDExists(bucket, object, uploadID)
if err != nil {
return nil, err
}
objdir := filepath.Join(bucket, metaTmpMultipartDir, fmt.Sprintf("%x", sum))
// check all parts ok
last := len(parts) - 1
partsize := int64(0)
var totalsize int64
for i, p := range parts {
partPath := filepath.Join(objdir, uploadID, fmt.Sprintf("%v", p.PartNumber))
fi, err := os.Lstat(partPath)
if err != nil {
return nil, s3err.GetAPIError(s3err.ErrInvalidPart)
}
if i == 0 {
partsize = fi.Size()
}
totalsize += fi.Size()
// all parts except the last need to be the same size
if i < last && partsize != fi.Size() {
return nil, s3err.GetAPIError(s3err.ErrInvalidPart)
}
// non-last part sizes need to be multiples of 4k for move blocks
// TODO: fallback to no move blocks if not 4k aligned?
if i == 0 && i < last && fi.Size()%4096 != 0 {
return nil, s3err.GetAPIError(s3err.ErrInvalidPart)
}
b, err := xattr.Get(partPath, "user.etag")
etag := string(b)
if err != nil {
etag = ""
}
parts[i].ETag = &etag
}
// use totalsize=0 because we wont be writing to the file, only moving
// extents around. so we dont want to fallocate this.
f, err := openTmpFile(filepath.Join(bucket, metaTmpDir), bucket, object, 0)
if err != nil {
return nil, fmt.Errorf("open temp file: %w", err)
}
defer f.cleanup()
for _, p := range parts {
pf, err := os.Open(filepath.Join(objdir, uploadID, fmt.Sprintf("%v", p.PartNumber)))
if err != nil {
return nil, fmt.Errorf("open part %v: %v", p.PartNumber, err)
}
// scoutfs move data is a metadata only operation that moves the data
// extent references from the source, appeding to the destination.
// this needs to be 4k aligned.
err = scoutfs.MoveData(pf, f.f)
pf.Close()
if err != nil {
return nil, fmt.Errorf("move blocks part %v: %v", p.PartNumber, err)
}
}
userMetaData := make(map[string]string)
upiddir := filepath.Join(objdir, uploadID)
loadUserMetaData(upiddir, userMetaData)
objname := filepath.Join(bucket, object)
dir := filepath.Dir(objname)
if dir != "" {
if err = mkdirAll(dir, os.FileMode(0755), bucket, object); err != nil {
if err != nil {
return nil, s3err.GetAPIError(s3err.ErrExistingObjectIsDirectory)
}
}
}
err = f.link()
if err != nil {
return nil, fmt.Errorf("link object in namespace: %w", err)
}
for k, v := range userMetaData {
err = xattr.Set(objname, "user."+k, []byte(v))
if err != nil {
// cleanup object if returning error
os.Remove(objname)
return nil, fmt.Errorf("set user attr %q: %w", k, err)
}
}
// Calculate s3 compatible md5sum for complete multipart.
s3MD5 := backend.GetMultipartMD5(parts)
err = xattr.Set(objname, "user.etag", []byte(s3MD5))
if err != nil {
// cleanup object if returning error
os.Remove(objname)
return nil, fmt.Errorf("set etag attr: %w", err)
}
// cleanup tmp dirs
os.RemoveAll(upiddir)
// use Remove for objdir in case there are still other uploads
// for same object name outstanding
os.Remove(objdir)
return &s3.CompleteMultipartUploadOutput{
Bucket: &bucket,
ETag: &s3MD5,
Key: &object,
}, nil
}
func (p *ScoutFS) checkUploadIDExists(bucket, object, uploadID string) ([32]byte, error) {
sum := sha256.Sum256([]byte(object))
objdir := filepath.Join(bucket, metaTmpMultipartDir, fmt.Sprintf("%x", sum))
_, err := os.Stat(filepath.Join(objdir, uploadID))
if errors.Is(err, fs.ErrNotExist) {
return [32]byte{}, s3err.GetAPIError(s3err.ErrNoSuchUpload)
}
if err != nil {
return [32]byte{}, fmt.Errorf("stat upload: %w", err)
}
return sum, nil
}
func loadUserMetaData(path string, m map[string]string) (contentType, contentEncoding string) {
ents, err := xattr.List(path)
if err != nil || len(ents) == 0 {
return
}
for _, e := range ents {
if !isValidMeta(e) {
continue
}
b, err := xattr.Get(path, e)
if err == syscall.ENODATA {
m[strings.TrimPrefix(e, "user.")] = ""
continue
}
if err != nil {
continue
}
m[strings.TrimPrefix(e, "user.")] = string(b)
}
b, err := xattr.Get(path, "user.content-type")
contentType = string(b)
if err != nil {
contentType = ""
}
if contentType != "" {
m["content-type"] = contentType
}
b, err = xattr.Get(path, "user.content-encoding")
contentEncoding = string(b)
if err != nil {
contentEncoding = ""
}
if contentEncoding != "" {
m["content-encoding"] = contentEncoding
}
return
}
func isValidMeta(val string) bool {
if strings.HasPrefix(val, "user.X-Amz-Meta") {
return true
}
if strings.EqualFold(val, "user.Expires") {
return true
}
return false
}
// mkdirAll is similar to os.MkdirAll but it will return ErrObjectParentIsFile
// when appropriate
func mkdirAll(path string, perm os.FileMode, bucket, object string) error {
// Fast path: if we can tell whether path is a directory or file, stop with success or error.
dir, err := os.Stat(path)
if err == nil {
if dir.IsDir() {
return nil
}
return s3err.GetAPIError(s3err.ErrObjectParentIsFile)
}
// Slow path: make sure parent exists and then call Mkdir for path.
i := len(path)
for i > 0 && os.IsPathSeparator(path[i-1]) { // Skip trailing path separator.
i--
}
j := i
for j > 0 && !os.IsPathSeparator(path[j-1]) { // Scan backward over element.
j--
}
if j > 1 {
// Create parent.
err = mkdirAll(path[:j-1], perm, bucket, object)
if err != nil {
return err
}
}
// Parent now exists; invoke Mkdir and use its result.
err = os.Mkdir(path, perm)
if err != nil {
// Handle arguments like "foo/." by
// double-checking that directory doesn't exist.
dir, err1 := os.Lstat(path)
if err1 == nil && dir.IsDir() {
return nil
}
return s3err.GetAPIError(s3err.ErrObjectParentIsFile)
}
return nil
}

View File

@@ -0,0 +1,48 @@
// Copyright 2023 Versity Software
// This file is licensed under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package scoutfs
import (
"errors"
"fmt"
"os"
)
func New(rootdir string) (*ScoutFS, error) {
return nil, fmt.Errorf("scoutfs only available on linux")
}
type tmpfile struct {
f *os.File
}
var (
errNotSupported = errors.New("not supported")
)
func openTmpFile(dir, bucket, obj string, size int64) (*tmpfile, error) {
return nil, errNotSupported
}
func (tmp *tmpfile) link() error {
return errNotSupported
}
func (tmp *tmpfile) Write(b []byte) (int, error) {
return 0, errNotSupported
}
func (tmp *tmpfile) cleanup() {
}

View File

@@ -0,0 +1,179 @@
// Copyright 2023 Versity Software
// This file is licensed under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package scoutfs
import (
"crypto/sha256"
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
"strconv"
"syscall"
"golang.org/x/sys/unix"
"github.com/versity/versitygw/backend/posix"
)
func New(rootdir string) (*ScoutFS, error) {
p, err := posix.New(rootdir)
if err != nil {
return nil, err
}
f, err := os.Open(rootdir)
if err != nil {
return nil, fmt.Errorf("open %v: %w", rootdir, err)
}
return &ScoutFS{Posix: p, rootfd: f, rootdir: rootdir}, nil
}
const procfddir = "/proc/self/fd"
type tmpfile struct {
f *os.File
bucket string
objname string
isOTmp bool
size int64
}
func openTmpFile(dir, bucket, obj string, size int64) (*tmpfile, error) {
// O_TMPFILE allows for a file handle to an unnamed file in the filesystem.
// This can help reduce contention within the namespace (parent directories),
// etc. And will auto cleanup the inode on close if we never link this
// file descriptor into the namespace.
// Not all filesystems support this, so fallback to CreateTemp for when
// this is not supported.
fd, err := unix.Open(dir, unix.O_RDWR|unix.O_TMPFILE|unix.O_CLOEXEC, 0666)
if err != nil {
// O_TMPFILE not supported, try fallback
err := os.MkdirAll(dir, 0700)
if err != nil {
return nil, fmt.Errorf("make temp dir: %w", err)
}
f, err := os.CreateTemp(dir,
fmt.Sprintf("%x.", sha256.Sum256([]byte(obj))))
if err != nil {
return nil, err
}
tmp := &tmpfile{f: f, bucket: bucket, objname: obj, size: size}
// falloc is best effort, its fine if this fails
if size > 0 {
tmp.falloc()
}
return tmp, nil
}
// for O_TMPFILE, filename is /proc/self/fd/<fd> to be used
// later to link file into namespace
f := os.NewFile(uintptr(fd), filepath.Join(procfddir, strconv.Itoa(fd)))
tmp := &tmpfile{f: f, bucket: bucket, objname: obj, isOTmp: true, size: size}
// falloc is best effort, its fine if this fails
if size > 0 {
tmp.falloc()
}
return tmp, nil
}
func (tmp *tmpfile) falloc() error {
err := syscall.Fallocate(int(tmp.f.Fd()), 0, 0, tmp.size)
if err != nil {
return fmt.Errorf("fallocate: %v", err)
}
return nil
}
func (tmp *tmpfile) link() error {
// We use Linkat/Rename as the atomic operation for object puts. The
// upload is written to a temp (or unnamed/O_TMPFILE) file to not conflict
// with any other simultaneous uploads. The final operation is to move the
// temp file into place for the object. This ensures the object semantics
// of last upload completed wins and is not some combination of writes
// from simultaneous uploads.
objPath := filepath.Join(tmp.bucket, tmp.objname)
err := os.Remove(objPath)
if err != nil && !errors.Is(err, fs.ErrNotExist) {
return fmt.Errorf("remove stale path: %w", err)
}
if !tmp.isOTmp {
// O_TMPFILE not suported, use fallback
return tmp.fallbackLink()
}
procdir, err := os.Open(procfddir)
if err != nil {
return fmt.Errorf("open proc dir: %w", err)
}
defer procdir.Close()
dir, err := os.Open(filepath.Dir(objPath))
if err != nil {
return fmt.Errorf("open parent dir: %w", err)
}
defer dir.Close()
err = unix.Linkat(int(procdir.Fd()), filepath.Base(tmp.f.Name()),
int(dir.Fd()), filepath.Base(objPath), unix.AT_SYMLINK_FOLLOW)
if err != nil {
return fmt.Errorf("link tmpfile: %w", err)
}
err = tmp.f.Close()
if err != nil {
return fmt.Errorf("close tmpfile: %w", err)
}
return nil
}
func (tmp *tmpfile) fallbackLink() error {
tempname := tmp.f.Name()
// cleanup in case anything goes wrong, if rename succeeds then
// this will no longer exist
defer os.Remove(tempname)
err := tmp.f.Close()
if err != nil {
return fmt.Errorf("close tmpfile: %w", err)
}
objPath := filepath.Join(tmp.bucket, tmp.objname)
err = os.Rename(tempname, objPath)
if err != nil {
return fmt.Errorf("rename tmpfile: %w", err)
}
return nil
}
func (tmp *tmpfile) Write(b []byte) (int, error) {
if int64(len(b)) > tmp.size {
return 0, fmt.Errorf("write exceeds content length %v", tmp.size)
}
n, err := tmp.f.Write(b)
tmp.size -= int64(n)
return n, err
}
func (tmp *tmpfile) cleanup() {
tmp.f.Close()
}

View File

@@ -0,0 +1,48 @@
// Copyright 2023 Versity Software
// This file is licensed under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package scoutfs
import (
"errors"
"fmt"
"os"
)
func New(rootdir string) (*ScoutFS, error) {
return nil, fmt.Errorf("scoutfs only available on linux")
}
type tmpfile struct {
f *os.File
}
var (
errNotSupported = errors.New("not supported")
)
func openTmpFile(dir, bucket, obj string, size int64) (*tmpfile, error) {
return nil, errNotSupported
}
func (tmp *tmpfile) link() error {
return errNotSupported
}
func (tmp *tmpfile) Write(b []byte) (int, error) {
return 0, errNotSupported
}
func (tmp *tmpfile) cleanup() {
}

View File

@@ -51,6 +51,7 @@ func main() {
app.Commands = []*cli.Command{
posixCommand(),
scoutfsCommand(),
adminCommand(),
}

56
cmd/versitygw/scoutfs.go Normal file
View File

@@ -0,0 +1,56 @@
// Copyright 2023 Versity Software
// This file is licensed under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package main
import (
"fmt"
"github.com/urfave/cli/v2"
"github.com/versity/versitygw/backend/scoutfs"
)
func scoutfsCommand() *cli.Command {
return &cli.Command{
Name: "scoutfs",
Usage: "scoutfs filesystem storage backend",
Description: `Support for ScoutFS.
The top level directory for the gateway must be provided. All sub directories
of the top level directory are treated as buckets, and all files/directories
below the "bucket directory" are treated as the objects. The object name is
split on "/" separator to translate to posix storage.
For example:
top level: /mnt/fs/gwroot
bucket: mybucket
object: a/b/c/myobject
will be translated into the file /mnt/fs/gwroot/mybucket/a/b/c/myobject
ScoutFS contains optimizations for multipart uploads using extent
move interfaces as well as support for tiered filesystems.`,
Action: runScoutfs,
}
}
func runScoutfs(ctx *cli.Context) error {
if ctx.NArg() == 0 {
return fmt.Errorf("no directory provided for operation")
}
be, err := scoutfs.New(ctx.Args().Get(0))
if err != nil {
return fmt.Errorf("init scoutfs: %v", err)
}
return runGateway(be)
}

1
go.mod
View File

@@ -37,5 +37,6 @@ require (
github.com/tinylib/msgp v1.1.8 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/tcplisten v1.0.0 // indirect
github.com/versity/scoutfs-go v0.0.0-20230606232754-0474b14343b9 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
)

2
go.sum
View File

@@ -70,6 +70,8 @@ github.com/valyala/fasthttp v1.47.0 h1:y7moDoxYzMooFpT5aHgNgVOQDrS3qlkfiP9mDtGGK
github.com/valyala/fasthttp v1.47.0/go.mod h1:k2zXd82h/7UZc3VOdJ2WaUqt1uZ/XpXAfE9i+HBC3lA=
github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8=
github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
github.com/versity/scoutfs-go v0.0.0-20230606232754-0474b14343b9 h1:ZfmQR01Kk6/kQh6+zlqfBYszVY02fzf9xYrchOY4NFM=
github.com/versity/scoutfs-go v0.0.0-20230606232754-0474b14343b9/go.mod h1:gJsq73k+4685y+rbDIpPY8i/5GbsiwP6JFoFyUDB1fQ=
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU=
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=