perf(s3): add optional shared in-memory chunk cache for GET (#9069)

Adds the -s3.cacheCapacityMB flag (default 0, disabled) that attaches
an in-memory chunk_cache.ChunkCacheInMemory to the server-wide
ReaderCache introduced in the previous commit. When enabled,
completed chunks are deposited into the shared cache as they are
downloaded, so concurrent and repeat GETs of the same object hit
memory instead of re-fetching chunks from volume servers.

When 0 (the default) the shared ReaderCache still runs — it just
attaches a nil chunk cache, so behaviour matches the previous commit
exactly. No behaviour change for clusters that don't opt in.

Disk-backed TieredChunkCache was evaluated and rejected: its
synchronous SetChunk writes regressed cold reads ~12x on loopback
because the chunk fetchers block on local disk I/O that is *slower*
than the TCP volume-server fetch it is supposed to accelerate.
Memory-only avoids that.

Flag registered in all four S3 flag sites (s3.go, server.go,
filer.go, mini.go) per the comment on command.S3Options. The chunk
size used to convert CacheSizeMB → entry count is encapsulated in
the s3ChunkCacheChunkSizeMB constant so it's easy to grep and
revisit if the filer default chunk size changes.

Measured on weed mini + 1 GiB random object over loopback, single
curl on a presigned URL:

    cacheCapacityMB=0 (off):  cold ~2900, warm ~2900 MB/s
    cacheCapacityMB=4096:     cold ~2790, warm ~5050 MB/s (+70%)
This commit is contained in:
Chris Lu
2026-04-14 09:24:35 -07:00
committed by GitHub
parent 7a7f220224
commit eaf561e86c
5 changed files with 51 additions and 4 deletions

View File

@@ -150,6 +150,7 @@ func init() {
filerS3Options.portIceberg = cmdFiler.Flag.Int("s3.port.iceberg", 8181, "Iceberg REST Catalog server listen port (0 to disable)")
filerS3Options.externalUrl = cmdFiler.Flag.String("s3.externalUrl", "", "the external URL clients use to connect (e.g. https://api.example.com:9000). Used for S3 signature verification behind a reverse proxy. Falls back to S3_EXTERNAL_URL env var.")
filerS3Options.defaultFileMode = cmdFiler.Flag.String("s3.defaultFileMode", "", "default file mode for S3 uploaded objects, e.g. 0660, 0644, 0666")
filerS3Options.cacheSizeMB = cmdFiler.Flag.Int64("s3.cacheCapacityMB", 0, "in-memory chunk cache capacity in MB for S3 GETs shared across requests (0 disables)")
// start webdav on filer
filerStartWebDav = cmdFiler.Flag.Bool("webdav", false, "whether to start webdav gateway")

View File

@@ -251,6 +251,7 @@ func initMiniS3Flags() {
miniS3Options.allowDeleteBucketNotEmpty = miniS3AllowDeleteBucketNotEmpty
miniS3Options.externalUrl = cmdMini.Flag.String("s3.externalUrl", "", "the external URL clients use to connect (e.g. https://api.example.com:9000). Used for S3 signature verification behind a reverse proxy. Falls back to S3_EXTERNAL_URL env var.")
miniS3Options.defaultFileMode = cmdMini.Flag.String("s3.defaultFileMode", "", "default file mode for S3 uploaded objects, e.g. 0660, 0644, 0666")
miniS3Options.cacheSizeMB = cmdMini.Flag.Int64("s3.cacheCapacityMB", 0, "in-memory chunk cache capacity in MB for S3 GETs shared across requests (0 disables)")
// In mini mode, S3 uses the shared debug server started at line 681, not its own separate debug server
miniS3Options.debug = new(bool) // explicitly false
miniS3Options.debugPort = cmdMini.Flag.Int("s3.debug.port", 6060, "http port for debugging (unused in mini mode)")

View File

@@ -73,6 +73,7 @@ type S3Options struct {
cipher *bool
externalUrl *string
defaultFileMode *string
cacheSizeMB *int64
}
func init() {
@@ -109,6 +110,7 @@ func init() {
s3StandaloneOptions.cipher = cmdS3.Flag.Bool("encryptVolumeData", false, "encrypt data on volume servers")
s3StandaloneOptions.externalUrl = cmdS3.Flag.String("externalUrl", "", "the external URL clients use to connect (e.g. https://api.example.com:9000). Used for S3 signature verification behind a reverse proxy. Falls back to S3_EXTERNAL_URL env var.")
s3StandaloneOptions.defaultFileMode = cmdS3.Flag.String("defaultFileMode", "", "default file mode for S3 uploaded objects, e.g. 0660, 0644, 0666")
s3StandaloneOptions.cacheSizeMB = cmdS3.Flag.Int64("cacheCapacityMB", 0, "in-memory chunk cache capacity in MB for S3 GETs shared across requests (0 disables)")
}
var cmdS3 = &Command{
@@ -343,6 +345,7 @@ func (s3opt *S3Options) startS3Server() bool {
GrpcPort: *s3opt.portGrpc,
ExternalUrl: s3opt.resolveExternalUrl(),
DefaultFileMode: defaultFileMode,
CacheSizeMB: *s3opt.cacheSizeMB,
})
if s3ApiServer_err != nil {
glog.Fatalf("S3 API Server startup error: %v", s3ApiServer_err)

View File

@@ -181,6 +181,7 @@ func init() {
s3Options.cipher = cmdServer.Flag.Bool("s3.encryptVolumeData", false, "encrypt data on volume servers for S3 uploads")
s3Options.externalUrl = cmdServer.Flag.String("s3.externalUrl", "", "the external URL clients use to connect (e.g. https://api.example.com:9000). Used for S3 signature verification behind a reverse proxy. Falls back to S3_EXTERNAL_URL env var.")
s3Options.defaultFileMode = cmdServer.Flag.String("s3.defaultFileMode", "", "default file mode for S3 uploaded objects, e.g. 0660, 0644, 0666")
s3Options.cacheSizeMB = cmdServer.Flag.Int64("s3.cacheCapacityMB", 0, "in-memory chunk cache capacity in MB for S3 GETs shared across requests (0 disables)")
sftpOptions.port = cmdServer.Flag.Int("sftp.port", 2022, "SFTP server listen port")
sftpOptions.sshPrivateKey = cmdServer.Flag.String("sftp.sshPrivateKey", "", "path to the SSH private key file for host authentication")

View File

@@ -62,8 +62,16 @@ type S3ApiServerOption struct {
GrpcPort int
ExternalUrl string // external URL clients use, for signature verification behind a reverse proxy
DefaultFileMode uint32 // default file permission mode for S3 uploads (e.g. 0660, 0644)
CacheSizeMB int64 // in-memory chunk cache capacity in MB for the shared ReaderCache; 0 disables
}
// s3ChunkCacheChunkSizeMB is the assumed chunk size (in MiB) used to convert
// CacheSizeMB into the entry count the in-memory cache accepts. This matches
// the default -filer.maxMB for all filer/webdav/mini flag sites. It is NOT a
// hard limit — larger chunks still get cached, this just means the byte budget
// is approximate when upload-side chunking is configured larger.
const s3ChunkCacheChunkSizeMB = 4
type S3ApiServer struct {
s3_pb.UnimplementedSeaweedS3IamCacheServer
option *S3ApiServerOption
@@ -185,13 +193,23 @@ func NewS3ApiServerWithStore(router *mux.Router, option *S3ApiServerOption, expl
}
}()
// Shared ReaderCache for the S3 GET streaming path. The chunk cache is
// nil for now — all TieredChunkCache receiver methods are nil-safe. A
// follow-up adds an in-memory chunk cache on top. Keeping this shared
// Shared ReaderCache for the S3 GET streaming path. Keeping this shared
// (rather than per-request) avoids the per-request Close(), which would
// otherwise wait for background chunk downloads that run on
// context.Background() even after the client disconnects.
//
// The underlying ChunkCache is controlled by option.CacheSizeMB below:
// - CacheSizeMB == 0: a nil *chunk_cache.TieredChunkCache is used (its
// receiver methods are nil-safe). Completed chunks are not deposited
// into a cross-request cache — concurrent readers still share in-flight
// downloads through the ReaderCache's downloaders map, but repeat reads
// refetch from volume servers.
// - CacheSizeMB > 0: a chunk_cache.ChunkCacheInMemory is created and
// wrapped in the ReaderCache, so repeat and concurrent reads hit
// memory. maxEntries is approximated from the byte budget and the
// assumed chunk size (s3ChunkCacheChunkSizeMB), clamped to a small
// floor so tiny caches still function.
//
// Downloader slots: each slot holds one in-flight / recently-completed
// chunk buffer (~4 MiB by default), so this caps both peak memory for
// in-flight chunks (s3ReaderCacheDownloaderLimit × chunkSize) and the
@@ -199,7 +217,30 @@ func NewS3ApiServerWithStore(router *mux.Router, option *S3ApiServerOption, expl
// because it typically has a handful of clients; S3 serves many
// concurrent readers, so we pick a more generous default here.
const s3ReaderCacheDownloaderLimit = 256
readerCache := filer.NewReaderCache(s3ReaderCacheDownloaderLimit, (*chunk_cache.TieredChunkCache)(nil), filerClient.GetLookupFileIdFunction())
// Negative CacheSizeMB is a misconfiguration; fail fast rather than
// silently behaving like 0.
if option.CacheSizeMB < 0 {
return nil, fmt.Errorf("invalid -s3.cacheCapacityMB %d: must be >= 0", option.CacheSizeMB)
}
var chunkCache chunk_cache.ChunkCache
if option.CacheSizeMB > 0 {
// ccache sizes entries by count; convert the configured byte budget
// via the assumed chunk size. Clamp to a floor so tiny caches still
// function.
maxEntries := option.CacheSizeMB / s3ChunkCacheChunkSizeMB
if maxEntries < 8 {
maxEntries = 8
}
chunkCache = chunk_cache.NewChunkCacheInMemory(maxEntries)
// Log the effective capacity after the floor clamp, not the configured
// value — a user passing `-s3.cacheCapacityMB=1` actually gets 8 entries
// ≈ 32 MiB because of the floor.
glog.V(0).Infof("s3 chunk cache enabled: in-memory, ~%dMB (%d chunks of ~%dMB)", maxEntries*s3ChunkCacheChunkSizeMB, maxEntries, s3ChunkCacheChunkSizeMB)
} else {
chunkCache = (*chunk_cache.TieredChunkCache)(nil)
}
readerCache := filer.NewReaderCache(s3ReaderCacheDownloaderLimit, chunkCache, filerClient.GetLookupFileIdFunction())
s3ApiServer = &S3ApiServer{
option: option,