Files
seaweedfs/weed/server/filer_server.go
Chris Lu 85ca3cb757 filer: warm-up + fail-closed cooling for POSIX locks on owner (re)start (#9673)
After a (re)start the owner defers would-be grants for posixLockWarmup
while mounts re-assert, trusting only locally-visible conflicts, so it
does not double-grant from empty state; a deferred grant is a retry for
SetLkw and EAGAIN for non-blocking SetLk, never a spurious grant. Cooling
now fail-closes: if the previous owner is unreachable during a ring
change, defer rather than risk a double-grant. readyAt is atomic so the
handler reads it without locking.
2026-05-25 13:14:05 -07:00

345 lines
13 KiB
Go

package weed_server
import (
"context"
"fmt"
"net/http"
"os"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/seaweedfs/seaweedfs/weed/credential"
"github.com/seaweedfs/seaweedfs/weed/stats"
"golang.org/x/sync/singleflight"
"google.golang.org/grpc"
"github.com/seaweedfs/seaweedfs/weed/util/grace"
"github.com/seaweedfs/seaweedfs/weed/operation"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/util"
"github.com/seaweedfs/seaweedfs/weed/filer"
"github.com/seaweedfs/seaweedfs/weed/filer/posixlock"
_ "github.com/seaweedfs/seaweedfs/weed/filer/arangodb"
_ "github.com/seaweedfs/seaweedfs/weed/filer/cassandra"
_ "github.com/seaweedfs/seaweedfs/weed/filer/cassandra2"
_ "github.com/seaweedfs/seaweedfs/weed/filer/elastic/v7"
_ "github.com/seaweedfs/seaweedfs/weed/filer/etcd"
_ "github.com/seaweedfs/seaweedfs/weed/filer/foundationdb"
_ "github.com/seaweedfs/seaweedfs/weed/filer/hbase"
_ "github.com/seaweedfs/seaweedfs/weed/filer/leveldb"
_ "github.com/seaweedfs/seaweedfs/weed/filer/leveldb2"
_ "github.com/seaweedfs/seaweedfs/weed/filer/leveldb3"
_ "github.com/seaweedfs/seaweedfs/weed/filer/mongodb"
_ "github.com/seaweedfs/seaweedfs/weed/filer/mysql"
_ "github.com/seaweedfs/seaweedfs/weed/filer/mysql2"
_ "github.com/seaweedfs/seaweedfs/weed/filer/postgres"
_ "github.com/seaweedfs/seaweedfs/weed/filer/postgres2"
_ "github.com/seaweedfs/seaweedfs/weed/filer/redis"
_ "github.com/seaweedfs/seaweedfs/weed/filer/redis2"
_ "github.com/seaweedfs/seaweedfs/weed/filer/redis3"
_ "github.com/seaweedfs/seaweedfs/weed/filer/sqlite"
_ "github.com/seaweedfs/seaweedfs/weed/filer/tarantool"
_ "github.com/seaweedfs/seaweedfs/weed/filer/ydb"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/notification"
_ "github.com/seaweedfs/seaweedfs/weed/notification/aws_sqs"
_ "github.com/seaweedfs/seaweedfs/weed/notification/gocdk_pub_sub"
_ "github.com/seaweedfs/seaweedfs/weed/notification/google_pub_sub"
_ "github.com/seaweedfs/seaweedfs/weed/notification/kafka"
_ "github.com/seaweedfs/seaweedfs/weed/notification/log"
_ "github.com/seaweedfs/seaweedfs/weed/notification/webhook"
"github.com/seaweedfs/seaweedfs/weed/security"
)
type FilerOption struct {
Masters *pb.ServerDiscovery
FilerGroup string
Collection string
DefaultReplication string
DisableDirListing bool
MaxMB int
DirListingLimit int
DataCenter string
Rack string
DataNode string
DefaultLevelDbDir string
DisableHttp bool
Host pb.ServerAddress
recursiveDelete bool
Cipher bool
SaveToFilerLimit int64
ConcurrentUploadLimit int64
ConcurrentFileUploadLimit int64
ShowUIDirectoryDelete bool
DownloadMaxBytesPs int64
DiskType string
AllowedOrigins []string
ExposeDirectoryData bool
TusBasePath string
S3ConfigFile string // optional path to static S3 identity config file
CredentialManager *credential.CredentialManager
}
type FilerServer struct {
inFlightDataSize int64
inFlightUploads int64
listenersWaits int64
// notifying clients
listenersLock sync.Mutex
listenersCond *sync.Cond
inFlightDataLimitCond *sync.Cond
filer_pb.UnimplementedSeaweedFilerServer
option *FilerOption
filer *filer.Filer
filerGuard *security.Guard
volumeGuard *security.Guard
grpcDialOption grpc.DialOption
// metrics read from the master
metricsAddress string
metricsIntervalSec int
// track known metadata listeners
knownListenersLock sync.Mutex
knownListeners map[int32]int32
// deduplicates concurrent remote object caching operations
remoteCacheGroup singleflight.Group
recentCopyRequestsMu sync.Mutex
recentCopyRequests map[string]recentCopyRequest
// credential manager for IAM operations
CredentialManager *credential.CredentialManager
// mountPeerRegistry backs the MountRegister / MountList RPCs for peer
// chunk sharing (tier 1). Always populated.
mountPeerRegistry *filer.MountPeerRegistry
// entryLockTable serializes mutations to the same entry path on this filer.
// CreateEntry takes it today; UpdateEntry and DeleteEntry are intended to take
// it too as their callers route a key's writes to this node, making it the
// local serialization point for read-modify-write operations that replaces
// the distributed lock for that key. Idle keys are evicted automatically, so
// the table stays bounded.
entryLockTable *util.LockTable[util.FullPath]
// posixLocks is the in-memory authority for cross-mount POSIX advisory locks
// on inodes this filer owns (per the route-by-key ring). Lock state is kept
// here rather than in replicated metadata: it is transient coordination, so
// keeping it off the meta-log avoids churn.
posixLocks *posixlock.Manager
// posixLockSweeperStop stops the lease-reaping sweeper goroutine on Shutdown.
posixLockSweeperStop chan struct{}
// posixLockReadyAt is the unix-nanos when this filer began serving POSIX
// locks. For posixLockWarmup after it, the owner defers would-be grants while
// mounts re-assert, so a (re)started owner does not double-grant from empty
// state. Atomic so the handler reads it without locking; 0 means "not warming
// up" (e.g. in tests).
posixLockReadyAt atomic.Int64
}
func NewFilerServer(defaultMux, readonlyMux *http.ServeMux, option *FilerOption) (fs *FilerServer, err error) {
v := util.GetViper()
signingKey := v.GetString("jwt.filer_signing.key")
v.SetDefault("jwt.filer_signing.expires_after_seconds", 10)
expiresAfterSec := v.GetInt("jwt.filer_signing.expires_after_seconds")
readSigningKey := v.GetString("jwt.filer_signing.read.key")
v.SetDefault("jwt.filer_signing.read.expires_after_seconds", 60)
readExpiresAfterSec := v.GetInt("jwt.filer_signing.read.expires_after_seconds")
volumeSigningKey := v.GetString("jwt.signing.key")
v.SetDefault("jwt.signing.expires_after_seconds", 10)
volumeExpiresAfterSec := v.GetInt("jwt.signing.expires_after_seconds")
volumeReadSigningKey := v.GetString("jwt.signing.read.key")
v.SetDefault("jwt.signing.read.expires_after_seconds", 60)
volumeReadExpiresAfterSec := v.GetInt("jwt.signing.read.expires_after_seconds")
v.SetDefault("cors.allowed_origins.values", "*")
allowedOrigins := v.GetString("cors.allowed_origins.values")
domains := strings.Split(allowedOrigins, ",")
option.AllowedOrigins = domains
v.SetDefault("filer.expose_directory_metadata.enabled", true)
returnDirMetadata := v.GetBool("filer.expose_directory_metadata.enabled")
option.ExposeDirectoryData = returnDirMetadata
fs = &FilerServer{
option: option,
grpcDialOption: security.LoadClientTLS(util.GetViper(), "grpc.filer"),
knownListeners: make(map[int32]int32),
inFlightDataLimitCond: sync.NewCond(new(sync.Mutex)),
recentCopyRequests: make(map[string]recentCopyRequest),
CredentialManager: option.CredentialManager,
entryLockTable: util.NewLockTable[util.FullPath](),
posixLocks: posixlock.NewManager(),
}
fs.startPosixLockSweeper()
fs.mountPeerRegistry = filer.NewMountPeerRegistry()
go fs.runMountPeerRegistrySweeper()
fs.listenersCond = sync.NewCond(&fs.listenersLock)
option.Masters.RefreshBySrvIfAvailable()
if len(option.Masters.GetInstances()) == 0 {
glog.Fatal("master list is required!")
}
if !util.LoadConfiguration("filer", false) {
v.SetDefault("leveldb2.enabled", true)
v.SetDefault("leveldb2.dir", option.DefaultLevelDbDir)
_, err := os.Stat(option.DefaultLevelDbDir)
if os.IsNotExist(err) {
os.MkdirAll(option.DefaultLevelDbDir, 0755)
}
glog.V(0).Infof("default to create filer store dir in %s", option.DefaultLevelDbDir)
} else {
glog.Warningf("skipping default store dir in %s", option.DefaultLevelDbDir)
}
util.LoadConfiguration("notification", false)
v.SetDefault("filer.options.max_file_name_length", 255)
maxFilenameLength := v.GetUint32("filer.options.max_file_name_length")
glog.V(0).Infof("max_file_name_length %d", maxFilenameLength)
fs.filer = filer.NewFiler(*option.Masters, fs.grpcDialOption, option.Host, option.FilerGroup, option.Collection, option.DefaultReplication, option.DataCenter, maxFilenameLength, func() {
if atomic.LoadInt64(&fs.listenersWaits) > 0 {
fs.listenersCond.Broadcast()
}
})
fs.filer.Cipher = option.Cipher
// we do not support IP whitelist right now https://github.com/seaweedfs/seaweedfs/issues/7094
if v.GetString("guard.white_list") != "" {
glog.Warningf("filer: guard.white_list is configured but the IP whitelist feature is currently disabled. See https://github.com/seaweedfs/seaweedfs/issues/7094")
}
fs.filerGuard = security.NewGuard([]string{}, signingKey, expiresAfterSec, readSigningKey, readExpiresAfterSec)
fs.volumeGuard = security.NewGuard([]string{}, volumeSigningKey, volumeExpiresAfterSec, volumeReadSigningKey, volumeReadExpiresAfterSec)
fs.checkWithMaster()
go stats.LoopPushingMetric("filer", string(fs.option.Host), fs.metricsAddress, fs.metricsIntervalSec)
go fs.filer.MasterClient.KeepConnectedToMaster(context.Background())
fs.option.recursiveDelete = v.GetBool("filer.options.recursive_delete")
v.SetDefault("filer.options.buckets_folder", "/buckets")
fs.filer.DirBucketsPath = v.GetString("filer.options.buckets_folder")
// TODO deprecated, will be removed after 2020-12-31
// replaced by https://github.com/seaweedfs/seaweedfs/wiki/Path-Specific-Configuration
// fs.filer.FsyncBuckets = v.GetStringSlice("filer.options.buckets_fsync")
isFresh := fs.filer.LoadConfiguration(v)
notification.LoadConfiguration(v, "notification.")
handleStaticResources(defaultMux)
if !option.DisableHttp {
defaultMux.HandleFunc("/healthz", requestIDMiddleware(fs.filerHealthzHandler))
// TUS resumable upload protocol handler
if option.TusBasePath != "" {
// Normalize TusPath to always have a leading slash and no trailing slash
if !strings.HasPrefix(option.TusBasePath, "/") {
option.TusBasePath = "/" + option.TusBasePath
}
option.TusBasePath = strings.TrimRight(option.TusBasePath, "/")
// Disallow using "/" as TUS base to avoid hijacking all filer routes
if option.TusBasePath == "" {
glog.Warningf("Invalid TUS base path; TUS disabled (must not be root '/')")
} else {
handlePath := option.TusBasePath + "/"
defaultMux.HandleFunc(handlePath, fs.filerGuard.WhiteList(requestIDMiddleware(fs.tusHandler)))
// Start background cleanup of expired TUS sessions (every hour)
fs.StartTusSessionCleanup(1 * time.Hour)
}
}
defaultMux.HandleFunc("/", fs.filerGuard.WhiteList(requestIDMiddleware(fs.filerHandler)))
}
if defaultMux != readonlyMux {
handleStaticResources(readonlyMux)
readonlyMux.HandleFunc("/healthz", requestIDMiddleware(fs.filerHealthzHandler))
readonlyMux.HandleFunc("/", fs.filerGuard.WhiteList(requestIDMiddleware(fs.readonlyFilerHandler)))
}
existingNodes := fs.filer.ListExistingPeerUpdates(context.Background())
startFromTime := time.Now().Add(-filer.LogFlushInterval)
if isFresh {
glog.V(0).Infof("%s bootstrap from peers %+v", option.Host, existingNodes)
if err := fs.filer.MaybeBootstrapFromOnePeer(option.Host, existingNodes, startFromTime); err != nil {
glog.Fatalf("%s bootstrap from %+v: %v", option.Host, existingNodes, err)
}
}
v.SetDefault("filer.options.s3.empty_folder_cleanup_delay", "2m")
if d, err := time.ParseDuration(v.GetString("filer.options.s3.empty_folder_cleanup_delay")); err == nil {
fs.filer.EmptyFolderCleanupDelay = d
}
fs.filer.AggregateFromPeers(option.Host, existingNodes, startFromTime)
fs.filer.LoadFilerConf()
fs.filer.LoadRemoteStorageConfAndMapping()
grace.OnReload(fs.Reload)
fs.SetupDlmReplication()
fs.filer.Dlm.LockRing.SetTakeSnapshotCallback(fs.OnDlmChangeSnapshot)
if fs.CredentialManager != nil {
fs.CredentialManager.SetFilerAddressFunc(func() pb.ServerAddress {
return fs.option.Host
}, fs.grpcDialOption)
fs.CredentialManager.SetMasterClient(fs.filer.MasterClient, fs.grpcDialOption)
}
return fs, nil
}
func (fs *FilerServer) checkWithMaster() {
isConnected := false
for !isConnected {
fs.option.Masters.RefreshBySrvIfAvailable()
for _, master := range fs.option.Masters.GetInstances() {
readErr := operation.WithMasterServerClient(false, master, fs.grpcDialOption, func(masterClient master_pb.SeaweedClient) error {
resp, err := masterClient.GetMasterConfiguration(context.Background(), &master_pb.GetMasterConfigurationRequest{})
if err != nil {
return fmt.Errorf("get master %s configuration: %v", master, err)
}
fs.metricsAddress, fs.metricsIntervalSec = resp.MetricsAddress, int(resp.MetricsIntervalSeconds)
return nil
})
if readErr == nil {
isConnected = true
} else {
time.Sleep(7 * time.Second)
}
}
}
}
// Shutdown gracefully shuts down the filer server by waiting for in-flight uploads to complete.
// This prevents data corruption when the process receives SIGTERM during active uploads.
func (fs *FilerServer) Shutdown() {
glog.V(0).Infof("Shutting down filer")
if fs.posixLockSweeperStop != nil {
close(fs.posixLockSweeperStop)
}
fs.filer.Shutdown()
}
func (fs *FilerServer) Reload() {
glog.V(0).Infoln("Reload filer server...")
util.LoadConfiguration("security", false)
}