mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-05-14 05:41:29 +00:00
* fix(mount): retry saveEntry on transient filer errors, stop mismapping Canceled to EIO When the mount's gRPC connection to the filer flaps (e.g. a transient restart or network blip), every in-flight setattr/utimes/chmod/xattr/ rename-driven saveEntry returns "code = Canceled desc = grpc: the client connection is closing" at the same instant. Two bugs in saveEntry then turned each of those into a hard EIO for the user: 1. The error was wrapped with fmt.Errorf(... %v ...) before being passed to grpcErrorToFuseStatus. %v stringifies the status, so status.FromError could no longer unwrap the gRPC code and the Canceled→ETIMEDOUT branch in the classifier never fired; every Canceled error fell through to the default EIO. 2. saveEntry issued a single streamUpdateEntry call with no retry, unlike doFlush which already wraps its CreateEntry in retryMetadataFlush. One stream flap therefore propagated straight to the FUSE caller instead of being ridden out across the 4-attempt / ~7s backoff window. Wrap the UpdateEntry call in retryMetadataFlush (matching doFlush and completeAsyncFlush) and switch the wrap verb to %w so the classifier can still see the gRPC code. This recovers transient closes silently and, if retries are exhausted, returns ETIMEDOUT instead of EIO. Reported by rclone users in #9139 where a large concurrent copy (hundreds of .partial uploads per filer flap) surfaced as walls of EIOs because each .partial rename's post-setattr hit saveEntry at the worst possible moment. * mount: skip saveEntry retries on permanent filer errors Address gemini-code-assist review on #9141: blindly retrying every UpdateEntry failure with exponential backoff means interactive FUSE ops like chmod/utimes/xattr can hang for ~7s before surfacing clearly permanent errors (NotFound, PermissionDenied, InvalidArgument, etc.). Introduce retryMetadataFlushIf, a variant of retryMetadataFlush that accepts a shouldRetry predicate, and an isRetryableFilerError classifier that short-circuits on a conservative whitelist of terminal gRPC codes. Transient errors (Canceled / Unavailable / DeadlineExceeded / ResourceExhausted / Internal) and non-gRPC errors still retry, so the original fix for #9139 (rclone EIO burst during filer connection flaps) is preserved.
75 lines
2.0 KiB
Go
75 lines
2.0 KiB
Go
package mount
|
|
|
|
import (
|
|
"strings"
|
|
"syscall"
|
|
|
|
"github.com/seaweedfs/go-fuse/v2/fuse"
|
|
"google.golang.org/grpc/codes"
|
|
"google.golang.org/grpc/status"
|
|
)
|
|
|
|
func grpcErrorToFuseStatus(err error) fuse.Status {
|
|
if err == nil {
|
|
return fuse.OK
|
|
}
|
|
|
|
// Unpack error for inspection
|
|
if s, ok := status.FromError(err); ok {
|
|
switch s.Code() {
|
|
case codes.OK:
|
|
return fuse.OK
|
|
case codes.Canceled, codes.DeadlineExceeded:
|
|
return fuse.Status(syscall.ETIMEDOUT)
|
|
case codes.Unavailable:
|
|
return fuse.Status(syscall.EAGAIN)
|
|
case codes.ResourceExhausted:
|
|
return fuse.Status(syscall.EAGAIN) // Or syscall.ENOSPC
|
|
case codes.PermissionDenied:
|
|
return fuse.Status(syscall.EACCES)
|
|
case codes.Unauthenticated:
|
|
return fuse.Status(syscall.EPERM)
|
|
case codes.NotFound:
|
|
return fuse.ENOENT
|
|
case codes.AlreadyExists:
|
|
return fuse.Status(syscall.EEXIST)
|
|
case codes.InvalidArgument:
|
|
return fuse.EINVAL
|
|
}
|
|
}
|
|
|
|
// String matching for errors that don't have proper gRPC codes but are known
|
|
errStr := err.Error()
|
|
if strings.Contains(errStr, "transport") {
|
|
return fuse.Status(syscall.EAGAIN)
|
|
}
|
|
// Add other string matches if necessary
|
|
|
|
return fuse.EIO
|
|
}
|
|
|
|
// isRetryableFilerError reports whether a filer RPC error looks transient
|
|
// enough to retry. It takes a conservative whitelist approach: only errors
|
|
// that clearly describe a permanent application-level failure
|
|
// (NotFound/AlreadyExists/InvalidArgument/PermissionDenied/Unauthenticated/
|
|
// FailedPrecondition) short-circuit the retry loop. Everything else —
|
|
// transport errors, Canceled/Unavailable/ResourceExhausted, or errors with no
|
|
// gRPC status — is treated as potentially transient and retried.
|
|
func isRetryableFilerError(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
if s, ok := status.FromError(err); ok {
|
|
switch s.Code() {
|
|
case codes.NotFound,
|
|
codes.AlreadyExists,
|
|
codes.InvalidArgument,
|
|
codes.PermissionDenied,
|
|
codes.Unauthenticated,
|
|
codes.FailedPrecondition:
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|