diff --git a/weed/shell/command_ec_common.go b/weed/shell/command_ec_common.go index c0c709365..cd97f814d 100644 --- a/weed/shell/command_ec_common.go +++ b/weed/shell/command_ec_common.go @@ -2,7 +2,6 @@ package shell import ( "context" - "errors" "fmt" "regexp" "slices" @@ -537,13 +536,6 @@ func sourceServerDeleteEcShards(grpcDialOption grpc.DialOption, collection strin } -// errFullTeardownNotAcked marks a VolumeEcShardsDelete whose server did not -// echo full_teardown_done -- a pre-upgrade volume server silently ignoring the -// flag. The pre-encode sweep treats it as fatal only for a reported (mounted) -// leftover; for an unreported node it is best-effort, since that node likely had -// nothing to wipe and an old server cannot acknowledge either way. -var errFullTeardownNotAcked = errors.New("volume server did not perform full teardown") - // unmountAndDeleteEcShardsQuiet unmounts then deletes shards on one server in a // single connection, without the per-call logging the interactive helpers emit. // Used by the orphan sweep, which fans out to every node x volume and would @@ -567,7 +559,7 @@ func unmountAndDeleteEcShardsQuiet(grpcDialOption grpc.DialOption, collection st return fmt.Errorf("delete: %w", err) } if !resp.GetFullTeardownDone() { - return fmt.Errorf("delete on %s: %w", location, errFullTeardownNotAcked) + return fmt.Errorf("delete on %s did not perform full teardown (pre-upgrade volume server?); a stale EC generation may remain", location) } return nil }) diff --git a/weed/shell/command_ec_encode.go b/weed/shell/command_ec_encode.go index f5918adee..b106b2868 100644 --- a/weed/shell/command_ec_encode.go +++ b/weed/shell/command_ec_encode.go @@ -2,7 +2,6 @@ package shell import ( "context" - "errors" "flag" "fmt" "io" @@ -406,11 +405,12 @@ func clearPreexistingEcShards(commandEnv *CommandEnv, topologyInfo *master_pb.To ewg.Add(func() error { if err := unmountAndDeleteEcShardsQuiet(commandEnv.option.GrpcDialOption, collection, vid, addr, allShardIds); err != nil { // Surface a reachable node whose delete genuinely failed (its orphan would - // be re-stamped by a later copy installing the new .vif). Stay best-effort - // for an unreachable node (it cannot receive this new generation) and for - // a pre-upgrade node that did not ack full_teardown on an UNREPORTED pair - // (it likely had nothing to wipe); a reported leftover stays fatal. - if fatal || (!isNodeUnreachable(err) && !errors.Is(err, errFullTeardownNotAcked)) { + // be re-stamped by a later copy installing the new .vif). A missing + // full_teardown ack from a reachable pre-upgrade node is fatal too: it may + // still hold an orphan a later copy would re-stamp into the new generation. + // Stay best-effort only for an unreachable node, which cannot receive this + // new generation at all. + if fatal || !isNodeUnreachable(err) { return fmt.Errorf("clear stale ec shards for volume %d on %s: %w", vid, addr, err) } glog.V(1).Infof("orphan sweep: volume %d on %s skipped: %v", vid, addr, err)