mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-09 18:32:43 +00:00
fix(ec): fail the pre-encode sweep for any reachable node that can't ack teardown
A reachable pre-upgrade server ignores full_teardown and returns success without wiping an orphan, which a later copy then folds into the new generation. Treat a missing full_teardown_done ack as fatal for every reachable node (best-effort only for a gRPC-unreachable one), not just for topology-reported pairs.
This commit is contained in:
@@ -2,7 +2,6 @@ package shell
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"slices"
|
||||
@@ -537,13 +536,6 @@ func sourceServerDeleteEcShards(grpcDialOption grpc.DialOption, collection strin
|
||||
|
||||
}
|
||||
|
||||
// errFullTeardownNotAcked marks a VolumeEcShardsDelete whose server did not
|
||||
// echo full_teardown_done -- a pre-upgrade volume server silently ignoring the
|
||||
// flag. The pre-encode sweep treats it as fatal only for a reported (mounted)
|
||||
// leftover; for an unreported node it is best-effort, since that node likely had
|
||||
// nothing to wipe and an old server cannot acknowledge either way.
|
||||
var errFullTeardownNotAcked = errors.New("volume server did not perform full teardown")
|
||||
|
||||
// unmountAndDeleteEcShardsQuiet unmounts then deletes shards on one server in a
|
||||
// single connection, without the per-call logging the interactive helpers emit.
|
||||
// Used by the orphan sweep, which fans out to every node x volume and would
|
||||
@@ -567,7 +559,7 @@ func unmountAndDeleteEcShardsQuiet(grpcDialOption grpc.DialOption, collection st
|
||||
return fmt.Errorf("delete: %w", err)
|
||||
}
|
||||
if !resp.GetFullTeardownDone() {
|
||||
return fmt.Errorf("delete on %s: %w", location, errFullTeardownNotAcked)
|
||||
return fmt.Errorf("delete on %s did not perform full teardown (pre-upgrade volume server?); a stale EC generation may remain", location)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
@@ -2,7 +2,6 @@ package shell
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -406,11 +405,12 @@ func clearPreexistingEcShards(commandEnv *CommandEnv, topologyInfo *master_pb.To
|
||||
ewg.Add(func() error {
|
||||
if err := unmountAndDeleteEcShardsQuiet(commandEnv.option.GrpcDialOption, collection, vid, addr, allShardIds); err != nil {
|
||||
// Surface a reachable node whose delete genuinely failed (its orphan would
|
||||
// be re-stamped by a later copy installing the new .vif). Stay best-effort
|
||||
// for an unreachable node (it cannot receive this new generation) and for
|
||||
// a pre-upgrade node that did not ack full_teardown on an UNREPORTED pair
|
||||
// (it likely had nothing to wipe); a reported leftover stays fatal.
|
||||
if fatal || (!isNodeUnreachable(err) && !errors.Is(err, errFullTeardownNotAcked)) {
|
||||
// be re-stamped by a later copy installing the new .vif). A missing
|
||||
// full_teardown ack from a reachable pre-upgrade node is fatal too: it may
|
||||
// still hold an orphan a later copy would re-stamp into the new generation.
|
||||
// Stay best-effort only for an unreachable node, which cannot receive this
|
||||
// new generation at all.
|
||||
if fatal || !isNodeUnreachable(err) {
|
||||
return fmt.Errorf("clear stale ec shards for volume %d on %s: %w", vid, addr, err)
|
||||
}
|
||||
glog.V(1).Infof("orphan sweep: volume %d on %s skipped: %v", vid, addr, err)
|
||||
|
||||
Reference in New Issue
Block a user