Files
seaweedfs/weed/storage/erasure_coding/verification_test.go
Chris Lu 3a8389cd68 fix(ec): verify full shard set before deleting source volume (#9490) (#9493)
* fix(ec): verify full shard set before deleting source volume (#9490)

Before this change, both the worker EC task and the shell ec.encode
command would delete the source .dat as soon as MountEcShards returned —
even if distribute/mount failed partway, leaving fewer than 14 shards
in the cluster. The deletion was logged at V(2), so by the time someone
noticed missing data the only trace was a 0-byte .dat synthesized by
disk_location at next restart.

- Worker path adds Step 6: poll VolumeEcShardsInfo on every destination,
  union the bitmaps, and refuse to call deleteOriginalVolume unless all
  TotalShardsCount distinct shard ids are observed. A failed gate leaves
  the source readonly so the next detection scan can retry.
- Shell ec.encode adds the same gate after EcBalance, walking the master
  topology with collectEcNodeShardsInfo.
- VolumeDelete RPC success and .dat/.idx unlinks now log at V(0) so any
  source destruction is traceable in default-verbosity production logs.

The EC-balance-vs-in-flight-encode race is intentionally left for a
follow-up; balance should refuse to move shards for a volume whose
encode job is not in Completed state.

* fix(ec): trim doc comments on the new shard-verification path

Drop WHAT-describing godoc on freshly added helpers; keep only the WHY
notes (query-error policy in VerifyShardsAcrossServers, the #9490
reference at the call sites).

* fix(ec): drop issue-number anchors from new comments

Issue references age poorly — the why behind each comment already
stands on its own.

* fix(ec): parametrize RequireFullShardSet on totalShards

Take totalShards as an argument instead of reading the package-level
TotalShardsCount constant. The OSS callers continue to pass 14, but the
helper is now usable with any DataShards+ParityShards ratio.

* test(plugin_workers): make fake volume server respond to VolumeEcShardsInfo

The new pre-delete verification gate calls VolumeEcShardsInfo on every
destination after mount, and the fake server's UnimplementedVolumeServer
returns Unimplemented — the verifier read that as zero shards on every
node and aborted source deletion. Build the response from recorded
mount requests so the integration test exercises the gate end-to-end.

* fix(rust/volume): log .dat/.idx unlink with size in remove_volume_files

Mirror the Go-side change in weed/storage/volume_write.go: stat each
file before removing and emit an info-level log for .dat/.idx so a
destructive call is always traceable. The OSS Rust crate previously
unlinked them silently.

* fix(ec/decode): verify regenerated .dat before deleting EC shards

After mountDecodedVolume succeeds, the previous code immediately
unmounts and deletes every EC shard. A silent failure in generate or
mount could leave the cluster with neither shards nor a valid normal
volume. Probe ReadVolumeFileStatus on the target and refuse to proceed
if dat or idx is 0 bytes.

Also make the fake volume server's VolumeEcShardsInfo reflect whichever
shard files exist on disk (seeded for tests as well as mounted via
RPC), so the new gate can be exercised end-to-end.

* fix(ec): address PR review nits in verification + fake server

- Drop unused ServerShardInventory.Sizes field.
- Skip shard ids >= MaxShardCount before bitmap Set so the ShardBits
  bound is explicit (Set already no-ops on overflow, this is for
  clarity).
- Nil-guard the fake server's VolumeEcShardsInfo so a malformed call
  doesn't panic the test process.
2026-05-13 19:29:24 -07:00

110 lines
3.1 KiB
Go

package erasure_coding
import (
"strings"
"testing"
)
func TestRequireFullShardSet_AllPresent(t *testing.T) {
var bits ShardBits
for id := 0; id < TotalShardsCount; id++ {
bits = bits.Set(ShardId(id))
}
if err := RequireFullShardSet(42, bits, TotalShardsCount); err != nil {
t.Fatalf("unexpected error for full set: %v", err)
}
}
func TestRequireFullShardSet_ReportsMissingIds(t *testing.T) {
var bits ShardBits
for id := 0; id < TotalShardsCount; id++ {
if id == 3 || id == 7 {
continue
}
bits = bits.Set(ShardId(id))
}
err := RequireFullShardSet(42, bits, TotalShardsCount)
if err == nil {
t.Fatal("expected error for incomplete set, got nil")
}
msg := err.Error()
if !strings.Contains(msg, "volume 42") {
t.Errorf("error should name the volume id: %s", msg)
}
if !strings.Contains(msg, "[3 7]") {
t.Errorf("error should list missing ids 3 and 7: %s", msg)
}
if !strings.Contains(msg, "12/14") {
t.Errorf("error should report 12/14 shards present: %s", msg)
}
}
func TestRequireFullShardSet_EmptyBitmap(t *testing.T) {
err := RequireFullShardSet(1, 0, TotalShardsCount)
if err == nil {
t.Fatal("expected error for empty bitmap")
}
if !strings.Contains(err.Error(), "0/14") {
t.Errorf("error should report 0/14 shards: %s", err.Error())
}
}
func TestRequireFullShardSet_CustomRatio(t *testing.T) {
// 6+3 ratio: total=9, all present
var bits ShardBits
for id := 0; id < 9; id++ {
bits = bits.Set(ShardId(id))
}
if err := RequireFullShardSet(7, bits, 9); err != nil {
t.Fatalf("unexpected error for full 6+3 set: %v", err)
}
// 6+3, missing shard 5
bits = bits.Clear(5)
err := RequireFullShardSet(7, bits, 9)
if err == nil {
t.Fatal("expected error when shard 5 is missing in 6+3 ratio")
}
if !strings.Contains(err.Error(), "8/9") {
t.Errorf("error should report 8/9: %s", err.Error())
}
if !strings.Contains(err.Error(), "[5]") {
t.Errorf("error should list missing id 5: %s", err.Error())
}
}
func TestRequireFullShardSet_RejectsInvalidTotal(t *testing.T) {
if err := RequireFullShardSet(1, 0, 0); err == nil {
t.Error("expected error for totalShards=0")
}
if err := RequireFullShardSet(1, 0, MaxShardCount+1); err == nil {
t.Errorf("expected error for totalShards > MaxShardCount")
}
}
func TestSummarizeShardInventory_Deterministic(t *testing.T) {
perServer := map[string]ServerShardInventory{
"10.0.0.2:8080": {Bits: ShardBits(0).Set(4).Set(5).Set(6)},
"10.0.0.1:8080": {Bits: ShardBits(0).Set(0).Set(1).Set(2).Set(3)},
}
got := SummarizeShardInventory(perServer)
want := "10.0.0.1:8080=[0 1 2 3] 10.0.0.2:8080=[4 5 6]"
if got != want {
t.Errorf("summary mismatch\n got: %q\n want: %q", got, want)
}
}
func TestSummarizeShardInventory_IncludesError(t *testing.T) {
perServer := map[string]ServerShardInventory{
"10.0.0.1:8080": {Bits: ShardBits(0).Set(0).Set(1), QueryError: errStr("dial timeout")},
}
got := SummarizeShardInventory(perServer)
if !strings.Contains(got, "ERR:dial timeout") {
t.Errorf("expected error tag in summary, got %q", got)
}
}
type errStr string
func (e errStr) Error() string { return string(e) }