fix(shell): exclude failed EC shard copies from rebuild recoverability gate (#10043)

* fix(shell): correct volume.list -writable filter unit and comparison * fix(shell): correct volume.list -writable filter unit and comparison * chore(shell): fix typo in EC shard helper param names * fix(shell): use exact match for volume.balance -racks/-nodes filter The old strings.Contains-based filter quietly included any id that was a substring of the user-supplied flag value (e.g. -racks=rack10 also matched rack1). Replace it with an exact-match set parsed from the comma-separated flag value, and add regression tests for both -racks and -nodes paths. Also fix a small typo in the "remote storage" error returned by maybeMoveOneVolume. * fix(shell): use exact match for volume.balance -racks/-nodes filter The old strings.Contains-based filter quietly included any id that was a substring of the user-supplied flag value (e.g. -racks=rack10 also matched rack1). Replace it with an exact-match set parsed from the comma-separated flag value, and add regression tests for both -racks and -nodes paths. Also fix a small typo in the "remote storage" error returned by maybeMoveOneVolume. * refactor(shell): drop nil sentinel in splitCSVSet, use len() in callers * fix(shell): exclude failed EC shard copies from rebuild recoverability gate prepareDataToRecover incremented the remote-shard counter before the copy RPC, so in apply mode a failed VolumeEcShardsCopy was still counted toward the DataShardsCount recoverability gate. The gate could then pass with fewer real shards than required, deferring the failure to the deeper generateMissingShards/reconstruct step and reporting an inflated shard count in the "not enough shards" error. Count the remote shard only after a successful copy (apply mode) or when planning (dry-run), and rename wouldCopy to recoverableRemoteShards for clarity. Add a regression test covering an apply-mode copy failure. * fix(shell): clean up copied EC shards when the recoverability gate fails A runtime copy failure can trip the gate after earlier copies already succeeded, stranding those working shards on the rebuilder. Return the copied shard ids on the error path and run the cleanup defer even when recovery fails, so the temp shards get deleted. --------- Co-authored-by: Chris Lu <chris.lu@gmail.com>
2026-06-09 18:32:43 +00:00 · 2026-06-23 02:23:23 +08:00
parent 6f1d4af035
commit 9de9dbaa83
2 changed files with 59 additions and 16 deletions
@@ -9,6 +9,8 @@ import (
 	"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
 	"github.com/seaweedfs/seaweedfs/weed/storage/needle"
 	"github.com/seaweedfs/seaweedfs/weed/storage/types"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
 )

 // topoWith builds a one-node topology snapshot where regularVids appear as
@@ -191,3 +193,43 @@ func TestPrepareDataToRecover_SelfSourceNotCopied(t *testing.T) {
 		t.Errorf("self-sourced shard 5 should be classified local, got local=%v", local)
 	}
 }
+
+func TestPrepareDataToRecover_ApplyCopyFailureDoesNotCountAsRecoverable(t *testing.T) {
+	var log bytes.Buffer
+	rebuilder := newEcNode("dc1", "rack1", "127.0.0.1:1", 100).
+		addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1, 2, 3, 4, 5, 6, 7, 8})
+	remote := newEcNode("dc1", "rack1", "127.0.0.1:2", 100).
+		addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{9})
+	erb := &ecRebuilder{
+		commandEnv: &CommandEnv{
+			env:    make(map[string]string),
+			noLock: true,
+			option: &ShellOptions{
+				GrpcDialOption: grpc.WithTransportCredentials(insecure.NewCredentials()),
+			},
+		},
+		ecNodes:      []*EcNode{rebuilder, remote},
+		writer:       &log,
+		applyChanges: true,
+	}
+
+	locations := make(EcShardLocations, erasure_coding.MaxShardCount)
+	for i := 0; i < 9; i++ {
+		locations[i] = []*EcNode{rebuilder}
+	}
+	locations[9] = []*EcNode{remote}
+
+	copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
+	if err == nil {
+		t.Fatalf("copy failure must leave only 9 usable shards and fail recoverability gate; copied=%v local=%v log:\n%s", copied, local, log.String())
+	}
+	if !strings.Contains(err.Error(), "9 shards are not enough") {
+		t.Fatalf("expected recoverability error to count only successful/local shards, got %v; log:\n%s", err, log.String())
+	}
+	if len(copied) != 0 {
+		t.Fatalf("failed copy must not be recorded as copied/deletable, got %v", copied)
+	}
+	if !strings.Contains(log.String(), "failed to copy 1.9") {
+		t.Fatalf("expected failed copy to be logged, got:\n%s", log.String())
+	}
+}
@@ -265,14 +265,12 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
 	// collect shard files to rebuilder local disk
 	var generatedShardIds []erasure_coding.ShardId
 	copiedShardIds, _, err := erb.prepareDataToRecover(rebuilder, collection, volumeId, locations)
-	if err != nil {
-		return err
-	}
 	defer func() {
-		// Clean up the working copies this run actually made. In dry-run
-		// nothing was copied (copiedShardIds is empty), so this issues no
-		// delete RPC. Use a local error so a cleanup failure cannot mask a
-		// successful rebuild's return value.
+		// Clean up the working copies this run actually made, even when the
+		// recoverability gate failed after some copies already succeeded:
+		// they are temp files on the rebuilder nothing else reclaims. Dry-run
+		// copies nothing (copiedShardIds is empty), so this issues no delete
+		// RPC. Use a local error so a cleanup failure cannot mask the return.
 		if !erb.applyChanges || len(copiedShardIds) == 0 {
 			return
 		}
@@ -280,6 +278,9 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
 			erb.write("%s delete copied ec shards %s %d.%v: %v\n", rebuilder.info.Id, collection, volumeId, copiedShardIds, derr)
 		}
 	}()
+	if err != nil {
+		return err
+	}

 	if !erb.applyChanges {
 		return nil
@@ -344,11 +345,9 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 		}
 	}

-	// wouldCopy counts shards available on a remote holder that the rebuilder
-	// could pull. It drives the recoverability gate below independently of
-	// whether we actually copy (dry-run copies nothing), so a dry-run still
-	// reports the plan instead of erroring "not enough shards".
-	wouldCopy := 0
+	// recoverableRemoteShards counts remote shards that can contribute to the
+	// rebuild. Dry-run counts the plan; apply mode counts only successful copies.
+	recoverableRemoteShards := 0
 	for i := 0; i < targetShardCount; i++ {
 		ecNodes := locations[i]
 		shardId := erasure_coding.ShardId(i)
@@ -372,9 +371,8 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 			continue
 		}

-		wouldCopy++
-
 		if !erb.applyChanges {
+			recoverableRemoteShards++
 			erb.write("would copy %d.%d from %s\n", volumeId, shardId, ecNodes[0].info.Id)
 			continue
 		}
@@ -395,6 +393,7 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 			erb.write("%s failed to copy %d.%d from %s: %v\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id, copyErr)
 			continue
 		}
+		recoverableRemoteShards++
 		if needEcxFile {
 			needEcxFile = false
 		}
@@ -404,11 +403,13 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 		copiedShardIds = append(copiedShardIds, shardId)
 	}

-	if len(localShardIds)+wouldCopy >= erasure_coding.DataShardsCount {
+	if len(localShardIds)+recoverableRemoteShards >= erasure_coding.DataShardsCount {
 		return copiedShardIds, localShardIds, nil
 	}

-	return nil, nil, fmt.Errorf("%d shards are not enough to recover volume %d", len(localShardIds)+wouldCopy, volumeId)
+	// Hand back what was copied so the caller deletes these orphaned working
+	// shards: recovery failed, but the temp files are already on the rebuilder.
+	return copiedShardIds, localShardIds, fmt.Errorf("%d shards are not enough to recover volume %d", len(localShardIds)+recoverableRemoteShards, volumeId)

 }