From 9de9dbaa835bb10a4fd01bed8b5ccd70bc08ad1c Mon Sep 17 00:00:00 2001
From: qzhello <951012707@qq.com>
Date: Tue, 23 Jun 2026 02:23:23 +0800
Subject: [PATCH] fix(shell): exclude failed EC shard copies from rebuild
 recoverability gate (#10043)

* fix(shell): correct volume.list -writable filter unit and comparison

* fix(shell): correct volume.list -writable filter unit and comparison

* chore(shell): fix typo in EC shard helper param names

* fix(shell): use exact match for volume.balance -racks/-nodes filter

The old strings.Contains-based filter quietly included any id that was a
  substring of the user-supplied flag value (e.g. -racks=rack10 also matched
  rack1). Replace it with an exact-match set parsed from the comma-separated
  flag value, and add regression tests for both -racks and -nodes paths.

  Also fix a small typo in the "remote storage" error returned by
  maybeMoveOneVolume.

* fix(shell): use exact match for volume.balance -racks/-nodes filter

The old strings.Contains-based filter quietly included any id that was a
  substring of the user-supplied flag value (e.g. -racks=rack10 also matched
  rack1). Replace it with an exact-match set parsed from the comma-separated
  flag value, and add regression tests for both -racks and -nodes paths.

  Also fix a small typo in the "remote storage" error returned by
  maybeMoveOneVolume.

* refactor(shell): drop nil sentinel in splitCSVSet, use len() in callers

* fix(shell): exclude failed EC shard copies from rebuild recoverability gate

prepareDataToRecover incremented the remote-shard counter before the copy
RPC, so in apply mode a failed VolumeEcShardsCopy was still counted toward
the DataShardsCount recoverability gate. The gate could then pass with
fewer real shards than required, deferring the failure to the deeper
generateMissingShards/reconstruct step and reporting an inflated shard
count in the "not enough shards" error.

Count the remote shard only after a successful copy (apply mode) or when
planning (dry-run), and rename wouldCopy to recoverableRemoteShards for
clarity. Add a regression test covering an apply-mode copy failure.

* fix(shell): clean up copied EC shards when the recoverability gate fails

A runtime copy failure can trip the gate after earlier copies already
succeeded, stranding those working shards on the rebuilder. Return the
copied shard ids on the error path and run the cleanup defer even when
recovery fails, so the temp shards get deleted.

---------

Co-authored-by: Chris Lu <chris.lu@gmail.com>
---
 .../command_ec_destructive_guards_test.go     | 42 +++++++++++++++++++
 weed/shell/command_ec_rebuild.go              | 33 ++++++++-------
 2 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/weed/shell/command_ec_destructive_guards_test.go b/weed/shell/command_ec_destructive_guards_test.go
index 6d2e0823f..c3a124c0d 100644
--- a/weed/shell/command_ec_destructive_guards_test.go
+++ b/weed/shell/command_ec_destructive_guards_test.go
@@ -9,6 +9,8 @@ import (
 	"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
 	"github.com/seaweedfs/seaweedfs/weed/storage/needle"
 	"github.com/seaweedfs/seaweedfs/weed/storage/types"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
 )
 
 // topoWith builds a one-node topology snapshot where regularVids appear as
@@ -191,3 +193,43 @@ func TestPrepareDataToRecover_SelfSourceNotCopied(t *testing.T) {
 		t.Errorf("self-sourced shard 5 should be classified local, got local=%v", local)
 	}
 }
+
+func TestPrepareDataToRecover_ApplyCopyFailureDoesNotCountAsRecoverable(t *testing.T) {
+	var log bytes.Buffer
+	rebuilder := newEcNode("dc1", "rack1", "127.0.0.1:1", 100).
+		addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1, 2, 3, 4, 5, 6, 7, 8})
+	remote := newEcNode("dc1", "rack1", "127.0.0.1:2", 100).
+		addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{9})
+	erb := &ecRebuilder{
+		commandEnv: &CommandEnv{
+			env:    make(map[string]string),
+			noLock: true,
+			option: &ShellOptions{
+				GrpcDialOption: grpc.WithTransportCredentials(insecure.NewCredentials()),
+			},
+		},
+		ecNodes:      []*EcNode{rebuilder, remote},
+		writer:       &log,
+		applyChanges: true,
+	}
+
+	locations := make(EcShardLocations, erasure_coding.MaxShardCount)
+	for i := 0; i < 9; i++ {
+		locations[i] = []*EcNode{rebuilder}
+	}
+	locations[9] = []*EcNode{remote}
+
+	copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations)
+	if err == nil {
+		t.Fatalf("copy failure must leave only 9 usable shards and fail recoverability gate; copied=%v local=%v log:\n%s", copied, local, log.String())
+	}
+	if !strings.Contains(err.Error(), "9 shards are not enough") {
+		t.Fatalf("expected recoverability error to count only successful/local shards, got %v; log:\n%s", err, log.String())
+	}
+	if len(copied) != 0 {
+		t.Fatalf("failed copy must not be recorded as copied/deletable, got %v", copied)
+	}
+	if !strings.Contains(log.String(), "failed to copy 1.9") {
+		t.Fatalf("expected failed copy to be logged, got:\n%s", log.String())
+	}
+}
diff --git a/weed/shell/command_ec_rebuild.go b/weed/shell/command_ec_rebuild.go
index 0b3a5893e..9ede3cf85 100644
--- a/weed/shell/command_ec_rebuild.go
+++ b/weed/shell/command_ec_rebuild.go
@@ -265,14 +265,12 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
 	// collect shard files to rebuilder local disk
 	var generatedShardIds []erasure_coding.ShardId
 	copiedShardIds, _, err := erb.prepareDataToRecover(rebuilder, collection, volumeId, locations)
-	if err != nil {
-		return err
-	}
 	defer func() {
-		// Clean up the working copies this run actually made. In dry-run
-		// nothing was copied (copiedShardIds is empty), so this issues no
-		// delete RPC. Use a local error so a cleanup failure cannot mask a
-		// successful rebuild's return value.
+		// Clean up the working copies this run actually made, even when the
+		// recoverability gate failed after some copies already succeeded:
+		// they are temp files on the rebuilder nothing else reclaims. Dry-run
+		// copies nothing (copiedShardIds is empty), so this issues no delete
+		// RPC. Use a local error so a cleanup failure cannot mask the return.
 		if !erb.applyChanges || len(copiedShardIds) == 0 {
 			return
 		}
@@ -280,6 +278,9 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo
 			erb.write("%s delete copied ec shards %s %d.%v: %v\n", rebuilder.info.Id, collection, volumeId, copiedShardIds, derr)
 		}
 	}()
+	if err != nil {
+		return err
+	}
 
 	if !erb.applyChanges {
 		return nil
@@ -344,11 +345,9 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 		}
 	}
 
-	// wouldCopy counts shards available on a remote holder that the rebuilder
-	// could pull. It drives the recoverability gate below independently of
-	// whether we actually copy (dry-run copies nothing), so a dry-run still
-	// reports the plan instead of erroring "not enough shards".
-	wouldCopy := 0
+	// recoverableRemoteShards counts remote shards that can contribute to the
+	// rebuild. Dry-run counts the plan; apply mode counts only successful copies.
+	recoverableRemoteShards := 0
 	for i := 0; i < targetShardCount; i++ {
 		ecNodes := locations[i]
 		shardId := erasure_coding.ShardId(i)
@@ -372,9 +371,8 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 			continue
 		}
 
-		wouldCopy++
-
 		if !erb.applyChanges {
+			recoverableRemoteShards++
 			erb.write("would copy %d.%d from %s\n", volumeId, shardId, ecNodes[0].info.Id)
 			continue
 		}
@@ -395,6 +393,7 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 			erb.write("%s failed to copy %d.%d from %s: %v\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id, copyErr)
 			continue
 		}
+		recoverableRemoteShards++
 		if needEcxFile {
 			needEcxFile = false
 		}
@@ -404,11 +403,13 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin
 		copiedShardIds = append(copiedShardIds, shardId)
 	}
 
-	if len(localShardIds)+wouldCopy >= erasure_coding.DataShardsCount {
+	if len(localShardIds)+recoverableRemoteShards >= erasure_coding.DataShardsCount {
 		return copiedShardIds, localShardIds, nil
 	}
 
-	return nil, nil, fmt.Errorf("%d shards are not enough to recover volume %d", len(localShardIds)+wouldCopy, volumeId)
+	// Hand back what was copied so the caller deletes these orphaned working
+	// shards: recovery failed, but the temp files are already on the rebuilder.
+	return copiedShardIds, localShardIds, fmt.Errorf("%d shards are not enough to recover volume %d", len(localShardIds)+recoverableRemoteShards, volumeId)
 
 }