diff --git a/weed/shell/command_ec_destructive_guards_test.go b/weed/shell/command_ec_destructive_guards_test.go index 6d2e0823f..c3a124c0d 100644 --- a/weed/shell/command_ec_destructive_guards_test.go +++ b/weed/shell/command_ec_destructive_guards_test.go @@ -9,6 +9,8 @@ import ( "github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding" "github.com/seaweedfs/seaweedfs/weed/storage/needle" "github.com/seaweedfs/seaweedfs/weed/storage/types" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" ) // topoWith builds a one-node topology snapshot where regularVids appear as @@ -191,3 +193,43 @@ func TestPrepareDataToRecover_SelfSourceNotCopied(t *testing.T) { t.Errorf("self-sourced shard 5 should be classified local, got local=%v", local) } } + +func TestPrepareDataToRecover_ApplyCopyFailureDoesNotCountAsRecoverable(t *testing.T) { + var log bytes.Buffer + rebuilder := newEcNode("dc1", "rack1", "127.0.0.1:1", 100). + addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{0, 1, 2, 3, 4, 5, 6, 7, 8}) + remote := newEcNode("dc1", "rack1", "127.0.0.1:2", 100). + addEcVolumeAndShardsForTest(1, "c1", []erasure_coding.ShardId{9}) + erb := &ecRebuilder{ + commandEnv: &CommandEnv{ + env: make(map[string]string), + noLock: true, + option: &ShellOptions{ + GrpcDialOption: grpc.WithTransportCredentials(insecure.NewCredentials()), + }, + }, + ecNodes: []*EcNode{rebuilder, remote}, + writer: &log, + applyChanges: true, + } + + locations := make(EcShardLocations, erasure_coding.MaxShardCount) + for i := 0; i < 9; i++ { + locations[i] = []*EcNode{rebuilder} + } + locations[9] = []*EcNode{remote} + + copied, local, err := erb.prepareDataToRecover(rebuilder, "c1", needle.VolumeId(1), locations) + if err == nil { + t.Fatalf("copy failure must leave only 9 usable shards and fail recoverability gate; copied=%v local=%v log:\n%s", copied, local, log.String()) + } + if !strings.Contains(err.Error(), "9 shards are not enough") { + t.Fatalf("expected recoverability error to count only successful/local shards, got %v; log:\n%s", err, log.String()) + } + if len(copied) != 0 { + t.Fatalf("failed copy must not be recorded as copied/deletable, got %v", copied) + } + if !strings.Contains(log.String(), "failed to copy 1.9") { + t.Fatalf("expected failed copy to be logged, got:\n%s", log.String()) + } +} diff --git a/weed/shell/command_ec_rebuild.go b/weed/shell/command_ec_rebuild.go index 0b3a5893e..9ede3cf85 100644 --- a/weed/shell/command_ec_rebuild.go +++ b/weed/shell/command_ec_rebuild.go @@ -265,14 +265,12 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo // collect shard files to rebuilder local disk var generatedShardIds []erasure_coding.ShardId copiedShardIds, _, err := erb.prepareDataToRecover(rebuilder, collection, volumeId, locations) - if err != nil { - return err - } defer func() { - // Clean up the working copies this run actually made. In dry-run - // nothing was copied (copiedShardIds is empty), so this issues no - // delete RPC. Use a local error so a cleanup failure cannot mask a - // successful rebuild's return value. + // Clean up the working copies this run actually made, even when the + // recoverability gate failed after some copies already succeeded: + // they are temp files on the rebuilder nothing else reclaims. Dry-run + // copies nothing (copiedShardIds is empty), so this issues no delete + // RPC. Use a local error so a cleanup failure cannot mask the return. if !erb.applyChanges || len(copiedShardIds) == 0 { return } @@ -280,6 +278,9 @@ func (erb *ecRebuilder) rebuildOneEcVolume(collection string, volumeId needle.Vo erb.write("%s delete copied ec shards %s %d.%v: %v\n", rebuilder.info.Id, collection, volumeId, copiedShardIds, derr) } }() + if err != nil { + return err + } if !erb.applyChanges { return nil @@ -344,11 +345,9 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin } } - // wouldCopy counts shards available on a remote holder that the rebuilder - // could pull. It drives the recoverability gate below independently of - // whether we actually copy (dry-run copies nothing), so a dry-run still - // reports the plan instead of erroring "not enough shards". - wouldCopy := 0 + // recoverableRemoteShards counts remote shards that can contribute to the + // rebuild. Dry-run counts the plan; apply mode counts only successful copies. + recoverableRemoteShards := 0 for i := 0; i < targetShardCount; i++ { ecNodes := locations[i] shardId := erasure_coding.ShardId(i) @@ -372,9 +371,8 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin continue } - wouldCopy++ - if !erb.applyChanges { + recoverableRemoteShards++ erb.write("would copy %d.%d from %s\n", volumeId, shardId, ecNodes[0].info.Id) continue } @@ -395,6 +393,7 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin erb.write("%s failed to copy %d.%d from %s: %v\n", rebuilder.info.Id, volumeId, shardId, ecNodes[0].info.Id, copyErr) continue } + recoverableRemoteShards++ if needEcxFile { needEcxFile = false } @@ -404,11 +403,13 @@ func (erb *ecRebuilder) prepareDataToRecover(rebuilder *EcNode, collection strin copiedShardIds = append(copiedShardIds, shardId) } - if len(localShardIds)+wouldCopy >= erasure_coding.DataShardsCount { + if len(localShardIds)+recoverableRemoteShards >= erasure_coding.DataShardsCount { return copiedShardIds, localShardIds, nil } - return nil, nil, fmt.Errorf("%d shards are not enough to recover volume %d", len(localShardIds)+wouldCopy, volumeId) + // Hand back what was copied so the caller deletes these orphaned working + // shards: recovery failed, but the temp files are already on the rebuilder. + return copiedShardIds, localShardIds, fmt.Errorf("%d shards are not enough to recover volume %d", len(localShardIds)+recoverableRemoteShards, volumeId) }