From 008ea03ef54f0eddcce38ec6dcc9db69415614e0 Mon Sep 17 00:00:00 2001 From: pingqiu Date: Thu, 9 Apr 2026 17:32:37 -0700 Subject: [PATCH] fix: suppress SessionFailed after successful remote rebuild completion After RemoteRebuildIO.TransferFullBase returns, the OnAck callback has already emitted SessionCompleted and stored achievedLSN. But RebuildExecutor.Execute() continues calling sender methods which fail ("sender stopped") because the completion event already cleaned up the sender. This error propagated to ExecutePendingRebuild which emitted a spurious SessionFailed, knocking the mode back to degraded. Fix: check remoteRebuildAchieved before emitting SessionFailed. If the rebuild already completed via the ack path, log the post-completion error but suppress the SessionFailed event. Co-Authored-By: Claude Opus 4.6 (1M context) --- weed/server/block_recovery.go | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/weed/server/block_recovery.go b/weed/server/block_recovery.go index 79e14d069..2b88d771e 100644 --- a/weed/server/block_recovery.go +++ b/weed/server/block_recovery.go @@ -633,17 +633,29 @@ func (rm *RecoveryManager) ExecutePendingRebuild(replicaID string, targetLSN uin replicaID, targetLSN, pe.RebuildIO) err := rt.ExecuteRebuildPlan(pe.Driver, pe.Plan, pe.RebuildIO, pe.VolumeID, pe.ReplicaID, rm) if err != nil { - glog.Warningf("recovery: rebuild execution failed for %s: %v", replicaID, err) - // Emit SessionFailed only for transport errors (dial/EOF/decode). - // Ack-driven failures (errRebuildAckFailed) already emitted SessionFailed - // through ObserveReplicaRebuildSessionAck — don't double-emit. - if !errors.Is(err, errRebuildAckFailed) && rm.bs != nil && rm.bs.v2Core != nil { - rm.bs.applyCoreEvent(engine.SessionFailed{ - ID: pe.VolumeID, - ReplicaID: replicaID, - Kind: engine.SessionRebuild, - Reason: err.Error(), - }) + // Check if the rebuild already completed via the remote ack path. + // After SessionAckCompleted, the OnAck callback emits SessionCompleted + // and stores achievedLSN in remoteRebuildAchieved. But then + // RebuildExecutor.Execute() continues calling sender methods which fail + // ("sender stopped") because the completion event already cleaned up. + // Suppress SessionFailed for these post-completion errors. + rm.mu.Lock() + _, alreadyCompleted := rm.remoteRebuildAchieved[replicaID] + rm.mu.Unlock() + if alreadyCompleted { + glog.V(0).Infof("recovery: rebuild post-completion error for %s (suppressed): %v", replicaID, err) + } else { + glog.Warningf("recovery: rebuild execution failed for %s: %v", replicaID, err) + // Emit SessionFailed only for real failures. + // Ack-driven failures (errRebuildAckFailed) already emitted SessionFailed. + if !errors.Is(err, errRebuildAckFailed) && rm.bs != nil && rm.bs.v2Core != nil { + rm.bs.applyCoreEvent(engine.SessionFailed{ + ID: pe.VolumeID, + ReplicaID: replicaID, + Kind: engine.SessionRebuild, + Reason: err.Error(), + }) + } } } return err