fix: suppress SessionFailed after successful remote rebuild completion

After RemoteRebuildIO.TransferFullBase returns, the OnAck callback has
already emitted SessionCompleted and stored achievedLSN. But
RebuildExecutor.Execute() continues calling sender methods which fail
("sender stopped") because the completion event already cleaned up the
sender. This error propagated to ExecutePendingRebuild which emitted a
spurious SessionFailed, knocking the mode back to degraded.

Fix: check remoteRebuildAchieved before emitting SessionFailed. If the
rebuild already completed via the ack path, log the post-completion
error but suppress the SessionFailed event.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
pingqiu
2026-04-09 17:32:37 -07:00
parent 55862f1ab1
commit 008ea03ef5

View File

@@ -633,17 +633,29 @@ func (rm *RecoveryManager) ExecutePendingRebuild(replicaID string, targetLSN uin
replicaID, targetLSN, pe.RebuildIO)
err := rt.ExecuteRebuildPlan(pe.Driver, pe.Plan, pe.RebuildIO, pe.VolumeID, pe.ReplicaID, rm)
if err != nil {
glog.Warningf("recovery: rebuild execution failed for %s: %v", replicaID, err)
// Emit SessionFailed only for transport errors (dial/EOF/decode).
// Ack-driven failures (errRebuildAckFailed) already emitted SessionFailed
// through ObserveReplicaRebuildSessionAck — don't double-emit.
if !errors.Is(err, errRebuildAckFailed) && rm.bs != nil && rm.bs.v2Core != nil {
rm.bs.applyCoreEvent(engine.SessionFailed{
ID: pe.VolumeID,
ReplicaID: replicaID,
Kind: engine.SessionRebuild,
Reason: err.Error(),
})
// Check if the rebuild already completed via the remote ack path.
// After SessionAckCompleted, the OnAck callback emits SessionCompleted
// and stores achievedLSN in remoteRebuildAchieved. But then
// RebuildExecutor.Execute() continues calling sender methods which fail
// ("sender stopped") because the completion event already cleaned up.
// Suppress SessionFailed for these post-completion errors.
rm.mu.Lock()
_, alreadyCompleted := rm.remoteRebuildAchieved[replicaID]
rm.mu.Unlock()
if alreadyCompleted {
glog.V(0).Infof("recovery: rebuild post-completion error for %s (suppressed): %v", replicaID, err)
} else {
glog.Warningf("recovery: rebuild execution failed for %s: %v", replicaID, err)
// Emit SessionFailed only for real failures.
// Ack-driven failures (errRebuildAckFailed) already emitted SessionFailed.
if !errors.Is(err, errRebuildAckFailed) && rm.bs != nil && rm.bs.v2Core != nil {
rm.bs.applyCoreEvent(engine.SessionFailed{
ID: pe.VolumeID,
ReplicaID: replicaID,
Kind: engine.SessionRebuild,
Reason: err.Error(),
})
}
}
}
return err