fix: PlanRebuild targetLSN=0 when replica is degraded (CommittedLSN fallback)

Root cause: StatusSnapshot().CommittedLSN reports 0 in sync_all mode when
the replica shipper has no flushed progress (NeedsRebuild state). This is
correct for lineage-safe committed boundary, but PlanRebuild uses
CommittedLSN as RebuildTargetLSN. With target=0, shouldStartSessionCommand
rejects the StartRebuildCommand, and the rebuild IO never executes.

Fix: PlanRebuild falls back to HeadLSN when CommittedLSN is 0. The
primary's WAL head IS the data boundary the replica needs to reach.
The fact that no replica has confirmed durability is exactly why we're
rebuilding.

Also adds command type logging to coreApplyAndLog so tester can verify
which commands are actually emitted vs silently dropped.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
pingqiu
2026-04-09 15:35:31 -07:00
parent bc767eb9d2
commit a79cba0be7
2 changed files with 17 additions and 3 deletions

View File

@@ -144,13 +144,23 @@ func (d *RecoveryDriver) PlanRebuild(replicaID string) (*RecoveryPlan, error) {
history := d.Storage.GetRetainedHistory()
source, snapLSN := history.RebuildSourceDecision()
// RebuildTargetLSN: the primary's data boundary the replica must reach.
// CommittedLSN is the lineage-safe boundary, but during rebuild the replica
// is down — sync_all mode reports CommittedLSN=0 because no replica has
// confirmed durability. In that case, fall back to HeadLSN (the primary's
// actual data extent). The rebuild brings the replica up to the primary's head.
rebuildTarget := history.CommittedLSN
if rebuildTarget == 0 {
rebuildTarget = history.HeadLSN
}
plan := &RecoveryPlan{
ReplicaID: replicaID,
SessionID: sessID,
Outcome: OutcomeNeedsRebuild,
RebuildSource: source,
RebuildSnapshotLSN: snapLSN,
RebuildTargetLSN: history.CommittedLSN,
RebuildTargetLSN: rebuildTarget,
}
if source == RebuildSnapshotTail {

View File

@@ -879,14 +879,18 @@ func (bs *BlockService) applyCoreEvent(ev engine.Event) {
// so the VS log contains a complete trace for post-run diagnosis.
func (bs *BlockService) coreApplyAndLog(ev engine.Event) engine.ApplyResult {
result := bs.v2Core.ApplyEvent(ev)
glog.V(0).Infof("core [%s]: event=%T mode=%s pub=%v reason=%q readiness={applied=%v shipper_cfg=%v shipper_conn=%v recv=%v} boundary={durable=%d committed=%d last_barrier_ok=%v last_barrier_reason=%q} cmds=%d",
cmdTypes := make([]string, len(result.Commands))
for i, cmd := range result.Commands {
cmdTypes[i] = fmt.Sprintf("%T", cmd)
}
glog.V(0).Infof("core [%s]: event=%T mode=%s pub=%v reason=%q readiness={applied=%v shipper_cfg=%v shipper_conn=%v recv=%v} boundary={durable=%d committed=%d last_barrier_ok=%v last_barrier_reason=%q} cmds=%d %v",
ev.VolumeID(), ev, result.Projection.Mode.Name,
result.Projection.Publication.Healthy, result.Projection.Publication.Reason,
result.Projection.Readiness.RoleApplied, result.Projection.Readiness.ShipperConfigured,
result.Projection.Readiness.ShipperConnected, result.Projection.Readiness.ReceiverReady,
result.Projection.Boundary.DurableLSN, result.Projection.Boundary.CommittedLSN,
result.Projection.Boundary.LastBarrierOK, result.Projection.Boundary.LastBarrierReason,
len(result.Commands))
len(result.Commands), cmdTypes)
return result
}