fix: PlanRebuild targetLSN=0 when replica is degraded (CommittedLSN fallback)

Root cause: StatusSnapshot().CommittedLSN reports 0 in sync_all mode when the replica shipper has no flushed progress (NeedsRebuild state). This is correct for lineage-safe committed boundary, but PlanRebuild uses CommittedLSN as RebuildTargetLSN. With target=0, shouldStartSessionCommand rejects the StartRebuildCommand, and the rebuild IO never executes. Fix: PlanRebuild falls back to HeadLSN when CommittedLSN is 0. The primary's WAL head IS the data boundary the replica needs to reach. The fact that no replica has confirmed durability is exactly why we're rebuilding. Also adds command type logging to coreApplyAndLog so tester can verify which commands are actually emitted vs silently dropped. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-20 08:41:29 +00:00 · 2026-04-09 15:35:31 -07:00
parent bc767eb9d2
commit a79cba0be7
2 changed files with 17 additions and 3 deletions
--- a/sw-block/engine/replication/driver.go
+++ b/sw-block/engine/replication/driver.go
@@ -144,13 +144,23 @@ func (d *RecoveryDriver) PlanRebuild(replicaID string) (*RecoveryPlan, error) {
 	history := d.Storage.GetRetainedHistory()
 	source, snapLSN := history.RebuildSourceDecision()

+	// RebuildTargetLSN: the primary's data boundary the replica must reach.
+	// CommittedLSN is the lineage-safe boundary, but during rebuild the replica
+	// is down — sync_all mode reports CommittedLSN=0 because no replica has
+	// confirmed durability. In that case, fall back to HeadLSN (the primary's
+	// actual data extent). The rebuild brings the replica up to the primary's head.
+	rebuildTarget := history.CommittedLSN
+	if rebuildTarget == 0 {
+		rebuildTarget = history.HeadLSN
+	}
+
 	plan := &RecoveryPlan{
 		ReplicaID:          replicaID,
 		SessionID:          sessID,
 		Outcome:            OutcomeNeedsRebuild,
 		RebuildSource:      source,
 		RebuildSnapshotLSN: snapLSN,
-		RebuildTargetLSN:   history.CommittedLSN,
+		RebuildTargetLSN:   rebuildTarget,
 	}

 	if source == RebuildSnapshotTail {
--- a/weed/server/volume_server_block.go
+++ b/weed/server/volume_server_block.go
@@ -879,14 +879,18 @@ func (bs *BlockService) applyCoreEvent(ev engine.Event) {
 // so the VS log contains a complete trace for post-run diagnosis.
 func (bs *BlockService) coreApplyAndLog(ev engine.Event) engine.ApplyResult {
 	result := bs.v2Core.ApplyEvent(ev)
-	glog.V(0).Infof("core [%s]: event=%T mode=%s pub=%v reason=%q readiness={applied=%v shipper_cfg=%v shipper_conn=%v recv=%v} boundary={durable=%d committed=%d last_barrier_ok=%v last_barrier_reason=%q} cmds=%d",
+	cmdTypes := make([]string, len(result.Commands))
+	for i, cmd := range result.Commands {
+		cmdTypes[i] = fmt.Sprintf("%T", cmd)
+	}
+	glog.V(0).Infof("core [%s]: event=%T mode=%s pub=%v reason=%q readiness={applied=%v shipper_cfg=%v shipper_conn=%v recv=%v} boundary={durable=%d committed=%d last_barrier_ok=%v last_barrier_reason=%q} cmds=%d %v",
 		ev.VolumeID(), ev, result.Projection.Mode.Name,
 		result.Projection.Publication.Healthy, result.Projection.Publication.Reason,
 		result.Projection.Readiness.RoleApplied, result.Projection.Readiness.ShipperConfigured,
 		result.Projection.Readiness.ShipperConnected, result.Projection.Readiness.ReceiverReady,
 		result.Projection.Boundary.DurableLSN, result.Projection.Boundary.CommittedLSN,
 		result.Projection.Boundary.LastBarrierOK, result.Projection.Boundary.LastBarrierReason,
-		len(result.Commands))
+		len(result.Commands), cmdTypes)
 	return result
 }