From a79cba0be78edab430fca11d2bb347e18d508abe Mon Sep 17 00:00:00 2001 From: pingqiu Date: Thu, 9 Apr 2026 15:35:31 -0700 Subject: [PATCH] fix: PlanRebuild targetLSN=0 when replica is degraded (CommittedLSN fallback) Root cause: StatusSnapshot().CommittedLSN reports 0 in sync_all mode when the replica shipper has no flushed progress (NeedsRebuild state). This is correct for lineage-safe committed boundary, but PlanRebuild uses CommittedLSN as RebuildTargetLSN. With target=0, shouldStartSessionCommand rejects the StartRebuildCommand, and the rebuild IO never executes. Fix: PlanRebuild falls back to HeadLSN when CommittedLSN is 0. The primary's WAL head IS the data boundary the replica needs to reach. The fact that no replica has confirmed durability is exactly why we're rebuilding. Also adds command type logging to coreApplyAndLog so tester can verify which commands are actually emitted vs silently dropped. Co-Authored-By: Claude Opus 4.6 (1M context) --- sw-block/engine/replication/driver.go | 12 +++++++++++- weed/server/volume_server_block.go | 8 ++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/sw-block/engine/replication/driver.go b/sw-block/engine/replication/driver.go index 74508735d..015ce39e1 100644 --- a/sw-block/engine/replication/driver.go +++ b/sw-block/engine/replication/driver.go @@ -144,13 +144,23 @@ func (d *RecoveryDriver) PlanRebuild(replicaID string) (*RecoveryPlan, error) { history := d.Storage.GetRetainedHistory() source, snapLSN := history.RebuildSourceDecision() + // RebuildTargetLSN: the primary's data boundary the replica must reach. + // CommittedLSN is the lineage-safe boundary, but during rebuild the replica + // is down — sync_all mode reports CommittedLSN=0 because no replica has + // confirmed durability. In that case, fall back to HeadLSN (the primary's + // actual data extent). The rebuild brings the replica up to the primary's head. + rebuildTarget := history.CommittedLSN + if rebuildTarget == 0 { + rebuildTarget = history.HeadLSN + } + plan := &RecoveryPlan{ ReplicaID: replicaID, SessionID: sessID, Outcome: OutcomeNeedsRebuild, RebuildSource: source, RebuildSnapshotLSN: snapLSN, - RebuildTargetLSN: history.CommittedLSN, + RebuildTargetLSN: rebuildTarget, } if source == RebuildSnapshotTail { diff --git a/weed/server/volume_server_block.go b/weed/server/volume_server_block.go index 5469de4f0..934786a1c 100644 --- a/weed/server/volume_server_block.go +++ b/weed/server/volume_server_block.go @@ -879,14 +879,18 @@ func (bs *BlockService) applyCoreEvent(ev engine.Event) { // so the VS log contains a complete trace for post-run diagnosis. func (bs *BlockService) coreApplyAndLog(ev engine.Event) engine.ApplyResult { result := bs.v2Core.ApplyEvent(ev) - glog.V(0).Infof("core [%s]: event=%T mode=%s pub=%v reason=%q readiness={applied=%v shipper_cfg=%v shipper_conn=%v recv=%v} boundary={durable=%d committed=%d last_barrier_ok=%v last_barrier_reason=%q} cmds=%d", + cmdTypes := make([]string, len(result.Commands)) + for i, cmd := range result.Commands { + cmdTypes[i] = fmt.Sprintf("%T", cmd) + } + glog.V(0).Infof("core [%s]: event=%T mode=%s pub=%v reason=%q readiness={applied=%v shipper_cfg=%v shipper_conn=%v recv=%v} boundary={durable=%d committed=%d last_barrier_ok=%v last_barrier_reason=%q} cmds=%d %v", ev.VolumeID(), ev, result.Projection.Mode.Name, result.Projection.Publication.Healthy, result.Projection.Publication.Reason, result.Projection.Readiness.RoleApplied, result.Projection.Readiness.ShipperConfigured, result.Projection.Readiness.ShipperConnected, result.Projection.Readiness.ReceiverReady, result.Projection.Boundary.DurableLSN, result.Projection.Boundary.CommittedLSN, result.Projection.Boundary.LastBarrierOK, result.Projection.Boundary.LastBarrierReason, - len(result.Commands)) + len(result.Commands), cmdTypes) return result }