mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-05-20 08:41:29 +00:00
fix: PlanRebuild targetLSN=0 when replica is degraded (CommittedLSN fallback)
Root cause: StatusSnapshot().CommittedLSN reports 0 in sync_all mode when the replica shipper has no flushed progress (NeedsRebuild state). This is correct for lineage-safe committed boundary, but PlanRebuild uses CommittedLSN as RebuildTargetLSN. With target=0, shouldStartSessionCommand rejects the StartRebuildCommand, and the rebuild IO never executes. Fix: PlanRebuild falls back to HeadLSN when CommittedLSN is 0. The primary's WAL head IS the data boundary the replica needs to reach. The fact that no replica has confirmed durability is exactly why we're rebuilding. Also adds command type logging to coreApplyAndLog so tester can verify which commands are actually emitted vs silently dropped. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -144,13 +144,23 @@ func (d *RecoveryDriver) PlanRebuild(replicaID string) (*RecoveryPlan, error) {
|
||||
history := d.Storage.GetRetainedHistory()
|
||||
source, snapLSN := history.RebuildSourceDecision()
|
||||
|
||||
// RebuildTargetLSN: the primary's data boundary the replica must reach.
|
||||
// CommittedLSN is the lineage-safe boundary, but during rebuild the replica
|
||||
// is down — sync_all mode reports CommittedLSN=0 because no replica has
|
||||
// confirmed durability. In that case, fall back to HeadLSN (the primary's
|
||||
// actual data extent). The rebuild brings the replica up to the primary's head.
|
||||
rebuildTarget := history.CommittedLSN
|
||||
if rebuildTarget == 0 {
|
||||
rebuildTarget = history.HeadLSN
|
||||
}
|
||||
|
||||
plan := &RecoveryPlan{
|
||||
ReplicaID: replicaID,
|
||||
SessionID: sessID,
|
||||
Outcome: OutcomeNeedsRebuild,
|
||||
RebuildSource: source,
|
||||
RebuildSnapshotLSN: snapLSN,
|
||||
RebuildTargetLSN: history.CommittedLSN,
|
||||
RebuildTargetLSN: rebuildTarget,
|
||||
}
|
||||
|
||||
if source == RebuildSnapshotTail {
|
||||
|
||||
@@ -879,14 +879,18 @@ func (bs *BlockService) applyCoreEvent(ev engine.Event) {
|
||||
// so the VS log contains a complete trace for post-run diagnosis.
|
||||
func (bs *BlockService) coreApplyAndLog(ev engine.Event) engine.ApplyResult {
|
||||
result := bs.v2Core.ApplyEvent(ev)
|
||||
glog.V(0).Infof("core [%s]: event=%T mode=%s pub=%v reason=%q readiness={applied=%v shipper_cfg=%v shipper_conn=%v recv=%v} boundary={durable=%d committed=%d last_barrier_ok=%v last_barrier_reason=%q} cmds=%d",
|
||||
cmdTypes := make([]string, len(result.Commands))
|
||||
for i, cmd := range result.Commands {
|
||||
cmdTypes[i] = fmt.Sprintf("%T", cmd)
|
||||
}
|
||||
glog.V(0).Infof("core [%s]: event=%T mode=%s pub=%v reason=%q readiness={applied=%v shipper_cfg=%v shipper_conn=%v recv=%v} boundary={durable=%d committed=%d last_barrier_ok=%v last_barrier_reason=%q} cmds=%d %v",
|
||||
ev.VolumeID(), ev, result.Projection.Mode.Name,
|
||||
result.Projection.Publication.Healthy, result.Projection.Publication.Reason,
|
||||
result.Projection.Readiness.RoleApplied, result.Projection.Readiness.ShipperConfigured,
|
||||
result.Projection.Readiness.ShipperConnected, result.Projection.Readiness.ReceiverReady,
|
||||
result.Projection.Boundary.DurableLSN, result.Projection.Boundary.CommittedLSN,
|
||||
result.Projection.Boundary.LastBarrierOK, result.Projection.Boundary.LastBarrierReason,
|
||||
len(result.Commands))
|
||||
len(result.Commands), cmdTypes)
|
||||
return result
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user