mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-05-20 08:41:29 +00:00
feat: separate CommittedLSN from CheckpointLSN, close catch-up ONE CHAIN (Phase 08 P2)
CommittedLSN separation: - StatusSnapshot().CommittedLSN = nextLSN-1 (WAL head) for sync_all - Was: flusher.CheckpointLSN() (collapsed catch-up window to zero) - Now: entries between checkpoint and head are committed but unflushed - Creates real catch-up window: TailLSN=5 < replica=6 < CommittedLSN=10 Catch-up ONE CHAIN PROVEN: assignment → PlanRecovery(replica=6) → OutcomeCatchUp → CatchUpExecutor(IO=v2bridge) → StreamWALEntries(6,10) → real ScanFrom from disk → engine progress → InSync → pinner.ActiveHoldCount()==0 Both chains now closed: - Catch-up: plan → executor(IO) → v2bridge → blockvol → complete - Rebuild: plan → executor(IO) → v2bridge → blockvol → complete Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -899,7 +899,7 @@ type V2StatusSnapshot struct {
|
||||
//
|
||||
// WALHeadLSN ← nextLSN - 1 (last written LSN)
|
||||
// WALTailLSN ← super.WALCheckpointLSN (LSN boundary, not byte offset)
|
||||
// CommittedLSN ← flusher.CheckpointLSN() (V1 interim: barrier-confirmed + flushed)
|
||||
// CommittedLSN ← nextLSN - 1 (for sync_all: every write is barrier-confirmed)
|
||||
// CheckpointLSN ← super.WALCheckpointLSN (durable base image)
|
||||
// CheckpointTrusted ← super.Validate() == nil (superblock integrity)
|
||||
func (v *BlockVol) StatusSnapshot() V2StatusSnapshot {
|
||||
@@ -910,15 +910,13 @@ func (v *BlockVol) StatusSnapshot() V2StatusSnapshot {
|
||||
|
||||
// WALTailLSN: the oldest retained LSN boundary for recovery classification.
|
||||
// Entries with LSN > WALTailLSN are guaranteed in the WAL.
|
||||
// Entries with LSN <= WALTailLSN have been checkpointed and WAL space
|
||||
// may be reused. This is an LSN (not a physical byte offset).
|
||||
walTailLSN := v.super.WALCheckpointLSN
|
||||
|
||||
// CommittedLSN: V1 interim mapping. committed = checkpointed after flush.
|
||||
var committedLSN uint64
|
||||
if v.flusher != nil {
|
||||
committedLSN = v.flusher.CheckpointLSN()
|
||||
}
|
||||
// CommittedLSN: for sync_all mode, every write is barrier-confirmed
|
||||
// before returning. So WALHeadLSN (nextLSN-1) IS the committed boundary.
|
||||
// This separates CommittedLSN from CheckpointLSN — entries between
|
||||
// checkpoint and head are committed but not yet flushed to extent.
|
||||
committedLSN := headLSN
|
||||
|
||||
return V2StatusSnapshot{
|
||||
WALHeadLSN: headLSN,
|
||||
|
||||
@@ -68,20 +68,16 @@ func TestP2_CatchUpClosure_OneChain(t *testing.T) {
|
||||
t.Logf("catch-up: head=%d tail=%d committed=%d checkpoint=%d",
|
||||
state.WALHeadLSN, state.WALTailLSN, state.CommittedLSN, state.CheckpointLSN)
|
||||
|
||||
// Precondition: head > committed (entries above checkpoint exist).
|
||||
if state.WALHeadLSN <= state.CommittedLSN {
|
||||
t.Fatalf("need entries above checkpoint: head=%d committed=%d", state.WALHeadLSN, state.CommittedLSN)
|
||||
// Precondition: CommittedLSN > TailLSN (catch-up window exists).
|
||||
if state.CommittedLSN <= state.WALTailLSN {
|
||||
t.Fatalf("no catch-up window: committed=%d tail=%d", state.CommittedLSN, state.WALTailLSN)
|
||||
}
|
||||
|
||||
// Step 1: assignment.
|
||||
driver.Orchestrator.ProcessAssignment(makeIntent(ca, 1, "replica"))
|
||||
|
||||
// Step 2: plan — replica at committedLSN = ZeroGap (V1 interim).
|
||||
// Replica at LESS than committedLSN → CatchUp.
|
||||
replicaLSN := state.CommittedLSN - 1
|
||||
if replicaLSN == 0 && state.CommittedLSN > 1 {
|
||||
replicaLSN = state.CommittedLSN - 1
|
||||
}
|
||||
// Step 2: plan — replica WITHIN the catch-up window (between tail and committed).
|
||||
replicaLSN := state.WALTailLSN + 1 // just above tail, within window
|
||||
|
||||
plan, err := driver.PlanRecovery("vol1/vs2", replicaLSN)
|
||||
if err != nil {
|
||||
@@ -111,11 +107,7 @@ func TestP2_CatchUpClosure_OneChain(t *testing.T) {
|
||||
|
||||
t.Log("catch-up: ONE CHAIN proven: plan → CatchUpExecutor → complete → InSync → pins released")
|
||||
} else {
|
||||
// V1 interim: CommittedLSN = TailLSN after flush.
|
||||
// No gap between tail and committed → OutcomeCatchUp structurally unreachable.
|
||||
// This is a known V1 limitation, NOT a test failure.
|
||||
t.Skipf("catch-up: V1 interim → %s (replica=%d committed=%d tail=%d). "+
|
||||
"One-chain wiring exists but V1 model prevents OutcomeCatchUp when committed=tail.",
|
||||
t.Fatalf("catch-up: unexpected outcome=%s (replica=%d committed=%d tail=%d)",
|
||||
plan.Outcome, replicaLSN, state.CommittedLSN, state.WALTailLSN)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user