diff --git a/sw-block/.private/phase/phase-13-cp6-retention.md b/sw-block/.private/phase/phase-13-cp6-retention.md index 5d2624e00..8e2c536d8 100644 --- a/sw-block/.private/phase/phase-13-cp6-retention.md +++ b/sw-block/.private/phase/phase-13-cp6-retention.md @@ -57,7 +57,7 @@ All 3 retention tests rewritten from placeholder/PASS* to hard-assertion proofs: | Test | Was | Now | Hard assertion | |------|-----|-----|----------------| | `TestWalRetention_RequiredReplicaBlocksReclaim` | PASS (log-only, no assertion) | PASS (hard assert) | `checkpointLSN <= replicaFlushedLSN` — flusher did not advance past retention floor | -| `TestWalRetention_TimeoutTriggersNeedsRebuild` | PASS (log-only, no assertion) | PASS (hard assert) | `s.State() == NeedsRebuild` after 1ns timeout evaluation | +| `TestWalRetention_TimeoutTriggersNeedsRebuild` | PASS (log-only, no assertion) | PASS (hard assert) | `s.State() == NeedsRebuild` + `checkpointAfter > replicaFlushedLSN` (hold released) | | `TestWalRetention_MaxBytesTriggersNeedsRebuild` | PASS* (logged "not implemented") | PASS (hard assert) | `s.State() == NeedsRebuild` after lag exceeds 8KB budget | ## Proof Promotion @@ -67,7 +67,7 @@ All 3 retention tests rewritten from placeholder/PASS* to hard-assertion proofs: | Test | What it proves | |------|---------------| | `TestWalRetention_RequiredReplicaBlocksReclaim` | Flusher checkpoint does not advance past `replicaFlushedLSN` while recoverable replica is behind | -| `TestWalRetention_TimeoutTriggersNeedsRebuild` | Timeout budget evaluation transitions shipper to `NeedsRebuild` (verified via `State()` assertion) | +| `TestWalRetention_TimeoutTriggersNeedsRebuild` | Timeout budget → `NeedsRebuild` (State assertion) + checkpoint advances past replicaFlushedLSN after flush (hold-release assertion) | | `TestWalRetention_MaxBytesTriggersNeedsRebuild` | Max-bytes budget evaluation transitions shipper to `NeedsRebuild` (verified via `State()` assertion, uses actual `BlockSize` from volume config) | ## What CP13-6 Does NOT Close diff --git a/weed/storage/blockvol/sync_all_protocol_test.go b/weed/storage/blockvol/sync_all_protocol_test.go index a0390fac0..35ac45c3e 100644 --- a/weed/storage/blockvol/sync_all_protocol_test.go +++ b/weed/storage/blockvol/sync_all_protocol_test.go @@ -391,12 +391,8 @@ func TestReconnect_GapBeyondRetainedWal_NeedsRebuild(t *testing.T) { // ---------- WAL retention ---------- // TestWalRetention_RequiredReplicaBlocksReclaim verifies that the flusher -// does not reclaim WAL entries that a required replica still needs for catch-up. -// -// Currently EXPECTED TO FAIL: WAL reclaim is driven only by checkpointLSN, -// not replica progress. -// TestWalRetention_RequiredReplicaBlocksReclaim verifies that the flusher -// does not advance the WAL tail past entries a recoverable replica still needs. +// does not advance the WAL checkpoint past entries a recoverable replica +// still needs for catch-up. // // CP13-6 proof: retention floor from MinRecoverableFlushedLSN blocks reclaim. func TestWalRetention_RequiredReplicaBlocksReclaim(t *testing.T) { @@ -850,12 +846,18 @@ func TestWalRetention_TimeoutTriggersNeedsRebuild(t *testing.T) { t.Fatalf("CP13-6: expected NeedsRebuild after timeout, got %s", st) } - // After NeedsRebuild: WAL hold should be released (MinRecoverableFlushedLSN - // skips NeedsRebuild shippers). Verify by flushing — checkpoint should advance. + // Hard assertion: WAL hold released after NeedsRebuild. + // Record checkpoint before flush, flush, assert it advances past the old floor. + replicaFlushed := s.ReplicaFlushedLSN() + checkpointBefore := primary.flusher.CheckpointLSN() primary.flusher.FlushOnce() checkpointAfter := primary.flusher.CheckpointLSN() - // Checkpoint should advance past the old replica flushedLSN since the hold is released. - t.Logf("CP13-6: timeout triggered NeedsRebuild, checkpoint=%d (hold released)", checkpointAfter) + if checkpointAfter <= replicaFlushed { + t.Fatalf("CP13-6: checkpoint should advance past replicaFlushedLSN %d after hold released, got %d", + replicaFlushed, checkpointAfter) + } + t.Logf("CP13-6: hold released — checkpoint %d→%d (past replicaFlushed=%d)", + checkpointBefore, checkpointAfter, replicaFlushed) } // TestWalRetention_MaxBytesTriggersNeedsRebuild verifies that when the