From 0ce5aa32e91539bf784657834a0f7edee3cbb41d Mon Sep 17 00:00:00 2001 From: pingqiu Date: Thu, 2 Apr 2026 23:59:44 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20CP13-6=20rev3=20=E2=80=94=20hard=20hold-?= =?UTF-8?q?release=20assertion=20+=20stale=20comment=20cleanup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. TestWalRetention_TimeoutTriggersNeedsRebuild: add hard assertion that checkpoint advances past replicaFlushedLSN after NeedsRebuild (proves hold is actually released, not just state transition) 2. TestWalRetention_RequiredReplicaBlocksReclaim: remove stale "EXPECTED TO FAIL" / duplicate comment block Co-Authored-By: Claude Opus 4.6 (1M context) --- .../.private/phase/phase-13-cp6-retention.md | 4 ++-- .../blockvol/sync_all_protocol_test.go | 22 ++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/sw-block/.private/phase/phase-13-cp6-retention.md b/sw-block/.private/phase/phase-13-cp6-retention.md index 5d2624e00..8e2c536d8 100644 --- a/sw-block/.private/phase/phase-13-cp6-retention.md +++ b/sw-block/.private/phase/phase-13-cp6-retention.md @@ -57,7 +57,7 @@ All 3 retention tests rewritten from placeholder/PASS* to hard-assertion proofs: | Test | Was | Now | Hard assertion | |------|-----|-----|----------------| | `TestWalRetention_RequiredReplicaBlocksReclaim` | PASS (log-only, no assertion) | PASS (hard assert) | `checkpointLSN <= replicaFlushedLSN` — flusher did not advance past retention floor | -| `TestWalRetention_TimeoutTriggersNeedsRebuild` | PASS (log-only, no assertion) | PASS (hard assert) | `s.State() == NeedsRebuild` after 1ns timeout evaluation | +| `TestWalRetention_TimeoutTriggersNeedsRebuild` | PASS (log-only, no assertion) | PASS (hard assert) | `s.State() == NeedsRebuild` + `checkpointAfter > replicaFlushedLSN` (hold released) | | `TestWalRetention_MaxBytesTriggersNeedsRebuild` | PASS* (logged "not implemented") | PASS (hard assert) | `s.State() == NeedsRebuild` after lag exceeds 8KB budget | ## Proof Promotion @@ -67,7 +67,7 @@ All 3 retention tests rewritten from placeholder/PASS* to hard-assertion proofs: | Test | What it proves | |------|---------------| | `TestWalRetention_RequiredReplicaBlocksReclaim` | Flusher checkpoint does not advance past `replicaFlushedLSN` while recoverable replica is behind | -| `TestWalRetention_TimeoutTriggersNeedsRebuild` | Timeout budget evaluation transitions shipper to `NeedsRebuild` (verified via `State()` assertion) | +| `TestWalRetention_TimeoutTriggersNeedsRebuild` | Timeout budget → `NeedsRebuild` (State assertion) + checkpoint advances past replicaFlushedLSN after flush (hold-release assertion) | | `TestWalRetention_MaxBytesTriggersNeedsRebuild` | Max-bytes budget evaluation transitions shipper to `NeedsRebuild` (verified via `State()` assertion, uses actual `BlockSize` from volume config) | ## What CP13-6 Does NOT Close diff --git a/weed/storage/blockvol/sync_all_protocol_test.go b/weed/storage/blockvol/sync_all_protocol_test.go index a0390fac0..35ac45c3e 100644 --- a/weed/storage/blockvol/sync_all_protocol_test.go +++ b/weed/storage/blockvol/sync_all_protocol_test.go @@ -391,12 +391,8 @@ func TestReconnect_GapBeyondRetainedWal_NeedsRebuild(t *testing.T) { // ---------- WAL retention ---------- // TestWalRetention_RequiredReplicaBlocksReclaim verifies that the flusher -// does not reclaim WAL entries that a required replica still needs for catch-up. -// -// Currently EXPECTED TO FAIL: WAL reclaim is driven only by checkpointLSN, -// not replica progress. -// TestWalRetention_RequiredReplicaBlocksReclaim verifies that the flusher -// does not advance the WAL tail past entries a recoverable replica still needs. +// does not advance the WAL checkpoint past entries a recoverable replica +// still needs for catch-up. // // CP13-6 proof: retention floor from MinRecoverableFlushedLSN blocks reclaim. func TestWalRetention_RequiredReplicaBlocksReclaim(t *testing.T) { @@ -850,12 +846,18 @@ func TestWalRetention_TimeoutTriggersNeedsRebuild(t *testing.T) { t.Fatalf("CP13-6: expected NeedsRebuild after timeout, got %s", st) } - // After NeedsRebuild: WAL hold should be released (MinRecoverableFlushedLSN - // skips NeedsRebuild shippers). Verify by flushing — checkpoint should advance. + // Hard assertion: WAL hold released after NeedsRebuild. + // Record checkpoint before flush, flush, assert it advances past the old floor. + replicaFlushed := s.ReplicaFlushedLSN() + checkpointBefore := primary.flusher.CheckpointLSN() primary.flusher.FlushOnce() checkpointAfter := primary.flusher.CheckpointLSN() - // Checkpoint should advance past the old replica flushedLSN since the hold is released. - t.Logf("CP13-6: timeout triggered NeedsRebuild, checkpoint=%d (hold released)", checkpointAfter) + if checkpointAfter <= replicaFlushed { + t.Fatalf("CP13-6: checkpoint should advance past replicaFlushedLSN %d after hold released, got %d", + replicaFlushed, checkpointAfter) + } + t.Logf("CP13-6: hold released — checkpoint %d→%d (past replicaFlushed=%d)", + checkpointBefore, checkpointAfter, replicaFlushed) } // TestWalRetention_MaxBytesTriggersNeedsRebuild verifies that when the