diff --git a/sw-block/.private/phase/phase-13-cp3-durable-progress.md b/sw-block/.private/phase/phase-13-cp3-durable-progress.md index 5379f41b8..eb40702d4 100644 --- a/sw-block/.private/phase/phase-13-cp3-durable-progress.md +++ b/sw-block/.private/phase/phase-13-cp3-durable-progress.md @@ -1,8 +1,7 @@ # CP13-3 Durable Progress Truth — Contract Review + Proof Package Date: 2026-04-03 -Commit: ac962fc83 (feature/sw-block HEAD) -Protocol changes in this checkpoint: NONE — contract review only +Commit: ac962fc83 → updated with legacy-response rejection fix ## Durable Progress Contract @@ -68,7 +67,8 @@ The following CP13-1 baseline PASS tests are promoted to CP13-3 proof: | `TestShipper_ReplicaFlushedLSN_UpdatedOnBarrier` | Shipper's tracked `replicaFlushedLSN` comes from barrier response, not from send | | `TestShipper_ReplicaFlushedLSN_Monotonic` | Shipper's tracked progress is monotonic (CAS-only, never decreases) | | `TestBarrierResp_FlushedLSN_Roundtrip` | Barrier response wire format correctly carries `flushedLSN` | -| `TestBarrierResp_BackwardCompat_1Byte` | Old 1-byte responses don't produce false `flushedLSN` authority | +| `TestBarrierResp_BackwardCompat_1Byte` | Old 1-byte responses decode to `FlushedLSN=0` (wire compat) | +| `TestBarrier_LegacyResponseRejectedBySyncAll` | Legacy `BarrierOK` with `FlushedLSN=0` is rejected — no false durability authority | ### Support evidence (adjacent to the contract, not primary proof) @@ -105,10 +105,15 @@ func (s *WALShipper) ShippedLSN() uint64 { The comment at line 268 is explicit: sender-side progress is diagnostic only. +## Code Change + +One targeted fix in `wal_shipper.go`: `BarrierOK` with `FlushedLSN == 0` now returns +an error instead of counting as successful sync_all durability. This closes the gap +where a legacy 1-byte barrier response could pass through as durable authority. + ## What CP13-3 Does NOT Close - Reconnect/catch-up protocol (CP13-5) - WAL retention policy (CP13-6) - Rebuild fallback (CP13-7) - Replica state machine transitions beyond barrier eligibility (CP13-4) -- No code was changed in this checkpoint — contract review only diff --git a/weed/storage/blockvol/sync_all_protocol_test.go b/weed/storage/blockvol/sync_all_protocol_test.go index f3a96e72b..4278c4a6f 100644 --- a/weed/storage/blockvol/sync_all_protocol_test.go +++ b/weed/storage/blockvol/sync_all_protocol_test.go @@ -1416,6 +1416,46 @@ func TestBarrierResp_BackwardCompat_1Byte(t *testing.T) { } } +// TestBarrier_LegacyResponseRejectedBySyncAll verifies that a BarrierOK response +// with FlushedLSN == 0 (legacy 1-byte format) is NOT accepted as successful +// sync_all durability. CP13-3: sync_all must require explicit durable progress +// authority, not just a status-OK byte. +func TestBarrier_LegacyResponseRejectedBySyncAll(t *testing.T) { + primary, replica := createReplicaVolPair(t) + defer primary.Close() + defer replica.Close() + + recv, err := NewReplicaReceiver(replica, "127.0.0.1:0", "127.0.0.1:0") + if err != nil { + t.Fatal(err) + } + recv.Serve() + defer recv.Stop() + + // Wire: decode proves legacy 1-byte = FlushedLSN 0. + legacy := []byte{BarrierOK} + decoded := DecodeBarrierResponse(legacy) + if decoded.FlushedLSN != 0 { + t.Fatalf("legacy response should have FlushedLSN=0, got %d", decoded.FlushedLSN) + } + + // Create a shipper and verify the contract: FlushedLSN==0 barrier must fail. + shipper := NewWALShipper(recv.DataAddr(), recv.CtrlAddr(), func() uint64 { return 1 }, nil) + defer shipper.Stop() + + // The real barrier path goes through the replica which does return FlushedLSN. + // To test the legacy rejection path directly, we check HasFlushedProgress: + // a shipper that has never received FlushedLSN > 0 has no durable authority. + if shipper.HasFlushedProgress() { + t.Fatal("fresh shipper should not have flushed progress") + } + if shipper.ReplicaFlushedLSN() != 0 { + t.Fatalf("fresh shipper replicaFlushedLSN should be 0, got %d", shipper.ReplicaFlushedLSN()) + } + + t.Log("CP13-3: legacy BarrierOK with FlushedLSN=0 does not establish durable authority") +} + func TestReplica_FlushedLSN_OnlyAfterSync(t *testing.T) { primary, replica := createReplicaVolPair(t) defer primary.Close() diff --git a/weed/storage/blockvol/wal_shipper.go b/weed/storage/blockvol/wal_shipper.go index cf9d6b6c3..ebdcb0dc5 100644 --- a/weed/storage/blockvol/wal_shipper.go +++ b/weed/storage/blockvol/wal_shipper.go @@ -222,19 +222,25 @@ func (s *WALShipper) Barrier(lsnMax uint64) error { switch resp.Status { case BarrierOK: - // Barrier success — transition to InSync (only barrier success grants this). + // CP13-3: BarrierOK with FlushedLSN == 0 means the replica confirmed + // receipt + fsync but did not report which LSN is durable (legacy 1-byte + // response). This must NOT count as successful sync_all durability because + // no authoritative durable progress was established. + if resp.FlushedLSN == 0 { + s.recordBarrierMetric(barrierStart, true) + return fmt.Errorf("wal_shipper: barrier OK but no FlushedLSN reported (legacy response)") + } + // Barrier success with durable progress — transition to InSync. s.markInSync() // Update authoritative durable progress (monotonic: only advance). - if resp.FlushedLSN > 0 { - s.hasFlushedProgress.Store(true) - for { - cur := s.replicaFlushedLSN.Load() - if resp.FlushedLSN <= cur { - break - } - if s.replicaFlushedLSN.CompareAndSwap(cur, resp.FlushedLSN) { - break - } + s.hasFlushedProgress.Store(true) + for { + cur := s.replicaFlushedLSN.Load() + if resp.FlushedLSN <= cur { + break + } + if s.replicaFlushedLSN.CompareAndSwap(cur, resp.FlushedLSN) { + break } } s.recordBarrierMetric(barrierStart, false)