feat: add engine data/recoverability core (Phase 05 Slice 3)

New file: history.go — RetainedHistory connects recovery decisions to actual WAL retention state: - IsRecoverable: checks gap against tail/head boundaries - MakeHandshakeResult: generates HandshakeResult from retention state - RebuildSourceDecision: chooses snapshot+tail vs full base from checkpoint state (trusted vs untrusted) - ProveRecoverability: generates explicit proof explaining why recovery is or is not allowed 14 new tests (recoverability_test.go): - Recoverable/unrecoverable gap (exact boundary, beyond head) - Trusted/untrusted/no checkpoint → rebuild source selection - Handshake from retained history → outcome classification - Recoverability proofs (zero-gap, ahead, within retention, beyond) - E2E: two replicas driven by retained history (catch-up + rebuild) - Truncation required for replica ahead of committed Engine module at 44 tests (12 + 18 + 14). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-07-19 22:42:20 +00:00 · 2026-03-29 23:04:51 -07:00
parent 368a956aee
commit ff7ea41099
2 changed files with 425 additions and 0 deletions
@@ -0,0 +1,112 @@
+package replication
+
+import "fmt"
+
+// RetainedHistory represents the primary's WAL retention state as seen
+// by the recovery decision path. It answers "why is recovery allowed?"
+// with executable proof, not just policy assertions.
+//
+// This is the engine-level equivalent of the prototype's WALHistory —
+// it provides the recoverability inputs that ClassifyRecoveryOutcome
+// and rebuild-source selection consume.
+type RetainedHistory struct {
+	// HeadLSN is the highest LSN written to the primary WAL.
+	HeadLSN uint64
+
+	// TailLSN is the oldest retained LSN boundary (exclusive).
+	// Entries with LSN > TailLSN are available for catch-up.
+	// Entries with LSN <= TailLSN have been recycled.
+	TailLSN uint64
+
+	// CommittedLSN is the lineage-safe boundary — the highest LSN
+	// acknowledged as durable by the commit protocol.
+	CommittedLSN uint64
+
+	// CheckpointLSN is the highest LSN with a durable base image.
+	// Used for rebuild-source decision: if CheckpointLSN > 0 and
+	// the checkpoint is trusted, snapshot+tail rebuild is possible.
+	CheckpointLSN uint64
+
+	// CheckpointTrusted indicates whether the checkpoint base image
+	// is known to be consistent and usable for rebuild.
+	CheckpointTrusted bool
+}
+
+// MakeHandshakeResult generates a HandshakeResult from the primary's
+// retained history and a replica's reported flushed LSN.
+func (rh *RetainedHistory) MakeHandshakeResult(replicaFlushedLSN uint64) HandshakeResult {
+	retentionStart := rh.TailLSN + 1
+	if rh.TailLSN == 0 {
+		retentionStart = 0
+	}
+	return HandshakeResult{
+		ReplicaFlushedLSN: replicaFlushedLSN,
+		CommittedLSN:      rh.CommittedLSN,
+		RetentionStartLSN: retentionStart,
+	}
+}
+
+// IsRecoverable checks whether all entries from startExclusive+1 to
+// endInclusive are available in the retained WAL.
+func (rh *RetainedHistory) IsRecoverable(startExclusive, endInclusive uint64) bool {
+	if startExclusive < rh.TailLSN {
+		return false
+	}
+	if endInclusive > rh.HeadLSN {
+		return false
+	}
+	return true
+}
+
+// RebuildSourceDecision determines the optimal rebuild source from
+// the current retained history state.
+func (rh *RetainedHistory) RebuildSourceDecision() (source RebuildSource, snapshotLSN uint64) {
+	if rh.CheckpointTrusted && rh.CheckpointLSN > 0 {
+		return RebuildSnapshotTail, rh.CheckpointLSN
+	}
+	return RebuildFullBase, 0
+}
+
+// RecoverabilityProof explains why a gap is or is not recoverable.
+type RecoverabilityProof struct {
+	ReplicaFlushedLSN uint64
+	CommittedLSN      uint64
+	TailLSN           uint64
+	HeadLSN           uint64
+	Recoverable       bool
+	Reason            string
+}
+
+// ProveRecoverability generates an explicit proof for a recovery decision.
+func (rh *RetainedHistory) ProveRecoverability(replicaFlushedLSN uint64) RecoverabilityProof {
+	proof := RecoverabilityProof{
+		ReplicaFlushedLSN: replicaFlushedLSN,
+		CommittedLSN:      rh.CommittedLSN,
+		TailLSN:           rh.TailLSN,
+		HeadLSN:           rh.HeadLSN,
+	}
+
+	if replicaFlushedLSN == rh.CommittedLSN {
+		proof.Recoverable = true
+		proof.Reason = "zero_gap"
+		return proof
+	}
+
+	if replicaFlushedLSN > rh.CommittedLSN {
+		proof.Recoverable = true
+		proof.Reason = "replica_ahead_needs_truncation"
+		return proof
+	}
+
+	if rh.IsRecoverable(replicaFlushedLSN, rh.CommittedLSN) {
+		proof.Recoverable = true
+		proof.Reason = fmt.Sprintf("gap_within_retention: need LSN %d-%d, tail=%d head=%d",
+			replicaFlushedLSN+1, rh.CommittedLSN, rh.TailLSN, rh.HeadLSN)
+		return proof
+	}
+
+	proof.Recoverable = false
+	proof.Reason = fmt.Sprintf("gap_beyond_retention: need LSN %d but tail=%d",
+		replicaFlushedLSN+1, rh.TailLSN)
+	return proof
+}
@@ -0,0 +1,313 @@
+package replication
+
+import "testing"
+
+// ============================================================
+// Phase 05 Slice 3: Engine Data / Recoverability Core
+//
+// Tests validate that recovery decisions are backed by actual
+// retained-history state, not just policy assertions.
+// ============================================================
+
+// --- Recoverable vs unrecoverable gap ---
+
+func TestHistory_Recoverable_GapWithinRetention(t *testing.T) {
+	rh := RetainedHistory{
+		HeadLSN:      100,
+		TailLSN:      30,
+		CommittedLSN: 100,
+	}
+
+	if !rh.IsRecoverable(50, 100) {
+		t.Fatal("gap 50→100 should be recoverable (tail=30)")
+	}
+
+	proof := rh.ProveRecoverability(50)
+	if !proof.Recoverable {
+		t.Fatalf("proof: %s", proof.Reason)
+	}
+}
+
+func TestHistory_Unrecoverable_GapBeyondRetention(t *testing.T) {
+	rh := RetainedHistory{
+		HeadLSN:      100,
+		TailLSN:      60,
+		CommittedLSN: 100,
+	}
+
+	if rh.IsRecoverable(50, 100) {
+		t.Fatal("gap 50→100 should NOT be recoverable (tail=60)")
+	}
+
+	proof := rh.ProveRecoverability(50)
+	if proof.Recoverable {
+		t.Fatal("proof should show unrecoverable")
+	}
+}
+
+func TestHistory_ExactBoundary(t *testing.T) {
+	rh := RetainedHistory{
+		HeadLSN:      100,
+		TailLSN:      50,
+		CommittedLSN: 100,
+	}
+
+	// AT boundary: recoverable.
+	if !rh.IsRecoverable(50, 100) {
+		t.Fatal("exact boundary should be recoverable")
+	}
+	// ONE BELOW: unrecoverable.
+	if rh.IsRecoverable(49, 100) {
+		t.Fatal("below boundary should NOT be recoverable")
+	}
+}
+
+func TestHistory_BeyondHead_Unrecoverable(t *testing.T) {
+	rh := RetainedHistory{
+		HeadLSN:      80,
+		TailLSN:      0,
+		CommittedLSN: 100,
+	}
+
+	if rh.IsRecoverable(0, 100) {
+		t.Fatal("gap beyond head should NOT be recoverable")
+	}
+}
+
+// --- Trusted base vs no trusted base ---
+
+func TestHistory_RebuildSource_TrustedCheckpoint(t *testing.T) {
+	rh := RetainedHistory{
+		CheckpointLSN:     50,
+		CheckpointTrusted: true,
+		CommittedLSN:      100,
+	}
+
+	source, snapLSN := rh.RebuildSourceDecision()
+	if source != RebuildSnapshotTail {
+		t.Fatalf("source=%s, want snapshot_tail", source)
+	}
+	if snapLSN != 50 {
+		t.Fatalf("snapshot LSN=%d, want 50", snapLSN)
+	}
+}
+
+func TestHistory_RebuildSource_NoCheckpoint(t *testing.T) {
+	rh := RetainedHistory{
+		CommittedLSN: 100,
+	}
+
+	source, snapLSN := rh.RebuildSourceDecision()
+	if source != RebuildFullBase {
+		t.Fatalf("source=%s, want full_base", source)
+	}
+	if snapLSN != 0 {
+		t.Fatalf("snapshot LSN=%d, want 0", snapLSN)
+	}
+}
+
+func TestHistory_RebuildSource_UntrustedCheckpoint(t *testing.T) {
+	rh := RetainedHistory{
+		CheckpointLSN:     50,
+		CheckpointTrusted: false,
+		CommittedLSN:      100,
+	}
+
+	source, _ := rh.RebuildSourceDecision()
+	if source != RebuildFullBase {
+		t.Fatalf("untrusted checkpoint: source=%s, want full_base", source)
+	}
+}
+
+// --- Handshake result from retained history ---
+
+func TestHistory_MakeHandshakeResult(t *testing.T) {
+	rh := RetainedHistory{
+		HeadLSN:      100,
+		TailLSN:      30,
+		CommittedLSN: 90,
+	}
+
+	hr := rh.MakeHandshakeResult(70)
+	if hr.ReplicaFlushedLSN != 70 {
+		t.Fatalf("replica=%d", hr.ReplicaFlushedLSN)
+	}
+	if hr.CommittedLSN != 90 {
+		t.Fatalf("committed=%d", hr.CommittedLSN)
+	}
+	if hr.RetentionStartLSN != 31 {
+		t.Fatalf("retention=%d, want 31", hr.RetentionStartLSN)
+	}
+
+	outcome := ClassifyRecoveryOutcome(hr)
+	if outcome != OutcomeCatchUp {
+		t.Fatalf("outcome=%s", outcome)
+	}
+}
+
+// --- Recoverability proof ---
+
+func TestHistory_Proof_ZeroGap(t *testing.T) {
+	rh := RetainedHistory{CommittedLSN: 100}
+	proof := rh.ProveRecoverability(100)
+	if !proof.Recoverable || proof.Reason != "zero_gap" {
+		t.Fatalf("proof: recoverable=%v reason=%s", proof.Recoverable, proof.Reason)
+	}
+}
+
+func TestHistory_Proof_ReplicaAhead(t *testing.T) {
+	rh := RetainedHistory{CommittedLSN: 100}
+	proof := rh.ProveRecoverability(105)
+	if !proof.Recoverable || proof.Reason != "replica_ahead_needs_truncation" {
+		t.Fatalf("proof: recoverable=%v reason=%s", proof.Recoverable, proof.Reason)
+	}
+}
+
+func TestHistory_Proof_GapWithinRetention(t *testing.T) {
+	rh := RetainedHistory{HeadLSN: 100, TailLSN: 30, CommittedLSN: 100}
+	proof := rh.ProveRecoverability(50)
+	if !proof.Recoverable {
+		t.Fatalf("proof: %s", proof.Reason)
+	}
+}
+
+func TestHistory_Proof_GapBeyondRetention(t *testing.T) {
+	rh := RetainedHistory{HeadLSN: 100, TailLSN: 60, CommittedLSN: 100}
+	proof := rh.ProveRecoverability(50)
+	if proof.Recoverable {
+		t.Fatal("should not be recoverable")
+	}
+}
+
+// --- End-to-end: retained-history-driven recovery flow ---
+
+func TestHistory_E2E_RecoveryDrivenByRetainedHistory(t *testing.T) {
+	// Primary's retained history.
+	primary := RetainedHistory{
+		HeadLSN:           100,
+		TailLSN:           30,
+		CommittedLSN:      100,
+		CheckpointLSN:     50,
+		CheckpointTrusted: true,
+	}
+
+	r := NewRegistry()
+	r.ApplyAssignment(AssignmentIntent{
+		Replicas: []ReplicaAssignment{
+			{ReplicaID: "r1", Endpoint: Endpoint{DataAddr: "10.0.0.1:9333", Version: 1}},
+			{ReplicaID: "r2", Endpoint: Endpoint{DataAddr: "10.0.0.2:9333", Version: 1}},
+		},
+		Epoch: 1,
+		RecoveryTargets: map[string]SessionKind{
+			"r1": SessionCatchUp,
+			"r2": SessionCatchUp,
+		},
+	})
+
+	// r1: replica at LSN 70 — catch-up (within retention).
+	r1 := r.Sender("r1")
+	id1 := r1.SessionID()
+	r1.BeginConnect(id1)
+
+	proof1 := primary.ProveRecoverability(70)
+	if !proof1.Recoverable {
+		t.Fatalf("r1: %s", proof1.Reason)
+	}
+
+	hr1 := primary.MakeHandshakeResult(70)
+	o1, _ := r1.RecordHandshakeWithOutcome(id1, hr1)
+	if o1 != OutcomeCatchUp {
+		t.Fatalf("r1: outcome=%s", o1)
+	}
+	r1.BeginCatchUp(id1)
+	r1.RecordCatchUpProgress(id1, 100)
+	r1.CompleteSessionByID(id1)
+
+	// r2: replica at LSN 20 — needs rebuild (beyond retention).
+	r2 := r.Sender("r2")
+	id2 := r2.SessionID()
+	r2.BeginConnect(id2)
+
+	proof2 := primary.ProveRecoverability(20)
+	if proof2.Recoverable {
+		t.Fatal("r2: should not be recoverable")
+	}
+
+	hr2 := primary.MakeHandshakeResult(20)
+	o2, _ := r2.RecordHandshakeWithOutcome(id2, hr2)
+	if o2 != OutcomeNeedsRebuild {
+		t.Fatalf("r2: outcome=%s", o2)
+	}
+
+	// r2 needs rebuild — use history to choose source.
+	source, snapLSN := primary.RebuildSourceDecision()
+	if source != RebuildSnapshotTail || snapLSN != 50 {
+		t.Fatalf("rebuild source=%s snap=%d", source, snapLSN)
+	}
+
+	// New rebuild session for r2.
+	r.ApplyAssignment(AssignmentIntent{
+		Replicas: []ReplicaAssignment{
+			{ReplicaID: "r2", Endpoint: Endpoint{DataAddr: "10.0.0.2:9333", Version: 1}},
+		},
+		Epoch:           1,
+		RecoveryTargets: map[string]SessionKind{"r2": SessionRebuild},
+	})
+
+	id2b := r2.SessionID()
+	r2.BeginConnect(id2b)
+	r2.RecordHandshake(id2b, 0, 100)
+	r2.SelectRebuildSource(id2b, snapLSN, true, primary.CommittedLSN)
+	r2.BeginRebuildTransfer(id2b)
+	r2.RecordRebuildTransferProgress(id2b, snapLSN)
+	r2.BeginRebuildTailReplay(id2b)
+	r2.RecordRebuildTailProgress(id2b, 100)
+	r2.CompleteRebuild(id2b)
+
+	if r1.State() != StateInSync || r2.State() != StateInSync {
+		t.Fatalf("r1=%s r2=%s", r1.State(), r2.State())
+	}
+	t.Log("e2e: r1 caught up (proof: gap within retention), r2 rebuilt (proof: gap beyond retention, snapshot+tail)")
+}
+
+// --- Truncation / safe-boundary handling ---
+
+func TestHistory_Proof_TruncationRequired(t *testing.T) {
+	rh := RetainedHistory{CommittedLSN: 100}
+
+	// Replica ahead → truncation required.
+	proof := rh.ProveRecoverability(105)
+	if proof.Reason != "replica_ahead_needs_truncation" {
+		t.Fatalf("reason=%s", proof.Reason)
+	}
+
+	// Sender execution: handshake sets truncation requirement.
+	s := NewSender("r1", Endpoint{DataAddr: "r1:9333", Version: 1}, 1)
+	id, _ := s.AttachSession(1, SessionCatchUp)
+	s.BeginConnect(id)
+
+	hr := rh.MakeHandshakeResult(105)
+	outcome, _ := s.RecordHandshakeWithOutcome(id, hr)
+	if outcome != OutcomeCatchUp {
+		t.Fatalf("outcome=%s", outcome)
+	}
+
+	// Session should require truncation.
+	snap := s.SessionSnapshot()
+	if snap == nil {
+		t.Fatal("session should exist")
+	}
+
+	// Completion without truncation rejected.
+	if s.CompleteSessionByID(id) {
+		t.Fatal("should reject completion without truncation")
+	}
+
+	// Record truncation.
+	s.RecordTruncation(id, 100)
+
+	// Now completion works (zero-gap after truncation).
+	if !s.CompleteSessionByID(id) {
+		t.Fatal("completion after truncation should succeed")
+	}
+}