From 2b97cd04b8c8d39b36ca0c0dd50df19d2ed6086f Mon Sep 17 00:00:00 2001 From: pingqiu Date: Sun, 5 Apr 2026 16:27:02 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20Phase=2020=20T3=20=E2=80=94=20add=20V2?= =?UTF-8?q?=20promotion=20observability=20to=20FailoverDiagnostic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FailoverDiagnostic now carries V2PromotionEnabled and V2PromotionReady fields. MasterServer.FailoverDiagnosticSnapshot() enriches the failover state diagnostic with rollout gate visibility so operators can confirm whether the master is on V1, V2, or V2-fail-closed-placeholder mode. Update phase-20.md: document default=false rollout policy (safe default until proto regen enables evidence RPC, then flip to default true). Co-Authored-By: Claude Opus 4.6 (1M context) --- sw-block/.private/phase/phase-20.md | 6 ++++-- weed/server/master_block_failover.go | 11 +++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/sw-block/.private/phase/phase-20.md b/sw-block/.private/phase/phase-20.md index dedb5ffe1..b05e92942 100644 --- a/sw-block/.private/phase/phase-20.md +++ b/sw-block/.private/phase/phase-20.md @@ -207,9 +207,11 @@ authorization. 6. Bump epoch, enqueue assignment to selected candidate **Legacy fallback policy**: -- Add `--block.v2-promotion` flag (default `true`) +- Add `--block.v2Promotion` flag (default `false` — safe rollout default + until proto regen enables the evidence RPC; once RPC is live, flip to + default `true`) - When `true`: `promoteReplicaV2()` with fail-closed on evidence failure -- When `false`: existing `promoteReplica()` (V1 path) +- When `false`: existing `promoteReplicaV1()` (V1 path) - The flag is observable via `/vol/status` and metrics - The flag is intended to be removed once V2 is validated, not permanent diff --git a/weed/server/master_block_failover.go b/weed/server/master_block_failover.go index 9c7b7120b..ada422c79 100644 --- a/weed/server/master_block_failover.go +++ b/weed/server/master_block_failover.go @@ -54,6 +54,8 @@ type FailoverVolumeState struct { // Volume-oriented: each entry describes one volume's failover state. // Aggregate counts are derived from the volume list. type FailoverDiagnostic struct { + V2PromotionEnabled bool // T3: whether durability-first V2 promotion is active + V2PromotionReady bool // T3: whether V2 evidence querier is wired (false = fail-closed placeholder) Volumes []FailoverVolumeState PendingRebuildCount map[string]int // dead server → count of pending rebuilds DeferredPromotionCount map[string]int // dead server → count of deferred promotion timers @@ -93,6 +95,15 @@ func (fs *blockFailoverState) DiagnosticSnapshot() FailoverDiagnostic { return diag } +// FailoverDiagnosticSnapshot returns a FailoverDiagnostic enriched with +// V2 promotion rollout state so operators can observe the active mode. +func (ms *MasterServer) FailoverDiagnosticSnapshot() FailoverDiagnostic { + diag := ms.blockFailover.DiagnosticSnapshot() + diag.V2PromotionEnabled = ms.blockV2Promotion + diag.V2PromotionReady = ms.blockV2Promotion && ms.blockVSQueryEvidence != nil + return diag +} + // PublicationDiagnostic is a bounded read-only snapshot comparing the // operator-visible publication (LookupBlockVolume response) against the // registry authority for one volume. P3 diagnosability surface for S2.