From ceb68cc66baab043b7e85292cff22ee01efa2acf Mon Sep 17 00:00:00 2001 From: pingqiu Date: Sun, 5 Apr 2026 18:49:37 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20Phase=2020=20T5=20=E2=80=94=20RF2=20miss?= =?UTF-8?q?ing=20replica=20degraded=20+=20transport=20signal=20+=20API=20s?= =?UTF-8?q?urface?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix three tester findings on T5: 1. RF2 with missing replicas now reports "degraded" instead of "no_replicas". Only RF=1 with no replicas returns "no_replicas". Missing replica in an RF2 set is a degraded cluster state. 2. TransportDegraded signal now incorporated: if master-observed transport is degraded, ClusterReplicationMode is at least "degraded" regardless of individual replica health. 3. API surface exposure: EngineProjectionMode and ClusterReplicationMode now appear on blockapi.VolumeInfo and are populated in entryToVolumeInfo(). Operators can consume both through GET /block/volume/{name} with distinct JSON field names. 12 tests: keepup, catching_up, stale degraded, LSN gap needs_rebuild, rebuilding role, RF1 no_replicas, RF2 missing degraded, transport degraded, distinctness, heartbeat update, worst dominates, API surface distinct naming. Co-Authored-By: Claude Opus 4.6 (1M context) --- weed/server/master_block_cluster_mode_test.go | 90 +++++++++++++++++-- weed/server/master_block_registry.go | 17 +++- weed/server/master_server_handlers_block.go | 6 +- weed/storage/blockvol/blockapi/types.go | 6 +- 4 files changed, 109 insertions(+), 10 deletions(-) diff --git a/weed/server/master_block_cluster_mode_test.go b/weed/server/master_block_cluster_mode_test.go index 78a69d2b2..1f3d5b9b0 100644 --- a/weed/server/master_block_cluster_mode_test.go +++ b/weed/server/master_block_cluster_mode_test.go @@ -119,19 +119,58 @@ func TestT5_RebuildingRole_NeedsRebuild(t *testing.T) { } } -func TestT5_NoReplicas_NoReplicasMode(t *testing.T) { +func TestT5_RF1_NoReplicas(t *testing.T) { r := NewBlockVolumeRegistry() if err := r.Register(&BlockVolumeEntry{ - Name: "vol-crm-none", VolumeServer: "primary:8080", - Path: "/data/vol-crm-none.blk", Status: StatusActive, + Name: "vol-crm-rf1", VolumeServer: "primary:8080", + Path: "/data/vol-crm-rf1.blk", Status: StatusActive, Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 1, }); err != nil { t.Fatalf("register: %v", err) } - entry, _ := r.Lookup("vol-crm-none") + entry, _ := r.Lookup("vol-crm-rf1") if entry.ClusterReplicationMode != "no_replicas" { - t.Fatalf("ClusterReplicationMode=%q, want %q", entry.ClusterReplicationMode, "no_replicas") + t.Fatalf("ClusterReplicationMode=%q, want %q for RF=1", entry.ClusterReplicationMode, "no_replicas") + } +} + +func TestT5_RF2_MissingReplica_Degraded(t *testing.T) { + r := NewBlockVolumeRegistry() + if err := r.Register(&BlockVolumeEntry{ + Name: "vol-crm-missing", VolumeServer: "primary:8080", + Path: "/data/vol-crm-missing.blk", Status: StatusActive, + Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 2, + // RF=2 but no replicas registered → degraded, not "no_replicas" + }); err != nil { + t.Fatalf("register: %v", err) + } + + entry, _ := r.Lookup("vol-crm-missing") + if entry.ClusterReplicationMode != "degraded" { + t.Fatalf("ClusterReplicationMode=%q, want %q for RF=2 with missing replica", entry.ClusterReplicationMode, "degraded") + } +} + +func TestT5_TransportDegraded_Degraded(t *testing.T) { + r := NewBlockVolumeRegistry() + if err := r.Register(&BlockVolumeEntry{ + Name: "vol-crm-transport", VolumeServer: "primary:8080", + Path: "/data/vol-crm-transport.blk", Status: StatusActive, + Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 2, + WALHeadLSN: 100, TransportDegraded: true, + Replicas: []ReplicaInfo{{ + Server: "replica:8080", Path: "/data/vol-crm-transport.blk", + HealthScore: 1.0, WALHeadLSN: 100, Ready: true, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(), + }}, + }); err != nil { + t.Fatalf("register: %v", err) + } + + entry, _ := r.Lookup("vol-crm-transport") + if entry.ClusterReplicationMode != "degraded" { + t.Fatalf("ClusterReplicationMode=%q, want %q for transport-degraded", entry.ClusterReplicationMode, "degraded") } } @@ -235,3 +274,44 @@ func TestT5_WorstReplicaDominates(t *testing.T) { t.Fatalf("ClusterReplicationMode=%q, want needs_rebuild (worst dominates)", entry.ClusterReplicationMode) } } + +func TestT5_APISurface_DistinctNaming(t *testing.T) { + r := NewBlockVolumeRegistry() + r.MarkBlockCapable("primary:8080") + if err := r.Register(&BlockVolumeEntry{ + Name: "vol-crm-api", VolumeServer: "primary:8080", + Path: "/data/vol-crm-api.blk", Status: StatusActive, + Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 2, + WALHeadLSN: 100, + EngineProjectionMode: "publish_healthy", + HasEngineProjectionMode: true, + Replicas: []ReplicaInfo{{ + Server: "replica:8080", Path: "/data/vol-crm-api.blk", + HealthScore: 1.0, WALHeadLSN: 80, Ready: true, + Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(), + }}, + }); err != nil { + t.Fatalf("register: %v", err) + } + + entry, _ := r.Lookup("vol-crm-api") + info := entryToVolumeInfo(&entry, true) + + // All three mode fields must be present and distinct. + if info.VolumeMode == "" { + t.Fatal("VolumeMode missing from API surface") + } + if info.EngineProjectionMode == "" { + t.Fatal("EngineProjectionMode missing from API surface") + } + if info.ClusterReplicationMode == "" { + t.Fatal("ClusterReplicationMode missing from API surface") + } + // EngineProjectionMode is VS-local (publish_healthy). + // ClusterReplicationMode is master-computed (catching_up because replica behind). + // They must differ in this scenario. + if info.EngineProjectionMode == info.ClusterReplicationMode { + t.Fatalf("EngineProjectionMode=%q should differ from ClusterReplicationMode=%q on API surface", + info.EngineProjectionMode, info.ClusterReplicationMode) + } +} diff --git a/weed/server/master_block_registry.go b/weed/server/master_block_registry.go index e7806c269..568f2d76f 100644 --- a/weed/server/master_block_registry.go +++ b/weed/server/master_block_registry.go @@ -188,11 +188,26 @@ func (e *BlockVolumeEntry) recomputeReplicaState() { // // Monotonic: worst replica state dominates the cluster mode. func (e *BlockVolumeEntry) computeClusterReplicationMode() string { - if len(e.Replicas) == 0 { + rf := e.ReplicaFactor + if rf == 0 { + rf = 1 + } + // RF=1: no replication configured — not an RF2 judgment. + if rf <= 1 && len(e.Replicas) == 0 { return "no_replicas" } + // RF>1 but no replicas registered: the set is degraded (missing replica). + if len(e.Replicas) == 0 { + return "degraded" + } worst := "keepup" + + // Incorporate master-observed transport degradation signal. + if e.TransportDegraded { + worst = worseClusterMode(worst, "degraded") + } + for _, ri := range e.Replicas { replicaMode := evaluateReplicaHealth(ri, e.WALHeadLSN) worst = worseClusterMode(worst, replicaMode) diff --git a/weed/server/master_server_handlers_block.go b/weed/server/master_server_handlers_block.go index 1e9114f85..1ea32c76c 100644 --- a/weed/server/master_server_handlers_block.go +++ b/weed/server/master_server_handlers_block.go @@ -432,8 +432,10 @@ func entryToVolumeInfo(e *BlockVolumeEntry, primaryAlive bool) blockapi.VolumeIn NvmeAddr: e.NvmeAddr, NQN: e.NQN, HealthState: surface.HealthState, - VolumeMode: surface.VolumeMode, - VolumeModeReason: surface.VolumeModeReason, + VolumeMode: surface.VolumeMode, + VolumeModeReason: surface.VolumeModeReason, + EngineProjectionMode: e.EngineProjectionMode, + ClusterReplicationMode: e.ClusterReplicationMode, } for _, ri := range e.Replicas { info.Replicas = append(info.Replicas, blockapi.ReplicaDetail{ diff --git a/weed/storage/blockvol/blockapi/types.go b/weed/storage/blockvol/blockapi/types.go index 23713bc87..fcd705bf2 100644 --- a/weed/storage/blockvol/blockapi/types.go +++ b/weed/storage/blockvol/blockapi/types.go @@ -46,8 +46,10 @@ type VolumeInfo struct { // CP11B-4: Operator-facing health state. HealthState string `json:"health_state"` // "healthy", "degraded", "rebuilding", "unsafe" // CP13-9: Normalized volume mode for constrained-runtime surfaces. - VolumeMode string `json:"volume_mode,omitempty"` // "allocated_only", "bootstrap_pending", "publish_healthy", "degraded", "needs_rebuild" - VolumeModeReason string `json:"volume_mode_reason,omitempty"` + VolumeMode string `json:"volume_mode,omitempty"` // "allocated_only", "bootstrap_pending", "publish_healthy", "degraded", "needs_rebuild" + VolumeModeReason string `json:"volume_mode_reason,omitempty"` + EngineProjectionMode string `json:"engine_projection_mode,omitempty"` // T1: VS-local V2 engine projection + ClusterReplicationMode string `json:"cluster_replication_mode,omitempty"` // T5: master-owned cluster RF2 health } // ResolvedPolicyResponse is the response for POST /block/volume/resolve.