fix: Phase 20 T5 — RF2 missing replica degraded + transport signal + API surface

Fix three tester findings on T5:

1. RF2 with missing replicas now reports "degraded" instead of
   "no_replicas". Only RF=1 with no replicas returns "no_replicas".
   Missing replica in an RF2 set is a degraded cluster state.

2. TransportDegraded signal now incorporated: if master-observed
   transport is degraded, ClusterReplicationMode is at least
   "degraded" regardless of individual replica health.

3. API surface exposure: EngineProjectionMode and
   ClusterReplicationMode now appear on blockapi.VolumeInfo and are
   populated in entryToVolumeInfo(). Operators can consume both
   through GET /block/volume/{name} with distinct JSON field names.

12 tests: keepup, catching_up, stale degraded, LSN gap needs_rebuild,
rebuilding role, RF1 no_replicas, RF2 missing degraded, transport
degraded, distinctness, heartbeat update, worst dominates, API
surface distinct naming.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
pingqiu
2026-04-05 18:49:37 -07:00
parent 013f3e7ccb
commit ceb68cc66b
4 changed files with 109 additions and 10 deletions

View File

@@ -119,19 +119,58 @@ func TestT5_RebuildingRole_NeedsRebuild(t *testing.T) {
}
}
func TestT5_NoReplicas_NoReplicasMode(t *testing.T) {
func TestT5_RF1_NoReplicas(t *testing.T) {
r := NewBlockVolumeRegistry()
if err := r.Register(&BlockVolumeEntry{
Name: "vol-crm-none", VolumeServer: "primary:8080",
Path: "/data/vol-crm-none.blk", Status: StatusActive,
Name: "vol-crm-rf1", VolumeServer: "primary:8080",
Path: "/data/vol-crm-rf1.blk", Status: StatusActive,
Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 1,
}); err != nil {
t.Fatalf("register: %v", err)
}
entry, _ := r.Lookup("vol-crm-none")
entry, _ := r.Lookup("vol-crm-rf1")
if entry.ClusterReplicationMode != "no_replicas" {
t.Fatalf("ClusterReplicationMode=%q, want %q", entry.ClusterReplicationMode, "no_replicas")
t.Fatalf("ClusterReplicationMode=%q, want %q for RF=1", entry.ClusterReplicationMode, "no_replicas")
}
}
func TestT5_RF2_MissingReplica_Degraded(t *testing.T) {
r := NewBlockVolumeRegistry()
if err := r.Register(&BlockVolumeEntry{
Name: "vol-crm-missing", VolumeServer: "primary:8080",
Path: "/data/vol-crm-missing.blk", Status: StatusActive,
Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 2,
// RF=2 but no replicas registered → degraded, not "no_replicas"
}); err != nil {
t.Fatalf("register: %v", err)
}
entry, _ := r.Lookup("vol-crm-missing")
if entry.ClusterReplicationMode != "degraded" {
t.Fatalf("ClusterReplicationMode=%q, want %q for RF=2 with missing replica", entry.ClusterReplicationMode, "degraded")
}
}
func TestT5_TransportDegraded_Degraded(t *testing.T) {
r := NewBlockVolumeRegistry()
if err := r.Register(&BlockVolumeEntry{
Name: "vol-crm-transport", VolumeServer: "primary:8080",
Path: "/data/vol-crm-transport.blk", Status: StatusActive,
Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 2,
WALHeadLSN: 100, TransportDegraded: true,
Replicas: []ReplicaInfo{{
Server: "replica:8080", Path: "/data/vol-crm-transport.blk",
HealthScore: 1.0, WALHeadLSN: 100, Ready: true,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
}); err != nil {
t.Fatalf("register: %v", err)
}
entry, _ := r.Lookup("vol-crm-transport")
if entry.ClusterReplicationMode != "degraded" {
t.Fatalf("ClusterReplicationMode=%q, want %q for transport-degraded", entry.ClusterReplicationMode, "degraded")
}
}
@@ -235,3 +274,44 @@ func TestT5_WorstReplicaDominates(t *testing.T) {
t.Fatalf("ClusterReplicationMode=%q, want needs_rebuild (worst dominates)", entry.ClusterReplicationMode)
}
}
func TestT5_APISurface_DistinctNaming(t *testing.T) {
r := NewBlockVolumeRegistry()
r.MarkBlockCapable("primary:8080")
if err := r.Register(&BlockVolumeEntry{
Name: "vol-crm-api", VolumeServer: "primary:8080",
Path: "/data/vol-crm-api.blk", Status: StatusActive,
Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 2,
WALHeadLSN: 100,
EngineProjectionMode: "publish_healthy",
HasEngineProjectionMode: true,
Replicas: []ReplicaInfo{{
Server: "replica:8080", Path: "/data/vol-crm-api.blk",
HealthScore: 1.0, WALHeadLSN: 80, Ready: true,
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
}},
}); err != nil {
t.Fatalf("register: %v", err)
}
entry, _ := r.Lookup("vol-crm-api")
info := entryToVolumeInfo(&entry, true)
// All three mode fields must be present and distinct.
if info.VolumeMode == "" {
t.Fatal("VolumeMode missing from API surface")
}
if info.EngineProjectionMode == "" {
t.Fatal("EngineProjectionMode missing from API surface")
}
if info.ClusterReplicationMode == "" {
t.Fatal("ClusterReplicationMode missing from API surface")
}
// EngineProjectionMode is VS-local (publish_healthy).
// ClusterReplicationMode is master-computed (catching_up because replica behind).
// They must differ in this scenario.
if info.EngineProjectionMode == info.ClusterReplicationMode {
t.Fatalf("EngineProjectionMode=%q should differ from ClusterReplicationMode=%q on API surface",
info.EngineProjectionMode, info.ClusterReplicationMode)
}
}

View File

@@ -188,11 +188,26 @@ func (e *BlockVolumeEntry) recomputeReplicaState() {
//
// Monotonic: worst replica state dominates the cluster mode.
func (e *BlockVolumeEntry) computeClusterReplicationMode() string {
if len(e.Replicas) == 0 {
rf := e.ReplicaFactor
if rf == 0 {
rf = 1
}
// RF=1: no replication configured — not an RF2 judgment.
if rf <= 1 && len(e.Replicas) == 0 {
return "no_replicas"
}
// RF>1 but no replicas registered: the set is degraded (missing replica).
if len(e.Replicas) == 0 {
return "degraded"
}
worst := "keepup"
// Incorporate master-observed transport degradation signal.
if e.TransportDegraded {
worst = worseClusterMode(worst, "degraded")
}
for _, ri := range e.Replicas {
replicaMode := evaluateReplicaHealth(ri, e.WALHeadLSN)
worst = worseClusterMode(worst, replicaMode)

View File

@@ -432,8 +432,10 @@ func entryToVolumeInfo(e *BlockVolumeEntry, primaryAlive bool) blockapi.VolumeIn
NvmeAddr: e.NvmeAddr,
NQN: e.NQN,
HealthState: surface.HealthState,
VolumeMode: surface.VolumeMode,
VolumeModeReason: surface.VolumeModeReason,
VolumeMode: surface.VolumeMode,
VolumeModeReason: surface.VolumeModeReason,
EngineProjectionMode: e.EngineProjectionMode,
ClusterReplicationMode: e.ClusterReplicationMode,
}
for _, ri := range e.Replicas {
info.Replicas = append(info.Replicas, blockapi.ReplicaDetail{

View File

@@ -46,8 +46,10 @@ type VolumeInfo struct {
// CP11B-4: Operator-facing health state.
HealthState string `json:"health_state"` // "healthy", "degraded", "rebuilding", "unsafe"
// CP13-9: Normalized volume mode for constrained-runtime surfaces.
VolumeMode string `json:"volume_mode,omitempty"` // "allocated_only", "bootstrap_pending", "publish_healthy", "degraded", "needs_rebuild"
VolumeModeReason string `json:"volume_mode_reason,omitempty"`
VolumeMode string `json:"volume_mode,omitempty"` // "allocated_only", "bootstrap_pending", "publish_healthy", "degraded", "needs_rebuild"
VolumeModeReason string `json:"volume_mode_reason,omitempty"`
EngineProjectionMode string `json:"engine_projection_mode,omitempty"` // T1: VS-local V2 engine projection
ClusterReplicationMode string `json:"cluster_replication_mode,omitempty"` // T5: master-owned cluster RF2 health
}
// ResolvedPolicyResponse is the response for POST /block/volume/resolve.