mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-05-22 09:41:28 +00:00
fix: Phase 20 T5 — RF2 missing replica degraded + transport signal + API surface
Fix three tester findings on T5:
1. RF2 with missing replicas now reports "degraded" instead of
"no_replicas". Only RF=1 with no replicas returns "no_replicas".
Missing replica in an RF2 set is a degraded cluster state.
2. TransportDegraded signal now incorporated: if master-observed
transport is degraded, ClusterReplicationMode is at least
"degraded" regardless of individual replica health.
3. API surface exposure: EngineProjectionMode and
ClusterReplicationMode now appear on blockapi.VolumeInfo and are
populated in entryToVolumeInfo(). Operators can consume both
through GET /block/volume/{name} with distinct JSON field names.
12 tests: keepup, catching_up, stale degraded, LSN gap needs_rebuild,
rebuilding role, RF1 no_replicas, RF2 missing degraded, transport
degraded, distinctness, heartbeat update, worst dominates, API
surface distinct naming.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -119,19 +119,58 @@ func TestT5_RebuildingRole_NeedsRebuild(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestT5_NoReplicas_NoReplicasMode(t *testing.T) {
|
||||
func TestT5_RF1_NoReplicas(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
if err := r.Register(&BlockVolumeEntry{
|
||||
Name: "vol-crm-none", VolumeServer: "primary:8080",
|
||||
Path: "/data/vol-crm-none.blk", Status: StatusActive,
|
||||
Name: "vol-crm-rf1", VolumeServer: "primary:8080",
|
||||
Path: "/data/vol-crm-rf1.blk", Status: StatusActive,
|
||||
Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 1,
|
||||
}); err != nil {
|
||||
t.Fatalf("register: %v", err)
|
||||
}
|
||||
|
||||
entry, _ := r.Lookup("vol-crm-none")
|
||||
entry, _ := r.Lookup("vol-crm-rf1")
|
||||
if entry.ClusterReplicationMode != "no_replicas" {
|
||||
t.Fatalf("ClusterReplicationMode=%q, want %q", entry.ClusterReplicationMode, "no_replicas")
|
||||
t.Fatalf("ClusterReplicationMode=%q, want %q for RF=1", entry.ClusterReplicationMode, "no_replicas")
|
||||
}
|
||||
}
|
||||
|
||||
func TestT5_RF2_MissingReplica_Degraded(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
if err := r.Register(&BlockVolumeEntry{
|
||||
Name: "vol-crm-missing", VolumeServer: "primary:8080",
|
||||
Path: "/data/vol-crm-missing.blk", Status: StatusActive,
|
||||
Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 2,
|
||||
// RF=2 but no replicas registered → degraded, not "no_replicas"
|
||||
}); err != nil {
|
||||
t.Fatalf("register: %v", err)
|
||||
}
|
||||
|
||||
entry, _ := r.Lookup("vol-crm-missing")
|
||||
if entry.ClusterReplicationMode != "degraded" {
|
||||
t.Fatalf("ClusterReplicationMode=%q, want %q for RF=2 with missing replica", entry.ClusterReplicationMode, "degraded")
|
||||
}
|
||||
}
|
||||
|
||||
func TestT5_TransportDegraded_Degraded(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
if err := r.Register(&BlockVolumeEntry{
|
||||
Name: "vol-crm-transport", VolumeServer: "primary:8080",
|
||||
Path: "/data/vol-crm-transport.blk", Status: StatusActive,
|
||||
Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 2,
|
||||
WALHeadLSN: 100, TransportDegraded: true,
|
||||
Replicas: []ReplicaInfo{{
|
||||
Server: "replica:8080", Path: "/data/vol-crm-transport.blk",
|
||||
HealthScore: 1.0, WALHeadLSN: 100, Ready: true,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
|
||||
}},
|
||||
}); err != nil {
|
||||
t.Fatalf("register: %v", err)
|
||||
}
|
||||
|
||||
entry, _ := r.Lookup("vol-crm-transport")
|
||||
if entry.ClusterReplicationMode != "degraded" {
|
||||
t.Fatalf("ClusterReplicationMode=%q, want %q for transport-degraded", entry.ClusterReplicationMode, "degraded")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,3 +274,44 @@ func TestT5_WorstReplicaDominates(t *testing.T) {
|
||||
t.Fatalf("ClusterReplicationMode=%q, want needs_rebuild (worst dominates)", entry.ClusterReplicationMode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestT5_APISurface_DistinctNaming(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("primary:8080")
|
||||
if err := r.Register(&BlockVolumeEntry{
|
||||
Name: "vol-crm-api", VolumeServer: "primary:8080",
|
||||
Path: "/data/vol-crm-api.blk", Status: StatusActive,
|
||||
Role: blockvol.RoleToWire(blockvol.RolePrimary), ReplicaFactor: 2,
|
||||
WALHeadLSN: 100,
|
||||
EngineProjectionMode: "publish_healthy",
|
||||
HasEngineProjectionMode: true,
|
||||
Replicas: []ReplicaInfo{{
|
||||
Server: "replica:8080", Path: "/data/vol-crm-api.blk",
|
||||
HealthScore: 1.0, WALHeadLSN: 80, Ready: true,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
|
||||
}},
|
||||
}); err != nil {
|
||||
t.Fatalf("register: %v", err)
|
||||
}
|
||||
|
||||
entry, _ := r.Lookup("vol-crm-api")
|
||||
info := entryToVolumeInfo(&entry, true)
|
||||
|
||||
// All three mode fields must be present and distinct.
|
||||
if info.VolumeMode == "" {
|
||||
t.Fatal("VolumeMode missing from API surface")
|
||||
}
|
||||
if info.EngineProjectionMode == "" {
|
||||
t.Fatal("EngineProjectionMode missing from API surface")
|
||||
}
|
||||
if info.ClusterReplicationMode == "" {
|
||||
t.Fatal("ClusterReplicationMode missing from API surface")
|
||||
}
|
||||
// EngineProjectionMode is VS-local (publish_healthy).
|
||||
// ClusterReplicationMode is master-computed (catching_up because replica behind).
|
||||
// They must differ in this scenario.
|
||||
if info.EngineProjectionMode == info.ClusterReplicationMode {
|
||||
t.Fatalf("EngineProjectionMode=%q should differ from ClusterReplicationMode=%q on API surface",
|
||||
info.EngineProjectionMode, info.ClusterReplicationMode)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -188,11 +188,26 @@ func (e *BlockVolumeEntry) recomputeReplicaState() {
|
||||
//
|
||||
// Monotonic: worst replica state dominates the cluster mode.
|
||||
func (e *BlockVolumeEntry) computeClusterReplicationMode() string {
|
||||
if len(e.Replicas) == 0 {
|
||||
rf := e.ReplicaFactor
|
||||
if rf == 0 {
|
||||
rf = 1
|
||||
}
|
||||
// RF=1: no replication configured — not an RF2 judgment.
|
||||
if rf <= 1 && len(e.Replicas) == 0 {
|
||||
return "no_replicas"
|
||||
}
|
||||
// RF>1 but no replicas registered: the set is degraded (missing replica).
|
||||
if len(e.Replicas) == 0 {
|
||||
return "degraded"
|
||||
}
|
||||
|
||||
worst := "keepup"
|
||||
|
||||
// Incorporate master-observed transport degradation signal.
|
||||
if e.TransportDegraded {
|
||||
worst = worseClusterMode(worst, "degraded")
|
||||
}
|
||||
|
||||
for _, ri := range e.Replicas {
|
||||
replicaMode := evaluateReplicaHealth(ri, e.WALHeadLSN)
|
||||
worst = worseClusterMode(worst, replicaMode)
|
||||
|
||||
@@ -432,8 +432,10 @@ func entryToVolumeInfo(e *BlockVolumeEntry, primaryAlive bool) blockapi.VolumeIn
|
||||
NvmeAddr: e.NvmeAddr,
|
||||
NQN: e.NQN,
|
||||
HealthState: surface.HealthState,
|
||||
VolumeMode: surface.VolumeMode,
|
||||
VolumeModeReason: surface.VolumeModeReason,
|
||||
VolumeMode: surface.VolumeMode,
|
||||
VolumeModeReason: surface.VolumeModeReason,
|
||||
EngineProjectionMode: e.EngineProjectionMode,
|
||||
ClusterReplicationMode: e.ClusterReplicationMode,
|
||||
}
|
||||
for _, ri := range e.Replicas {
|
||||
info.Replicas = append(info.Replicas, blockapi.ReplicaDetail{
|
||||
|
||||
@@ -46,8 +46,10 @@ type VolumeInfo struct {
|
||||
// CP11B-4: Operator-facing health state.
|
||||
HealthState string `json:"health_state"` // "healthy", "degraded", "rebuilding", "unsafe"
|
||||
// CP13-9: Normalized volume mode for constrained-runtime surfaces.
|
||||
VolumeMode string `json:"volume_mode,omitempty"` // "allocated_only", "bootstrap_pending", "publish_healthy", "degraded", "needs_rebuild"
|
||||
VolumeModeReason string `json:"volume_mode_reason,omitempty"`
|
||||
VolumeMode string `json:"volume_mode,omitempty"` // "allocated_only", "bootstrap_pending", "publish_healthy", "degraded", "needs_rebuild"
|
||||
VolumeModeReason string `json:"volume_mode_reason,omitempty"`
|
||||
EngineProjectionMode string `json:"engine_projection_mode,omitempty"` // T1: VS-local V2 engine projection
|
||||
ClusterReplicationMode string `json:"cluster_replication_mode,omitempty"` // T5: master-owned cluster RF2 health
|
||||
}
|
||||
|
||||
// ResolvedPolicyResponse is the response for POST /block/volume/resolve.
|
||||
|
||||
Reference in New Issue
Block a user