mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-09 18:32:43 +00:00
feat: CP11B-3 safe ops — promotion hardening, preflight, manual promote
Six-task checkpoint hardening the promotion and failover paths:
T1: 4-gate candidate evaluation (heartbeat freshness, WAL lag, role,
server liveness) with structured rejection reasons.
T2: Orphaned-primary re-evaluation on replica reconnect (B-06/B-08).
T3: Deferred timer safety — epoch validation prevents stale timers
from firing on recreated/changed volumes (B-07).
T4: Rebuild addr cleanup on promotion (B-11), NVMe publication
refresh on heartbeat, and preflight endpoint wiring.
T5: Manual promote API — POST /block/volume/{name}/promote with
force flag, target server selection, and structured rejection
response. Shared applyPromotionLocked/finalizePromotion helpers
eliminate duplication between auto and manual paths.
T6: Read-only preflight endpoint (GET /block/volume/{name}/preflight)
and blockapi client wrappers (Preflight, Promote).
BUG-T5-1: PromotionsTotal counter moved to finalizePromotion (shared
by both auto and manual paths) to prevent metrics divergence.
24 files changed, ~6500 lines added. 42 new QA adversarial tests.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -645,13 +645,16 @@ func TestIntegration_DoubleFailover(t *testing.T) {
|
||||
// Reconnect vs1 first so it becomes a replica (via recoverBlockVolumes).
|
||||
ms.recoverBlockVolumes(vs1)
|
||||
|
||||
// Simulate heartbeat from vs1 that restores iSCSI addr and health score
|
||||
// (in production this happens when the VS re-registers after reconnect).
|
||||
// Simulate heartbeat from vs1 that restores iSCSI addr, health score,
|
||||
// role, and heartbeat timestamp (in production this happens when the
|
||||
// VS re-registers after reconnect and completes rebuild).
|
||||
e1, _ = ms.blockRegistry.Lookup("pvc-double-1")
|
||||
for i := range e1.Replicas {
|
||||
if e1.Replicas[i].Server == vs1 {
|
||||
e1.Replicas[i].ISCSIAddr = vs1 + ":3260"
|
||||
e1.Replicas[i].HealthScore = 1.0
|
||||
e1.Replicas[i].Role = blockvol.RoleToWire(blockvol.RoleReplica)
|
||||
e1.Replicas[i].LastHeartbeat = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -57,7 +57,19 @@ func (ms *MasterServer) failoverBlockVolumes(deadServer string) {
|
||||
delay := leaseExpiry.Sub(now)
|
||||
glog.V(0).Infof("failover: %q lease expires in %v, deferring promotion", entry.Name, delay)
|
||||
volumeName := entry.Name
|
||||
capturedEpoch := entry.Epoch // T3: capture epoch for stale-timer validation
|
||||
timer := time.AfterFunc(delay, func() {
|
||||
// T3: Re-validate before acting — prevent stale timer on recreated/changed volume.
|
||||
current, ok := ms.blockRegistry.Lookup(volumeName)
|
||||
if !ok {
|
||||
glog.V(0).Infof("failover: deferred promotion for %q skipped (volume deleted)", volumeName)
|
||||
return
|
||||
}
|
||||
if current.Epoch != capturedEpoch {
|
||||
glog.V(0).Infof("failover: deferred promotion for %q skipped (epoch changed %d -> %d)",
|
||||
volumeName, capturedEpoch, current.Epoch)
|
||||
return
|
||||
}
|
||||
ms.promoteReplica(volumeName)
|
||||
})
|
||||
ms.blockFailover.mu.Lock()
|
||||
@@ -116,8 +128,15 @@ func (ms *MasterServer) promoteReplica(volumeName string) {
|
||||
return
|
||||
}
|
||||
|
||||
ms.finalizePromotion(volumeName, oldPrimary, oldPath, newEpoch)
|
||||
}
|
||||
|
||||
// finalizePromotion performs post-registry promotion steps:
|
||||
// enqueue assignment for new primary, record pending rebuild for old primary, bump metrics.
|
||||
// Called by both promoteReplica (auto) and blockVolumePromoteHandler (manual).
|
||||
func (ms *MasterServer) finalizePromotion(volumeName, oldPrimary, oldPath string, newEpoch uint64) {
|
||||
// Re-read entry after promotion.
|
||||
entry, ok = ms.blockRegistry.Lookup(volumeName)
|
||||
entry, ok := ms.blockRegistry.Lookup(volumeName)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
@@ -198,11 +217,15 @@ func (ms *MasterServer) cancelDeferredTimers(server string) {
|
||||
|
||||
// recoverBlockVolumes is called when a previously dead VS reconnects.
|
||||
// It cancels any deferred promotion timers (R2-F2), drains pending rebuilds,
|
||||
// and enqueues rebuild assignments.
|
||||
// enqueues rebuild assignments, and checks for orphaned primaries (T2/B-06).
|
||||
func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) {
|
||||
// R2-F2: Cancel deferred promotion timers for this server to prevent split-brain.
|
||||
ms.cancelDeferredTimers(reconnectedServer)
|
||||
|
||||
// T2 (B-06): Check for orphaned primaries — volumes where the reconnecting
|
||||
// server is a replica but the primary is dead/disconnected.
|
||||
ms.reevaluateOrphanedPrimaries(reconnectedServer)
|
||||
|
||||
rebuilds := ms.drainPendingRebuilds(reconnectedServer)
|
||||
if len(rebuilds) == 0 {
|
||||
return
|
||||
@@ -221,16 +244,74 @@ func (ms *MasterServer) recoverBlockVolumes(reconnectedServer string) {
|
||||
Path: rb.OldPath,
|
||||
})
|
||||
|
||||
// T4: Warn if RebuildListenAddr is empty (new primary hasn't heartbeated yet).
|
||||
rebuildAddr := entry.RebuildListenAddr
|
||||
if rebuildAddr == "" {
|
||||
glog.Warningf("rebuild: %q RebuildListenAddr is empty (new primary %s may not have heartbeated yet), "+
|
||||
"queuing rebuild anyway — VS should retry on empty addr", rb.VolumeName, entry.VolumeServer)
|
||||
}
|
||||
|
||||
// Enqueue rebuild assignment for the reconnected server.
|
||||
ms.blockAssignmentQueue.Enqueue(reconnectedServer, blockvol.BlockVolumeAssignment{
|
||||
Path: rb.OldPath,
|
||||
Epoch: entry.Epoch,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleRebuilding),
|
||||
RebuildAddr: entry.RebuildListenAddr,
|
||||
RebuildAddr: rebuildAddr,
|
||||
})
|
||||
|
||||
ms.blockRegistry.RebuildsTotal.Add(1)
|
||||
glog.V(0).Infof("rebuild: enqueued rebuild for %q on %s (epoch=%d, rebuildAddr=%s)",
|
||||
rb.VolumeName, reconnectedServer, entry.Epoch, entry.RebuildListenAddr)
|
||||
rb.VolumeName, reconnectedServer, entry.Epoch, rebuildAddr)
|
||||
}
|
||||
}
|
||||
|
||||
// reevaluateOrphanedPrimaries checks if the given server is a replica for any
|
||||
// volumes whose primary is dead (not block-capable). If so, promotes the best
|
||||
// available replica — but only after the old primary's lease has expired, to
|
||||
// maintain the same split-brain protection as failoverBlockVolumes().
|
||||
// This fixes B-06 (orphaned primary after replica re-register)
|
||||
// and partially B-08 (fast reconnect skips failover window).
|
||||
func (ms *MasterServer) reevaluateOrphanedPrimaries(server string) {
|
||||
if ms.blockRegistry == nil {
|
||||
return
|
||||
}
|
||||
orphaned := ms.blockRegistry.VolumesWithDeadPrimary(server)
|
||||
now := time.Now()
|
||||
for _, volumeName := range orphaned {
|
||||
entry, ok := ms.blockRegistry.Lookup(volumeName)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
// Respect lease expiry — same gate as failoverBlockVolumes().
|
||||
leaseExpiry := entry.LastLeaseGrant.Add(entry.LeaseTTL)
|
||||
if now.Before(leaseExpiry) {
|
||||
delay := leaseExpiry.Sub(now)
|
||||
glog.V(0).Infof("failover: orphaned primary for %q (replica %s alive, primary dead) "+
|
||||
"but lease expires in %v, deferring promotion", volumeName, server, delay)
|
||||
capturedEpoch := entry.Epoch
|
||||
deadPrimary := entry.VolumeServer
|
||||
timer := time.AfterFunc(delay, func() {
|
||||
current, ok := ms.blockRegistry.Lookup(volumeName)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if current.Epoch != capturedEpoch {
|
||||
glog.V(0).Infof("failover: deferred orphan promotion for %q skipped (epoch changed %d -> %d)",
|
||||
volumeName, capturedEpoch, current.Epoch)
|
||||
return
|
||||
}
|
||||
ms.promoteReplica(volumeName)
|
||||
})
|
||||
ms.blockFailover.mu.Lock()
|
||||
ms.blockFailover.deferredTimers[deadPrimary] = append(
|
||||
ms.blockFailover.deferredTimers[deadPrimary], timer)
|
||||
ms.blockFailover.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
|
||||
glog.V(0).Infof("failover: orphaned primary detected for %q (replica %s alive, primary dead, lease expired), promoting",
|
||||
volumeName, server)
|
||||
ms.promoteReplica(volumeName)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,6 +34,9 @@ func testMasterServerForFailover(t *testing.T) *MasterServer {
|
||||
// registerVolumeWithReplica creates a volume entry with primary + replica for tests.
|
||||
func registerVolumeWithReplica(t *testing.T, ms *MasterServer, name, primary, replica string, epoch uint64, leaseTTL time.Duration) {
|
||||
t.Helper()
|
||||
// Mark both servers as block-capable so promotion Gate 4 (liveness) passes.
|
||||
ms.blockRegistry.MarkBlockCapable(primary)
|
||||
ms.blockRegistry.MarkBlockCapable(replica)
|
||||
entry := &BlockVolumeEntry{
|
||||
Name: name,
|
||||
VolumeServer: primary,
|
||||
@@ -53,11 +56,13 @@ func registerVolumeWithReplica(t *testing.T, ms *MasterServer, name, primary, re
|
||||
// CP8-2: also populate Replicas[] for PromoteBestReplica.
|
||||
Replicas: []ReplicaInfo{
|
||||
{
|
||||
Server: replica,
|
||||
Path: fmt.Sprintf("/data/%s.blk", name),
|
||||
IQN: fmt.Sprintf("iqn.2024.test:%s-replica", name),
|
||||
ISCSIAddr: replica + ":3260",
|
||||
HealthScore: 1.0,
|
||||
Server: replica,
|
||||
Path: fmt.Sprintf("/data/%s.blk", name),
|
||||
IQN: fmt.Sprintf("iqn.2024.test:%s-replica", name),
|
||||
ISCSIAddr: replica + ":3260",
|
||||
HealthScore: 1.0,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
LastHeartbeat: time.Now(),
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -194,6 +199,9 @@ func TestFailover_MultipleVolumes(t *testing.T) {
|
||||
|
||||
func TestFailover_LeaseNotExpired_DeferredPromotion(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
|
||||
ms.blockRegistry.MarkBlockCapable("vs1")
|
||||
ms.blockRegistry.MarkBlockCapable("vs2")
|
||||
entry := &BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "vs1",
|
||||
@@ -209,7 +217,7 @@ func TestFailover_LeaseNotExpired_DeferredPromotion(t *testing.T) {
|
||||
LeaseTTL: 200 * time.Millisecond,
|
||||
LastLeaseGrant: time.Now(), // just granted, NOT expired yet
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
|
||||
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
},
|
||||
}
|
||||
ms.blockRegistry.Register(entry)
|
||||
@@ -397,6 +405,9 @@ func TestRebuild_RegistryUpdatedWithNewReplica(t *testing.T) {
|
||||
|
||||
func TestRebuild_AssignmentContainsRebuildAddr(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
|
||||
ms.blockRegistry.MarkBlockCapable("vs1")
|
||||
ms.blockRegistry.MarkBlockCapable("vs2")
|
||||
entry := &BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "vs1",
|
||||
@@ -413,7 +424,7 @@ func TestRebuild_AssignmentContainsRebuildAddr(t *testing.T) {
|
||||
LeaseTTL: 5 * time.Second,
|
||||
LastLeaseGrant: time.Now().Add(-10 * time.Second),
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
|
||||
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
},
|
||||
}
|
||||
ms.blockRegistry.Register(entry)
|
||||
@@ -457,7 +468,7 @@ func TestFailover_TransientDisconnect_NoPromotion(t *testing.T) {
|
||||
LeaseTTL: 30 * time.Second,
|
||||
LastLeaseGrant: time.Now(), // just granted
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
|
||||
{Server: "vs2", Path: "/data/vol1.blk", IQN: "iqn:vol1-r", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
},
|
||||
}
|
||||
ms.blockRegistry.Register(entry)
|
||||
@@ -556,6 +567,10 @@ func TestLifecycle_CreateFailoverRebuild(t *testing.T) {
|
||||
// registerVolumeRF3 creates a volume entry with primary + 2 replicas for RF=3 tests.
|
||||
func registerVolumeRF3(t *testing.T, ms *MasterServer, name, primary, replica1, replica2 string, epoch uint64, leaseTTL time.Duration) {
|
||||
t.Helper()
|
||||
// Mark all servers as block-capable so promotion Gate 4 (liveness) passes.
|
||||
ms.blockRegistry.MarkBlockCapable(primary)
|
||||
ms.blockRegistry.MarkBlockCapable(replica1)
|
||||
ms.blockRegistry.MarkBlockCapable(replica2)
|
||||
entry := &BlockVolumeEntry{
|
||||
Name: name,
|
||||
VolumeServer: primary,
|
||||
@@ -576,20 +591,24 @@ func registerVolumeRF3(t *testing.T, ms *MasterServer, name, primary, replica1,
|
||||
ReplicaISCSIAddr: replica1 + ":3260",
|
||||
Replicas: []ReplicaInfo{
|
||||
{
|
||||
Server: replica1,
|
||||
Path: fmt.Sprintf("/data/%s.blk", name),
|
||||
IQN: fmt.Sprintf("iqn.2024.test:%s-r1", name),
|
||||
ISCSIAddr: replica1 + ":3260",
|
||||
HealthScore: 1.0,
|
||||
WALHeadLSN: 100,
|
||||
Server: replica1,
|
||||
Path: fmt.Sprintf("/data/%s.blk", name),
|
||||
IQN: fmt.Sprintf("iqn.2024.test:%s-r1", name),
|
||||
ISCSIAddr: replica1 + ":3260",
|
||||
HealthScore: 1.0,
|
||||
WALHeadLSN: 100,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
LastHeartbeat: time.Now(),
|
||||
},
|
||||
{
|
||||
Server: replica2,
|
||||
Path: fmt.Sprintf("/data/%s.blk", name),
|
||||
IQN: fmt.Sprintf("iqn.2024.test:%s-r2", name),
|
||||
ISCSIAddr: replica2 + ":3260",
|
||||
HealthScore: 1.0,
|
||||
WALHeadLSN: 100,
|
||||
Server: replica2,
|
||||
Path: fmt.Sprintf("/data/%s.blk", name),
|
||||
IQN: fmt.Sprintf("iqn.2024.test:%s-r2", name),
|
||||
ISCSIAddr: replica2 + ":3260",
|
||||
HealthScore: 1.0,
|
||||
WALHeadLSN: 100,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
LastHeartbeat: time.Now(),
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -793,6 +812,10 @@ func TestRF3_AllReplicasDead_NoPromotion(t *testing.T) {
|
||||
// RF3: Lease deferred promotion with RF=3.
|
||||
func TestRF3_LeaseDeferred_Promotion(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
|
||||
ms.blockRegistry.MarkBlockCapable("vs1")
|
||||
ms.blockRegistry.MarkBlockCapable("vs2")
|
||||
ms.blockRegistry.MarkBlockCapable("vs3")
|
||||
entry := &BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "vs1",
|
||||
@@ -807,8 +830,8 @@ func TestRF3_LeaseDeferred_Promotion(t *testing.T) {
|
||||
LeaseTTL: 200 * time.Millisecond,
|
||||
LastLeaseGrant: time.Now(), // just granted → NOT expired
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0, WALHeadLSN: 50},
|
||||
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 0.9, WALHeadLSN: 50},
|
||||
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 0.9, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
},
|
||||
// Deprecated scalar fields.
|
||||
ReplicaServer: "vs2", ReplicaPath: "/data/vol1.blk", ReplicaISCSIAddr: "vs2:3260",
|
||||
@@ -853,8 +876,8 @@ func TestRF3_CancelDeferredOnReconnect(t *testing.T) {
|
||||
LeaseTTL: 5 * time.Second,
|
||||
LastLeaseGrant: time.Now(), // just granted → long lease
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0},
|
||||
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 1.0},
|
||||
{Server: "vs2", Path: "/data/vol1.blk", ISCSIAddr: "vs2:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
{Server: "vs3", Path: "/data/vol1.blk", ISCSIAddr: "vs3:3260", HealthScore: 1.0, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
},
|
||||
ReplicaServer: "vs2", ReplicaPath: "/data/vol1.blk", ReplicaISCSIAddr: "vs2:3260",
|
||||
}
|
||||
@@ -888,3 +911,267 @@ func TestRF3_CancelDeferredOnReconnect(t *testing.T) {
|
||||
t.Fatalf("vs1 should remain primary (timer cancelled), got %q", e.VolumeServer)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CP11B-3 T2: Re-evaluate on Replica Registration (B-06)
|
||||
// ============================================================
|
||||
|
||||
// T2: Orphaned primary + replica reconnects → automatic promotion.
|
||||
func TestT2_OrphanedPrimary_ReplicaReconnect_Promotes(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
|
||||
|
||||
// Simulate vs1 dying without proper failover (e.g., promotion failed at the time).
|
||||
// Mark vs1 as dead but DON'T call failoverBlockVolumes (simulates missed/failed failover).
|
||||
ms.blockRegistry.UnmarkBlockCapable("vs1")
|
||||
|
||||
// vs2 reconnects (sends heartbeat). reevaluateOrphanedPrimaries should detect orphaned primary.
|
||||
ms.recoverBlockVolumes("vs2")
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("vol1")
|
||||
if entry.VolumeServer != "vs2" {
|
||||
t.Fatalf("expected promotion to vs2 (orphaned primary), got %q", entry.VolumeServer)
|
||||
}
|
||||
if entry.Epoch != 2 {
|
||||
t.Fatalf("expected epoch 2 after promotion, got %d", entry.Epoch)
|
||||
}
|
||||
}
|
||||
|
||||
// T2: Replica reconnects but primary is alive → no unnecessary promotion.
|
||||
func TestT2_PrimaryAlive_NoPromotion(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
|
||||
|
||||
// Both servers alive. vs2 reconnects — no orphaned primary.
|
||||
ms.recoverBlockVolumes("vs2")
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("vol1")
|
||||
if entry.VolumeServer != "vs1" {
|
||||
t.Fatalf("primary should remain vs1 (alive), got %q", entry.VolumeServer)
|
||||
}
|
||||
if entry.Epoch != 1 {
|
||||
t.Fatalf("epoch should remain 1, got %d", entry.Epoch)
|
||||
}
|
||||
}
|
||||
|
||||
// T2: Multiple orphaned volumes, all promoted on reconnect.
|
||||
func TestT2_MultipleOrphanedVolumes(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
// vol1: vs1=primary, vs2=replica
|
||||
// vol2: vs3=primary, vs2=replica
|
||||
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
|
||||
ms.blockRegistry.MarkBlockCapable("vs3")
|
||||
entry2 := &BlockVolumeEntry{
|
||||
Name: "vol2", VolumeServer: "vs3", Path: "/data/vol2.blk",
|
||||
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
Status: StatusActive, LeaseTTL: 5 * time.Second,
|
||||
LastLeaseGrant: time.Now().Add(-10 * time.Second),
|
||||
Replicas: []ReplicaInfo{{
|
||||
Server: "vs2", Path: "/data/vol2.blk", HealthScore: 1.0,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
|
||||
}},
|
||||
}
|
||||
ms.blockRegistry.Register(entry2)
|
||||
|
||||
// Both primaries die.
|
||||
ms.blockRegistry.UnmarkBlockCapable("vs1")
|
||||
ms.blockRegistry.UnmarkBlockCapable("vs3")
|
||||
|
||||
// vs2 reconnects → both orphaned volumes should be promoted.
|
||||
ms.recoverBlockVolumes("vs2")
|
||||
|
||||
e1, _ := ms.blockRegistry.Lookup("vol1")
|
||||
e2, _ := ms.blockRegistry.Lookup("vol2")
|
||||
if e1.VolumeServer != "vs2" {
|
||||
t.Fatalf("vol1: expected promotion to vs2, got %q", e1.VolumeServer)
|
||||
}
|
||||
if e2.VolumeServer != "vs2" {
|
||||
t.Fatalf("vol2: expected promotion to vs2, got %q", e2.VolumeServer)
|
||||
}
|
||||
}
|
||||
|
||||
// T2: Repeated heartbeats do NOT cause duplicate promotions.
|
||||
func TestT2_RepeatedHeartbeats_NoDuplicatePromotion(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
|
||||
|
||||
ms.blockRegistry.UnmarkBlockCapable("vs1")
|
||||
|
||||
// First reconnect promotes.
|
||||
ms.reevaluateOrphanedPrimaries("vs2")
|
||||
entry, _ := ms.blockRegistry.Lookup("vol1")
|
||||
if entry.VolumeServer != "vs2" {
|
||||
t.Fatalf("first call: expected promotion to vs2, got %q", entry.VolumeServer)
|
||||
}
|
||||
epochAfterFirst := entry.Epoch
|
||||
|
||||
// Second call: vs2 is now the primary AND block-capable. No orphan detected.
|
||||
ms.reevaluateOrphanedPrimaries("vs2")
|
||||
entry, _ = ms.blockRegistry.Lookup("vol1")
|
||||
if entry.Epoch != epochAfterFirst {
|
||||
t.Fatalf("second call should not bump epoch: got %d, want %d", entry.Epoch, epochAfterFirst)
|
||||
}
|
||||
}
|
||||
|
||||
// T2: Dead primary with active lease, replica reconnects → no immediate promotion.
|
||||
// Regression test for lease-bypass bug: reevaluateOrphanedPrimaries must respect
|
||||
// lease expiry, not promote immediately.
|
||||
func TestT2_OrphanedPrimary_LeaseNotExpired_DefersPromotion(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
ms.blockRegistry.MarkBlockCapable("vs1")
|
||||
ms.blockRegistry.MarkBlockCapable("vs2")
|
||||
ms.blockRegistry.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
|
||||
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
Status: StatusActive, LeaseTTL: 300 * time.Millisecond,
|
||||
LastLeaseGrant: time.Now(), // lease still active
|
||||
Replicas: []ReplicaInfo{{
|
||||
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
|
||||
}},
|
||||
})
|
||||
|
||||
// vs1 dies (unmark block-capable).
|
||||
ms.blockRegistry.UnmarkBlockCapable("vs1")
|
||||
|
||||
// vs2 reconnects — orphan detected, but lease still active → should NOT promote immediately.
|
||||
ms.reevaluateOrphanedPrimaries("vs2")
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("vol1")
|
||||
if entry.VolumeServer != "vs1" {
|
||||
t.Fatalf("should NOT promote while lease active, got primary=%q", entry.VolumeServer)
|
||||
}
|
||||
if entry.Epoch != 1 {
|
||||
t.Fatalf("epoch should remain 1, got %d", entry.Epoch)
|
||||
}
|
||||
|
||||
// Verify a deferred timer was created for the dead primary.
|
||||
ms.blockFailover.mu.Lock()
|
||||
timerCount := len(ms.blockFailover.deferredTimers["vs1"])
|
||||
ms.blockFailover.mu.Unlock()
|
||||
if timerCount != 1 {
|
||||
t.Fatalf("expected 1 deferred timer for vs1, got %d", timerCount)
|
||||
}
|
||||
|
||||
// Wait for lease to expire + margin → timer fires, promotion happens.
|
||||
time.Sleep(450 * time.Millisecond)
|
||||
|
||||
entry, _ = ms.blockRegistry.Lookup("vol1")
|
||||
if entry.VolumeServer != "vs2" {
|
||||
t.Fatalf("after lease expiry, expected promotion to vs2, got %q", entry.VolumeServer)
|
||||
}
|
||||
if entry.Epoch != 2 {
|
||||
t.Fatalf("expected epoch 2, got %d", entry.Epoch)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CP11B-3 T3: Deferred Timer Safety
|
||||
// ============================================================
|
||||
|
||||
// T3: Delete/recreate volume before deferred timer fires → no wrong promotion.
|
||||
func TestT3_DeferredTimer_VolumeDeleted_NoPromotion(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
ms.blockRegistry.MarkBlockCapable("vs1")
|
||||
ms.blockRegistry.MarkBlockCapable("vs2")
|
||||
entry := &BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
|
||||
SizeBytes: 1 << 30, Epoch: 5, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
Status: StatusActive, LeaseTTL: 200 * time.Millisecond,
|
||||
LastLeaseGrant: time.Now(),
|
||||
Replicas: []ReplicaInfo{{
|
||||
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
|
||||
}},
|
||||
}
|
||||
ms.blockRegistry.Register(entry)
|
||||
|
||||
// vs1 dies → deferred timer created (lease not expired, epoch=5).
|
||||
ms.failoverBlockVolumes("vs1")
|
||||
|
||||
// Delete the volume before timer fires.
|
||||
ms.blockRegistry.Unregister("vol1")
|
||||
|
||||
// Wait for timer to fire.
|
||||
time.Sleep(350 * time.Millisecond)
|
||||
|
||||
// Volume should not exist (timer found it deleted, no-op).
|
||||
_, ok := ms.blockRegistry.Lookup("vol1")
|
||||
if ok {
|
||||
t.Fatal("volume should have been deleted, timer should not recreate it")
|
||||
}
|
||||
}
|
||||
|
||||
// T3: Epoch changes before deferred timer fires → timer rejected.
|
||||
func TestT3_DeferredTimer_EpochChanged_NoPromotion(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
ms.blockRegistry.MarkBlockCapable("vs1")
|
||||
ms.blockRegistry.MarkBlockCapable("vs2")
|
||||
ms.blockRegistry.MarkBlockCapable("vs3")
|
||||
entry := &BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
|
||||
SizeBytes: 1 << 30, Epoch: 5, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
Status: StatusActive, LeaseTTL: 200 * time.Millisecond,
|
||||
LastLeaseGrant: time.Now(),
|
||||
Replicas: []ReplicaInfo{{
|
||||
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
|
||||
}},
|
||||
}
|
||||
ms.blockRegistry.Register(entry)
|
||||
|
||||
// vs1 dies → deferred timer created (captures epoch=5).
|
||||
ms.failoverBlockVolumes("vs1")
|
||||
|
||||
// Before timer fires, manually bump the epoch (simulating another event).
|
||||
e, _ := ms.blockRegistry.Lookup("vol1")
|
||||
e.Epoch = 99
|
||||
|
||||
// Wait for timer to fire.
|
||||
time.Sleep(350 * time.Millisecond)
|
||||
|
||||
// Timer should have been rejected (epoch mismatch). Epoch stays at 99.
|
||||
e, _ = ms.blockRegistry.Lookup("vol1")
|
||||
if e.Epoch != 99 {
|
||||
t.Fatalf("epoch should remain 99 (timer rejected), got %d", e.Epoch)
|
||||
}
|
||||
// Primary should NOT have changed (deferred promotion was rejected).
|
||||
if e.VolumeServer != "vs1" {
|
||||
t.Fatalf("primary should remain vs1 (timer rejected), got %q", e.VolumeServer)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CP11B-3 T4: Rebuild with empty RebuildListenAddr
|
||||
// ============================================================
|
||||
|
||||
// T4: Rebuild queued with empty RebuildListenAddr after promotion.
|
||||
func TestT4_RebuildEmptyAddr_StillQueued(t *testing.T) {
|
||||
ms := testMasterServerForFailover(t)
|
||||
registerVolumeWithReplica(t, ms, "vol1", "vs1", "vs2", 1, 5*time.Second)
|
||||
|
||||
// Failover: vs1 dies, vs2 promoted. PromoteBestReplica clears RebuildListenAddr.
|
||||
ms.failoverBlockVolumes("vs1")
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("vol1")
|
||||
if entry.RebuildListenAddr != "" {
|
||||
t.Fatalf("RebuildListenAddr should be empty after promotion, got %q", entry.RebuildListenAddr)
|
||||
}
|
||||
|
||||
// vs1 reconnects. Rebuild should still be queued (even with empty addr).
|
||||
ms.recoverBlockVolumes("vs1")
|
||||
|
||||
assignments := ms.blockAssignmentQueue.Peek("vs1")
|
||||
foundRebuild := false
|
||||
for _, a := range assignments {
|
||||
if blockvol.RoleFromWire(a.Role) == blockvol.RoleRebuilding {
|
||||
foundRebuild = true
|
||||
if a.RebuildAddr != "" {
|
||||
t.Fatalf("RebuildAddr should be empty (new primary hasn't heartbeated), got %q", a.RebuildAddr)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !foundRebuild {
|
||||
t.Fatal("rebuild assignment should still be queued even with empty addr")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -842,44 +842,91 @@ func (r *BlockVolumeRegistry) PromotionLSNTolerance() uint64 {
|
||||
return r.promotionLSNTolerance
|
||||
}
|
||||
|
||||
// PromoteBestReplica promotes the best eligible replica to primary.
|
||||
// Eligibility: heartbeat fresh (within 2×LeaseTTL), WALHeadLSN within tolerance of primary,
|
||||
// and role must be RoleReplica (not RoleRebuilding).
|
||||
// The promoted replica is removed from Replicas[]. Other replicas stay.
|
||||
// Old primary is NOT added to Replicas (needs rebuild).
|
||||
// Returns the new epoch.
|
||||
func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
entry, ok := r.volumes[name]
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("block volume %q not found", name)
|
||||
// PromotionRejection records why a specific replica was rejected for promotion.
|
||||
type PromotionRejection struct {
|
||||
Server string
|
||||
Reason string // "stale_heartbeat", "wal_lag", "wrong_role", "server_dead"
|
||||
}
|
||||
|
||||
// PromotionPreflightResult is the reusable result of a promotion evaluation.
|
||||
// Used by auto-promotion, manual promote API, preflight status, and logging.
|
||||
type PromotionPreflightResult struct {
|
||||
VolumeName string
|
||||
Promotable bool // true if a candidate was found
|
||||
Candidate *ReplicaInfo // best candidate (nil if !Promotable)
|
||||
CandidateIdx int // index in Replicas[] (-1 if !Promotable)
|
||||
Rejections []PromotionRejection // why each non-candidate was rejected
|
||||
Reason string // human-readable summary when !Promotable
|
||||
}
|
||||
|
||||
// evaluatePromotionLocked evaluates promotion candidates for a volume.
|
||||
// Caller must hold r.mu (read or write). Returns a preflight result without
|
||||
// mutating the registry. The four gates:
|
||||
// 1. Heartbeat freshness (within 2×LeaseTTL)
|
||||
// 2. WAL LSN recency (within promotionLSNTolerance of primary)
|
||||
// 3. Role must be RoleReplica (not RoleRebuilding)
|
||||
// 4. Server must be in blockServers (alive) — fixes B-12
|
||||
func (r *BlockVolumeRegistry) evaluatePromotionLocked(entry *BlockVolumeEntry) PromotionPreflightResult {
|
||||
result := PromotionPreflightResult{
|
||||
VolumeName: entry.Name,
|
||||
CandidateIdx: -1,
|
||||
}
|
||||
if len(entry.Replicas) == 0 {
|
||||
return 0, fmt.Errorf("block volume %q has no replicas", name)
|
||||
result.Reason = "no replicas"
|
||||
return result
|
||||
}
|
||||
|
||||
// Filter eligible replicas.
|
||||
now := time.Now()
|
||||
freshnessCutoff := 2 * entry.LeaseTTL
|
||||
if freshnessCutoff == 0 {
|
||||
freshnessCutoff = 60 * time.Second // default if LeaseTTL not set
|
||||
freshnessCutoff = 60 * time.Second
|
||||
}
|
||||
primaryLSN := entry.WALHeadLSN
|
||||
|
||||
bestIdx := -1
|
||||
for i := range entry.Replicas {
|
||||
ri := &entry.Replicas[i]
|
||||
// Gate 1: heartbeat freshness.
|
||||
if !ri.LastHeartbeat.IsZero() && now.Sub(ri.LastHeartbeat) > freshnessCutoff {
|
||||
|
||||
// Gate 1: heartbeat freshness. Zero means never heartbeated — unsafe
|
||||
// to promote because the registry has no proof the replica is alive,
|
||||
// caught up, or fully initialized.
|
||||
if ri.LastHeartbeat.IsZero() {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "no_heartbeat",
|
||||
})
|
||||
continue
|
||||
}
|
||||
if now.Sub(ri.LastHeartbeat) > freshnessCutoff {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "stale_heartbeat",
|
||||
})
|
||||
continue
|
||||
}
|
||||
// Gate 2: WAL LSN recency (skip if primary LSN is 0 — no data yet, all eligible).
|
||||
if primaryLSN > 0 && ri.WALHeadLSN+r.promotionLSNTolerance < primaryLSN {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "wal_lag",
|
||||
})
|
||||
continue
|
||||
}
|
||||
// Gate 3: role must be RoleReplica (not rebuilding/stale).
|
||||
if ri.Role != 0 && blockvol.RoleFromWire(ri.Role) != blockvol.RoleReplica {
|
||||
// Gate 3: role must be exactly RoleReplica. Zero/unset role means
|
||||
// the replica was created but never confirmed its role via heartbeat.
|
||||
if blockvol.RoleFromWire(ri.Role) != blockvol.RoleReplica {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "wrong_role",
|
||||
})
|
||||
continue
|
||||
}
|
||||
// Gate 4: server must be alive (in blockServers set) — B-12 fix.
|
||||
if !r.blockServers[ri.Server] {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "server_dead",
|
||||
})
|
||||
continue
|
||||
}
|
||||
// Eligible — pick best by health score, tie-break by WALHeadLSN.
|
||||
@@ -894,11 +941,39 @@ func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
|
||||
}
|
||||
|
||||
if bestIdx == -1 {
|
||||
return 0, fmt.Errorf("block volume %q: no eligible replicas for promotion", name)
|
||||
result.Reason = "no eligible replicas"
|
||||
if len(result.Rejections) > 0 {
|
||||
result.Reason += ": " + result.Rejections[0].Reason
|
||||
if len(result.Rejections) > 1 {
|
||||
result.Reason += fmt.Sprintf(" (+%d more)", len(result.Rejections)-1)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
promoted := entry.Replicas[bestIdx]
|
||||
result.Promotable = true
|
||||
ri := entry.Replicas[bestIdx]
|
||||
result.Candidate = &ri
|
||||
result.CandidateIdx = bestIdx
|
||||
return result
|
||||
}
|
||||
|
||||
// EvaluatePromotion returns a read-only preflight result for the named volume
|
||||
// without mutating the registry. Safe for status/logging/manual promote preview.
|
||||
func (r *BlockVolumeRegistry) EvaluatePromotion(name string) (PromotionPreflightResult, error) {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
entry, ok := r.volumes[name]
|
||||
if !ok {
|
||||
return PromotionPreflightResult{VolumeName: name, Reason: "volume not found"}, fmt.Errorf("block volume %q not found", name)
|
||||
}
|
||||
return r.evaluatePromotionLocked(entry), nil
|
||||
}
|
||||
|
||||
// applyPromotionLocked applies the promotion of a replica at candidateIdx to primary.
|
||||
// Caller must hold r.mu (write lock). The promoted replica is removed from Replicas[].
|
||||
// Old primary is NOT added to Replicas (needs rebuild). Returns the new epoch.
|
||||
func (r *BlockVolumeRegistry) applyPromotionLocked(entry *BlockVolumeEntry, name string, candidate ReplicaInfo, candidateIdx int) uint64 {
|
||||
// Remove old primary from byServer index.
|
||||
r.removeFromServer(entry.VolumeServer, name)
|
||||
|
||||
@@ -906,18 +981,21 @@ func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
|
||||
newEpoch := entry.Epoch + 1
|
||||
|
||||
// Promote replica to primary.
|
||||
entry.VolumeServer = promoted.Server
|
||||
entry.Path = promoted.Path
|
||||
entry.IQN = promoted.IQN
|
||||
entry.ISCSIAddr = promoted.ISCSIAddr
|
||||
entry.NvmeAddr = promoted.NvmeAddr
|
||||
entry.NQN = promoted.NQN
|
||||
entry.VolumeServer = candidate.Server
|
||||
entry.Path = candidate.Path
|
||||
entry.IQN = candidate.IQN
|
||||
entry.ISCSIAddr = candidate.ISCSIAddr
|
||||
entry.NvmeAddr = candidate.NvmeAddr
|
||||
entry.NQN = candidate.NQN
|
||||
entry.Epoch = newEpoch
|
||||
entry.Role = blockvol.RoleToWire(blockvol.RolePrimary)
|
||||
entry.LastLeaseGrant = time.Now()
|
||||
|
||||
// Clear stale rebuild/publication metadata from old primary (B-11 partial fix).
|
||||
entry.RebuildListenAddr = ""
|
||||
|
||||
// Remove promoted from Replicas. Others stay.
|
||||
entry.Replicas = append(entry.Replicas[:bestIdx], entry.Replicas[bestIdx+1:]...)
|
||||
entry.Replicas = append(entry.Replicas[:candidateIdx], entry.Replicas[candidateIdx+1:]...)
|
||||
|
||||
// Sync deprecated scalar fields.
|
||||
if len(entry.Replicas) > 0 {
|
||||
@@ -940,9 +1018,212 @@ func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
|
||||
// Update byServer index: new primary server now hosts this volume.
|
||||
r.addToServer(entry.VolumeServer, name)
|
||||
|
||||
return newEpoch
|
||||
}
|
||||
|
||||
// PromoteBestReplica promotes the best eligible replica to primary.
|
||||
// Eligibility: heartbeat fresh (within 2×LeaseTTL), WALHeadLSN within tolerance of primary,
|
||||
// role must be RoleReplica (not RoleRebuilding), and server must be alive (B-12 fix).
|
||||
// The promoted replica is removed from Replicas[]. Other replicas stay.
|
||||
// Old primary is NOT added to Replicas (needs rebuild).
|
||||
// Returns the new epoch and the preflight result.
|
||||
func (r *BlockVolumeRegistry) PromoteBestReplica(name string) (uint64, error) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
entry, ok := r.volumes[name]
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("block volume %q not found", name)
|
||||
}
|
||||
|
||||
pf := r.evaluatePromotionLocked(entry)
|
||||
if !pf.Promotable {
|
||||
return 0, fmt.Errorf("block volume %q: %s", name, pf.Reason)
|
||||
}
|
||||
|
||||
promoted := *pf.Candidate
|
||||
bestIdx := pf.CandidateIdx
|
||||
|
||||
newEpoch := r.applyPromotionLocked(entry, name, promoted, bestIdx)
|
||||
return newEpoch, nil
|
||||
}
|
||||
|
||||
// evaluateManualPromotionLocked evaluates promotion candidates for a manual promote request.
|
||||
// Caller must hold r.mu (read or write).
|
||||
//
|
||||
// Differences from evaluatePromotionLocked:
|
||||
// - Primary-alive gate: if !force and current primary is alive, reject with "primary_alive".
|
||||
// - Target filtering: if targetServer != "", only evaluate that specific replica.
|
||||
// Returns Reason="target_not_found" if that server is not a replica.
|
||||
// - Force flag: bypasses soft gates (primary_alive, stale_heartbeat, wal_lag)
|
||||
// but keeps hard gates (no_heartbeat with zero time, wrong_role, server_dead).
|
||||
//
|
||||
// Gate table:
|
||||
//
|
||||
// Gate | Normal | Force
|
||||
// primary_alive | Reject | Skip
|
||||
// no_heartbeat(0) | Reject | Reject
|
||||
// stale_heartbeat | Reject | Skip
|
||||
// wal_lag | Reject | Skip
|
||||
// wrong_role | Reject | Reject
|
||||
// server_dead | Reject | Reject
|
||||
func (r *BlockVolumeRegistry) evaluateManualPromotionLocked(entry *BlockVolumeEntry, targetServer string, force bool) PromotionPreflightResult {
|
||||
result := PromotionPreflightResult{
|
||||
VolumeName: entry.Name,
|
||||
CandidateIdx: -1,
|
||||
}
|
||||
|
||||
// Primary-alive gate (soft — skipped when force=true).
|
||||
if !force && r.blockServers[entry.VolumeServer] {
|
||||
result.Reason = "primary_alive"
|
||||
return result
|
||||
}
|
||||
|
||||
if len(entry.Replicas) == 0 {
|
||||
result.Reason = "no replicas"
|
||||
return result
|
||||
}
|
||||
|
||||
// Target filtering: if a specific server is requested, find its index first.
|
||||
// Return early if not found.
|
||||
if targetServer != "" {
|
||||
found := false
|
||||
for i := range entry.Replicas {
|
||||
if entry.Replicas[i].Server == targetServer {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
result.Reason = "target_not_found"
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
freshnessCutoff := 2 * entry.LeaseTTL
|
||||
if freshnessCutoff == 0 {
|
||||
freshnessCutoff = 60 * time.Second
|
||||
}
|
||||
primaryLSN := entry.WALHeadLSN
|
||||
|
||||
bestIdx := -1
|
||||
for i := range entry.Replicas {
|
||||
ri := &entry.Replicas[i]
|
||||
|
||||
// If targeting a specific server, skip all others.
|
||||
if targetServer != "" && ri.Server != targetServer {
|
||||
continue
|
||||
}
|
||||
|
||||
// Hard gate: no heartbeat (zero time) — unsafe regardless of force.
|
||||
if ri.LastHeartbeat.IsZero() {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "no_heartbeat",
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Soft gate: stale heartbeat — skipped when force=true.
|
||||
if !force && now.Sub(ri.LastHeartbeat) > freshnessCutoff {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "stale_heartbeat",
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Soft gate: WAL lag — skipped when force=true.
|
||||
if !force && primaryLSN > 0 && ri.WALHeadLSN+r.promotionLSNTolerance < primaryLSN {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "wal_lag",
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Hard gate: role must be exactly RoleReplica.
|
||||
if blockvol.RoleFromWire(ri.Role) != blockvol.RoleReplica {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "wrong_role",
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Hard gate: server must be alive (in blockServers set).
|
||||
if !r.blockServers[ri.Server] {
|
||||
result.Rejections = append(result.Rejections, PromotionRejection{
|
||||
Server: ri.Server,
|
||||
Reason: "server_dead",
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Eligible — pick best by health score, tie-break by WALHeadLSN.
|
||||
if bestIdx == -1 {
|
||||
bestIdx = i
|
||||
} else if ri.HealthScore > entry.Replicas[bestIdx].HealthScore {
|
||||
bestIdx = i
|
||||
} else if ri.HealthScore == entry.Replicas[bestIdx].HealthScore &&
|
||||
ri.WALHeadLSN > entry.Replicas[bestIdx].WALHeadLSN {
|
||||
bestIdx = i
|
||||
}
|
||||
}
|
||||
|
||||
if bestIdx == -1 {
|
||||
result.Reason = "no eligible replicas"
|
||||
if len(result.Rejections) > 0 {
|
||||
result.Reason += ": " + result.Rejections[0].Reason
|
||||
if len(result.Rejections) > 1 {
|
||||
result.Reason += fmt.Sprintf(" (+%d more)", len(result.Rejections)-1)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
result.Promotable = true
|
||||
ri := entry.Replicas[bestIdx]
|
||||
result.Candidate = &ri
|
||||
result.CandidateIdx = bestIdx
|
||||
return result
|
||||
}
|
||||
|
||||
// ManualPromote promotes a specific replica (or the best eligible replica) to primary.
|
||||
// Unlike PromoteBestReplica, it accepts operator overrides:
|
||||
// - targetServer: if non-empty, only that replica is considered.
|
||||
// - force: bypasses soft gates (primary_alive, stale_heartbeat, wal_lag).
|
||||
//
|
||||
// Returns (newEpoch, oldPrimary, oldPath, preflightResult, nil) on success.
|
||||
// oldPrimary and oldPath are captured under the lock to avoid TOCTOU with
|
||||
// concurrent auto-failover (BUG-T5-2 fix).
|
||||
// Returns (0, "", "", preflightResult, err) on rejection or lookup failure.
|
||||
func (r *BlockVolumeRegistry) ManualPromote(name, targetServer string, force bool) (uint64, string, string, PromotionPreflightResult, error) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
entry, ok := r.volumes[name]
|
||||
if !ok {
|
||||
return 0, "", "", PromotionPreflightResult{VolumeName: name, Reason: "volume not found"},
|
||||
fmt.Errorf("block volume %q not found", name)
|
||||
}
|
||||
|
||||
// Capture old primary info under lock (BUG-T5-2 fix).
|
||||
oldPrimary := entry.VolumeServer
|
||||
oldPath := entry.Path
|
||||
|
||||
pf := r.evaluateManualPromotionLocked(entry, targetServer, force)
|
||||
if !pf.Promotable {
|
||||
return 0, "", "", pf, fmt.Errorf("block volume %q: %s", name, pf.Reason)
|
||||
}
|
||||
|
||||
promoted := *pf.Candidate
|
||||
candidateIdx := pf.CandidateIdx
|
||||
|
||||
newEpoch := r.applyPromotionLocked(entry, name, promoted, candidateIdx)
|
||||
return newEpoch, oldPrimary, oldPath, pf, nil
|
||||
}
|
||||
|
||||
// MarkBlockCapable records that the given server supports block volumes.
|
||||
func (r *BlockVolumeRegistry) MarkBlockCapable(server string) {
|
||||
r.mu.Lock()
|
||||
@@ -1045,6 +1326,41 @@ func (r *BlockVolumeRegistry) ServerSummaries() []BlockServerSummary {
|
||||
return summaries
|
||||
}
|
||||
|
||||
// IsBlockCapable returns true if the given server is in the block-capable set (alive).
|
||||
func (r *BlockVolumeRegistry) IsBlockCapable(server string) bool {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
return r.blockServers[server]
|
||||
}
|
||||
|
||||
// VolumesWithDeadPrimary returns names of volumes where the given server is a replica
|
||||
// and the current primary is NOT in the block-capable set (dead/disconnected).
|
||||
// Used by T2 (B-06) to detect orphaned primaries that need re-promotion.
|
||||
func (r *BlockVolumeRegistry) VolumesWithDeadPrimary(replicaServer string) []string {
|
||||
r.mu.RLock()
|
||||
defer r.mu.RUnlock()
|
||||
names, ok := r.byServer[replicaServer]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
var orphaned []string
|
||||
for name := range names {
|
||||
entry := r.volumes[name]
|
||||
if entry == nil {
|
||||
continue
|
||||
}
|
||||
// Only consider volumes where this server is a replica (not the primary).
|
||||
if entry.VolumeServer == replicaServer {
|
||||
continue
|
||||
}
|
||||
// Check if the primary server is dead.
|
||||
if !r.blockServers[entry.VolumeServer] {
|
||||
orphaned = append(orphaned, name)
|
||||
}
|
||||
}
|
||||
return orphaned
|
||||
}
|
||||
|
||||
// BlockCapableServers returns the list of servers known to support block volumes.
|
||||
func (r *BlockVolumeRegistry) BlockCapableServers() []string {
|
||||
r.mu.RLock()
|
||||
|
||||
@@ -2,6 +2,7 @@ package weed_server
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -538,6 +539,8 @@ func TestRegistry_RemoveReplica(t *testing.T) {
|
||||
|
||||
func TestRegistry_PromoteBestReplica_PicksHighest(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("s2")
|
||||
r.MarkBlockCapable("s3")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "s1",
|
||||
@@ -545,8 +548,8 @@ func TestRegistry_PromoteBestReplica_PicksHighest(t *testing.T) {
|
||||
Epoch: 5,
|
||||
Role: 1,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.8, WALHeadLSN: 100},
|
||||
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.95, WALHeadLSN: 90},
|
||||
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.8, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.95, WALHeadLSN: 90, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
},
|
||||
})
|
||||
// Add to byServer for s2 and s3.
|
||||
@@ -592,14 +595,16 @@ func TestRegistry_PromoteBestReplica_NoReplica(t *testing.T) {
|
||||
|
||||
func TestRegistry_PromoteBestReplica_TiebreakByLSN(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("s2")
|
||||
r.MarkBlockCapable("s3")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "s1",
|
||||
Path: "/v1.blk",
|
||||
Epoch: 3,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.9, WALHeadLSN: 50},
|
||||
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.9, WALHeadLSN: 100},
|
||||
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 0.9, WALHeadLSN: 50, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.9, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
},
|
||||
})
|
||||
r.mu.Lock()
|
||||
@@ -627,14 +632,16 @@ func TestRegistry_PromoteBestReplica_TiebreakByLSN(t *testing.T) {
|
||||
|
||||
func TestRegistry_PromoteBestReplica_KeepsOthers(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("s2")
|
||||
r.MarkBlockCapable("s3")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "s1",
|
||||
Path: "/v1.blk",
|
||||
Epoch: 1,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 1.0, WALHeadLSN: 100},
|
||||
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.5, WALHeadLSN: 100},
|
||||
{Server: "s2", Path: "/r1.blk", IQN: "iqn:r1", ISCSIAddr: "s2:3260", HealthScore: 1.0, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
{Server: "s3", Path: "/r2.blk", IQN: "iqn:r2", ISCSIAddr: "s3:3260", HealthScore: 0.5, WALHeadLSN: 100, Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now()},
|
||||
},
|
||||
})
|
||||
r.mu.Lock()
|
||||
@@ -877,6 +884,7 @@ func TestRegistry_PromoteBestReplica_WALLagIneligible(t *testing.T) {
|
||||
HealthScore: 1.0,
|
||||
WALHeadLSN: 800, // lag=200, tolerance=100
|
||||
LastHeartbeat: time.Now(),
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -918,6 +926,8 @@ func TestRegistry_PromoteBestReplica_RebuildingIneligible(t *testing.T) {
|
||||
// Fix #2: Among eligible replicas, best (health+LSN) wins.
|
||||
func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("stale")
|
||||
r.MarkBlockCapable("good")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "primary",
|
||||
@@ -939,6 +949,7 @@ func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
|
||||
HealthScore: 0.8,
|
||||
WALHeadLSN: 95,
|
||||
LastHeartbeat: time.Now(),
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -956,6 +967,7 @@ func TestRegistry_PromoteBestReplica_EligibilityFiltersCorrectly(t *testing.T) {
|
||||
// Configurable tolerance: widen tolerance to allow lagging replicas.
|
||||
func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("lagging")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "primary",
|
||||
@@ -970,6 +982,7 @@ func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
|
||||
HealthScore: 1.0,
|
||||
WALHeadLSN: 800, // lag=200
|
||||
LastHeartbeat: time.Now(),
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -992,6 +1005,236 @@ func TestRegistry_PromoteBestReplica_ConfigurableTolerance(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// B-12: PromoteBestReplica rejects dead replica (server not in blockServers).
|
||||
func TestRegistry_PromoteBestReplica_DeadServerIneligible(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
// Intentionally do NOT mark "dead-replica" as block-capable.
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "primary",
|
||||
Path: "/data/vol1.blk",
|
||||
Epoch: 1,
|
||||
LeaseTTL: 30 * time.Second,
|
||||
WALHeadLSN: 100,
|
||||
Replicas: []ReplicaInfo{
|
||||
{
|
||||
Server: "dead-replica",
|
||||
Path: "/data/vol1.blk",
|
||||
HealthScore: 1.0,
|
||||
WALHeadLSN: 100,
|
||||
LastHeartbeat: time.Now(),
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
_, err := r.PromoteBestReplica("vol1")
|
||||
if err == nil {
|
||||
t.Fatal("expected error: dead replica should be rejected")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "server_dead") {
|
||||
t.Fatalf("error should mention server_dead, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// B-12: Dead replica rejected but alive replica promoted when both exist.
|
||||
func TestRegistry_PromoteBestReplica_DeadSkipped_AlivePromoted(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
// Only mark s3 as alive.
|
||||
r.MarkBlockCapable("s3")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "primary",
|
||||
Path: "/data/vol1.blk",
|
||||
Epoch: 1,
|
||||
LeaseTTL: 30 * time.Second,
|
||||
WALHeadLSN: 100,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "s2-dead", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
{Server: "s3", Path: "/r2.blk", HealthScore: 0.8, WALHeadLSN: 95, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
|
||||
newEpoch, err := r.PromoteBestReplica("vol1")
|
||||
if err != nil {
|
||||
t.Fatalf("PromoteBestReplica: %v", err)
|
||||
}
|
||||
if newEpoch != 2 {
|
||||
t.Fatalf("newEpoch: got %d, want 2", newEpoch)
|
||||
}
|
||||
e, _ := r.Lookup("vol1")
|
||||
if e.VolumeServer != "s3" {
|
||||
t.Fatalf("expected alive s3 promoted, got %q", e.VolumeServer)
|
||||
}
|
||||
}
|
||||
|
||||
// EvaluatePromotion returns read-only preflight without mutating registry.
|
||||
func TestRegistry_EvaluatePromotion_Basic(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("replica1")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "primary",
|
||||
Path: "/data/vol1.blk",
|
||||
Epoch: 5,
|
||||
LeaseTTL: 30 * time.Second,
|
||||
WALHeadLSN: 100,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "replica1", Path: "/r1.blk", HealthScore: 0.9, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
|
||||
pf, err := r.EvaluatePromotion("vol1")
|
||||
if err != nil {
|
||||
t.Fatalf("EvaluatePromotion: %v", err)
|
||||
}
|
||||
if !pf.Promotable {
|
||||
t.Fatalf("expected promotable, got reason: %s", pf.Reason)
|
||||
}
|
||||
if pf.Candidate == nil || pf.Candidate.Server != "replica1" {
|
||||
t.Fatalf("expected candidate replica1, got %+v", pf.Candidate)
|
||||
}
|
||||
|
||||
// Registry must be unmutated.
|
||||
e, _ := r.Lookup("vol1")
|
||||
if e.VolumeServer != "primary" {
|
||||
t.Fatal("EvaluatePromotion should not mutate the registry")
|
||||
}
|
||||
if e.Epoch != 5 {
|
||||
t.Fatal("EvaluatePromotion should not bump epoch")
|
||||
}
|
||||
}
|
||||
|
||||
// EvaluatePromotion with all replicas rejected.
|
||||
func TestRegistry_EvaluatePromotion_AllRejected(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
// No servers marked as block-capable.
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "primary",
|
||||
Path: "/data/vol1.blk",
|
||||
Epoch: 1,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "dead1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
{Server: "dead2", Path: "/r2.blk", HealthScore: 0.9, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
|
||||
pf, err := r.EvaluatePromotion("vol1")
|
||||
if err != nil {
|
||||
t.Fatalf("EvaluatePromotion: %v", err)
|
||||
}
|
||||
if pf.Promotable {
|
||||
t.Fatal("expected not promotable")
|
||||
}
|
||||
if len(pf.Rejections) != 2 {
|
||||
t.Fatalf("expected 2 rejections, got %d", len(pf.Rejections))
|
||||
}
|
||||
for _, rej := range pf.Rejections {
|
||||
if rej.Reason != "server_dead" {
|
||||
t.Fatalf("expected server_dead rejection, got %q", rej.Reason)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// EvaluatePromotion for nonexistent volume.
|
||||
func TestRegistry_EvaluatePromotion_NotFound(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
_, err := r.EvaluatePromotion("nonexistent")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for nonexistent volume")
|
||||
}
|
||||
}
|
||||
|
||||
// Replica created but never heartbeated is not promotable.
|
||||
func TestRegistry_PromoteBestReplica_NoHeartbeatIneligible(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("replica1")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "primary",
|
||||
Path: "/data/vol1.blk",
|
||||
Epoch: 1,
|
||||
LeaseTTL: 30 * time.Second,
|
||||
WALHeadLSN: 100,
|
||||
Replicas: []ReplicaInfo{
|
||||
{
|
||||
Server: "replica1",
|
||||
Path: "/r1.blk",
|
||||
HealthScore: 1.0,
|
||||
WALHeadLSN: 100,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
// LastHeartbeat: zero — never heartbeated
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
_, err := r.PromoteBestReplica("vol1")
|
||||
if err == nil {
|
||||
t.Fatal("expected error: replica with no heartbeat should be rejected")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "no_heartbeat") {
|
||||
t.Fatalf("error should mention no_heartbeat, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Replica with unset (zero) role is not promotable.
|
||||
func TestRegistry_PromoteBestReplica_UnsetRoleIneligible(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("replica1")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "primary",
|
||||
Path: "/data/vol1.blk",
|
||||
Epoch: 1,
|
||||
LeaseTTL: 30 * time.Second,
|
||||
WALHeadLSN: 100,
|
||||
Replicas: []ReplicaInfo{
|
||||
{
|
||||
Server: "replica1",
|
||||
Path: "/r1.blk",
|
||||
HealthScore: 1.0,
|
||||
WALHeadLSN: 100,
|
||||
LastHeartbeat: time.Now(),
|
||||
// Role: 0 — unset/RoleNone
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
_, err := r.PromoteBestReplica("vol1")
|
||||
if err == nil {
|
||||
t.Fatal("expected error: replica with unset role should be rejected")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "wrong_role") {
|
||||
t.Fatalf("error should mention wrong_role, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// PromoteBestReplica clears RebuildListenAddr on promotion (B-11 partial fix).
|
||||
func TestRegistry_PromoteBestReplica_ClearsRebuildAddr(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("replica1")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1",
|
||||
VolumeServer: "primary",
|
||||
Path: "/data/vol1.blk",
|
||||
Epoch: 1,
|
||||
RebuildListenAddr: "primary:15000",
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "replica1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100, LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
|
||||
_, err := r.PromoteBestReplica("vol1")
|
||||
if err != nil {
|
||||
t.Fatalf("PromoteBestReplica: %v", err)
|
||||
}
|
||||
e, _ := r.Lookup("vol1")
|
||||
if e.RebuildListenAddr != "" {
|
||||
t.Fatalf("RebuildListenAddr should be cleared after promotion, got %q", e.RebuildListenAddr)
|
||||
}
|
||||
}
|
||||
|
||||
// --- LeaseGrants ---
|
||||
|
||||
func TestRegistry_LeaseGrants_PrimaryOnly(t *testing.T) {
|
||||
@@ -1110,3 +1353,267 @@ func TestRegistry_LeaseGrants_UnknownServer(t *testing.T) {
|
||||
t.Fatalf("expected nil for unknown server, got %+v", grants)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CP11B-3 T2: IsBlockCapable + VolumesWithDeadPrimary
|
||||
// ============================================================
|
||||
|
||||
func TestRegistry_IsBlockCapable(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("vs1:8080")
|
||||
|
||||
if !r.IsBlockCapable("vs1:8080") {
|
||||
t.Fatal("vs1 should be block-capable")
|
||||
}
|
||||
if r.IsBlockCapable("vs2:8080") {
|
||||
t.Fatal("vs2 should NOT be block-capable")
|
||||
}
|
||||
|
||||
r.UnmarkBlockCapable("vs1:8080")
|
||||
if r.IsBlockCapable("vs1:8080") {
|
||||
t.Fatal("vs1 should no longer be block-capable after unmark")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegistry_VolumesWithDeadPrimary_Basic(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("vs1")
|
||||
r.MarkBlockCapable("vs2")
|
||||
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
|
||||
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
Status: StatusActive,
|
||||
Replicas: []ReplicaInfo{{Server: "vs2", Path: "/data/vol1.blk"}},
|
||||
})
|
||||
|
||||
// Both alive → no orphans.
|
||||
orphaned := r.VolumesWithDeadPrimary("vs2")
|
||||
if len(orphaned) != 0 {
|
||||
t.Fatalf("expected 0 orphaned volumes, got %d", len(orphaned))
|
||||
}
|
||||
|
||||
// Kill primary.
|
||||
r.UnmarkBlockCapable("vs1")
|
||||
orphaned = r.VolumesWithDeadPrimary("vs2")
|
||||
if len(orphaned) != 1 || orphaned[0] != "vol1" {
|
||||
t.Fatalf("expected [vol1], got %v", orphaned)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegistry_VolumesWithDeadPrimary_PrimaryServer_NotIncluded(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("vs1")
|
||||
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
|
||||
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
Status: StatusActive,
|
||||
})
|
||||
|
||||
// vs1 is the primary for vol1 — should NOT appear in orphaned list for vs1.
|
||||
orphaned := r.VolumesWithDeadPrimary("vs1")
|
||||
if len(orphaned) != 0 {
|
||||
t.Fatalf("primary server should not appear in its own orphan list, got %v", orphaned)
|
||||
}
|
||||
}
|
||||
|
||||
// T6: EvaluatePromotion preflight includes primary liveness.
|
||||
func TestRegistry_EvaluatePromotion_PrimaryDead_StillShowsCandidate(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("vs1")
|
||||
r.MarkBlockCapable("vs2")
|
||||
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "vs1", Path: "/data/vol1.blk",
|
||||
SizeBytes: 1 << 30, Epoch: 1, Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
Status: StatusActive, LeaseTTL: 30 * time.Second,
|
||||
Replicas: []ReplicaInfo{{
|
||||
Server: "vs2", Path: "/data/vol1.blk", HealthScore: 1.0,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica), LastHeartbeat: time.Now(),
|
||||
}},
|
||||
})
|
||||
|
||||
// Kill primary but keep vs2 alive.
|
||||
r.UnmarkBlockCapable("vs1")
|
||||
|
||||
pf, err := r.EvaluatePromotion("vol1")
|
||||
if err != nil {
|
||||
t.Fatalf("EvaluatePromotion: %v", err)
|
||||
}
|
||||
if !pf.Promotable {
|
||||
t.Fatalf("should be promotable (vs2 alive), reason=%s", pf.Reason)
|
||||
}
|
||||
if pf.Candidate.Server != "vs2" {
|
||||
t.Fatalf("candidate should be vs2, got %q", pf.Candidate.Server)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CP11B-3 T5: ManualPromote Dev Tests
|
||||
// ============================================================
|
||||
|
||||
// T5: ManualPromote with empty target → auto-picks best candidate.
|
||||
func TestRegistry_ManualPromote_AutoTarget(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("best")
|
||||
r.MarkBlockCapable("worse")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
||||
Epoch: 1, LeaseTTL: 30 * time.Second, WALHeadLSN: 100,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "worse", Path: "/r1.blk", HealthScore: 0.5, WALHeadLSN: 100,
|
||||
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
{Server: "best", Path: "/r2.blk", HealthScore: 1.0, WALHeadLSN: 100,
|
||||
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
// Primary not block-capable → non-force should still pass (primary_alive gate won't trigger).
|
||||
|
||||
newEpoch, _, _, pf, err := r.ManualPromote("vol1", "", false)
|
||||
if err != nil {
|
||||
t.Fatalf("ManualPromote: %v", err)
|
||||
}
|
||||
if newEpoch != 2 {
|
||||
t.Fatalf("epoch: got %d, want 2", newEpoch)
|
||||
}
|
||||
if !pf.Promotable {
|
||||
t.Fatal("should be promotable")
|
||||
}
|
||||
e, _ := r.Lookup("vol1")
|
||||
if e.VolumeServer != "best" {
|
||||
t.Fatalf("expected 'best' promoted, got %q", e.VolumeServer)
|
||||
}
|
||||
}
|
||||
|
||||
// T5: ManualPromote targets a specific replica (not the best by health).
|
||||
func TestRegistry_ManualPromote_SpecificTarget(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("r1")
|
||||
r.MarkBlockCapable("r2")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
||||
Epoch: 1, LeaseTTL: 30 * time.Second,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0, WALHeadLSN: 100,
|
||||
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
{Server: "r2", Path: "/r2.blk", HealthScore: 0.5, WALHeadLSN: 50,
|
||||
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
|
||||
// Target r2 specifically (worse health).
|
||||
newEpoch, _, _, _, err := r.ManualPromote("vol1", "r2", false)
|
||||
if err != nil {
|
||||
t.Fatalf("ManualPromote: %v", err)
|
||||
}
|
||||
if newEpoch != 2 {
|
||||
t.Fatalf("epoch: got %d, want 2", newEpoch)
|
||||
}
|
||||
e, _ := r.Lookup("vol1")
|
||||
if e.VolumeServer != "r2" {
|
||||
t.Fatalf("expected r2 promoted (specific target), got %q", e.VolumeServer)
|
||||
}
|
||||
}
|
||||
|
||||
// T5: ManualPromote with non-existent target → error.
|
||||
func TestRegistry_ManualPromote_TargetNotFound(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("r1")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
||||
Epoch: 1, LeaseTTL: 30 * time.Second,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
|
||||
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
|
||||
_, _, _, pf, err := r.ManualPromote("vol1", "nonexistent", false)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for nonexistent target")
|
||||
}
|
||||
if pf.Reason != "target_not_found" {
|
||||
t.Fatalf("expected target_not_found, got %q", pf.Reason)
|
||||
}
|
||||
}
|
||||
|
||||
// T5: ManualPromote non-force with alive primary → rejected.
|
||||
func TestRegistry_ManualPromote_PrimaryAlive_Rejected(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("primary")
|
||||
r.MarkBlockCapable("r1")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
||||
Epoch: 1, LeaseTTL: 30 * time.Second,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
|
||||
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
|
||||
_, _, _, pf, err := r.ManualPromote("vol1", "", false)
|
||||
if err == nil {
|
||||
t.Fatal("expected rejection when primary alive and !force")
|
||||
}
|
||||
if pf.Reason != "primary_alive" {
|
||||
t.Fatalf("expected primary_alive, got %q", pf.Reason)
|
||||
}
|
||||
// Verify no mutation.
|
||||
e, _ := r.Lookup("vol1")
|
||||
if e.VolumeServer != "primary" {
|
||||
t.Fatalf("primary should not change, got %q", e.VolumeServer)
|
||||
}
|
||||
}
|
||||
|
||||
// T5: Force bypasses stale heartbeat and primary_alive gates.
|
||||
func TestRegistry_ManualPromote_Force_StaleHeartbeat(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
r.MarkBlockCapable("primary")
|
||||
r.MarkBlockCapable("r1")
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
||||
Epoch: 1, LeaseTTL: 30 * time.Second,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "r1", Path: "/r1.blk", HealthScore: 1.0,
|
||||
LastHeartbeat: time.Now().Add(-10 * time.Minute), // stale
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
|
||||
// Non-force: would fail on primary_alive.
|
||||
// Force: bypasses primary_alive AND stale_heartbeat.
|
||||
newEpoch, _, _, _, err := r.ManualPromote("vol1", "", true)
|
||||
if err != nil {
|
||||
t.Fatalf("force ManualPromote should succeed: %v", err)
|
||||
}
|
||||
if newEpoch != 2 {
|
||||
t.Fatalf("epoch: got %d, want 2", newEpoch)
|
||||
}
|
||||
e, _ := r.Lookup("vol1")
|
||||
if e.VolumeServer != "r1" {
|
||||
t.Fatalf("expected r1 promoted via force, got %q", e.VolumeServer)
|
||||
}
|
||||
}
|
||||
|
||||
// T5: Force does NOT bypass server_dead (hard gate).
|
||||
func TestRegistry_ManualPromote_Force_StillRejectsDeadServer(t *testing.T) {
|
||||
r := NewBlockVolumeRegistry()
|
||||
// "dead" is NOT marked block-capable.
|
||||
r.Register(&BlockVolumeEntry{
|
||||
Name: "vol1", VolumeServer: "primary", Path: "/data/vol1.blk",
|
||||
Epoch: 1, LeaseTTL: 30 * time.Second,
|
||||
Replicas: []ReplicaInfo{
|
||||
{Server: "dead", Path: "/r1.blk", HealthScore: 1.0,
|
||||
LastHeartbeat: time.Now(), Role: blockvol.RoleToWire(blockvol.RoleReplica)},
|
||||
},
|
||||
})
|
||||
|
||||
_, _, _, pf, err := r.ManualPromote("vol1", "dead", true)
|
||||
if err == nil {
|
||||
t.Fatal("force should NOT bypass server_dead")
|
||||
}
|
||||
if len(pf.Rejections) == 0 || pf.Rejections[0].Reason != "server_dead" {
|
||||
t.Fatalf("expected server_dead rejection, got %+v", pf.Rejections)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -278,6 +278,9 @@ func (ms *MasterServer) SendHeartbeat(stream master_pb.Seaweed_SendHeartbeatServ
|
||||
// on subsequent heartbeats), never both in the same message.
|
||||
if len(heartbeat.BlockVolumeInfos) > 0 || heartbeat.HasNoBlockVolumes {
|
||||
ms.blockRegistry.UpdateFullHeartbeat(dn.Url(), heartbeat.BlockVolumeInfos)
|
||||
// T2 (B-06): After updating registry from heartbeat, check if this server
|
||||
// is a replica for any volume whose primary is dead. If so, promote.
|
||||
ms.reevaluateOrphanedPrimaries(dn.Url())
|
||||
} else if len(heartbeat.NewBlockVolumes) > 0 || len(heartbeat.DeletedBlockVolumes) > 0 {
|
||||
ms.blockRegistry.UpdateDeltaHeartbeat(dn.Url(), heartbeat.NewBlockVolumes, heartbeat.DeletedBlockVolumes)
|
||||
}
|
||||
|
||||
@@ -283,14 +283,16 @@ func (ms *MasterServer) tryCreateOneReplica(ctx context.Context, req *master_pb.
|
||||
entry.RebuildListenAddr = primaryResult.RebuildListenAddr
|
||||
// CP8-2: populate Replicas[].
|
||||
entry.Replicas = append(entry.Replicas, ReplicaInfo{
|
||||
Server: replicaServerStr,
|
||||
Path: replicaResult.Path,
|
||||
ISCSIAddr: replicaResult.ISCSIAddr,
|
||||
IQN: replicaResult.IQN,
|
||||
NvmeAddr: replicaResult.NvmeAddr,
|
||||
NQN: replicaResult.NQN,
|
||||
DataAddr: replicaResult.ReplicaDataAddr,
|
||||
CtrlAddr: replicaResult.ReplicaCtrlAddr,
|
||||
Server: replicaServerStr,
|
||||
Path: replicaResult.Path,
|
||||
ISCSIAddr: replicaResult.ISCSIAddr,
|
||||
IQN: replicaResult.IQN,
|
||||
NvmeAddr: replicaResult.NvmeAddr,
|
||||
NQN: replicaResult.NQN,
|
||||
DataAddr: replicaResult.ReplicaDataAddr,
|
||||
CtrlAddr: replicaResult.ReplicaCtrlAddr,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
LastHeartbeat: time.Now(),
|
||||
})
|
||||
return replicaServerStr
|
||||
}
|
||||
@@ -409,6 +411,11 @@ func (ms *MasterServer) ExpandBlockVolume(ctx context.Context, req *master_pb.Ex
|
||||
}
|
||||
}()
|
||||
|
||||
// Test-only hook: inject failover between lock acquisition and re-read.
|
||||
if ms.expandPreReadHook != nil {
|
||||
ms.expandPreReadHook()
|
||||
}
|
||||
|
||||
// B-09: Re-read entry after acquiring expand lock. Between the initial
|
||||
// Lookup and AcquireExpandInflight, failover may have changed VolumeServer
|
||||
// or Replicas. Using the stale snapshot would send PREPARE to dead nodes.
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
|
||||
)
|
||||
|
||||
// testMasterServer creates a minimal MasterServer with mock VS calls for testing.
|
||||
@@ -1112,6 +1113,9 @@ func TestMaster_NoNvmeFieldsWhenDisabled(t *testing.T) {
|
||||
|
||||
func TestMaster_PromotionCopiesNvmeFields(t *testing.T) {
|
||||
ms := testMasterServer(t)
|
||||
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
|
||||
ms.blockRegistry.MarkBlockCapable("vs1:9333")
|
||||
ms.blockRegistry.MarkBlockCapable("vs2:9333")
|
||||
|
||||
// Directly register an entry with primary + replica, both having NVMe fields.
|
||||
ms.blockRegistry.Register(&BlockVolumeEntry{
|
||||
@@ -1128,16 +1132,18 @@ func TestMaster_PromotionCopiesNvmeFields(t *testing.T) {
|
||||
LeaseTTL: 30 * time.Second,
|
||||
Replicas: []ReplicaInfo{
|
||||
{
|
||||
Server: "vs2:9333",
|
||||
Path: "/data/ha-vol.blk",
|
||||
IQN: "iqn.2024.test:ha-vol-r",
|
||||
ISCSIAddr: "vs2:3260",
|
||||
NvmeAddr: "vs2:4420",
|
||||
NQN: "nqn.2024-01.com.seaweedfs:vol.ha-vol.vs2",
|
||||
DataAddr: "vs2:14260",
|
||||
CtrlAddr: "vs2:14261",
|
||||
HealthScore: 0.95,
|
||||
WALHeadLSN: 100,
|
||||
Server: "vs2:9333",
|
||||
Path: "/data/ha-vol.blk",
|
||||
IQN: "iqn.2024.test:ha-vol-r",
|
||||
ISCSIAddr: "vs2:3260",
|
||||
NvmeAddr: "vs2:4420",
|
||||
NQN: "nqn.2024-01.com.seaweedfs:vol.ha-vol.vs2",
|
||||
DataAddr: "vs2:14260",
|
||||
CtrlAddr: "vs2:14261",
|
||||
HealthScore: 0.95,
|
||||
WALHeadLSN: 100,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
LastHeartbeat: time.Now(),
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1654,10 +1660,11 @@ func TestMaster_ExpandCoordinated_RestartRecovery(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) {
|
||||
// B-09: If failover changes VolumeServer between initial Lookup and
|
||||
// AcquireExpandInflight, the coordinator must use the fresh entry,
|
||||
// not the stale one. Use RF=3 so promotion still leaves 1 replica
|
||||
// and the coordinated path is taken.
|
||||
// B-09: Exercises the actual race window — failover happens BETWEEN
|
||||
// the initial Lookup (line 380) and the post-lock re-read (line 419).
|
||||
// Uses expandPreReadHook to inject PromoteBestReplica at the exact
|
||||
// interleaving point. RF=3 so promotion leaves 1 replica and the
|
||||
// coordinated path is taken.
|
||||
ms := testMasterServerWithExpandMocks(t)
|
||||
ms.blockRegistry.MarkBlockCapable("vs1:9333")
|
||||
ms.blockRegistry.MarkBlockCapable("vs2:9333")
|
||||
@@ -1689,31 +1696,39 @@ func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) {
|
||||
return 2 << 30, nil
|
||||
}
|
||||
|
||||
// Simulate failover: promote best replica. With RF=3, one replica
|
||||
// becomes primary and the other stays as replica → coordinated path.
|
||||
ms.blockRegistry.PromoteBestReplica("b09-vol")
|
||||
|
||||
entry, _ = ms.blockRegistry.Lookup("b09-vol")
|
||||
newPrimary := entry.VolumeServer
|
||||
if newPrimary == originalPrimary {
|
||||
t.Fatal("promotion didn't change primary")
|
||||
}
|
||||
if len(entry.Replicas) == 0 {
|
||||
t.Fatal("expected at least 1 replica after RF=3 promotion")
|
||||
// Hook fires AFTER AcquireExpandInflight but BEFORE the re-read Lookup.
|
||||
// This is the exact race window: the initial Lookup already returned
|
||||
// the old primary, but failover changes it before the re-read.
|
||||
hookFired := false
|
||||
ms.expandPreReadHook = func() {
|
||||
hookFired = true
|
||||
ms.blockRegistry.PromoteBestReplica("b09-vol")
|
||||
}
|
||||
|
||||
// Expand should use the NEW primary (post-failover), not the old one.
|
||||
// At this point, the initial Lookup inside ExpandBlockVolume will see
|
||||
// originalPrimary. The hook then promotes, changing the primary.
|
||||
// The re-read must pick up the new primary.
|
||||
resp, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
|
||||
Name: "b09-vol", NewSizeBytes: 2 << 30,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("expand: %v", err)
|
||||
}
|
||||
if !hookFired {
|
||||
t.Fatal("expandPreReadHook was not called — race window not exercised")
|
||||
}
|
||||
if resp.CapacityBytes != 2<<30 {
|
||||
t.Fatalf("capacity: got %d", resp.CapacityBytes)
|
||||
}
|
||||
|
||||
// First PREPARE should have gone to the new primary, not the old one.
|
||||
// Verify: after the hook promoted, the re-read must have picked up
|
||||
// the new primary. The first PREPARE should go to the new primary.
|
||||
entry, _ = ms.blockRegistry.Lookup("b09-vol")
|
||||
newPrimary := entry.VolumeServer
|
||||
if newPrimary == originalPrimary {
|
||||
t.Fatal("promotion didn't change primary")
|
||||
}
|
||||
|
||||
if len(preparedServers) == 0 {
|
||||
t.Fatal("no prepare calls recorded")
|
||||
}
|
||||
@@ -1721,7 +1736,7 @@ func TestMaster_ExpandCoordinated_B09_ReReadsEntryAfterLock(t *testing.T) {
|
||||
t.Fatalf("PREPARE went to %q (stale), should go to %q (fresh primary)",
|
||||
preparedServers[0], newPrimary)
|
||||
}
|
||||
// Verify old primary was NOT contacted.
|
||||
// Verify old primary was NOT contacted at all.
|
||||
for _, s := range preparedServers {
|
||||
if s == originalPrimary {
|
||||
t.Fatalf("PREPARE sent to old primary %q — stale entry used", originalPrimary)
|
||||
|
||||
@@ -109,6 +109,10 @@ type MasterServer struct {
|
||||
blockVSCommitExpand func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error)
|
||||
blockVSCancelExpand func(ctx context.Context, server string, name string, expandEpoch uint64) error
|
||||
nextExpandEpoch atomic.Uint64
|
||||
|
||||
// Test-only hook: called after AcquireExpandInflight but before the
|
||||
// re-read Lookup in coordinated expand. Nil in production.
|
||||
expandPreReadHook func()
|
||||
}
|
||||
|
||||
func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.ServerAddress) *MasterServer {
|
||||
@@ -224,6 +228,8 @@ func NewMasterServer(r *mux.Router, option *MasterOption, peers map[string]pb.Se
|
||||
r.HandleFunc("/block/volume/{name}", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeLookupHandler))).Methods("GET")
|
||||
r.HandleFunc("/block/volumes", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeListHandler))).Methods("GET")
|
||||
r.HandleFunc("/block/volume/{name}/expand", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumeExpandHandler)))).Methods("POST")
|
||||
r.HandleFunc("/block/volume/{name}/preflight", ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumePreflightHandler))).Methods("GET")
|
||||
r.HandleFunc("/block/volume/{name}/promote", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockVolumePromoteHandler)))).Methods("POST")
|
||||
r.HandleFunc("/block/assign", ms.proxyToLeader(ms.guard.WhiteList(requestIDMiddleware(ms.blockAssignHandler)))).Methods("POST")
|
||||
r.HandleFunc("/block/servers", ms.guard.WhiteList(requestIDMiddleware(ms.blockServersHandler))).Methods("GET")
|
||||
r.HandleFunc("/block/status", ms.guard.WhiteList(requestIDMiddleware(ms.blockStatusHandler))).Methods("GET")
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/blockapi"
|
||||
@@ -206,6 +207,99 @@ func (ms *MasterServer) blockStatusHandler(w http.ResponseWriter, r *http.Reques
|
||||
writeJsonQuiet(w, r, http.StatusOK, status)
|
||||
}
|
||||
|
||||
// blockVolumePreflightHandler handles GET /block/volume/{name}/preflight.
|
||||
// Returns a read-only promotion preflight evaluation for the named volume.
|
||||
func (ms *MasterServer) blockVolumePreflightHandler(w http.ResponseWriter, r *http.Request) {
|
||||
name := mux.Vars(r)["name"]
|
||||
if name == "" {
|
||||
writeJsonError(w, r, http.StatusBadRequest, fmt.Errorf("name is required"))
|
||||
return
|
||||
}
|
||||
|
||||
pf, err := ms.blockRegistry.EvaluatePromotion(name)
|
||||
if err != nil {
|
||||
writeJsonError(w, r, http.StatusNotFound, err)
|
||||
return
|
||||
}
|
||||
|
||||
resp := blockapi.PreflightResponse{
|
||||
VolumeName: pf.VolumeName,
|
||||
Promotable: pf.Promotable,
|
||||
Reason: pf.Reason,
|
||||
}
|
||||
if pf.Candidate != nil {
|
||||
resp.CandidateServer = pf.Candidate.Server
|
||||
resp.CandidateHealth = pf.Candidate.HealthScore
|
||||
resp.CandidateWALLSN = pf.Candidate.WALHeadLSN
|
||||
}
|
||||
for _, rej := range pf.Rejections {
|
||||
resp.Rejections = append(resp.Rejections, blockapi.PreflightRejection{
|
||||
Server: rej.Server,
|
||||
Reason: rej.Reason,
|
||||
})
|
||||
}
|
||||
// Add primary liveness info.
|
||||
entry, ok := ms.blockRegistry.Lookup(name)
|
||||
if ok {
|
||||
resp.PrimaryServer = entry.VolumeServer
|
||||
resp.PrimaryAlive = ms.blockRegistry.IsBlockCapable(entry.VolumeServer)
|
||||
}
|
||||
writeJsonQuiet(w, r, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// blockVolumePromoteHandler handles POST /block/volume/{name}/promote.
|
||||
// Triggers a manual promotion for the named block volume.
|
||||
func (ms *MasterServer) blockVolumePromoteHandler(w http.ResponseWriter, r *http.Request) {
|
||||
name := mux.Vars(r)["name"]
|
||||
if name == "" {
|
||||
writeJsonError(w, r, http.StatusBadRequest, fmt.Errorf("name is required"))
|
||||
return
|
||||
}
|
||||
|
||||
var req blockapi.PromoteVolumeRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeJsonError(w, r, http.StatusBadRequest, fmt.Errorf("decode request: %w", err))
|
||||
return
|
||||
}
|
||||
|
||||
// ManualPromote captures oldPrimary/oldPath under lock to avoid TOCTOU (BUG-T5-2).
|
||||
newEpoch, oldPrimary, oldPath, pf, err := ms.blockRegistry.ManualPromote(name, req.TargetServer, req.Force)
|
||||
if err != nil {
|
||||
// Distinguish not-found from rejection.
|
||||
status := http.StatusConflict
|
||||
if pf.Reason == "volume not found" {
|
||||
status = http.StatusNotFound
|
||||
}
|
||||
// Build structured rejection response.
|
||||
resp := blockapi.PromoteVolumeResponse{
|
||||
Reason: pf.Reason,
|
||||
}
|
||||
for _, rej := range pf.Rejections {
|
||||
resp.Rejections = append(resp.Rejections, blockapi.PreflightRejection{
|
||||
Server: rej.Server,
|
||||
Reason: rej.Reason,
|
||||
})
|
||||
}
|
||||
glog.V(0).Infof("manual promote %q rejected: %s", name, pf.Reason)
|
||||
writeJsonQuiet(w, r, status, resp)
|
||||
return
|
||||
}
|
||||
|
||||
// Post-promotion orchestration (same as auto path).
|
||||
ms.finalizePromotion(name, oldPrimary, oldPath, newEpoch)
|
||||
|
||||
if req.Reason != "" {
|
||||
glog.V(0).Infof("manual promote %q: reason=%q", name, req.Reason)
|
||||
}
|
||||
|
||||
// Re-read to get the new primary server name.
|
||||
entry, _ := ms.blockRegistry.Lookup(name)
|
||||
writeJsonQuiet(w, r, http.StatusOK, blockapi.PromoteVolumeResponse{
|
||||
NewPrimary: entry.VolumeServer,
|
||||
Epoch: newEpoch,
|
||||
})
|
||||
}
|
||||
|
||||
// entryToVolumeInfo converts a BlockVolumeEntry to a blockapi.VolumeInfo.
|
||||
func entryToVolumeInfo(e *BlockVolumeEntry) blockapi.VolumeInfo {
|
||||
status := "pending"
|
||||
@@ -239,6 +333,8 @@ func entryToVolumeInfo(e *BlockVolumeEntry) blockapi.VolumeInfo {
|
||||
HealthScore: e.HealthScore,
|
||||
ReplicaDegraded: e.ReplicaDegraded,
|
||||
DurabilityMode: durMode,
|
||||
NvmeAddr: e.NvmeAddr,
|
||||
NQN: e.NQN,
|
||||
}
|
||||
for _, ri := range e.Replicas {
|
||||
info.Replicas = append(info.Replicas, blockapi.ReplicaDetail{
|
||||
|
||||
1581
weed/server/qa_block_cp11b3_adversarial_test.go
Normal file
1581
weed/server/qa_block_cp11b3_adversarial_test.go
Normal file
File diff suppressed because it is too large
Load Diff
@@ -40,6 +40,11 @@ func testMSForQA(t *testing.T) *MasterServer {
|
||||
// registerQAVolume creates a volume entry with optional replica, configurable lease state.
|
||||
func registerQAVolume(t *testing.T, ms *MasterServer, name, primary, replica string, epoch uint64, leaseTTL time.Duration, leaseExpired bool) {
|
||||
t.Helper()
|
||||
// Mark servers as block-capable so promotion Gate 4 (liveness) passes.
|
||||
ms.blockRegistry.MarkBlockCapable(primary)
|
||||
if replica != "" {
|
||||
ms.blockRegistry.MarkBlockCapable(replica)
|
||||
}
|
||||
entry := &BlockVolumeEntry{
|
||||
Name: name,
|
||||
VolumeServer: primary,
|
||||
@@ -65,11 +70,13 @@ func registerQAVolume(t *testing.T, ms *MasterServer, name, primary, replica str
|
||||
// CP8-2: also populate Replicas[].
|
||||
entry.Replicas = []ReplicaInfo{
|
||||
{
|
||||
Server: replica,
|
||||
Path: fmt.Sprintf("/data/%s.blk", name),
|
||||
IQN: fmt.Sprintf("iqn.2024.test:%s-r", name),
|
||||
ISCSIAddr: replica + ":3260",
|
||||
HealthScore: 1.0,
|
||||
Server: replica,
|
||||
Path: fmt.Sprintf("/data/%s.blk", name),
|
||||
IQN: fmt.Sprintf("iqn.2024.test:%s-r", name),
|
||||
ISCSIAddr: replica + ":3260",
|
||||
HealthScore: 1.0,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
LastHeartbeat: time.Now(),
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -398,7 +405,15 @@ func TestQA_Failover_PromoteIdempotent_NoReplicaAfterFirstSwap(t *testing.T) {
|
||||
// Reconnect vs1 first so it becomes a replica.
|
||||
ms.recoverBlockVolumes("vs1")
|
||||
|
||||
// Simulate rebuild completion: mark vs1 as a healthy replica.
|
||||
e, _ := ms.blockRegistry.Lookup("vol1")
|
||||
for i := range e.Replicas {
|
||||
if e.Replicas[i].Server == "vs1" {
|
||||
e.Replicas[i].Role = blockvol.RoleToWire(blockvol.RoleReplica)
|
||||
e.Replicas[i].LastHeartbeat = time.Now()
|
||||
e.Replicas[i].HealthScore = 1.0
|
||||
}
|
||||
}
|
||||
e.LastLeaseGrant = time.Now().Add(-1 * time.Minute) // expire the new lease
|
||||
ms.failoverBlockVolumes("vs2")
|
||||
|
||||
|
||||
485
weed/server/qa_block_expand_adversarial_test.go
Normal file
485
weed/server/qa_block_expand_adversarial_test.go
Normal file
@@ -0,0 +1,485 @@
|
||||
package weed_server
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol"
|
||||
)
|
||||
|
||||
// ============================================================
|
||||
// CP11A-2 Adversarial Test Suite: B-09 + B-10
|
||||
//
|
||||
// 8 scenarios stress-testing the coordinated expand path under
|
||||
// failover, concurrent heartbeats, and partial failures.
|
||||
// ============================================================
|
||||
|
||||
// qaExpandMaster creates a MasterServer with 3 block-capable servers
|
||||
// and default expand mocks for adversarial testing.
|
||||
func qaExpandMaster(t *testing.T) *MasterServer {
|
||||
t.Helper()
|
||||
ms := &MasterServer{
|
||||
blockRegistry: NewBlockVolumeRegistry(),
|
||||
blockAssignmentQueue: NewBlockAssignmentQueue(),
|
||||
blockFailover: newBlockFailoverState(),
|
||||
}
|
||||
ms.blockVSAllocate = func(ctx context.Context, server string, name string, sizeBytes uint64, diskType string, durabilityMode string) (*blockAllocResult, error) {
|
||||
return &blockAllocResult{
|
||||
Path: fmt.Sprintf("/data/%s.blk", name),
|
||||
IQN: fmt.Sprintf("iqn.2024.test:%s", name),
|
||||
ISCSIAddr: server + ":3260",
|
||||
ReplicaDataAddr: server + ":14260",
|
||||
ReplicaCtrlAddr: server + ":14261",
|
||||
RebuildListenAddr: server + ":15000",
|
||||
}, nil
|
||||
}
|
||||
ms.blockVSDelete = func(ctx context.Context, server string, name string) error {
|
||||
return nil
|
||||
}
|
||||
ms.blockVSExpand = func(ctx context.Context, server string, name string, newSize uint64) (uint64, error) {
|
||||
return newSize, nil
|
||||
}
|
||||
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
|
||||
return nil
|
||||
}
|
||||
ms.blockVSCommitExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error) {
|
||||
return 2 << 30, nil
|
||||
}
|
||||
ms.blockVSCancelExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) error {
|
||||
return nil
|
||||
}
|
||||
ms.blockRegistry.MarkBlockCapable("vs1:9333")
|
||||
ms.blockRegistry.MarkBlockCapable("vs2:9333")
|
||||
ms.blockRegistry.MarkBlockCapable("vs3:9333")
|
||||
return ms
|
||||
}
|
||||
|
||||
// qaCreateRF creates a volume with the given replica factor.
|
||||
func qaCreateRF(t *testing.T, ms *MasterServer, name string, rf uint32) {
|
||||
t.Helper()
|
||||
_, err := ms.CreateBlockVolume(context.Background(), &master_pb.CreateBlockVolumeRequest{
|
||||
Name: name,
|
||||
SizeBytes: 1 << 30,
|
||||
ReplicaFactor: rf,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("create %s RF=%d: %v", name, rf, err)
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-B09-1: ExpandAfterDoubleFailover_RF3
|
||||
//
|
||||
// RF=3 volume. Primary dies → promote replica A. Then replica A
|
||||
// (now primary) dies → promote replica B. Expand must reach
|
||||
// replica B (the second-generation primary), not the original.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_B09_ExpandAfterDoubleFailover_RF3(t *testing.T) {
|
||||
ms := qaExpandMaster(t)
|
||||
qaCreateRF(t, ms, "dbl-failover", 3)
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("dbl-failover")
|
||||
gen0Primary := entry.VolumeServer
|
||||
|
||||
// First failover: kill original primary.
|
||||
ms.blockRegistry.PromoteBestReplica("dbl-failover")
|
||||
entry, _ = ms.blockRegistry.Lookup("dbl-failover")
|
||||
gen1Primary := entry.VolumeServer
|
||||
if gen1Primary == gen0Primary {
|
||||
t.Fatal("first promotion didn't change primary")
|
||||
}
|
||||
|
||||
// Second failover: kill gen1 primary.
|
||||
// Need to ensure the remaining replica has a fresh heartbeat.
|
||||
if len(entry.Replicas) == 0 {
|
||||
t.Fatal("no replicas left after first promotion (need RF=3)")
|
||||
}
|
||||
ms.blockRegistry.PromoteBestReplica("dbl-failover")
|
||||
entry, _ = ms.blockRegistry.Lookup("dbl-failover")
|
||||
gen2Primary := entry.VolumeServer
|
||||
if gen2Primary == gen1Primary || gen2Primary == gen0Primary {
|
||||
t.Fatalf("second promotion should pick a new server, got %q (gen0=%q gen1=%q)",
|
||||
gen2Primary, gen0Primary, gen1Primary)
|
||||
}
|
||||
|
||||
// Track PREPARE targets.
|
||||
var preparedServers []string
|
||||
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
|
||||
preparedServers = append(preparedServers, server)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Expand — standalone path since no replicas remain after 2 promotions.
|
||||
_, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
|
||||
Name: "dbl-failover", NewSizeBytes: 2 << 30,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("expand: %v", err)
|
||||
}
|
||||
|
||||
// If standalone path was taken (no replicas), preparedServers is empty — that's fine.
|
||||
// If coordinated path was taken, first PREPARE must target gen2Primary.
|
||||
if len(preparedServers) > 0 && preparedServers[0] != gen2Primary {
|
||||
t.Fatalf("PREPARE went to %q, want gen2 primary %q", preparedServers[0], gen2Primary)
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-B09-2: ExpandSeesDeletedVolume_AfterLockAcquire
|
||||
//
|
||||
// Volume is deleted between the initial Lookup (succeeds) and
|
||||
// the re-read after AcquireExpandInflight. The re-read must
|
||||
// detect the deletion and fail cleanly.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_B09_ExpandSeesDeletedVolume_AfterLockAcquire(t *testing.T) {
|
||||
ms := qaExpandMaster(t)
|
||||
qaCreateRF(t, ms, "disappear", 2)
|
||||
|
||||
// Hook PREPARE to delete the volume before it runs.
|
||||
// The B-09 re-read happens before PREPARE, so we simulate deletion
|
||||
// between initial Lookup and AcquireExpandInflight by having a
|
||||
// goroutine that deletes the entry while expand is in progress.
|
||||
// Instead, test directly: acquire expand lock, then unregister, then
|
||||
// call ExpandBlockVolume — it should fail on re-read.
|
||||
|
||||
// Acquire expand lock manually first so the real call gets blocked.
|
||||
// Then verify the error path by attempting a second expand.
|
||||
if !ms.blockRegistry.AcquireExpandInflight("disappear", 2<<30, 1) {
|
||||
t.Fatal("AcquireExpandInflight should succeed")
|
||||
}
|
||||
|
||||
// Try another expand while locked — should fail with "already in progress".
|
||||
_, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
|
||||
Name: "disappear", NewSizeBytes: 2 << 30,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expand should fail when lock is held")
|
||||
}
|
||||
|
||||
// Release and delete the volume.
|
||||
ms.blockRegistry.ReleaseExpandInflight("disappear")
|
||||
ms.blockRegistry.Unregister("disappear")
|
||||
|
||||
// Now expand on a deleted volume — should fail on initial Lookup.
|
||||
_, err = ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
|
||||
Name: "disappear", NewSizeBytes: 2 << 30,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expand on deleted volume should fail")
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-B09-3: ConcurrentExpandAndFailover
|
||||
//
|
||||
// Expand and failover race on the same volume. Neither should
|
||||
// panic, and the volume must be in a consistent state afterward.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_B09_ConcurrentExpandAndFailover(t *testing.T) {
|
||||
ms := qaExpandMaster(t)
|
||||
qaCreateRF(t, ms, "race-vol", 3)
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("race-vol")
|
||||
primary := entry.VolumeServer
|
||||
|
||||
// Make PREPARE slow so expand holds the lock longer.
|
||||
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
return nil
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Goroutine 1: expand.
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
|
||||
Name: "race-vol", NewSizeBytes: 2 << 30,
|
||||
})
|
||||
// Error is OK — we're testing for panics and consistency.
|
||||
}()
|
||||
|
||||
// Goroutine 2: failover kills primary.
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
time.Sleep(2 * time.Millisecond) // slight delay to let expand start
|
||||
ms.failoverBlockVolumes(primary)
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
// Volume must still exist regardless of outcome.
|
||||
_, ok := ms.blockRegistry.Lookup("race-vol")
|
||||
if !ok {
|
||||
t.Fatal("volume must survive concurrent expand + failover")
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-B09-4: ConcurrentExpandsSameVolume
|
||||
//
|
||||
// Two goroutines try to expand the same volume simultaneously.
|
||||
// Exactly one should succeed, the other should get "already in
|
||||
// progress". No panic, no double-commit.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_B09_ConcurrentExpandsSameVolume(t *testing.T) {
|
||||
ms := qaExpandMaster(t)
|
||||
qaCreateRF(t, ms, "dup-expand", 2)
|
||||
|
||||
var commitCount atomic.Int32
|
||||
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
|
||||
time.Sleep(5 * time.Millisecond) // slow prepare
|
||||
return nil
|
||||
}
|
||||
ms.blockVSCommitExpand = func(ctx context.Context, server string, name string, expandEpoch uint64) (uint64, error) {
|
||||
commitCount.Add(1)
|
||||
return 2 << 30, nil
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
var successes atomic.Int32
|
||||
var failures atomic.Int32
|
||||
|
||||
for i := 0; i < 2; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
_, err := ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
|
||||
Name: "dup-expand", NewSizeBytes: 2 << 30,
|
||||
})
|
||||
if err == nil {
|
||||
successes.Add(1)
|
||||
} else {
|
||||
failures.Add(1)
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if successes.Load() != 1 {
|
||||
t.Fatalf("expected exactly 1 success, got %d", successes.Load())
|
||||
}
|
||||
if failures.Load() != 1 {
|
||||
t.Fatalf("expected exactly 1 failure (already in progress), got %d", failures.Load())
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-B10-1: RepeatedEmptyHeartbeats_DuringExpand
|
||||
//
|
||||
// Multiple empty heartbeats from the primary during expand.
|
||||
// Entry must survive all of them — not just the first.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_B10_RepeatedEmptyHeartbeats_DuringExpand(t *testing.T) {
|
||||
ms := qaExpandMaster(t)
|
||||
qaCreateRF(t, ms, "multi-hb", 2)
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("multi-hb")
|
||||
primary := entry.VolumeServer
|
||||
|
||||
if !ms.blockRegistry.AcquireExpandInflight("multi-hb", 2<<30, 42) {
|
||||
t.Fatal("acquire expand lock")
|
||||
}
|
||||
|
||||
// 10 empty heartbeats from the primary — each one would delete
|
||||
// the entry without the B-10 guard.
|
||||
for i := 0; i < 10; i++ {
|
||||
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
|
||||
}
|
||||
|
||||
_, ok := ms.blockRegistry.Lookup("multi-hb")
|
||||
if !ok {
|
||||
t.Fatal("entry deleted after repeated empty heartbeats during expand")
|
||||
}
|
||||
|
||||
ms.blockRegistry.ReleaseExpandInflight("multi-hb")
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-B10-2: ExpandFailed_HeartbeatStillProtected
|
||||
//
|
||||
// After MarkExpandFailed (primary committed, replica didn't),
|
||||
// empty heartbeats must NOT delete the entry. ExpandFailed
|
||||
// keeps ExpandInProgress=true as a size-suppression guard.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_B10_ExpandFailed_HeartbeatStillProtected(t *testing.T) {
|
||||
ms := qaExpandMaster(t)
|
||||
qaCreateRF(t, ms, "fail-hb", 2)
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("fail-hb")
|
||||
primary := entry.VolumeServer
|
||||
|
||||
if !ms.blockRegistry.AcquireExpandInflight("fail-hb", 2<<30, 42) {
|
||||
t.Fatal("acquire expand lock")
|
||||
}
|
||||
ms.blockRegistry.MarkExpandFailed("fail-hb")
|
||||
|
||||
// Empty heartbeat should not delete — ExpandFailed keeps ExpandInProgress=true.
|
||||
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
|
||||
|
||||
e, ok := ms.blockRegistry.Lookup("fail-hb")
|
||||
if !ok {
|
||||
t.Fatal("entry deleted during ExpandFailed state")
|
||||
}
|
||||
if !e.ExpandFailed {
|
||||
t.Fatal("ExpandFailed should still be true")
|
||||
}
|
||||
if !e.ExpandInProgress {
|
||||
t.Fatal("ExpandInProgress should still be true")
|
||||
}
|
||||
|
||||
// After ClearExpandFailed, empty heartbeat should delete normally.
|
||||
ms.blockRegistry.ClearExpandFailed("fail-hb")
|
||||
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
|
||||
|
||||
_, ok = ms.blockRegistry.Lookup("fail-hb")
|
||||
if ok {
|
||||
t.Fatal("entry should be deleted after ClearExpandFailed + empty heartbeat")
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-B10-3: HeartbeatSizeSuppress_DuringExpand
|
||||
//
|
||||
// Primary reports a stale (old) size during coordinated expand.
|
||||
// Registry must NOT downgrade SizeBytes — the pending expand
|
||||
// size is authoritative until commit or release.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_B10_HeartbeatSizeSuppress_DuringExpand(t *testing.T) {
|
||||
ms := qaExpandMaster(t)
|
||||
qaCreateRF(t, ms, "size-suppress", 2)
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("size-suppress")
|
||||
primary := entry.VolumeServer
|
||||
origSize := entry.SizeBytes
|
||||
|
||||
if !ms.blockRegistry.AcquireExpandInflight("size-suppress", 2<<30, 42) {
|
||||
t.Fatal("acquire expand lock")
|
||||
}
|
||||
|
||||
// Heartbeat reports old size (expand hasn't committed on VS yet).
|
||||
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{
|
||||
{
|
||||
Path: "/data/size-suppress.blk",
|
||||
VolumeSize: origSize, // old size
|
||||
Epoch: 1,
|
||||
Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
},
|
||||
})
|
||||
|
||||
entry, _ = ms.blockRegistry.Lookup("size-suppress")
|
||||
if entry.SizeBytes != origSize {
|
||||
t.Fatalf("size should remain %d during expand, got %d", origSize, entry.SizeBytes)
|
||||
}
|
||||
|
||||
// Heartbeat reports a LARGER size (stale from previous expand or bug).
|
||||
// Still must not update — coordinated expand owns the size.
|
||||
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{
|
||||
{
|
||||
Path: "/data/size-suppress.blk",
|
||||
VolumeSize: 5 << 30, // bogus large size
|
||||
Epoch: 1,
|
||||
Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
},
|
||||
})
|
||||
|
||||
entry, _ = ms.blockRegistry.Lookup("size-suppress")
|
||||
if entry.SizeBytes != origSize {
|
||||
t.Fatalf("size should remain %d (suppressed), got %d", origSize, entry.SizeBytes)
|
||||
}
|
||||
|
||||
ms.blockRegistry.ReleaseExpandInflight("size-suppress")
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-B10-4: ConcurrentHeartbeatsAndExpand
|
||||
//
|
||||
// Simultaneous full heartbeats from primary and replicas while
|
||||
// expand runs on another goroutine. Must not panic, must not
|
||||
// orphan the entry, and expand must either succeed or fail
|
||||
// cleanly with a clear error.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_B10_ConcurrentHeartbeatsAndExpand(t *testing.T) {
|
||||
ms := qaExpandMaster(t)
|
||||
qaCreateRF(t, ms, "hb-expand-race", 2)
|
||||
|
||||
entry, _ := ms.blockRegistry.Lookup("hb-expand-race")
|
||||
primary := entry.VolumeServer
|
||||
replica := ""
|
||||
if len(entry.Replicas) > 0 {
|
||||
replica = entry.Replicas[0].Server
|
||||
}
|
||||
|
||||
ms.blockVSPrepareExpand = func(ctx context.Context, server string, name string, newSize, expandEpoch uint64) error {
|
||||
time.Sleep(2 * time.Millisecond)
|
||||
return nil
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
const rounds = 30
|
||||
|
||||
// Goroutine 1: expand.
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
ms.ExpandBlockVolume(context.Background(), &master_pb.ExpandBlockVolumeRequest{
|
||||
Name: "hb-expand-race", NewSizeBytes: 2 << 30,
|
||||
})
|
||||
}()
|
||||
|
||||
// Goroutine 2: primary heartbeats (mix of reporting and not reporting).
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < rounds; i++ {
|
||||
if i%5 == 0 {
|
||||
// Every 5th: empty heartbeat (simulates brief restart).
|
||||
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{})
|
||||
} else {
|
||||
ms.blockRegistry.UpdateFullHeartbeat(primary, []*master_pb.BlockVolumeInfoMessage{
|
||||
{
|
||||
Path: "/data/hb-expand-race.blk",
|
||||
VolumeSize: 1 << 30,
|
||||
Epoch: 1,
|
||||
Role: blockvol.RoleToWire(blockvol.RolePrimary),
|
||||
WalHeadLsn: uint64(100 + i),
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Goroutine 3: replica heartbeats.
|
||||
if replica != "" {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < rounds; i++ {
|
||||
ms.blockRegistry.UpdateFullHeartbeat(replica, []*master_pb.BlockVolumeInfoMessage{
|
||||
{
|
||||
Path: "/data/hb-expand-race.blk",
|
||||
VolumeSize: 1 << 30,
|
||||
Epoch: 1,
|
||||
Role: blockvol.RoleToWire(blockvol.RoleReplica),
|
||||
WalHeadLsn: uint64(99 + i),
|
||||
},
|
||||
})
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
// Volume must still exist — no orphan.
|
||||
_, ok := ms.blockRegistry.Lookup("hb-expand-race")
|
||||
if !ok {
|
||||
t.Fatal("volume must survive concurrent heartbeats + expand")
|
||||
}
|
||||
}
|
||||
1346
weed/server/qa_block_nvme_publication_test.go
Normal file
1346
weed/server/qa_block_nvme_publication_test.go
Normal file
File diff suppressed because it is too large
Load Diff
@@ -136,6 +136,61 @@ func (c *Client) ExpandVolume(ctx context.Context, name string, newSizeBytes uin
|
||||
return out.CapacityBytes, nil
|
||||
}
|
||||
|
||||
// PromoteVolume triggers a manual promotion for a block volume.
|
||||
func (c *Client) PromoteVolume(ctx context.Context, name string, req PromoteVolumeRequest) (*PromoteVolumeResponse, error) {
|
||||
body, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal request: %w", err)
|
||||
}
|
||||
resp, err := c.doRequest(ctx, http.MethodPost, "/block/volume/"+name+"/promote", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if err := checkStatus(resp, http.StatusOK); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out PromoteVolumeResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return nil, fmt.Errorf("decode response: %w", err)
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
// BlockStatus fetches the block registry status metrics.
|
||||
func (c *Client) BlockStatus(ctx context.Context) (*BlockStatusResponse, error) {
|
||||
resp, err := c.doRequest(ctx, http.MethodGet, "/block/status", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if err := checkStatus(resp, http.StatusOK); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out BlockStatusResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return nil, fmt.Errorf("decode response: %w", err)
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
// Preflight returns the promotion preflight evaluation for a block volume.
|
||||
func (c *Client) Preflight(ctx context.Context, name string) (*PreflightResponse, error) {
|
||||
resp, err := c.doRequest(ctx, http.MethodGet, "/block/volume/"+name+"/preflight", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if err := checkStatus(resp, http.StatusOK); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out PreflightResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return nil, fmt.Errorf("decode response: %w", err)
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
// ListServers lists all block-capable volume servers.
|
||||
func (c *Client) ListServers(ctx context.Context) ([]ServerInfo, error) {
|
||||
resp, err := c.doRequest(ctx, http.MethodGet, "/block/servers", nil)
|
||||
|
||||
@@ -38,6 +38,8 @@ type VolumeInfo struct {
|
||||
HealthScore float64 `json:"health_score"`
|
||||
ReplicaDegraded bool `json:"replica_degraded,omitempty"`
|
||||
DurabilityMode string `json:"durability_mode"` // CP8-3-1
|
||||
NvmeAddr string `json:"nvme_addr,omitempty"`
|
||||
NQN string `json:"nqn,omitempty"`
|
||||
}
|
||||
|
||||
// ReplicaDetail describes one replica in the API response.
|
||||
@@ -74,6 +76,52 @@ type ExpandVolumeResponse struct {
|
||||
CapacityBytes uint64 `json:"capacity_bytes"`
|
||||
}
|
||||
|
||||
// PromoteVolumeRequest is the request body for POST /block/volume/{name}/promote.
|
||||
type PromoteVolumeRequest struct {
|
||||
TargetServer string `json:"target_server,omitempty"` // specific replica, or empty for auto
|
||||
Force bool `json:"force,omitempty"` // bypass soft safety checks
|
||||
Reason string `json:"reason,omitempty"` // audit note
|
||||
}
|
||||
|
||||
// PromoteVolumeResponse is the response for POST /block/volume/{name}/promote.
|
||||
type PromoteVolumeResponse struct {
|
||||
NewPrimary string `json:"new_primary"`
|
||||
Epoch uint64 `json:"epoch"`
|
||||
Reason string `json:"reason,omitempty"` // rejection reason if failed
|
||||
Rejections []PreflightRejection `json:"rejections,omitempty"` // per-replica rejection details
|
||||
}
|
||||
|
||||
// BlockStatusResponse is the response for GET /block/status.
|
||||
type BlockStatusResponse struct {
|
||||
VolumeCount int `json:"volume_count"`
|
||||
ServerCount int `json:"server_count"`
|
||||
PromotionLSNTolerance uint64 `json:"promotion_lsn_tolerance"`
|
||||
BarrierLagLSN uint64 `json:"barrier_lag_lsn"`
|
||||
PromotionsTotal int64 `json:"promotions_total"`
|
||||
FailoversTotal int64 `json:"failovers_total"`
|
||||
RebuildsTotal int64 `json:"rebuilds_total"`
|
||||
AssignmentQueueDepth int `json:"assignment_queue_depth"`
|
||||
}
|
||||
|
||||
// PreflightRejection describes why a specific replica was rejected for promotion.
|
||||
type PreflightRejection struct {
|
||||
Server string `json:"server"`
|
||||
Reason string `json:"reason"` // "stale_heartbeat", "wal_lag", "wrong_role", "server_dead", "no_heartbeat"
|
||||
}
|
||||
|
||||
// PreflightResponse is the response for GET /block/volume/{name}/preflight.
|
||||
type PreflightResponse struct {
|
||||
VolumeName string `json:"volume_name"`
|
||||
Promotable bool `json:"promotable"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
CandidateServer string `json:"candidate_server,omitempty"`
|
||||
CandidateHealth float64 `json:"candidate_health,omitempty"`
|
||||
CandidateWALLSN uint64 `json:"candidate_wal_lsn,omitempty"`
|
||||
Rejections []PreflightRejection `json:"rejections,omitempty"`
|
||||
PrimaryServer string `json:"primary_server"`
|
||||
PrimaryAlive bool `json:"primary_alive"`
|
||||
}
|
||||
|
||||
// RoleFromString converts a role string to its uint32 wire value.
|
||||
// Returns 0 (RoleNone) for unrecognized strings.
|
||||
func RoleFromString(s string) uint32 {
|
||||
|
||||
511
weed/storage/blockvol/qa_wal_cp11a3_adversarial_test.go
Normal file
511
weed/storage/blockvol/qa_wal_cp11a3_adversarial_test.go
Normal file
@@ -0,0 +1,511 @@
|
||||
package blockvol
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ============================================================
|
||||
// CP11A-3 Adversarial Test Suite
|
||||
//
|
||||
// 10 scenarios stress-testing WAL admission pressure tracking,
|
||||
// PressureState boundaries, guidance edge cases, and concurrent
|
||||
// metric visibility.
|
||||
// ============================================================
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-1: SoftMarkEqualsHardMark_NoPanic
|
||||
//
|
||||
// If an operator configures softMark == hardMark, the soft-zone
|
||||
// delay calculation divides by (hardMark - softMark) = 0.
|
||||
// Must not panic, hang, or produce NaN/Inf delay.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_SoftMarkEqualsHardMark_NoPanic(t *testing.T) {
|
||||
m := NewEngineMetrics()
|
||||
|
||||
a := NewWALAdmission(WALAdmissionConfig{
|
||||
MaxConcurrent: 16,
|
||||
SoftWatermark: 0.8,
|
||||
HardWatermark: 0.8, // equal — no soft zone
|
||||
WALUsedFn: func() float64 { return 0.85 }, // above both marks
|
||||
NotifyFn: func() {},
|
||||
ClosedFn: func() bool { return false },
|
||||
Metrics: m,
|
||||
})
|
||||
|
||||
// With equal marks, pressure >= hardMark takes the hard branch.
|
||||
// The soft branch's division by zero is never reached.
|
||||
// But if the code path ever changes, this test catches it.
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- a.Acquire(50 * time.Millisecond)
|
||||
}()
|
||||
|
||||
select {
|
||||
case err := <-done:
|
||||
// ErrWALFull is expected (pressure stays above hard, times out).
|
||||
if err != ErrWALFull {
|
||||
t.Fatalf("expected ErrWALFull, got %v", err)
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("Acquire hung — possible Inf delay from division by zero")
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-2: SoftZoneExactBoundary_DelayIsZero
|
||||
//
|
||||
// When pressure == softMark exactly, scale = 0, delay = 0.
|
||||
// softPressureWaitNs should NOT increase (delay <= 0 skips sleep).
|
||||
// But hitSoft should still be true → SoftAdmitTotal increments.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_SoftZoneExactBoundary_DelayIsZero(t *testing.T) {
|
||||
m := NewEngineMetrics()
|
||||
|
||||
a := NewWALAdmission(WALAdmissionConfig{
|
||||
MaxConcurrent: 16,
|
||||
SoftWatermark: 0.7,
|
||||
HardWatermark: 0.9,
|
||||
WALUsedFn: func() float64 { return 0.7 }, // exactly at soft mark
|
||||
NotifyFn: func() {},
|
||||
ClosedFn: func() bool { return false },
|
||||
Metrics: m,
|
||||
})
|
||||
a.sleepFn = func(d time.Duration) {
|
||||
t.Fatalf("sleep should not be called when delay=0, but called with %v", d)
|
||||
}
|
||||
|
||||
if err := a.Acquire(100 * time.Millisecond); err != nil {
|
||||
t.Fatalf("Acquire: %v", err)
|
||||
}
|
||||
a.Release()
|
||||
|
||||
// SoftAdmitTotal should increment (we entered the soft branch).
|
||||
if m.WALAdmitSoftTotal.Load() != 1 {
|
||||
t.Fatalf("WALAdmitSoftTotal = %d, want 1", m.WALAdmitSoftTotal.Load())
|
||||
}
|
||||
// But no sleep → softPressureWaitNs stays 0.
|
||||
if a.SoftPressureWaitNs() != 0 {
|
||||
t.Fatalf("SoftPressureWaitNs = %d, want 0 (no delay at exact boundary)", a.SoftPressureWaitNs())
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-3: ConcurrentHardWaiters_TimeAccumulates
|
||||
//
|
||||
// 8 goroutines enter hard zone simultaneously. Each waits ~5ms.
|
||||
// Total hardPressureWaitNs should be roughly 8 × 5ms, proving
|
||||
// atomic accumulation doesn't lose contributions.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_ConcurrentHardWaiters_TimeAccumulates(t *testing.T) {
|
||||
m := NewEngineMetrics()
|
||||
var pressure atomic.Int64
|
||||
pressure.Store(95) // above hard mark
|
||||
|
||||
a := NewWALAdmission(WALAdmissionConfig{
|
||||
MaxConcurrent: 16,
|
||||
SoftWatermark: 0.7,
|
||||
HardWatermark: 0.9,
|
||||
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
|
||||
NotifyFn: func() {},
|
||||
ClosedFn: func() bool { return false },
|
||||
Metrics: m,
|
||||
})
|
||||
|
||||
var sleepCalls atomic.Int64
|
||||
a.sleepFn = func(d time.Duration) {
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
// After enough total sleeps across all goroutines, drop pressure.
|
||||
if sleepCalls.Add(1) >= 20 {
|
||||
pressure.Store(50)
|
||||
}
|
||||
}
|
||||
|
||||
const workers = 8
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < workers; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
if err := a.Acquire(5 * time.Second); err != nil {
|
||||
t.Errorf("Acquire: %v", err)
|
||||
}
|
||||
a.Release()
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// All 8 must have entered hard zone.
|
||||
if m.WALAdmitHardTotal.Load() < uint64(workers) {
|
||||
t.Fatalf("WALAdmitHardTotal = %d, want >= %d", m.WALAdmitHardTotal.Load(), workers)
|
||||
}
|
||||
// Accumulated hard wait should be > 0, reflecting contributions from all goroutines.
|
||||
if a.HardPressureWaitNs() <= 0 {
|
||||
t.Fatal("HardPressureWaitNs should be > 0 after concurrent hard-zone waits")
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-4: PressureStateAndAcquireRace
|
||||
//
|
||||
// One goroutine oscillates walUsed, another reads PressureState
|
||||
// rapidly. Must not panic, must always return a valid state.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_PressureStateAndAcquireRace(t *testing.T) {
|
||||
var pressure atomic.Int64
|
||||
pressure.Store(50)
|
||||
|
||||
a := NewWALAdmission(WALAdmissionConfig{
|
||||
MaxConcurrent: 16,
|
||||
SoftWatermark: 0.7,
|
||||
HardWatermark: 0.9,
|
||||
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
|
||||
NotifyFn: func() {},
|
||||
ClosedFn: func() bool { return false },
|
||||
Metrics: NewEngineMetrics(),
|
||||
})
|
||||
a.sleepFn = func(d time.Duration) { time.Sleep(100 * time.Microsecond) }
|
||||
|
||||
var wg sync.WaitGroup
|
||||
const rounds = 200
|
||||
|
||||
// Goroutine 1: oscillate pressure.
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
levels := []int64{30, 75, 95, 50, 80, 92, 10}
|
||||
for i := 0; i < rounds; i++ {
|
||||
pressure.Store(levels[i%len(levels)])
|
||||
}
|
||||
}()
|
||||
|
||||
// Goroutine 2: read PressureState.
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
valid := map[string]bool{"normal": true, "soft": true, "hard": true}
|
||||
for i := 0; i < rounds; i++ {
|
||||
s := a.PressureState()
|
||||
if !valid[s] {
|
||||
t.Errorf("PressureState() = %q — not a valid state", s)
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Goroutine 3: Acquire/Release rapidly.
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < rounds/2; i++ {
|
||||
err := a.Acquire(20 * time.Millisecond)
|
||||
if err == nil {
|
||||
a.Release()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-5: TimeInZoneMonotonicity
|
||||
//
|
||||
// softPressureWaitNs and hardPressureWaitNs must be monotonically
|
||||
// non-decreasing across reads, even under concurrent writes.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_TimeInZoneMonotonicity(t *testing.T) {
|
||||
m := NewEngineMetrics()
|
||||
var pressure atomic.Int64
|
||||
pressure.Store(80) // soft zone
|
||||
|
||||
a := NewWALAdmission(WALAdmissionConfig{
|
||||
MaxConcurrent: 16,
|
||||
SoftWatermark: 0.7,
|
||||
HardWatermark: 0.9,
|
||||
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
|
||||
NotifyFn: func() {},
|
||||
ClosedFn: func() bool { return false },
|
||||
Metrics: m,
|
||||
})
|
||||
a.sleepFn = func(d time.Duration) { time.Sleep(100 * time.Microsecond) }
|
||||
|
||||
var wg sync.WaitGroup
|
||||
const writers = 4
|
||||
const rounds = 30
|
||||
|
||||
// Writers produce soft-zone and hard-zone waits.
|
||||
for i := 0; i < writers; i++ {
|
||||
wg.Add(1)
|
||||
go func(id int) {
|
||||
defer wg.Done()
|
||||
for j := 0; j < rounds; j++ {
|
||||
if j%5 == 0 {
|
||||
pressure.Store(95) // hard
|
||||
} else {
|
||||
pressure.Store(80) // soft
|
||||
}
|
||||
err := a.Acquire(50 * time.Millisecond)
|
||||
if err == nil {
|
||||
a.Release()
|
||||
}
|
||||
// Drop back so next Acquire can succeed.
|
||||
pressure.Store(50)
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
|
||||
// Reader checks monotonicity.
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
var prevSoft, prevHard int64
|
||||
for i := 0; i < rounds*writers; i++ {
|
||||
soft := a.SoftPressureWaitNs()
|
||||
hard := a.HardPressureWaitNs()
|
||||
if soft < prevSoft {
|
||||
t.Errorf("SoftPressureWaitNs decreased: %d -> %d", prevSoft, soft)
|
||||
}
|
||||
if hard < prevHard {
|
||||
t.Errorf("HardPressureWaitNs decreased: %d -> %d", prevHard, hard)
|
||||
}
|
||||
prevSoft = soft
|
||||
prevHard = hard
|
||||
}
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-6: WALGuidance_ZeroInputs
|
||||
//
|
||||
// Zero walSize, zero blockSize, zero maxConcurrent, empty hint.
|
||||
// Must not panic or produce invalid results.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_WALGuidance_ZeroInputs(t *testing.T) {
|
||||
// All zeros.
|
||||
r := WALSizingGuidance(0, 0, "")
|
||||
if r.Level != "warn" {
|
||||
t.Errorf("zero walSize: Level = %q, want warn", r.Level)
|
||||
}
|
||||
|
||||
// Zero blockSize: absMin = 0*64 = 0. Only workload minimum check fires.
|
||||
r = WALSizingGuidance(0, 0, WorkloadGeneral)
|
||||
if r.Level != "warn" {
|
||||
t.Errorf("zero walSize+blockSize: Level = %q, want warn", r.Level)
|
||||
}
|
||||
|
||||
// Zero walSize but nonzero blockSize.
|
||||
r = WALSizingGuidance(0, 4096, WorkloadDatabase)
|
||||
if r.Level != "warn" {
|
||||
t.Errorf("zero walSize: Level = %q, want warn", r.Level)
|
||||
}
|
||||
if len(r.Warnings) < 2 {
|
||||
t.Errorf("expected both workload + absolute minimum warnings, got %d", len(r.Warnings))
|
||||
}
|
||||
|
||||
// EvaluateWALConfig with zero maxConcurrent should not trigger concurrency warning.
|
||||
r = EvaluateWALConfig(0, 4096, 0, WorkloadGeneral)
|
||||
// walSize=0 still triggers sizing warning.
|
||||
if r.Level != "warn" {
|
||||
t.Errorf("Level = %q, want warn for zero walSize", r.Level)
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-7: WALGuidance_OverflowSafe
|
||||
//
|
||||
// Very large blockSize × minWALEntries might overflow uint64.
|
||||
// (64 × 2^60 does NOT overflow, but let's test near-boundary.)
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_WALGuidance_OverflowSafe(t *testing.T) {
|
||||
// Large blockSize: 256MB blocks × 64 = 16GB minimum.
|
||||
// walSize = 1GB → should warn (16GB > 1GB).
|
||||
r := WALSizingGuidance(1<<30, 256<<20, WorkloadGeneral)
|
||||
if r.Level != "warn" {
|
||||
t.Errorf("Level = %q, want warn (1GB WAL < 16GB absMin)", r.Level)
|
||||
}
|
||||
|
||||
// Extreme: blockSize = 1<<40 (1TB). 64 × 1TB = 64TB.
|
||||
// uint64 can hold 18 EB — no overflow.
|
||||
r = WALSizingGuidance(1<<50, 1<<40, WorkloadThroughput)
|
||||
// 1PB WAL with 1TB blocks: absMin = 64TB, 1PB > 64TB → ok for absolute.
|
||||
// 1PB > 128MB (throughput min) → ok for workload.
|
||||
if r.Level != "ok" {
|
||||
t.Errorf("Level = %q, want ok for huge WAL", r.Level)
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-8: WALStatusSnapshot_PartialInit
|
||||
//
|
||||
// BlockVol with Metrics but nil walAdmission, and vice versa.
|
||||
// WALStatus must return coherent defaults for the nil side
|
||||
// and real values for the non-nil side.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_WALStatusSnapshot_PartialInit(t *testing.T) {
|
||||
// Case 1: Metrics set, walAdmission nil.
|
||||
m := NewEngineMetrics()
|
||||
m.WALAdmitSoftTotal.Add(42)
|
||||
m.WALAdmitHardTotal.Add(7)
|
||||
vol1 := &BlockVol{Metrics: m}
|
||||
|
||||
ws := vol1.WALStatus()
|
||||
if ws.PressureState != "normal" {
|
||||
t.Errorf("nil admission: PressureState = %q, want normal", ws.PressureState)
|
||||
}
|
||||
if ws.SoftAdmitTotal != 42 {
|
||||
t.Errorf("SoftAdmitTotal = %d, want 42", ws.SoftAdmitTotal)
|
||||
}
|
||||
if ws.HardAdmitTotal != 7 {
|
||||
t.Errorf("HardAdmitTotal = %d, want 7", ws.HardAdmitTotal)
|
||||
}
|
||||
// Pressure wait should be 0 (no admission controller).
|
||||
if ws.SoftPressureWaitSec != 0 || ws.HardPressureWaitSec != 0 {
|
||||
t.Errorf("nil admission: pressure wait should be 0")
|
||||
}
|
||||
|
||||
// Case 2: walAdmission set, Metrics nil.
|
||||
a := NewWALAdmission(WALAdmissionConfig{
|
||||
MaxConcurrent: 16,
|
||||
SoftWatermark: 0.65,
|
||||
HardWatermark: 0.85,
|
||||
WALUsedFn: func() float64 { return 0.7 },
|
||||
NotifyFn: func() {},
|
||||
ClosedFn: func() bool { return false },
|
||||
})
|
||||
vol2 := &BlockVol{walAdmission: a}
|
||||
|
||||
ws2 := vol2.WALStatus()
|
||||
if ws2.PressureState != "soft" {
|
||||
t.Errorf("PressureState = %q, want soft (0.7 >= 0.65)", ws2.PressureState)
|
||||
}
|
||||
if ws2.SoftWatermark != 0.65 {
|
||||
t.Errorf("SoftWatermark = %f, want 0.65", ws2.SoftWatermark)
|
||||
}
|
||||
// Metrics fields should be zero (nil Metrics).
|
||||
if ws2.SoftAdmitTotal != 0 || ws2.HardAdmitTotal != 0 || ws2.TimeoutTotal != 0 {
|
||||
t.Errorf("nil metrics: counters should be 0")
|
||||
}
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-9: ObserverPanic_ContainedOrDocumented
|
||||
//
|
||||
// If WALAdmitWaitObserver panics, RecordWALAdmit is called from
|
||||
// Acquire → recordAdmit. A panic in the observer would crash the
|
||||
// writer goroutine. This test documents whether the panic is
|
||||
// recovered or propagated.
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_ObserverPanic_DocumentedBehavior(t *testing.T) {
|
||||
m := NewEngineMetrics()
|
||||
m.WALAdmitWaitObserver = func(s float64) { panic("boom") }
|
||||
|
||||
// RecordWALAdmit calls the observer. If it panics, the caller panics.
|
||||
// This is expected (same as prometheus.Histogram.Observe panicking).
|
||||
// Document that the observer must not panic.
|
||||
panicked := false
|
||||
func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
panicked = true
|
||||
}
|
||||
}()
|
||||
m.RecordWALAdmit(1*time.Millisecond, false, false, false)
|
||||
}()
|
||||
|
||||
if !panicked {
|
||||
t.Fatal("expected panic from observer — if recovered, update this test")
|
||||
}
|
||||
|
||||
// Verify counters were NOT updated (panic happened before completion).
|
||||
// Actually, the observer is called AFTER WALAdmitTotal.Add(1) and
|
||||
// walAdmitWaitNs.record(). Let's verify the counter state.
|
||||
if m.WALAdmitTotal.Load() != 1 {
|
||||
t.Errorf("WALAdmitTotal = %d — should be 1 (incremented before observer)", m.WALAdmitTotal.Load())
|
||||
}
|
||||
// soft/hard/timeout flags are processed AFTER observer — panic skips them.
|
||||
// With soft=false, hard=false, timedOut=false there's nothing to skip,
|
||||
// but the counters should reflect what happened before the panic.
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────
|
||||
// QA-CP11A3-10: ConcurrentWALStatusReads
|
||||
//
|
||||
// Multiple goroutines read WALStatus while Acquire/Release runs.
|
||||
// Must not panic. Fields should be internally consistent
|
||||
// (SoftAdmitTotal >= 0, HardPressureWaitSec >= 0, etc.)
|
||||
// ────────────────────────────────────────────────────────────
|
||||
func TestQA_CP11A3_ConcurrentWALStatusReads(t *testing.T) {
|
||||
m := NewEngineMetrics()
|
||||
var pressure atomic.Int64
|
||||
pressure.Store(50)
|
||||
|
||||
a := NewWALAdmission(WALAdmissionConfig{
|
||||
MaxConcurrent: 16,
|
||||
SoftWatermark: 0.7,
|
||||
HardWatermark: 0.9,
|
||||
WALUsedFn: func() float64 { return float64(pressure.Load()) / 100.0 },
|
||||
NotifyFn: func() {},
|
||||
ClosedFn: func() bool { return false },
|
||||
Metrics: m,
|
||||
})
|
||||
a.sleepFn = func(d time.Duration) { time.Sleep(50 * time.Microsecond) }
|
||||
|
||||
vol := &BlockVol{
|
||||
Metrics: m,
|
||||
walAdmission: a,
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
const rounds = 100
|
||||
|
||||
// Writers with varying pressure.
|
||||
for i := 0; i < 4; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
levels := []int64{50, 75, 95, 60, 85}
|
||||
for j := 0; j < rounds; j++ {
|
||||
pressure.Store(levels[j%len(levels)])
|
||||
if err := a.Acquire(20 * time.Millisecond); err == nil {
|
||||
a.Release()
|
||||
}
|
||||
pressure.Store(50) // reset for next round
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Concurrent WALStatus readers.
|
||||
for i := 0; i < 4; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
valid := map[string]bool{"normal": true, "soft": true, "hard": true}
|
||||
for j := 0; j < rounds*2; j++ {
|
||||
ws := vol.WALStatus()
|
||||
if !valid[ws.PressureState] {
|
||||
t.Errorf("invalid PressureState: %q", ws.PressureState)
|
||||
return
|
||||
}
|
||||
if ws.UsedFraction < 0 || ws.UsedFraction > 1.01 {
|
||||
t.Errorf("UsedFraction out of range: %f", ws.UsedFraction)
|
||||
return
|
||||
}
|
||||
if ws.SoftPressureWaitSec < 0 {
|
||||
t.Errorf("SoftPressureWaitSec negative: %f", ws.SoftPressureWaitSec)
|
||||
return
|
||||
}
|
||||
if ws.HardPressureWaitSec < 0 {
|
||||
t.Errorf("HardPressureWaitSec negative: %f", ws.HardPressureWaitSec)
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
@@ -26,6 +26,10 @@ func RegisterDevOpsActions(r *tr.Registry) {
|
||||
r.RegisterFunc("delete_block_volume", tr.TierDevOps, deleteBlockVolume)
|
||||
r.RegisterFunc("wait_block_servers", tr.TierDevOps, waitBlockServers)
|
||||
r.RegisterFunc("cluster_status", tr.TierDevOps, clusterStatus)
|
||||
r.RegisterFunc("wait_block_primary", tr.TierDevOps, waitBlockPrimary)
|
||||
r.RegisterFunc("assert_block_field", tr.TierDevOps, assertBlockField)
|
||||
r.RegisterFunc("block_status", tr.TierDevOps, blockStatus)
|
||||
r.RegisterFunc("block_promote", tr.TierDevOps, blockPromote)
|
||||
}
|
||||
|
||||
// setISCSIVars sets the save_as_iscsi_host/port/addr/iqn vars from a VolumeInfo.
|
||||
@@ -434,6 +438,222 @@ func waitBlockServers(ctx context.Context, actx *tr.ActionContext, act tr.Action
|
||||
}
|
||||
}
|
||||
|
||||
// waitBlockPrimary polls lookup until the volume's primary server matches (or differs from) expected.
|
||||
// Params: name, expected (server addr to wait for) OR not (server addr to wait to change from), timeout (default 60s).
|
||||
// Sets save_as vars from the final lookup.
|
||||
func waitBlockPrimary(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
|
||||
client, err := blockAPIClient(actx, act)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("wait_block_primary: %w", err)
|
||||
}
|
||||
|
||||
name := act.Params["name"]
|
||||
if name == "" {
|
||||
return nil, fmt.Errorf("wait_block_primary: name param required")
|
||||
}
|
||||
expected := act.Params["expected"]
|
||||
notServer := act.Params["not"]
|
||||
if expected == "" && notServer == "" {
|
||||
return nil, fmt.Errorf("wait_block_primary: expected or not param required")
|
||||
}
|
||||
|
||||
timeout := 60 * time.Second
|
||||
if t, ok := act.Params["timeout"]; ok {
|
||||
if d, err := parseDuration(t); err == nil {
|
||||
timeout = d
|
||||
}
|
||||
}
|
||||
|
||||
timeoutCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
ticker := time.NewTicker(2 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
pollCount := 0
|
||||
for {
|
||||
select {
|
||||
case <-timeoutCtx.Done():
|
||||
return nil, fmt.Errorf("wait_block_primary: timeout after %s waiting for primary change on %s", timeout, name)
|
||||
case <-ticker.C:
|
||||
pollCount++
|
||||
info, err := client.LookupVolume(timeoutCtx, name)
|
||||
if err != nil {
|
||||
if pollCount <= 3 {
|
||||
actx.Log(" poll %d: lookup error: %v", pollCount, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if pollCount <= 3 || pollCount%10 == 0 {
|
||||
actx.Log(" poll %d: %s primary=%s role=%s", pollCount, name, info.VolumeServer, info.Role)
|
||||
}
|
||||
|
||||
match := false
|
||||
if expected != "" && info.VolumeServer == expected {
|
||||
match = true
|
||||
}
|
||||
if notServer != "" && info.VolumeServer != notServer && info.VolumeServer != "" {
|
||||
match = true
|
||||
}
|
||||
if match {
|
||||
actx.Log(" primary for %s is now %s (epoch=%d)", name, info.VolumeServer, info.Epoch)
|
||||
if act.SaveAs != "" {
|
||||
setISCSIVars(actx, act.SaveAs, info)
|
||||
actx.Vars[act.SaveAs+"_server"] = info.VolumeServer
|
||||
actx.Vars[act.SaveAs+"_epoch"] = strconv.FormatUint(info.Epoch, 10)
|
||||
actx.Vars[act.SaveAs+"_role"] = info.Role
|
||||
}
|
||||
return map[string]string{"value": info.VolumeServer}, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// assertBlockField looks up a block volume and asserts a specific field matches the expected value.
|
||||
// Params: name, field (one of: volume_server, role, status, epoch, size_bytes, replica_server,
|
||||
// replica_factor, health_score, replica_degraded, durability_mode, iscsi_addr, iqn), expected.
|
||||
func assertBlockField(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
|
||||
client, err := blockAPIClient(actx, act)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("assert_block_field: %w", err)
|
||||
}
|
||||
|
||||
name := act.Params["name"]
|
||||
if name == "" {
|
||||
return nil, fmt.Errorf("assert_block_field: name param required")
|
||||
}
|
||||
field := act.Params["field"]
|
||||
if field == "" {
|
||||
return nil, fmt.Errorf("assert_block_field: field param required")
|
||||
}
|
||||
expected := act.Params["expected"]
|
||||
if expected == "" {
|
||||
return nil, fmt.Errorf("assert_block_field: expected param required")
|
||||
}
|
||||
|
||||
info, err := client.LookupVolume(ctx, name)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("assert_block_field: lookup %s: %w", name, err)
|
||||
}
|
||||
|
||||
actual, err := extractVolumeField(info, field)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("assert_block_field: %w", err)
|
||||
}
|
||||
|
||||
if actual != expected {
|
||||
return nil, fmt.Errorf("assert_block_field: %s.%s = %q, expected %q", name, field, actual, expected)
|
||||
}
|
||||
actx.Log(" assert %s.%s == %q OK", name, field, expected)
|
||||
return map[string]string{"value": actual}, nil
|
||||
}
|
||||
|
||||
// extractVolumeField extracts a named field from VolumeInfo as a string.
|
||||
func extractVolumeField(info *blockapi.VolumeInfo, field string) (string, error) {
|
||||
switch field {
|
||||
case "volume_server":
|
||||
return info.VolumeServer, nil
|
||||
case "role":
|
||||
return info.Role, nil
|
||||
case "status":
|
||||
return info.Status, nil
|
||||
case "epoch":
|
||||
return strconv.FormatUint(info.Epoch, 10), nil
|
||||
case "size_bytes":
|
||||
return strconv.FormatUint(info.SizeBytes, 10), nil
|
||||
case "replica_server":
|
||||
return info.ReplicaServer, nil
|
||||
case "replica_factor":
|
||||
return strconv.Itoa(info.ReplicaFactor), nil
|
||||
case "health_score":
|
||||
return fmt.Sprintf("%.2f", info.HealthScore), nil
|
||||
case "replica_degraded":
|
||||
return strconv.FormatBool(info.ReplicaDegraded), nil
|
||||
case "durability_mode":
|
||||
return info.DurabilityMode, nil
|
||||
case "iscsi_addr":
|
||||
return info.ISCSIAddr, nil
|
||||
case "iqn":
|
||||
return info.IQN, nil
|
||||
case "name":
|
||||
return info.Name, nil
|
||||
case "replica_iscsi_addr":
|
||||
return info.ReplicaISCSIAddr, nil
|
||||
case "replica_iqn":
|
||||
return info.ReplicaIQN, nil
|
||||
case "replica_data_addr":
|
||||
return info.ReplicaDataAddr, nil
|
||||
case "replica_ctrl_addr":
|
||||
return info.ReplicaCtrlAddr, nil
|
||||
default:
|
||||
return "", fmt.Errorf("unknown field %q", field)
|
||||
}
|
||||
}
|
||||
|
||||
// blockStatus fetches block registry status metrics from master.
|
||||
// Sets save_as_promotions_total, save_as_failovers_total, etc.
|
||||
func blockStatus(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
|
||||
client, err := blockAPIClient(actx, act)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("block_status: %w", err)
|
||||
}
|
||||
|
||||
status, err := client.BlockStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("block_status: %w", err)
|
||||
}
|
||||
|
||||
actx.Log(" block status: volumes=%d servers=%d promotions=%d failovers=%d rebuilds=%d",
|
||||
status.VolumeCount, status.ServerCount, status.PromotionsTotal, status.FailoversTotal, status.RebuildsTotal)
|
||||
|
||||
if act.SaveAs != "" {
|
||||
actx.Vars[act.SaveAs+"_volume_count"] = strconv.Itoa(status.VolumeCount)
|
||||
actx.Vars[act.SaveAs+"_server_count"] = strconv.Itoa(status.ServerCount)
|
||||
actx.Vars[act.SaveAs+"_promotions_total"] = strconv.FormatInt(status.PromotionsTotal, 10)
|
||||
actx.Vars[act.SaveAs+"_failovers_total"] = strconv.FormatInt(status.FailoversTotal, 10)
|
||||
actx.Vars[act.SaveAs+"_rebuilds_total"] = strconv.FormatInt(status.RebuildsTotal, 10)
|
||||
actx.Vars[act.SaveAs+"_queue_depth"] = strconv.Itoa(status.AssignmentQueueDepth)
|
||||
}
|
||||
|
||||
jsonBytes, _ := json.Marshal(status)
|
||||
return map[string]string{"value": string(jsonBytes)}, nil
|
||||
}
|
||||
|
||||
// blockPromote triggers a manual promotion for a block volume.
|
||||
// Params: name, target_server (optional, empty=auto), force (optional bool), reason (optional).
|
||||
func blockPromote(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
|
||||
client, err := blockAPIClient(actx, act)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("block_promote: %w", err)
|
||||
}
|
||||
|
||||
name := act.Params["name"]
|
||||
if name == "" {
|
||||
return nil, fmt.Errorf("block_promote: name param required")
|
||||
}
|
||||
|
||||
force := false
|
||||
if f := act.Params["force"]; f == "true" || f == "1" {
|
||||
force = true
|
||||
}
|
||||
|
||||
resp, err := client.PromoteVolume(ctx, name, blockapi.PromoteVolumeRequest{
|
||||
TargetServer: act.Params["target_server"],
|
||||
Force: force,
|
||||
Reason: act.Params["reason"],
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("block_promote: %w", err)
|
||||
}
|
||||
|
||||
actx.Log(" promoted %s -> primary=%s epoch=%d", name, resp.NewPrimary, resp.Epoch)
|
||||
if act.SaveAs != "" {
|
||||
actx.Vars[act.SaveAs+"_server"] = resp.NewPrimary
|
||||
actx.Vars[act.SaveAs+"_epoch"] = strconv.FormatUint(resp.Epoch, 10)
|
||||
}
|
||||
return map[string]string{"value": resp.NewPrimary}, nil
|
||||
}
|
||||
|
||||
// clusterStatus fetches the full cluster status JSON.
|
||||
func clusterStatus(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
|
||||
node, err := getNode(actx, act.Node)
|
||||
|
||||
@@ -23,6 +23,10 @@ func TestDevOpsActions_Registration(t *testing.T) {
|
||||
"delete_block_volume",
|
||||
"wait_block_servers",
|
||||
"cluster_status",
|
||||
"wait_block_primary",
|
||||
"assert_block_field",
|
||||
"block_status",
|
||||
"block_promote",
|
||||
}
|
||||
|
||||
for _, name := range expected {
|
||||
@@ -39,8 +43,8 @@ func TestDevOpsActions_Tier(t *testing.T) {
|
||||
byTier := registry.ListByTier()
|
||||
devopsActions := byTier[tr.TierDevOps]
|
||||
|
||||
if len(devopsActions) != 11 {
|
||||
t.Errorf("devops tier has %d actions, want 11", len(devopsActions))
|
||||
if len(devopsActions) != 15 {
|
||||
t.Errorf("devops tier has %d actions, want 15", len(devopsActions))
|
||||
}
|
||||
|
||||
// Verify all are in devops tier.
|
||||
@@ -84,11 +88,11 @@ func TestAllActions_Registration(t *testing.T) {
|
||||
if n := len(byTier[tr.TierCore]); n != 11 {
|
||||
t.Errorf("core: %d, want 11", n)
|
||||
}
|
||||
if n := len(byTier[tr.TierBlock]); n != 56 {
|
||||
t.Errorf("block: %d, want 56", n)
|
||||
if n := len(byTier[tr.TierBlock]); n != 58 {
|
||||
t.Errorf("block: %d, want 58", n)
|
||||
}
|
||||
if n := len(byTier[tr.TierDevOps]); n != 11 {
|
||||
t.Errorf("devops: %d, want 11", n)
|
||||
if n := len(byTier[tr.TierDevOps]); n != 15 {
|
||||
t.Errorf("devops: %d, want 15", n)
|
||||
}
|
||||
if n := len(byTier[tr.TierChaos]); n != 5 {
|
||||
t.Errorf("chaos: %d, want 5", n)
|
||||
@@ -97,13 +101,13 @@ func TestAllActions_Registration(t *testing.T) {
|
||||
t.Errorf("k8s: %d, want 14", n)
|
||||
}
|
||||
|
||||
// Total should be 97 (92 prev + 4 devops: expand/lookup/delete/wait_block_servers + 1 block: iscsi_login_direct).
|
||||
// Total should be 103 (99 prev + 4 devops: wait_block_primary, assert_block_field, block_status, block_promote).
|
||||
total := 0
|
||||
for _, actions := range byTier {
|
||||
total += len(actions)
|
||||
}
|
||||
if total != 97 {
|
||||
t.Errorf("total actions: %d, want 97", total)
|
||||
if total != 103 {
|
||||
t.Errorf("total actions: %d, want 103", total)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"time"
|
||||
|
||||
tr "github.com/seaweedfs/seaweedfs/weed/storage/blockvol/testrunner"
|
||||
"github.com/seaweedfs/seaweedfs/weed/storage/blockvol/testrunner/infra"
|
||||
)
|
||||
|
||||
// RegisterSnapshotActions registers snapshot and resize actions.
|
||||
@@ -18,6 +19,8 @@ func RegisterSnapshotActions(r *tr.Registry) {
|
||||
r.RegisterFunc("resize", tr.TierBlock, resizeAction)
|
||||
r.RegisterFunc("iscsi_rescan", tr.TierBlock, iscsiRescan)
|
||||
r.RegisterFunc("get_block_size", tr.TierBlock, getBlockSize)
|
||||
r.RegisterFunc("snapshot_export_s3", tr.TierBlock, snapshotExportS3)
|
||||
r.RegisterFunc("snapshot_import_s3", tr.TierBlock, snapshotImportS3)
|
||||
}
|
||||
|
||||
func snapshotCreate(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
|
||||
@@ -181,3 +184,89 @@ func parseHumanSize(s string) (uint64, error) {
|
||||
}
|
||||
return val * multiplier, nil
|
||||
}
|
||||
|
||||
// snapshotExportS3 exports a snapshot from a target to an S3 bucket.
|
||||
// Params: bucket, key_prefix, s3_endpoint, s3_access_key, s3_secret_key, s3_region, snapshot_id (optional).
|
||||
// Returns: manifest_key, data_key, size_bytes, sha256.
|
||||
func snapshotExportS3(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
|
||||
tgt, err := getHATarget(actx, act.Target)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
opts := infra.ExportS3Opts{
|
||||
Bucket: act.Params["bucket"],
|
||||
KeyPrefix: act.Params["key_prefix"],
|
||||
S3Endpoint: act.Params["s3_endpoint"],
|
||||
S3AccessKey: act.Params["s3_access_key"],
|
||||
S3SecretKey: act.Params["s3_secret_key"],
|
||||
S3Region: act.Params["s3_region"],
|
||||
}
|
||||
if opts.Bucket == "" || opts.S3Endpoint == "" {
|
||||
return nil, fmt.Errorf("snapshot_export_s3: bucket and s3_endpoint required")
|
||||
}
|
||||
if idStr := act.Params["snapshot_id"]; idStr != "" {
|
||||
id, err := strconv.ParseUint(idStr, 10, 32)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("snapshot_export_s3: invalid snapshot_id %q: %w", idStr, err)
|
||||
}
|
||||
opts.SnapshotID = uint32(id)
|
||||
}
|
||||
|
||||
result, err := tgt.ExportSnapshotS3(ctx, opts)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("snapshot_export_s3: %w", err)
|
||||
}
|
||||
|
||||
actx.Log(" exported to s3://%s/%s (%d bytes, sha256=%s)", opts.Bucket, result.DataKey, result.SizeBytes, result.SHA256)
|
||||
out := map[string]string{
|
||||
"value": result.SHA256,
|
||||
}
|
||||
if act.SaveAs != "" {
|
||||
actx.Vars[act.SaveAs+"_manifest_key"] = result.ManifestKey
|
||||
actx.Vars[act.SaveAs+"_data_key"] = result.DataKey
|
||||
actx.Vars[act.SaveAs+"_size_bytes"] = strconv.FormatUint(result.SizeBytes, 10)
|
||||
actx.Vars[act.SaveAs+"_sha256"] = result.SHA256
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// snapshotImportS3 imports a snapshot from an S3 bucket into a target.
|
||||
// Params: bucket, manifest_key, s3_endpoint, s3_access_key, s3_secret_key, s3_region, allow_overwrite.
|
||||
// Returns: size_bytes, sha256.
|
||||
func snapshotImportS3(ctx context.Context, actx *tr.ActionContext, act tr.Action) (map[string]string, error) {
|
||||
tgt, err := getHATarget(actx, act.Target)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
opts := infra.ImportS3Opts{
|
||||
Bucket: act.Params["bucket"],
|
||||
ManifestKey: act.Params["manifest_key"],
|
||||
S3Endpoint: act.Params["s3_endpoint"],
|
||||
S3AccessKey: act.Params["s3_access_key"],
|
||||
S3SecretKey: act.Params["s3_secret_key"],
|
||||
S3Region: act.Params["s3_region"],
|
||||
}
|
||||
if opts.Bucket == "" || opts.ManifestKey == "" || opts.S3Endpoint == "" {
|
||||
return nil, fmt.Errorf("snapshot_import_s3: bucket, manifest_key, and s3_endpoint required")
|
||||
}
|
||||
if act.Params["allow_overwrite"] == "true" {
|
||||
opts.AllowOverwrite = true
|
||||
}
|
||||
|
||||
result, err := tgt.ImportSnapshotS3(ctx, opts)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("snapshot_import_s3: %w", err)
|
||||
}
|
||||
|
||||
actx.Log(" imported %d bytes (sha256=%s)", result.SizeBytes, result.SHA256)
|
||||
out := map[string]string{
|
||||
"value": result.SHA256,
|
||||
}
|
||||
if act.SaveAs != "" {
|
||||
actx.Vars[act.SaveAs+"_size_bytes"] = strconv.FormatUint(result.SizeBytes, 10)
|
||||
actx.Vars[act.SaveAs+"_sha256"] = result.SHA256
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
@@ -478,6 +478,107 @@ func (h *HATarget) Resize(ctx context.Context, newSizeBytes uint64) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ExportSnapshotS3 sends POST /export with S3 credentials.
|
||||
// Returns the manifest key and data SHA-256 on success.
|
||||
func (h *HATarget) ExportSnapshotS3(ctx context.Context, opts ExportS3Opts) (*ExportS3Result, error) {
|
||||
reqBody := map[string]interface{}{
|
||||
"bucket": opts.Bucket,
|
||||
"key_prefix": opts.KeyPrefix,
|
||||
"s3_endpoint": opts.S3Endpoint,
|
||||
"s3_region": opts.S3Region,
|
||||
}
|
||||
if opts.S3AccessKey != "" {
|
||||
reqBody["s3_access_key"] = opts.S3AccessKey
|
||||
reqBody["s3_secret_key"] = opts.S3SecretKey
|
||||
}
|
||||
if opts.SnapshotID > 0 {
|
||||
reqBody["snapshot_id"] = opts.SnapshotID
|
||||
}
|
||||
|
||||
code, body, err := h.curlPost(ctx, "/export", reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export snapshot s3: %w", err)
|
||||
}
|
||||
if code != http.StatusOK {
|
||||
return nil, fmt.Errorf("export snapshot s3 failed (HTTP %d): %s", code, body)
|
||||
}
|
||||
|
||||
var resp ExportS3Result
|
||||
if err := json.NewDecoder(strings.NewReader(body)).Decode(&resp); err != nil {
|
||||
return nil, fmt.Errorf("decode export response: %w", err)
|
||||
}
|
||||
return &resp, nil
|
||||
}
|
||||
|
||||
// ImportSnapshotS3 sends POST /import with S3 credentials and manifest key.
|
||||
func (h *HATarget) ImportSnapshotS3(ctx context.Context, opts ImportS3Opts) (*ImportS3Result, error) {
|
||||
reqBody := map[string]interface{}{
|
||||
"bucket": opts.Bucket,
|
||||
"manifest_key": opts.ManifestKey,
|
||||
"s3_endpoint": opts.S3Endpoint,
|
||||
"s3_region": opts.S3Region,
|
||||
}
|
||||
if opts.S3AccessKey != "" {
|
||||
reqBody["s3_access_key"] = opts.S3AccessKey
|
||||
reqBody["s3_secret_key"] = opts.S3SecretKey
|
||||
}
|
||||
if opts.AllowOverwrite {
|
||||
reqBody["allow_overwrite"] = true
|
||||
}
|
||||
|
||||
code, body, err := h.curlPost(ctx, "/import", reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("import snapshot s3: %w", err)
|
||||
}
|
||||
if code != http.StatusOK {
|
||||
return nil, fmt.Errorf("import snapshot s3 failed (HTTP %d): %s", code, body)
|
||||
}
|
||||
|
||||
var resp ImportS3Result
|
||||
if err := json.NewDecoder(strings.NewReader(body)).Decode(&resp); err != nil {
|
||||
return nil, fmt.Errorf("decode import response: %w", err)
|
||||
}
|
||||
return &resp, nil
|
||||
}
|
||||
|
||||
// ExportS3Opts configures a snapshot export to S3.
|
||||
type ExportS3Opts struct {
|
||||
Bucket string
|
||||
KeyPrefix string
|
||||
S3Endpoint string
|
||||
S3AccessKey string
|
||||
S3SecretKey string
|
||||
S3Region string
|
||||
SnapshotID uint32
|
||||
}
|
||||
|
||||
// ExportS3Result is the response from POST /export.
|
||||
type ExportS3Result struct {
|
||||
OK bool `json:"ok"`
|
||||
ManifestKey string `json:"manifest_key"`
|
||||
DataKey string `json:"data_key"`
|
||||
SizeBytes uint64 `json:"size_bytes"`
|
||||
SHA256 string `json:"sha256"`
|
||||
}
|
||||
|
||||
// ImportS3Opts configures a snapshot import from S3.
|
||||
type ImportS3Opts struct {
|
||||
Bucket string
|
||||
ManifestKey string
|
||||
S3Endpoint string
|
||||
S3AccessKey string
|
||||
S3SecretKey string
|
||||
S3Region string
|
||||
AllowOverwrite bool
|
||||
}
|
||||
|
||||
// ImportS3Result is the response from POST /import.
|
||||
type ImportS3Result struct {
|
||||
OK bool `json:"ok"`
|
||||
SizeBytes uint64 `json:"size_bytes"`
|
||||
SHA256 string `json:"sha256"`
|
||||
}
|
||||
|
||||
// WaitForRole polls GET /status until the target reports the expected role.
|
||||
func (h *HATarget) WaitForRole(ctx context.Context, expectedRole string) error {
|
||||
for {
|
||||
|
||||
@@ -0,0 +1,246 @@
|
||||
name: cp11b3-auto-failover
|
||||
timeout: 10m
|
||||
env:
|
||||
repo_dir: "/opt/work/seaweedfs"
|
||||
master_url: "http://192.168.1.184:9434"
|
||||
|
||||
# Tests: T1 (candidate evaluation), T2 (orphan re-evaluation), T6 (preflight/status)
|
||||
# Flow: Create RF=2 → write data → kill primary → master auto-promotes → verify data + metrics
|
||||
|
||||
topology:
|
||||
nodes:
|
||||
target_node:
|
||||
host: "192.168.1.184"
|
||||
user: testdev
|
||||
key: "/opt/work/testdev_key"
|
||||
client_node:
|
||||
host: "192.168.1.181"
|
||||
user: testdev
|
||||
key: "/opt/work/testdev_key"
|
||||
|
||||
phases:
|
||||
# Phase 1: Clean slate
|
||||
- name: setup
|
||||
actions:
|
||||
- action: kill_stale
|
||||
node: target_node
|
||||
- action: kill_stale
|
||||
node: client_node
|
||||
iscsi_cleanup: "true"
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "rm -rf /tmp/sw-b3-master /tmp/sw-b3-vs1 /tmp/sw-b3-vs2"
|
||||
root: "true"
|
||||
|
||||
# Phase 2: Start cluster
|
||||
- name: start_cluster
|
||||
actions:
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "mkdir -p /tmp/sw-b3-master /tmp/sw-b3-vs1/blocks /tmp/sw-b3-vs2/blocks"
|
||||
- action: start_weed_master
|
||||
node: target_node
|
||||
port: "9434"
|
||||
dir: "/tmp/sw-b3-master"
|
||||
save_as: master_pid
|
||||
- action: wait_cluster_ready
|
||||
node: target_node
|
||||
master_url: "http://localhost:9434"
|
||||
timeout: 30s
|
||||
- action: start_weed_volume
|
||||
node: target_node
|
||||
port: "18190"
|
||||
master: "localhost:9434"
|
||||
dir: "/tmp/sw-b3-vs1"
|
||||
extra_args: "-block.dir=/tmp/sw-b3-vs1/blocks -block.listen=:3277 -ip=192.168.1.184"
|
||||
save_as: vs1_pid
|
||||
- action: start_weed_volume
|
||||
node: target_node
|
||||
port: "18191"
|
||||
master: "localhost:9434"
|
||||
dir: "/tmp/sw-b3-vs2"
|
||||
extra_args: "-block.dir=/tmp/sw-b3-vs2/blocks -block.listen=:3278 -ip=192.168.1.184"
|
||||
save_as: vs2_pid
|
||||
- action: wait_block_servers
|
||||
count: "2"
|
||||
timeout: 60s
|
||||
|
||||
# Phase 3: Create RF=2 volume, record initial state
|
||||
- name: create_volume
|
||||
actions:
|
||||
- action: create_block_volume
|
||||
name: "failover-test"
|
||||
size: "50M"
|
||||
replica_factor: "2"
|
||||
save_as: vol_info
|
||||
# Wait for replica to confirm role via heartbeat.
|
||||
# Without this, PromoteBestReplica rejects replica as "no_heartbeat".
|
||||
- action: sleep
|
||||
duration: 10s
|
||||
- action: lookup_block_volume
|
||||
name: "failover-test"
|
||||
save_as: initial
|
||||
- action: print
|
||||
msg: "initial primary={{ initial_iscsi_host }}:{{ initial_iscsi_port }} capacity={{ initial_capacity }}"
|
||||
# Record the initial primary server for later comparison.
|
||||
- action: assert_block_field
|
||||
name: "failover-test"
|
||||
field: "replica_factor"
|
||||
expected: "2"
|
||||
- action: assert_block_field
|
||||
name: "failover-test"
|
||||
field: "epoch"
|
||||
expected: "1"
|
||||
# Capture initial block status metrics.
|
||||
- action: block_status
|
||||
save_as: pre_stats
|
||||
|
||||
# Phase 4: Write data via iSCSI
|
||||
- name: write_data
|
||||
actions:
|
||||
- action: iscsi_login_direct
|
||||
node: client_node
|
||||
host: "{{ initial_iscsi_host }}"
|
||||
port: "{{ initial_iscsi_port }}"
|
||||
iqn: "{{ initial_iqn }}"
|
||||
save_as: device
|
||||
- action: dd_write
|
||||
node: client_node
|
||||
device: "{{ device }}"
|
||||
bs: 1M
|
||||
count: "1"
|
||||
seek: "5"
|
||||
save_as: md5_5M
|
||||
- action: dd_read_md5
|
||||
node: client_node
|
||||
device: "{{ device }}"
|
||||
bs: 1M
|
||||
count: "1"
|
||||
skip: "5"
|
||||
save_as: verify_5M
|
||||
- action: assert_equal
|
||||
actual: "{{ verify_5M }}"
|
||||
expected: "{{ md5_5M }}"
|
||||
|
||||
# Phase 5: Kill primary VS, wait for master auto-failover
|
||||
- name: failover
|
||||
actions:
|
||||
- action: iscsi_cleanup
|
||||
node: client_node
|
||||
ignore_error: true
|
||||
- action: lookup_block_volume
|
||||
name: "failover-test"
|
||||
save_as: pre_kill
|
||||
- action: print
|
||||
msg: "killing primary VS (server={{ pre_kill_iscsi_host }}:{{ pre_kill_iscsi_port }})"
|
||||
# Crash-kill VS1 with SIGKILL (not SIGTERM) to simulate a real crash.
|
||||
# SIGTERM triggers graceful shutdown which deregisters volumes from
|
||||
# the master registry — preventing the failover path we want to test.
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "kill -9 {{ vs1_pid }}"
|
||||
root: "true"
|
||||
# Wait for master to detect VS1 disconnection and promote.
|
||||
# Lease TTL is 30s; if never granted (zero), promotion is immediate.
|
||||
# Allow extra time for heartbeat confirmation + deferred timer.
|
||||
- action: sleep
|
||||
duration: 35s
|
||||
- action: wait_block_primary
|
||||
name: "failover-test"
|
||||
not: "192.168.1.184:18190"
|
||||
timeout: 60s
|
||||
save_as: promoted
|
||||
|
||||
# Phase 6: Verify failover state
|
||||
- name: verify_failover
|
||||
actions:
|
||||
- action: print
|
||||
msg: "new primary={{ promoted_server }} epoch={{ promoted_epoch }}"
|
||||
# Epoch must have incremented (real promotion, not just heartbeat update).
|
||||
- action: assert_block_field
|
||||
name: "failover-test"
|
||||
field: "epoch"
|
||||
expected: "2"
|
||||
- action: block_status
|
||||
save_as: post_stats
|
||||
# Verify promotion counter incremented.
|
||||
- action: assert_greater
|
||||
actual: "{{ post_stats_promotions_total }}"
|
||||
expected: "{{ pre_stats_promotions_total }}"
|
||||
|
||||
# Phase 7: Reconnect iSCSI to new primary, verify data
|
||||
- name: verify_data
|
||||
actions:
|
||||
- action: iscsi_login_direct
|
||||
node: client_node
|
||||
host: "{{ promoted_iscsi_host }}"
|
||||
port: "{{ promoted_iscsi_port }}"
|
||||
iqn: "{{ promoted_iqn }}"
|
||||
save_as: device2
|
||||
- action: dd_read_md5
|
||||
node: client_node
|
||||
device: "{{ device2 }}"
|
||||
bs: 1M
|
||||
count: "1"
|
||||
skip: "5"
|
||||
save_as: post_failover_md5
|
||||
- action: assert_equal
|
||||
actual: "{{ post_failover_md5 }}"
|
||||
expected: "{{ md5_5M }}"
|
||||
|
||||
# Phase 8: Restart killed VS, verify rebuild queued
|
||||
- name: restart_verify
|
||||
actions:
|
||||
- action: iscsi_cleanup
|
||||
node: client_node
|
||||
ignore_error: true
|
||||
- action: start_weed_volume
|
||||
node: target_node
|
||||
port: "18190"
|
||||
master: "localhost:9434"
|
||||
dir: "/tmp/sw-b3-vs1"
|
||||
extra_args: "-block.dir=/tmp/sw-b3-vs1/blocks -block.listen=:3277 -ip=192.168.1.184"
|
||||
save_as: vs1_pid2
|
||||
- action: wait_block_servers
|
||||
count: "2"
|
||||
timeout: 60s
|
||||
- action: sleep
|
||||
duration: 5s
|
||||
# After restart, the old primary should be queued for rebuild.
|
||||
- action: block_status
|
||||
save_as: final_stats
|
||||
- action: assert_greater
|
||||
actual: "{{ final_stats_rebuilds_total }}"
|
||||
expected: "{{ post_stats_rebuilds_total }}"
|
||||
|
||||
# Cleanup (always runs)
|
||||
- name: cleanup
|
||||
always: true
|
||||
actions:
|
||||
- action: iscsi_cleanup
|
||||
node: client_node
|
||||
ignore_error: true
|
||||
- action: delete_block_volume
|
||||
name: "failover-test"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ vs1_pid2 }}"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ vs2_pid }}"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ vs1_pid }}"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ master_pid }}"
|
||||
ignore_error: true
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "rm -rf /tmp/sw-b3-master /tmp/sw-b3-vs1 /tmp/sw-b3-vs2"
|
||||
root: "true"
|
||||
ignore_error: true
|
||||
@@ -0,0 +1,214 @@
|
||||
name: cp11b3-fast-reconnect
|
||||
timeout: 10m
|
||||
env:
|
||||
repo_dir: "/opt/work/seaweedfs"
|
||||
master_url: "http://192.168.1.184:9436"
|
||||
|
||||
# Tests: T3 (deferred timer safety), T2 (fast reconnect skips failover)
|
||||
# Flow: Create RF=2 → write → kill primary briefly → restart before lease expires
|
||||
# → verify no promotion happened → verify data intact
|
||||
|
||||
topology:
|
||||
nodes:
|
||||
target_node:
|
||||
host: "192.168.1.184"
|
||||
user: testdev
|
||||
key: "/opt/work/testdev_key"
|
||||
client_node:
|
||||
host: "192.168.1.181"
|
||||
user: testdev
|
||||
key: "/opt/work/testdev_key"
|
||||
|
||||
phases:
|
||||
# Phase 1: Clean slate
|
||||
- name: setup
|
||||
actions:
|
||||
- action: kill_stale
|
||||
node: target_node
|
||||
- action: kill_stale
|
||||
node: client_node
|
||||
iscsi_cleanup: "true"
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "rm -rf /tmp/sw-b3r-master /tmp/sw-b3r-vs1 /tmp/sw-b3r-vs2"
|
||||
root: "true"
|
||||
|
||||
# Phase 2: Start cluster
|
||||
- name: start_cluster
|
||||
actions:
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "mkdir -p /tmp/sw-b3r-master /tmp/sw-b3r-vs1/blocks /tmp/sw-b3r-vs2/blocks"
|
||||
- action: start_weed_master
|
||||
node: target_node
|
||||
port: "9436"
|
||||
dir: "/tmp/sw-b3r-master"
|
||||
save_as: master_pid
|
||||
- action: wait_cluster_ready
|
||||
node: target_node
|
||||
master_url: "http://localhost:9436"
|
||||
timeout: 30s
|
||||
- action: start_weed_volume
|
||||
node: target_node
|
||||
port: "18194"
|
||||
master: "localhost:9436"
|
||||
dir: "/tmp/sw-b3r-vs1"
|
||||
extra_args: "-block.dir=/tmp/sw-b3r-vs1/blocks -block.listen=:3281 -ip=192.168.1.184"
|
||||
save_as: vs1_pid
|
||||
- action: start_weed_volume
|
||||
node: target_node
|
||||
port: "18195"
|
||||
master: "localhost:9436"
|
||||
dir: "/tmp/sw-b3r-vs2"
|
||||
extra_args: "-block.dir=/tmp/sw-b3r-vs2/blocks -block.listen=:3282 -ip=192.168.1.184"
|
||||
save_as: vs2_pid
|
||||
- action: wait_block_servers
|
||||
count: "2"
|
||||
timeout: 60s
|
||||
|
||||
# Phase 3: Create RF=2 volume, write data
|
||||
- name: create_and_write
|
||||
actions:
|
||||
- action: create_block_volume
|
||||
name: "reconnect-test"
|
||||
size: "50M"
|
||||
replica_factor: "2"
|
||||
save_as: vol_info
|
||||
# Wait for replica to confirm role via heartbeat.
|
||||
- action: sleep
|
||||
duration: 10s
|
||||
- action: lookup_block_volume
|
||||
name: "reconnect-test"
|
||||
save_as: initial
|
||||
- action: iscsi_login_direct
|
||||
node: client_node
|
||||
host: "{{ initial_iscsi_host }}"
|
||||
port: "{{ initial_iscsi_port }}"
|
||||
iqn: "{{ initial_iqn }}"
|
||||
save_as: device
|
||||
- action: dd_write
|
||||
node: client_node
|
||||
device: "{{ device }}"
|
||||
bs: 1M
|
||||
count: "1"
|
||||
seek: "8"
|
||||
save_as: md5_8M
|
||||
- action: dd_read_md5
|
||||
node: client_node
|
||||
device: "{{ device }}"
|
||||
bs: 1M
|
||||
count: "1"
|
||||
skip: "8"
|
||||
save_as: verify_8M
|
||||
- action: assert_equal
|
||||
actual: "{{ verify_8M }}"
|
||||
expected: "{{ md5_8M }}"
|
||||
- action: iscsi_cleanup
|
||||
node: client_node
|
||||
ignore_error: true
|
||||
# Record initial epoch.
|
||||
- action: assert_block_field
|
||||
name: "reconnect-test"
|
||||
field: "epoch"
|
||||
expected: "1"
|
||||
# Record pre-kill promotion counter.
|
||||
- action: block_status
|
||||
save_as: pre_stats
|
||||
|
||||
# Phase 4: Kill and quickly restart primary VS (before lease expires)
|
||||
- name: fast_reconnect
|
||||
actions:
|
||||
# Crash-kill primary VS with SIGKILL.
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "kill -9 {{ vs1_pid }}"
|
||||
root: "true"
|
||||
# Restart it quickly — within a few seconds, well before the
|
||||
# default 30s lease TTL expires on the master.
|
||||
- action: sleep
|
||||
duration: 3s
|
||||
- action: start_weed_volume
|
||||
node: target_node
|
||||
port: "18194"
|
||||
master: "localhost:9436"
|
||||
dir: "/tmp/sw-b3r-vs1"
|
||||
extra_args: "-block.dir=/tmp/sw-b3r-vs1/blocks -block.listen=:3281 -ip=192.168.1.184"
|
||||
save_as: vs1_pid2
|
||||
# Wait for VS to re-register with master.
|
||||
- action: wait_block_servers
|
||||
count: "2"
|
||||
timeout: 60s
|
||||
- action: sleep
|
||||
duration: 5s
|
||||
|
||||
# Phase 5: Verify NO promotion happened
|
||||
- name: verify_no_promotion
|
||||
actions:
|
||||
# Epoch should still be 1 (no promotion).
|
||||
- action: assert_block_field
|
||||
name: "reconnect-test"
|
||||
field: "epoch"
|
||||
expected: "1"
|
||||
# Promotion counter should not have increased.
|
||||
- action: block_status
|
||||
save_as: post_stats
|
||||
- action: assert_equal
|
||||
actual: "{{ post_stats_promotions_total }}"
|
||||
expected: "{{ pre_stats_promotions_total }}"
|
||||
- action: print
|
||||
msg: "fast reconnect: epoch unchanged, no promotion — deferred timer cancelled"
|
||||
|
||||
# Phase 6: Verify data still accessible on original primary
|
||||
- name: verify_data
|
||||
actions:
|
||||
- action: lookup_block_volume
|
||||
name: "reconnect-test"
|
||||
save_as: after
|
||||
- action: iscsi_login_direct
|
||||
node: client_node
|
||||
host: "{{ after_iscsi_host }}"
|
||||
port: "{{ after_iscsi_port }}"
|
||||
iqn: "{{ after_iqn }}"
|
||||
save_as: device2
|
||||
- action: dd_read_md5
|
||||
node: client_node
|
||||
device: "{{ device2 }}"
|
||||
bs: 1M
|
||||
count: "1"
|
||||
skip: "8"
|
||||
save_as: post_reconnect_md5
|
||||
- action: assert_equal
|
||||
actual: "{{ post_reconnect_md5 }}"
|
||||
expected: "{{ md5_8M }}"
|
||||
|
||||
# Cleanup (always runs)
|
||||
- name: cleanup
|
||||
always: true
|
||||
actions:
|
||||
- action: iscsi_cleanup
|
||||
node: client_node
|
||||
ignore_error: true
|
||||
- action: delete_block_volume
|
||||
name: "reconnect-test"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ vs1_pid2 }}"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ vs2_pid }}"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ vs1_pid }}"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ master_pid }}"
|
||||
ignore_error: true
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "rm -rf /tmp/sw-b3r-master /tmp/sw-b3r-vs1 /tmp/sw-b3r-vs2"
|
||||
root: "true"
|
||||
ignore_error: true
|
||||
@@ -0,0 +1,190 @@
|
||||
name: cp11b3-manual-promote
|
||||
timeout: 10m
|
||||
env:
|
||||
repo_dir: "/opt/work/seaweedfs"
|
||||
master_url: "http://192.168.1.184:9435"
|
||||
|
||||
# Tests: T5 (manual promote API), T6 (preflight), structured rejection
|
||||
# Flow: Create RF=2 → write → preflight check → kill primary → manual promote → verify data
|
||||
|
||||
topology:
|
||||
nodes:
|
||||
target_node:
|
||||
host: "192.168.1.184"
|
||||
user: testdev
|
||||
key: "/opt/work/testdev_key"
|
||||
client_node:
|
||||
host: "192.168.1.181"
|
||||
user: testdev
|
||||
key: "/opt/work/testdev_key"
|
||||
|
||||
phases:
|
||||
# Phase 1: Clean slate
|
||||
- name: setup
|
||||
actions:
|
||||
- action: kill_stale
|
||||
node: target_node
|
||||
- action: kill_stale
|
||||
node: client_node
|
||||
iscsi_cleanup: "true"
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "rm -rf /tmp/sw-b3m-master /tmp/sw-b3m-vs1 /tmp/sw-b3m-vs2"
|
||||
root: "true"
|
||||
|
||||
# Phase 2: Start cluster
|
||||
- name: start_cluster
|
||||
actions:
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "mkdir -p /tmp/sw-b3m-master /tmp/sw-b3m-vs1/blocks /tmp/sw-b3m-vs2/blocks"
|
||||
- action: start_weed_master
|
||||
node: target_node
|
||||
port: "9435"
|
||||
dir: "/tmp/sw-b3m-master"
|
||||
save_as: master_pid
|
||||
- action: wait_cluster_ready
|
||||
node: target_node
|
||||
master_url: "http://localhost:9435"
|
||||
timeout: 30s
|
||||
- action: start_weed_volume
|
||||
node: target_node
|
||||
port: "18192"
|
||||
master: "localhost:9435"
|
||||
dir: "/tmp/sw-b3m-vs1"
|
||||
extra_args: "-block.dir=/tmp/sw-b3m-vs1/blocks -block.listen=:3279 -ip=192.168.1.184"
|
||||
save_as: vs1_pid
|
||||
- action: start_weed_volume
|
||||
node: target_node
|
||||
port: "18193"
|
||||
master: "localhost:9435"
|
||||
dir: "/tmp/sw-b3m-vs2"
|
||||
extra_args: "-block.dir=/tmp/sw-b3m-vs2/blocks -block.listen=:3280 -ip=192.168.1.184"
|
||||
save_as: vs2_pid
|
||||
- action: wait_block_servers
|
||||
count: "2"
|
||||
timeout: 60s
|
||||
|
||||
# Phase 3: Create RF=2 volume, write data
|
||||
- name: create_and_write
|
||||
actions:
|
||||
- action: create_block_volume
|
||||
name: "promote-test"
|
||||
size: "50M"
|
||||
replica_factor: "2"
|
||||
save_as: vol_info
|
||||
# Wait for replica to confirm role via heartbeat.
|
||||
- action: sleep
|
||||
duration: 10s
|
||||
- action: lookup_block_volume
|
||||
name: "promote-test"
|
||||
save_as: initial
|
||||
- action: iscsi_login_direct
|
||||
node: client_node
|
||||
host: "{{ initial_iscsi_host }}"
|
||||
port: "{{ initial_iscsi_port }}"
|
||||
iqn: "{{ initial_iqn }}"
|
||||
save_as: device
|
||||
- action: dd_write
|
||||
node: client_node
|
||||
device: "{{ device }}"
|
||||
bs: 1M
|
||||
count: "2"
|
||||
seek: "3"
|
||||
save_as: md5_3M
|
||||
- action: dd_read_md5
|
||||
node: client_node
|
||||
device: "{{ device }}"
|
||||
bs: 1M
|
||||
count: "2"
|
||||
skip: "3"
|
||||
save_as: verify_3M
|
||||
- action: assert_equal
|
||||
actual: "{{ verify_3M }}"
|
||||
expected: "{{ md5_3M }}"
|
||||
|
||||
# Phase 4: Kill primary VS, then promote via API
|
||||
- name: kill_and_promote
|
||||
actions:
|
||||
- action: iscsi_cleanup
|
||||
node: client_node
|
||||
ignore_error: true
|
||||
# Crash-kill VS1 with SIGKILL to simulate a real crash.
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "kill -9 {{ vs1_pid }}"
|
||||
root: "true"
|
||||
# Wait for master to detect the disconnection.
|
||||
- action: sleep
|
||||
duration: 15s
|
||||
# Manual promote via the API.
|
||||
- action: block_promote
|
||||
name: "promote-test"
|
||||
reason: "T7 integration test: manual failover"
|
||||
save_as: promote_result
|
||||
- action: print
|
||||
msg: "promoted to {{ promote_result_server }} epoch={{ promote_result_epoch }}"
|
||||
|
||||
# Phase 5: Verify promoted state
|
||||
- name: verify_promoted
|
||||
actions:
|
||||
- action: lookup_block_volume
|
||||
name: "promote-test"
|
||||
save_as: after
|
||||
# New primary should be different from old.
|
||||
- action: assert_block_field
|
||||
name: "promote-test"
|
||||
field: "epoch"
|
||||
expected: "2"
|
||||
- action: block_status
|
||||
save_as: stats
|
||||
- action: print
|
||||
msg: "promotions_total={{ stats_promotions_total }}"
|
||||
|
||||
# Phase 6: Reconnect iSCSI to new primary, verify data
|
||||
- name: verify_data
|
||||
actions:
|
||||
- action: iscsi_login_direct
|
||||
node: client_node
|
||||
host: "{{ after_iscsi_host }}"
|
||||
port: "{{ after_iscsi_port }}"
|
||||
iqn: "{{ after_iqn }}"
|
||||
save_as: device2
|
||||
- action: dd_read_md5
|
||||
node: client_node
|
||||
device: "{{ device2 }}"
|
||||
bs: 1M
|
||||
count: "2"
|
||||
skip: "3"
|
||||
save_as: post_promote_md5
|
||||
- action: assert_equal
|
||||
actual: "{{ post_promote_md5 }}"
|
||||
expected: "{{ md5_3M }}"
|
||||
|
||||
# Cleanup (always runs)
|
||||
- name: cleanup
|
||||
always: true
|
||||
actions:
|
||||
- action: iscsi_cleanup
|
||||
node: client_node
|
||||
ignore_error: true
|
||||
- action: delete_block_volume
|
||||
name: "promote-test"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ vs2_pid }}"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ vs1_pid }}"
|
||||
ignore_error: true
|
||||
- action: stop_weed
|
||||
node: target_node
|
||||
pid: "{{ master_pid }}"
|
||||
ignore_error: true
|
||||
- action: exec
|
||||
node: target_node
|
||||
cmd: "rm -rf /tmp/sw-b3m-master /tmp/sw-b3m-vs1 /tmp/sw-b3m-vs2"
|
||||
root: "true"
|
||||
ignore_error: true
|
||||
Reference in New Issue
Block a user