mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-05-22 17:51:30 +00:00
Acceptance rows closed: - WriteLBA/SyncCache contract: code comments document write-back vs durability fence semantics - RF=2 stable identity: v2bridge always uses SetReplicaAddrs (preserves ServerID); blockcmd dispatcher also fixed to use setupPrimaryReplicationMulti; test asserts exact expected ReplicaID="vs-2" (not just non-empty) - Tests treating WriteLBA as commit: replica_read_test rewritten with SyncCache as durability fence - publish_healthy contract: 3 gate tests with hard assertions including gate 3 (PrimaryShipperConnected) - SetReplicaAddr deprecation warning added - WALShipper.ReplicaID() getter added for identity verification Test runner enhancements: - sw-test-runner suite command: build → deploy → run N scenarios in one invocation with --skip-deploy support - Suite YAML definitions for T6 Stage 0 and Stage 1 - deploy action: kill stale processes, clean dirs, cross-compile, upload - run-phase20-t6.ps1 PowerShell script (deprecated by suite command) Engine/runtime fixes: - Recovery executor nil-safety improvements - Recovery bundle BuildRecoveryBundle defensive checks - ShipperGroup MinReplicaFlushedLSNAll surface Docs: acceptance checklist refined, test matrix updated, T6 runbook, engine maintainer tutorial, design README updated. 26 files changed, ~1600 insertions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
104 lines
3.2 KiB
Go
104 lines
3.2 KiB
Go
package runtime
|
|
|
|
import (
|
|
"errors"
|
|
"strings"
|
|
|
|
engine "github.com/seaweedfs/seaweedfs/sw-block/engine/replication"
|
|
)
|
|
|
|
// RecoveryCallbacks is the host-side callback interface for recovery execution.
|
|
// The runtime helper drives plan execution; the host supplies concrete
|
|
// IO bindings and receives completion notifications.
|
|
type RecoveryCallbacks interface {
|
|
// OnRecoveryProgress is called when replay/rebuild reaches an explicit
|
|
// achieved boundary but the recovery session is not yet fully closed.
|
|
OnRecoveryProgress(volumeID, replicaID string, achievedLSN uint64)
|
|
|
|
// OnCatchUpCompleted is called after successful catch-up execution.
|
|
OnCatchUpCompleted(volumeID, replicaID string, achievedLSN uint64)
|
|
|
|
// OnCatchUpFailed is called when catch-up execution fails with a
|
|
// classified reason that the host may need to surface into core events.
|
|
OnCatchUpFailed(volumeID, replicaID, reason string)
|
|
|
|
// OnRebuildCompleted is called after successful rebuild execution.
|
|
// The host should read the post-rebuild snapshot and emit the
|
|
// appropriate core event.
|
|
OnRebuildCompleted(volumeID, replicaID string, plan *engine.RecoveryPlan)
|
|
}
|
|
|
|
// ExecuteCatchUpPlan runs a catch-up plan using the supplied IO binding
|
|
// and notifies the host on completion. Returns an error if execution fails.
|
|
func ExecuteCatchUpPlan(
|
|
driver *engine.RecoveryDriver,
|
|
plan *engine.RecoveryPlan,
|
|
io engine.CatchUpIO,
|
|
volumeID string,
|
|
replicaID string,
|
|
callbacks RecoveryCallbacks,
|
|
) error {
|
|
exec := engine.NewCatchUpExecutor(driver, plan)
|
|
exec.IO = io
|
|
if err := exec.Execute(nil, 0); err != nil {
|
|
if callbacks != nil {
|
|
callbacks.OnCatchUpFailed(volumeID, replicaID, classifyCatchUpFailure(err))
|
|
}
|
|
return err
|
|
}
|
|
if callbacks != nil {
|
|
achievedLSN := plan.CatchUpTarget
|
|
if achievedLSN == 0 {
|
|
achievedLSN = plan.CatchUpStartLSN
|
|
}
|
|
callbacks.OnRecoveryProgress(volumeID, replicaID, achievedLSN)
|
|
callbacks.OnCatchUpCompleted(volumeID, replicaID, achievedLSN)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ExecuteRebuildPlan runs a rebuild plan using the supplied IO binding
|
|
// and notifies the host on completion. Returns an error if execution fails.
|
|
func ExecuteRebuildPlan(
|
|
driver *engine.RecoveryDriver,
|
|
plan *engine.RecoveryPlan,
|
|
io engine.RebuildIO,
|
|
volumeID string,
|
|
replicaID string,
|
|
callbacks RecoveryCallbacks,
|
|
) error {
|
|
exec := engine.NewRebuildExecutor(driver, plan)
|
|
exec.IO = io
|
|
if err := exec.Execute(); err != nil {
|
|
return err
|
|
}
|
|
if callbacks != nil {
|
|
callbacks.OnRecoveryProgress(volumeID, replicaID, plan.RebuildTargetLSN)
|
|
callbacks.OnRebuildCompleted(volumeID, replicaID, plan)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func classifyCatchUpFailure(err error) string {
|
|
if err == nil {
|
|
return ""
|
|
}
|
|
msg := err.Error()
|
|
switch {
|
|
case errors.Is(err, engine.ErrTruncationUnsafe):
|
|
return "truncation_unsafe"
|
|
case strings.Contains(msg, "WAL recycled"):
|
|
return "retention_lost"
|
|
case strings.Contains(msg, "duration_exceeded"):
|
|
return "catchup_duration_exceeded"
|
|
case strings.Contains(msg, "progress_stalled"):
|
|
return "catchup_progress_stalled"
|
|
case strings.Contains(msg, "entries_limit_exceeded"):
|
|
return "catchup_entries_limit_exceeded"
|
|
case strings.Contains(msg, "budget violation"):
|
|
return "catchup_budget_exceeded"
|
|
default:
|
|
return ""
|
|
}
|
|
}
|