fix: CP13-2 — use advertisedIP (routable), not localServerID (opaque)

Bug: setupReplicaReceiver derived the advertised host from localServerID,
which can be an opaque string (from -id flag, e.g., "my-custom-server-id").
This would publish unusable endpoints like "my-custom-server-id:14260".

Fix:
- volume_server_block.go: add advertisedIP field (always a real IP from
  -ip flag), use it instead of localServerID for replica canonicalization
- volume.go: wire *v.ip → blockService.SetAdvertisedIP() at startup
- blockvol.go: StartReplicaReceiver variadic advertisedHost unchanged

Proof (sync_all_bug_test.go TestBug3, 4 sub-cases):
- fallback: wildcard bind without advertisedHost → outbound-IP
- advertisedHost: explicit IP appears in exported addresses
- StartReplicaReceiver_API: public API forwards host correctly
- opaque_identity_not_routable: proves opaque string produces
  non-routable address, confirming production must use advertisedIP

Identity vs transport separation preserved:
- localServerID: stable identity for V2 control (may be opaque)
- advertisedIP: routable IP for transport endpoints (always real IP)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
pingqiu
2026-04-02 20:51:47 -07:00
parent 2d47383df7
commit 4bdf6c604e
4 changed files with 51 additions and 12 deletions

View File

@@ -347,6 +347,7 @@ func (v VolumeServerOptions) startVolumeServer(volumeFolders, maxVolumeCounts, v
// volume server. One identity truth across VS, block service, control
// bridge, and runtime ownership.
blockService.SetServerID(volumeServerId)
blockService.SetAdvertisedIP(*v.ip) // CP13-2: routable IP, not opaque ID
volumeServer.SetBlockService(blockService)
}

View File

@@ -84,6 +84,7 @@ func newSoakSetup(t *testing.T) *soakSetup {
blockDir: filepath.Join(dir, "vs1_9333"),
listenAddr: "127.0.0.1:3260",
localServerID: "vs1:9333",
advertisedIP: "127.0.0.1",
v2Bridge: v2bridge.NewControlBridge(),
v2Orchestrator: engine.NewRecoveryOrchestrator(),
replStates: make(map[string]*volReplState),

View File

@@ -59,9 +59,14 @@ type BlockService struct {
lastAssignMu sync.RWMutex
lastAssign map[string]lastAppliedAssignment
// localServerID: stable identity for this volume server.
// INTERIM: uses listenAddr (transport-shaped). Should be replaced
// with a registry-assigned stable server ID in a later hardening pass.
// May be an opaque string (from -id flag) or ip:port (default fallback).
// NOT guaranteed to be a routable address — do not use for transport endpoints.
localServerID string
// advertisedIP: routable IP for this volume server (from -ip flag or auto-detected).
// Used by CP13-2 to canonicalize wildcard-bind replica listener addresses to
// routable ip:port. This is always a real IP, never an opaque identity string.
advertisedIP string
}
// V2Orchestrator returns the V2 engine orchestrator for inspection/testing.
@@ -70,12 +75,18 @@ func (bs *BlockService) V2Orchestrator() *engine.RecoveryOrchestrator {
}
// SetServerID sets the stable server identity for V2 control semantics.
// Should be called with the gRPC address that the master knows this VS by,
// replacing the interim listenAddr-based identity.
// This may be an opaque string (from -id flag) — not guaranteed routable.
func (bs *BlockService) SetServerID(id string) {
bs.localServerID = id
}
// SetAdvertisedIP sets the routable IP for replica endpoint canonicalization.
// This must be a real IP address (from -ip flag or auto-detected), never an
// opaque server identity. Called at startup from volume.go.
func (bs *BlockService) SetAdvertisedIP(ip string) {
bs.advertisedIP = ip
}
// WireStateChangeNotify sets up shipper state change callbacks on all
// registered volumes so that degradation/recovery triggers an immediate
// heartbeat via the provided channel. Non-blocking send (buffered chan 1).
@@ -525,16 +536,19 @@ func (bs *BlockService) setupPrimaryReplicationMulti(path string, addrs []blockv
// setupReplicaReceiver starts the replica WAL receiver.
func (bs *BlockService) setupReplicaReceiver(path, dataAddr, ctrlAddr string) {
// CP13-2: Pass the canonical host from localServerID so wildcard-bind
// listeners resolve to the authoritative routable IP, not outbound-IP fallback.
advertisedHost := bs.localServerID
if idx := strings.LastIndex(advertisedHost, ":"); idx >= 0 {
advertisedHost = advertisedHost[:idx]
}
// CP13-2: Pass the routable advertisedIP (from -ip flag, NOT from -id/serverID)
// so wildcard-bind listeners resolve to a real IP, not an opaque identity string.
var canonDataAddr, canonCtrlAddr string
advIP := bs.advertisedIP
if err := bs.blockStore.WithVolume(path, func(vol *blockvol.BlockVol) error {
if err := vol.StartReplicaReceiver(dataAddr, ctrlAddr, advertisedHost); err != nil {
return err
if advIP != "" {
if err := vol.StartReplicaReceiver(dataAddr, ctrlAddr, advIP); err != nil {
return err
}
} else {
if err := vol.StartReplicaReceiver(dataAddr, ctrlAddr); err != nil {
return err
}
}
// Read back canonical addresses from the receiver.
if vol.ReplicaReceiverAddr() != nil {

View File

@@ -104,6 +104,29 @@ func TestBug3_ReplicaAddr_MustBeIPPort_WildcardBind(t *testing.T) {
t.Fatalf("CtrlAddr %q does not use advertisedHost 10.0.0.99", info.CtrlAddr)
}
})
// Case 4: opaque non-IP advertisedHost must NOT produce a usable routable address.
// CP13-2: this proves that passing an opaque server identity (e.g., from -id flag)
// as the advertised host would produce a non-routable address. The production path
// uses advertisedIP (from -ip flag), not localServerID, to prevent this.
t.Run("opaque_identity_not_routable", func(t *testing.T) {
vol3 := createTestVol(t)
defer vol3.Close()
// Pass an opaque identity string — NOT an IP address.
if err := vol3.StartReplicaReceiver(":0", ":0", "my-custom-server-id"); err != nil {
t.Fatal(err)
}
info := vol3.ReplicaReceiverAddr()
if info == nil {
t.Fatal("ReplicaReceiverAddr() returned nil")
}
// The address will be "my-custom-server-id:port" — NOT parseable as a routable IP.
ip := net.ParseIP(strings.Split(info.DataAddr, ":")[0])
if ip != nil {
t.Fatalf("opaque identity should NOT produce a parseable IP, got %s", info.DataAddr)
}
t.Logf("correctly non-routable: %s (production uses advertisedIP, not serverID)", info.DataAddr)
})
}
// --- Bug 2: LSN gap after shipper degradation ---