mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-05-23 18:21:28 +00:00
feat: CP13-2 — canonical replica addressing on production truth surface
Problem: StartReplicaReceiver didn't forward advertisedHost to NewReplicaReceiver, so wildcard-bind listeners relied on outbound-IP fallback for canonicalization. On multi-NIC hosts this could select the wrong interface, leaking non-routable addresses into replication truth. Fix: - blockvol.go: StartReplicaReceiver now accepts optional advertisedHost variadic param and forwards it to NewReplicaReceiver - volume_server_block.go: setupReplicaReceiver extracts host from localServerID (the canonical VS identity) and passes it as advertisedHost — wildcard-bind addresses now resolve to the authoritative server IP, not outbound-IP fallback Proof (sync_all_bug_test.go TestBug3, upgraded from PASS* to PASS): - fallback: wildcard bind without advertisedHost still produces ip:port - advertisedHost: explicit host appears in exported DataAddr/CtrlAddr - StartReplicaReceiver_API: public API forwards advertisedHost correctly What CP13-2 does NOT change: - No reconnect handshake changes (CP13-5) - No retention policy changes (CP13-6) - No rebuild behavior changes (CP13-7) - No barrier protocol changes (CP13-3) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -525,11 +525,15 @@ func (bs *BlockService) setupPrimaryReplicationMulti(path string, addrs []blockv
|
||||
|
||||
// setupReplicaReceiver starts the replica WAL receiver.
|
||||
func (bs *BlockService) setupReplicaReceiver(path, dataAddr, ctrlAddr string) {
|
||||
// Store canonical addresses from the receiver (not raw assignment addresses).
|
||||
// The receiver canonicalizes wildcard ":port" to "ip:port" via CP13-2.
|
||||
// CP13-2: Pass the canonical host from localServerID so wildcard-bind
|
||||
// listeners resolve to the authoritative routable IP, not outbound-IP fallback.
|
||||
advertisedHost := bs.localServerID
|
||||
if idx := strings.LastIndex(advertisedHost, ":"); idx >= 0 {
|
||||
advertisedHost = advertisedHost[:idx]
|
||||
}
|
||||
var canonDataAddr, canonCtrlAddr string
|
||||
if err := bs.blockStore.WithVolume(path, func(vol *blockvol.BlockVol) error {
|
||||
if err := vol.StartReplicaReceiver(dataAddr, ctrlAddr); err != nil {
|
||||
if err := vol.StartReplicaReceiver(dataAddr, ctrlAddr, advertisedHost); err != nil {
|
||||
return err
|
||||
}
|
||||
// Read back canonical addresses from the receiver.
|
||||
|
||||
@@ -1265,8 +1265,12 @@ func (v *BlockVol) ReplicaReceiverAddr() *ReplicaReceiverAddrInfo {
|
||||
}
|
||||
|
||||
// StartReplicaReceiver starts listening for replicated WAL entries from a primary.
|
||||
func (v *BlockVol) StartReplicaReceiver(dataAddr, ctrlAddr string) error {
|
||||
recv, err := NewReplicaReceiver(v, dataAddr, ctrlAddr)
|
||||
// advertisedHost is the canonical routable IP for this server (e.g., from VS
|
||||
// server ID). If provided, wildcard-bind listener addresses are resolved to
|
||||
// advertisedHost:port instead of relying on outbound-IP fallback. On multi-NIC
|
||||
// hosts, always provide advertisedHost to ensure cross-machine reachability.
|
||||
func (v *BlockVol) StartReplicaReceiver(dataAddr, ctrlAddr string, advertisedHost ...string) error {
|
||||
recv, err := NewReplicaReceiver(v, dataAddr, ctrlAddr, advertisedHost...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ package blockvol
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"net"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -18,47 +19,91 @@ import (
|
||||
|
||||
// --- Bug 3: Address resolution ---
|
||||
|
||||
// TestBug3_ReplicaAddr_MustBeIPPort verifies that ReplicaReceiver.DataAddr()
|
||||
// and CtrlAddr() return ip:port, not :port.
|
||||
// Current code returns ":port" from listener.Addr().String() which fails cross-machine.
|
||||
// TestBug3_ReplicaAddr_MustBeIPPort_WildcardBind verifies that when
|
||||
// ReplicaReceiver binds to ":0" (wildcard — the production default),
|
||||
// DataAddr()/CtrlAddr() still return ip:port, not ":port".
|
||||
// In production, the VS uses ":0" to let the OS pick a port.
|
||||
// The address is then sent to the primary via heartbeat/assignment.
|
||||
// If it's ":port", the primary dials localhost — fails cross-machine.
|
||||
// DataAddr()/CtrlAddr() return canonical ip:port, not ":port" or wildcard.
|
||||
//
|
||||
// CP13-2 proof: this is a real proof, not witness coverage.
|
||||
// The test verifies both the fallback path (no advertisedHost) and the
|
||||
// explicit advertisedHost path (production: host extracted from serverID).
|
||||
func TestBug3_ReplicaAddr_MustBeIPPort_WildcardBind(t *testing.T) {
|
||||
vol := createTestVol(t)
|
||||
defer vol.Close()
|
||||
|
||||
// Bind to ":0" — this is what production code does.
|
||||
recv, err := NewReplicaReceiver(vol, ":0", ":0")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer recv.Stop()
|
||||
|
||||
dataAddr := recv.DataAddr()
|
||||
ctrlAddr := recv.CtrlAddr()
|
||||
|
||||
// Addresses MUST be dialable from a remote machine:
|
||||
// - not ":port" (missing IP entirely)
|
||||
// - not "0.0.0.0:port" (wildcard, not routable)
|
||||
// - not "[::]:port" (IPv6 wildcard, not routable)
|
||||
for _, addr := range []struct{ name, val string }{
|
||||
{"DataAddr", dataAddr},
|
||||
{"CtrlAddr", ctrlAddr},
|
||||
} {
|
||||
if strings.HasPrefix(addr.val, ":") {
|
||||
t.Fatalf("%s() returned %q — missing IP", addr.name, addr.val)
|
||||
assertCanonical := func(t *testing.T, name, addr string) {
|
||||
t.Helper()
|
||||
if strings.HasPrefix(addr, ":") {
|
||||
t.Fatalf("%s returned %q — missing IP", name, addr)
|
||||
}
|
||||
if strings.HasPrefix(addr.val, "0.0.0.0:") {
|
||||
t.Fatalf("%s() returned %q — wildcard, not routable cross-machine", addr.name, addr.val)
|
||||
if strings.HasPrefix(addr, "0.0.0.0:") {
|
||||
t.Fatalf("%s returned %q — wildcard, not routable cross-machine", name, addr)
|
||||
}
|
||||
if strings.HasPrefix(addr.val, "[::]:") {
|
||||
t.Fatalf("%s() returned %q — IPv6 wildcard, not routable cross-machine", addr.name, addr.val)
|
||||
if strings.HasPrefix(addr, "[::]:") {
|
||||
t.Fatalf("%s returned %q — IPv6 wildcard, not routable cross-machine", name, addr)
|
||||
}
|
||||
host, port, err := net.SplitHostPort(addr)
|
||||
if err != nil {
|
||||
t.Fatalf("%s returned %q — not valid host:port: %v", name, addr, err)
|
||||
}
|
||||
if host == "" {
|
||||
t.Fatalf("%s returned %q — empty host", name, addr)
|
||||
}
|
||||
if port == "" || port == "0" {
|
||||
t.Fatalf("%s returned %q — invalid port", name, addr)
|
||||
}
|
||||
}
|
||||
|
||||
// Case 1: wildcard bind with no advertisedHost (fallback to outbound IP).
|
||||
t.Run("fallback", func(t *testing.T) {
|
||||
recv, err := NewReplicaReceiver(vol, ":0", ":0")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer recv.Stop()
|
||||
assertCanonical(t, "DataAddr", recv.DataAddr())
|
||||
assertCanonical(t, "CtrlAddr", recv.CtrlAddr())
|
||||
})
|
||||
|
||||
// Case 2: wildcard bind with explicit advertisedHost (production path).
|
||||
// CP13-2: this is the authoritative path — serverID host is forwarded
|
||||
// through StartReplicaReceiver → NewReplicaReceiver → canonicalizeListenerAddr.
|
||||
t.Run("advertisedHost", func(t *testing.T) {
|
||||
recv, err := NewReplicaReceiver(vol, ":0", ":0", "10.0.0.42")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer recv.Stop()
|
||||
assertCanonical(t, "DataAddr", recv.DataAddr())
|
||||
assertCanonical(t, "CtrlAddr", recv.CtrlAddr())
|
||||
// The advertised host must appear in the exported address.
|
||||
if !strings.HasPrefix(recv.DataAddr(), "10.0.0.42:") {
|
||||
t.Fatalf("DataAddr %q does not use advertisedHost 10.0.0.42", recv.DataAddr())
|
||||
}
|
||||
if !strings.HasPrefix(recv.CtrlAddr(), "10.0.0.42:") {
|
||||
t.Fatalf("CtrlAddr %q does not use advertisedHost 10.0.0.42", recv.CtrlAddr())
|
||||
}
|
||||
})
|
||||
|
||||
// Case 3: StartReplicaReceiver (the public API) with advertisedHost.
|
||||
t.Run("StartReplicaReceiver_API", func(t *testing.T) {
|
||||
vol2 := createTestVol(t)
|
||||
defer vol2.Close()
|
||||
if err := vol2.StartReplicaReceiver(":0", ":0", "10.0.0.99"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
info := vol2.ReplicaReceiverAddr()
|
||||
if info == nil {
|
||||
t.Fatal("ReplicaReceiverAddr() returned nil")
|
||||
}
|
||||
assertCanonical(t, "DataAddr", info.DataAddr)
|
||||
assertCanonical(t, "CtrlAddr", info.CtrlAddr)
|
||||
if !strings.HasPrefix(info.DataAddr, "10.0.0.99:") {
|
||||
t.Fatalf("DataAddr %q does not use advertisedHost 10.0.0.99", info.DataAddr)
|
||||
}
|
||||
if !strings.HasPrefix(info.CtrlAddr, "10.0.0.99:") {
|
||||
t.Fatalf("CtrlAddr %q does not use advertisedHost 10.0.0.99", info.CtrlAddr)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// --- Bug 2: LSN gap after shipper degradation ---
|
||||
|
||||
Reference in New Issue
Block a user