Revert #9440 — volume admin fail-closed gate breaks multi-host clusters (#9472)

* Revert "volume: fail closed in admin gRPC gate when no whitelist is configured (#9440)"

This reverts commit 21054b6c18.

The fail-closed gate broke any multi-host cluster: in compose / k8s /
remote-host deployments the master's IP isn't loopback, so every
master->volume admin RPC (AllocateVolume, BatchDelete, EC reroute,
vacuum, scrub, ...) is rejected with PermissionDenied unless the
operator manually configures -whiteList. The e2e workflow has been
failing since 10cc06333 with `not authorized: 172.18.0.2` on
AllocateVolume; downstream symptom is fio fsync EIO because zero
volumes can be grown.

The gate's intent was to lock down destructive admin tooling, but the
same RPCs are the master's normal mechanism for growing and managing
volumes. Reverting to restore cluster-internal operation; a narrower
re-do should distinguish operator/admin callers from the master peer
(e.g. trust IPs resolved from -master) before going back in.

* security: skip invalid CIDR in UpdateWhiteList so IsWhiteListed can't panic

The revert in the previous commit also rolled back an unrelated bug fix
that lived inside #9440: UpdateWhiteList logged on net.ParseCIDR error
but did not continue, so the nil *net.IPNet was stored in whiteListCIDR
and IsWhiteListed would panic dereferencing cidrnet.Contains(remote) on
the next gRPC admin check.

Restore the continue. Orthogonal to the fail-closed semantics this PR
is reverting.
This commit is contained in:
Chris Lu
2026-05-12 16:00:44 -07:00
committed by GitHub
parent f28c7ce6df
commit 43a8c4fdca
5 changed files with 27 additions and 263 deletions

View File

@@ -198,11 +198,5 @@ kek = ""
key = ""
# white list. It's checking request ip address.
# Destructive volume admin gRPCs (DeleteCollection, VolumeDelete,
# VolumeMarkReadonly, VolumeMarkWritable, VolumeUnmount, VolumeMount,
# VolumeServerLeave, AllocateVolume, VolumeConfigure, EC/vacuum) require a
# populated white_list. If left empty, the volume server rejects those RPCs
# from off-host peers (loopback callers are still trusted); either list the
# admin IPs/CIDRs here or pass them via the -whiteList CLI flag.
[guard]
white_list = ""

View File

@@ -36,13 +36,6 @@ Generating JWT:
2. optionally set "exp", "nbf" fields, in Unix time,
the number of seconds elapsed since January 1, 1970 UTC.
There are two whitelist check entry points with different defaults:
- IsWhiteListed: allow-all when the whitelist is empty. Kept for HTTP
compatibility (e.g. Guard.WhiteList middleware on read-mostly routes).
- IsAdminAuthorized: fail-closed when the whitelist is empty. Used to gate
destructive admin endpoints so an unconfigured deployment does not
accept them from arbitrary peers.
Referenced:
https://github.com/pkieltyka/jwtauth/blob/master/jwtauth.go
*/
@@ -115,10 +108,6 @@ func (g *Guard) checkWhiteList(w http.ResponseWriter, r *http.Request) error {
// IsWhiteListed returns true if the given host IP is allowed by the guard.
// When no whitelist is configured (security inactive), all hosts are allowed.
// This allow-all-when-empty behaviour is kept for HTTP compatibility: many
// non-write HTTP routes wrap themselves with Guard.WhiteList and must remain
// reachable on default deployments. Callers gating destructive endpoints must
// use IsAdminAuthorized instead.
func (g *Guard) IsWhiteListed(host string) bool {
if !g.isWriteActive {
return true
@@ -140,25 +129,6 @@ func (g *Guard) IsWhiteListed(host string) bool {
return false
}
// IsAdminAuthorized is the explicit, fail-closed equivalent of IsWhiteListed
// for destructive admin endpoints. Unlike IsWhiteListed, it returns false
// when no whitelist is configured, so callers must opt in by populating the
// whitelist (CLI -whiteList or guard.white_list in security.toml).
//
// Loopback peers are trusted even when the whitelist is empty: a volume
// server commonly cohabits with the master/filer on a single host (and on
// every integration-test cluster), and an in-process loopback caller is
// equivalent to local-host trust. Off-host callers remain denied.
func (g *Guard) IsAdminAuthorized(host string) bool {
if g.isEmptyWhiteList {
if ip := net.ParseIP(host); ip != nil && ip.IsLoopback() {
return true
}
return false
}
return g.IsWhiteListed(host)
}
func (g *Guard) UpdateWhiteList(whiteList []string) {
whiteListIp := make(map[string]struct{})
whiteListCIDR := make(map[string]*net.IPNet)

View File

@@ -2,11 +2,12 @@ package security
import "testing"
// TestIsWhiteListedEmptyConfigAllowsEverything pins the HTTP-side contract:
// when no whitelist and no signing key are configured, IsWhiteListed accepts
// any host. This is the allow-all-when-empty path that Guard.WhiteList
// middleware on read-mostly HTTP routes relies on; the destructive gRPC
// admin gate uses IsAdminAuthorized instead, which is fail-closed.
// TestIsWhiteListedEmptyConfigAllowsEverything pins the contract that the
// gRPC admin auth helper relies on: when no whitelist and no signing key
// are configured, IsWhiteListed accepts any host (including the empty
// string and unparseable peer addresses like the gRPC "@" passthrough
// form). Otherwise insecure default deployments would lock themselves out
// of every destructive admin RPC.
func TestIsWhiteListedEmptyConfigAllowsEverything(t *testing.T) {
g := NewGuard(nil, "", 0, "", 0)
for _, host := range []string{"", "@", "127.0.0.1", "::1", "garbage:value"} {
@@ -37,9 +38,9 @@ func TestIsWhiteListedWithListRejectsUnknown(t *testing.T) {
}
func TestIsWhiteListedWithSigningKeyButNoWhitelistAllowsAll(t *testing.T) {
// JWT-only mode: signing key set, whitelist empty. IsWhiteListed must
// still accept every host because read-side HTTP middleware piggybacks
// on it; admin gating uses IsAdminAuthorized, which is fail-closed.
// JWT-only mode: signing key set, whitelist empty. Without this branch
// the gRPC admin auth helper would falsely deny in-process callers when
// the operator wires up signing keys but hasn't enumerated IPs.
g := NewGuard(nil, "deadbeef", 0, "", 0)
for _, host := range []string{"", "@", "127.0.0.1"} {
if !g.IsWhiteListed(host) {
@@ -47,81 +48,3 @@ func TestIsWhiteListedWithSigningKeyButNoWhitelistAllowsAll(t *testing.T) {
}
}
}
// TestIsAdminAuthorizedEmptyWhitelistDeniesNonLoopback pins the fail-closed
// contract the volume gRPC admin gate depends on: with no whitelist
// configured every off-host peer is rejected, even when a signing key is
// set. Loopback peers are still trusted (see
// TestIsAdminAuthorizedEmptyWhitelistAllowsLoopback) because a volume server
// commonly cohabits with master/filer on a single host.
func TestIsAdminAuthorizedEmptyWhitelistDeniesNonLoopback(t *testing.T) {
cases := []*Guard{
NewGuard(nil, "", 0, "", 0),
NewGuard(nil, "deadbeef", 0, "", 0),
NewGuard([]string{}, "", 0, "", 0),
}
for i, g := range cases {
for _, host := range []string{"", "@", "10.0.0.1", "192.168.0.5", "8.8.8.8"} {
if g.IsAdminAuthorized(host) {
t.Errorf("case %d: empty whitelist should deny host=%q", i, host)
}
}
}
}
// TestIsAdminAuthorizedEmptyWhitelistAllowsLoopback pins the loopback
// exception: when no whitelist is configured, in-process callers from
// 127.0.0.0/8 or ::1 are still authorized. Without this, single-host
// clusters and integration tests (which never set -whiteList) would lose
// every admin RPC the moment fail-closed semantics kicked in.
func TestIsAdminAuthorizedEmptyWhitelistAllowsLoopback(t *testing.T) {
cases := []*Guard{
NewGuard(nil, "", 0, "", 0),
NewGuard(nil, "deadbeef", 0, "", 0),
NewGuard([]string{}, "", 0, "", 0),
}
for i, g := range cases {
for _, host := range []string{"127.0.0.1", "127.0.0.5", "::1"} {
if !g.IsAdminAuthorized(host) {
t.Errorf("case %d: empty whitelist should still allow loopback host=%q", i, host)
}
}
}
}
func TestIsAdminAuthorizedWithIPList(t *testing.T) {
g := NewGuard([]string{"10.0.0.1"}, "", 0, "", 0)
cases := []struct {
host string
want bool
}{
{"10.0.0.1", true},
{"10.0.0.2", false},
{"", false},
{"@", false},
}
for _, tc := range cases {
if got := g.IsAdminAuthorized(tc.host); got != tc.want {
t.Errorf("IsAdminAuthorized(%q) = %v want %v", tc.host, got, tc.want)
}
}
}
func TestIsAdminAuthorizedWithCIDR(t *testing.T) {
g := NewGuard([]string{"10.0.0.0/24"}, "", 0, "", 0)
cases := []struct {
host string
want bool
}{
{"10.0.0.5", true},
{"10.0.0.255", true},
{"192.168.0.1", false},
{"10.0.1.1", false},
{"", false},
}
for _, tc := range cases {
if got := g.IsAdminAuthorized(tc.host); got != tc.want {
t.Errorf("IsAdminAuthorized(%q) = %v want %v", tc.host, got, tc.want)
}
}
}

View File

@@ -30,27 +30,17 @@ import (
// checkGrpcAdminAuth verifies the gRPC caller is authorized for destructive
// admin operations by checking the peer address against the guard's whitelist.
//
// Only TCP peers go through the host check. Any non-TCP peer (in-process
// bufconn, unix socket, etc.) is treated as trusted because such callers
// share the same OS process or host filesystem as the volume server and
// cannot be spoofed by a remote attacker. This matters when `weed server`
// runs master+volume+filer in a single process and the master invokes
// AllocateVolume through an in-process gRPC dial: the peer address surfaces
// as "@" rather than 127.0.0.1, which has no parseable IP.
// IP extraction prefers a typed *net.TCPAddr where available, falling back to
// SplitHostPort on the string form, then to the raw string. The fallback
// chain matters because in-process/passthrough connections used in tests
// surface as unparseable strings like "@"; with an empty whitelist the
// allow-all branch in IsWhiteListed accepts them, with a whitelist they're
// denied as expected.
//
// The whitelist check uses Guard.IsAdminAuthorized, which is fail-closed: a
// volume server with a configured guard but empty whitelist still rejects
// destructive admin RPCs from off-host TCP peers (loopback TCP callers are
// trusted, see IsAdminAuthorized). Operators must set -whiteList on the CLI
// or guard.white_list in security.toml to allow remote admin peers. Failed
// attempts are logged so misconfigured callers and probe attempts are
// visible.
// Failed authorization attempts are logged so an operator running with a
// configured whitelist can spot misconfigured callers and probe attempts.
func (vs *VolumeServer) checkGrpcAdminAuth(ctx context.Context) error {
if vs.guard == nil {
// No guard wired up at all — this is a developmental / embedded path
// (e.g. unit tests). Skip the gate; the real protection lives in
// IsAdminAuthorized, which only takes effect once a guard exists.
glog.V(1).Infof("gRPC admin auth: no guard configured; set -whiteList or guard.white_list in security.toml to enforce")
return nil
}
pr, ok := peer.FromContext(ctx)
@@ -60,15 +50,17 @@ func (vs *VolumeServer) checkGrpcAdminAuth(ctx context.Context) error {
glog.V(0).Infof("gRPC admin auth failed: no peer info")
return status.Error(codes.PermissionDenied, "no peer info")
}
tcpAddr, isTCP := pr.Addr.(*net.TCPAddr)
if !isTCP {
// In-process / bufconn / unix-socket peer: same OS process as us, so
// trusted by construction. A remote attacker cannot reach this path.
return nil
addr := pr.Addr.String()
var host string
if tcpAddr, ok := pr.Addr.(*net.TCPAddr); ok {
host = tcpAddr.IP.String()
} else if h, _, splitErr := net.SplitHostPort(addr); splitErr == nil {
host = h
} else {
host = addr
}
host := tcpAddr.IP.String()
if !vs.guard.IsAdminAuthorized(host) {
glog.V(0).Infof("gRPC admin auth failed: %s is not authorized (remote: %s)", host, pr.Addr.String())
if !vs.guard.IsWhiteListed(host) {
glog.V(0).Infof("gRPC admin auth failed: %s is not whitelisted (remote: %s)", host, addr)
return status.Errorf(codes.PermissionDenied, "not authorized: %s", host)
}
return nil

View File

@@ -1,115 +0,0 @@
package weed_server
import (
"context"
"net"
"testing"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/peer"
"google.golang.org/grpc/status"
"github.com/seaweedfs/seaweedfs/weed/security"
)
// fakeAddr is a net.Addr that is not *net.TCPAddr. We use it to fake a
// gRPC peer for the in-process / bufconn / unix-socket case.
type fakeAddr struct{ s string }
func (a fakeAddr) Network() string { return "fake" }
func (a fakeAddr) String() string { return a.s }
// TestCheckGrpcAdminAuthNonTCPPeerTrusted pins the in-process trust contract.
// When `weed server` runs master+volume+filer in the same process, the master
// dials the volume server via an in-process / bufconn / unix-socket transport.
// The peer Addr is not a *net.TCPAddr (often surfaces as "@"), so we cannot
// extract an IP — but the caller is in the same OS process, so we trust it
// unconditionally. Without this, an empty-whitelist fail-closed guard would
// break every embedded-cluster deployment.
func TestCheckGrpcAdminAuthNonTCPPeerTrusted(t *testing.T) {
vs := &VolumeServer{guard: security.NewGuard(nil, "", 0, "", 0)}
cases := []net.Addr{
fakeAddr{s: "@"},
fakeAddr{s: ""},
fakeAddr{s: "bufconn"},
&net.UnixAddr{Name: "/tmp/weed.sock", Net: "unix"},
}
for _, addr := range cases {
ctx := peer.NewContext(context.Background(), &peer.Peer{Addr: addr})
if err := vs.checkGrpcAdminAuth(ctx); err != nil {
t.Errorf("non-TCP peer %T(%q) should be trusted, got %v", addr, addr.String(), err)
}
}
}
// TestCheckGrpcAdminAuthTCPLoopbackTrusted covers the common single-host
// deployment where a separate `weed` process on the same host dials over
// real TCP. With no whitelist configured, loopback peers are still allowed
// (delegated to Guard.IsAdminAuthorized).
func TestCheckGrpcAdminAuthTCPLoopbackTrusted(t *testing.T) {
vs := &VolumeServer{guard: security.NewGuard(nil, "", 0, "", 0)}
for _, ip := range []string{"127.0.0.1", "127.0.0.5", "::1"} {
ctx := peer.NewContext(context.Background(), &peer.Peer{
Addr: &net.TCPAddr{IP: net.ParseIP(ip), Port: 50000},
})
if err := vs.checkGrpcAdminAuth(ctx); err != nil {
t.Errorf("loopback TCP peer %s should be trusted, got %v", ip, err)
}
}
}
// TestCheckGrpcAdminAuthTCPOffHostDenied is the fail-closed half of the
// contract: an off-host TCP peer with no matching whitelist entry is denied
// with PermissionDenied.
func TestCheckGrpcAdminAuthTCPOffHostDenied(t *testing.T) {
vs := &VolumeServer{guard: security.NewGuard(nil, "", 0, "", 0)}
ctx := peer.NewContext(context.Background(), &peer.Peer{
Addr: &net.TCPAddr{IP: net.ParseIP("10.0.0.5"), Port: 50000},
})
err := vs.checkGrpcAdminAuth(ctx)
if err == nil {
t.Fatalf("off-host TCP peer with empty whitelist should be denied")
}
if got, want := status.Code(err), codes.PermissionDenied; got != want {
t.Errorf("status code = %v want %v", got, want)
}
}
// TestCheckGrpcAdminAuthTCPWhitelistedAllowed verifies that an off-host TCP
// peer present in the configured whitelist is allowed through.
func TestCheckGrpcAdminAuthTCPWhitelistedAllowed(t *testing.T) {
vs := &VolumeServer{guard: security.NewGuard([]string{"10.0.0.5"}, "", 0, "", 0)}
ctx := peer.NewContext(context.Background(), &peer.Peer{
Addr: &net.TCPAddr{IP: net.ParseIP("10.0.0.5"), Port: 50000},
})
if err := vs.checkGrpcAdminAuth(ctx); err != nil {
t.Errorf("whitelisted TCP peer should be allowed, got %v", err)
}
}
// TestCheckGrpcAdminAuthNoPeerInfoDenied confirms we still deny when gRPC
// gives us no peer info at all (should not happen with real transports but
// we cannot prove a positive identity, so we refuse).
func TestCheckGrpcAdminAuthNoPeerInfoDenied(t *testing.T) {
vs := &VolumeServer{guard: security.NewGuard(nil, "", 0, "", 0)}
err := vs.checkGrpcAdminAuth(context.Background())
if err == nil {
t.Fatalf("missing peer info should be denied")
}
if got, want := status.Code(err), codes.PermissionDenied; got != want {
t.Errorf("status code = %v want %v", got, want)
}
}
// TestCheckGrpcAdminAuthNoGuardIsNoop preserves the developmental / embedded
// path: with no Guard wired up at all, the gate is a no-op regardless of
// peer information.
func TestCheckGrpcAdminAuthNoGuardIsNoop(t *testing.T) {
vs := &VolumeServer{guard: nil}
ctx := peer.NewContext(context.Background(), &peer.Peer{
Addr: &net.TCPAddr{IP: net.ParseIP("10.0.0.5"), Port: 50000},
})
if err := vs.checkGrpcAdminAuth(ctx); err != nil {
t.Errorf("no guard should be a no-op, got %v", err)
}
}