Merge 'auth: fix crash on ghost rows in role_permissions' from Marcin Maliszkiewicz

The auth cache crashes when it encounters rows in role_permissions that have a live row marker but no permissions column. These “ghost rows” were created by the now-removed auth v2 migration, which used INSERT (creating row markers) instead of UPDATE.

When permissions were later revoked, the row marker remained while the permissions column became null. An empty collection appears as null, since its lifetime is based only on its element's cells.

As a result, when the cache reloads and expects the permissions column to exist, it hits a missing_column exception.

The series removes dead code that was the primary crash site, adds has() guards to the remaining access paths, and includes a test reproducer.

Fixes https://scylladb.atlassian.net/browse/SCYLLADB-1816

Backport: all supported versions 2026.1, 2025.4, 2025.1

Closes scylladb/scylladb#29757

* github.com:scylladb/scylladb:
  test: add reproducer for auth cache crash on missing permissions column
  auth: tolerate missing permissions column in authorize()
  auth: add defensive has() guard for role_attributes value column
  auth: remove unused permissions field from cache role_record
This commit is contained in:
Piotr Dulikowski
2026-05-06 12:00:17 +02:00
4 changed files with 73 additions and 15 deletions

View File

@@ -185,24 +185,14 @@ future<lw_shared_ptr<cache::role_record>> cache::fetch_role(const role_name_t& r
static const sstring q = format("SELECT role, name, value FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, ROLE_ATTRIBUTES_CF);
auto rs = co_await fetch(q);
for (const auto& r : *rs) {
if (!r.has("value")) {
continue;
}
rec->attributes[r.get_as<sstring>("name")] =
r.get_as<sstring>("value");
co_await coroutine::maybe_yield();
}
}
// permissions
{
static const sstring q = format("SELECT role, resource, permissions FROM {}.{} WHERE role = ?", db::system_keyspace::NAME, PERMISSIONS_CF);
auto rs = co_await fetch(q);
for (const auto& r : *rs) {
auto resource = r.get_as<sstring>("resource");
auto perms_strings = r.get_set<sstring>("permissions");
std::unordered_set<sstring> perms_set(perms_strings.begin(), perms_strings.end());
auto pset = permissions::from_strings(perms_set);
rec->permissions[std::move(resource)] = std::move(pset);
co_await coroutine::maybe_yield();
}
}
co_return rec;
}

View File

@@ -44,7 +44,6 @@ public:
std::unordered_set<role_name_t> members;
sstring salted_hash;
std::unordered_map<sstring, sstring, sstring_hash, sstring_eq> attributes;
std::unordered_map<sstring, permission_set, sstring_hash, sstring_eq> permissions;
private:
friend cache;
// cached permissions include effects of role's inheritance

View File

@@ -76,7 +76,11 @@ default_authorizer::authorize(const role_or_anonymous& maybe_role, const resourc
if (results->empty()) {
co_return permissions::NONE;
}
co_return permissions::from_strings(results->one().get_set<sstring>(PERMISSIONS_NAME));
const auto& row = results->one();
if (!row.has(PERMISSIONS_NAME)) {
co_return permissions::NONE;
}
co_return permissions::from_strings(row.get_set<sstring>(PERMISSIONS_NAME));
}
future<>

View File

@@ -0,0 +1,65 @@
#
# Copyright (C) 2026-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
#
import pytest
import logging
from test.pylib.manager_client import ManagerClient
from test.cluster.auth_cluster import extra_scylla_config_options as auth_config
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
async def test_permissions_removal_and_restart(manager: ManagerClient) -> None:
"""Test that a node boots successfully when role_permissions contains a
ghost row with role and resource set but the permissions column missing.
The auth v2 migration (now removed) used INSERT to copy permission rows
from the legacy table, which created CQL row markers. Normal GRANT uses
UPDATE, which only writes collection cells without row markers. When
permissions were later revoked, the collection cells were tombstoned but
the row marker from the migration INSERT persisted. That leaves a row
with role and resource but no permissions column.
This test simulates that scenario:
1. INSERT permissions with row marker (simulating auth v2 migration)
2. REVOKE ALL permissions (tombstones the cells, marker survives)
3. Restart and verify the node boots successfully
"""
servers = await manager.servers_add(1, config=auth_config)
cql, _ = await manager.get_ready_cql(servers)
server = servers[0]
await cql.run_async("CREATE ROLE scylla_admin WITH PASSWORD = 'x' AND LOGIN = true")
await cql.run_async("CREATE ROLE scylla_manager WITH PASSWORD = 'x' AND LOGIN = true")
# Simulate auth v2 migration: INSERT creates a row marker alongside the
# permission cells, unlike GRANT which uses UPDATE (no row marker).
await cql.run_async(
"INSERT INTO system.role_permissions (role, resource, permissions) "
"VALUES ('scylla_admin', 'roles/scylla_manager', {'ALTER', 'AUTHORIZE', 'DROP'})")
# Revoke all permissions — tombstones the collection cells, but the
# row marker from the INSERT survives, creating a ghost row.
await cql.run_async("REVOKE ALL ON ROLE scylla_manager FROM scylla_admin")
# Additional check: a row with an explicitly empty permissions set.
await cql.run_async("CREATE ROLE test_empty_perms WITH PASSWORD = 'x' AND LOGIN = true")
await cql.run_async(
"INSERT INTO system.role_permissions (role, resource) "
"VALUES ('test_empty_perms', 'roles/scylla_manager')")
# Restart — the auth cache loads the ghost row and must not crash
logger.info("Restarting node")
await manager.server_stop_gracefully(server.server_id)
await manager.server_start(server.server_id)
await manager.driver_connect()
cql, _ = await manager.get_ready_cql(servers)
rows = await cql.run_async("SELECT * FROM system.local")
assert len(rows) == 1, "Node should be functional after restart"
logger.info("Node restarted successfully")