From fb6d5368bb6dbbfa95cef397510d6097ffb34f49 Mon Sep 17 00:00:00 2001 From: Marcin Maliszkiewicz Date: Tue, 28 Apr 2026 11:16:07 +0200 Subject: [PATCH] Merge 'auth: fix shutdown and startup races in LDAP cache pruner' from Andrzej Jackowski MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The LDAP role manager's `_cache_pruner` background fiber periodically calls cache::reload_all_permissions(). Two races cause it to hit SCYLLA_ASSERT(_permission_loader): - Cross-shard race: The pruner `used _cache.container().invoke_on_all()` to reload permissions on every shard. Since both `service::start()` and `sharded::stop()` execute per-shard in parallel, the pruner on one shard could call reload_all_permissions() on another shard before that shard set its loader (startup) or after it cleared its loader (shutdown). Each shard runs its own pruner instance, so reloading locally is sufficient — this also removes redundant N² reload calls. - Intra-shard race: `service::stop()` cleared the permission loader and stopped the role manager concurrently (via when_all_succeed). A mid-reload pruner could yield and then call the now-null loader. Fixed by stopping the role manager first so the pruner is fully drained before the loader is cleared. Fixes SCYLLADB-1679 Backport to 2026.2, introduced in 7eedf50c12 Closes scylladb/scylladb#29605 * github.com:scylladb/scylladb: auth: make shutdown the exact reverse of startup test: ldap: add test for pruner crash during shutdown auth: start authorizer and set permission loader before role manager auth: stop role manager before clearing permission loader auth: reload LDAP permission cache on local shard only (cherry picked from commit b0f988afc4775505703008543383fba66d6c536f) Closes scylladb/scylladb#29681 --- auth/ldap_role_manager.cc | 12 +++++------ auth/service.cc | 32 +++++++++++++++++++--------- test/ldap/role_manager_test.cc | 39 ++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/auth/ldap_role_manager.cc b/auth/ldap_role_manager.cc index 75b992c6a8..edef3d84b7 100644 --- a/auth/ldap_role_manager.cc +++ b/auth/ldap_role_manager.cc @@ -258,13 +258,11 @@ future<> ldap_role_manager::start() { } catch (const seastar::sleep_aborted&) { co_return; // ignore } - co_await _cache.container().invoke_on_all([] (cache& c) -> future<> { - try { - co_await c.reload_all_permissions(); - } catch (...) { - mylog.warn("Cache reload all permissions failed: {}", std::current_exception()); - } - }); + try { + co_await _cache.reload_all_permissions(); + } catch (...) { + mylog.warn("Cache reload all permissions failed: {}", std::current_exception()); + } } }); return _std_mgr.start(); diff --git a/auth/service.cc b/auth/service.cc index 861342bd6e..94194892ba 100644 --- a/auth/service.cc +++ b/auth/service.cc @@ -157,15 +157,12 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s return create_legacy_keyspace_if_missing(mm); }); } - co_await _role_manager->start(); - if (this_shard_id() == 0) { - // Role manager and password authenticator have this odd startup - // mechanism where they asynchronously create the superuser role - // in the background. Correct password creation depends on role - // creation therefore we need to wait here. - co_await _role_manager->ensure_superuser_is_created(); - } - co_await when_all_succeed(_authorizer->start(), _authenticator->start()).discard_result(); + // Authorizer must be started before the permission loader is set, + // because the loader calls _authorizer->authorize(). + // The loader must be set before starting the role manager, because + // LDAP role manager starts a pruner fiber that calls + // reload_all_permissions() which asserts _permission_loader is set. + co_await _authorizer->start(); if (!_used_by_maintenance_socket) { // Maintenance socket mode can't cache permissions because it has // different authorizer. We can't mix cached permissions, they could be @@ -174,12 +171,27 @@ future<> service::start(::service::migration_manager& mm, db::system_keyspace& s &service::get_uncached_permissions, this, std::placeholders::_1, std::placeholders::_2)); } + co_await _role_manager->start(); + if (this_shard_id() == 0) { + // Role manager and password authenticator have this odd startup + // mechanism where they asynchronously create the superuser role + // in the background. Correct password creation depends on role + // creation therefore we need to wait here. + co_await _role_manager->ensure_superuser_is_created(); + } + // Authenticator must be started after ensure_superuser_is_created() + // because password_authenticator queries system.roles for the + // superuser entry created by the role manager. + co_await _authenticator->start(); } future<> service::stop() { _as.request_abort(); + // Reverse of start() order. + co_await _authenticator->stop(); + co_await _role_manager->stop(); _cache.set_permission_loader(nullptr); - return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop()).discard_result(); + co_await _authorizer->stop(); } future<> service::ensure_superuser_is_created() { diff --git a/test/ldap/role_manager_test.cc b/test/ldap/role_manager_test.cc index 1d8d57c572..1dd135a4ff 100644 --- a/test/ldap/role_manager_test.cc +++ b/test/ldap/role_manager_test.cc @@ -18,6 +18,7 @@ #include #include "test/lib/exception_utils.hh" +#include "test/lib/log.hh" #include "test/lib/test_utils.hh" #include "ldap_common.hh" #include "service/migration_manager.hh" @@ -681,3 +682,41 @@ SEASTAR_TEST_CASE(ldap_config) { }, make_ldap_config()); } + +// Reproduces the race between the cache pruner and the permission +// loader lifecycle during shutdown. Refs SCYLLADB-1679. +SEASTAR_TEST_CASE(ldap_pruner_no_crash_after_loader_cleared) { + auto cfg = make_ldap_config(); + cfg->permissions_update_interval_in_ms.set(1); + + auto call_count = seastar::make_lw_shared(0); + + co_await do_with_cql_env_thread([call_count](cql_test_env& env) { + auto& cache = env.auth_cache().local(); + + testlog.info("Populating 50 cache entries"); + for (int i = 0; i < 50; i++) { + auto r = auth::make_data_resource("system", fmt::format("t{}", i)); + cache.get_permissions(auth::role_or_anonymous(), r).get(); + } + + testlog.info("Installing slow permission loader (10ms per call)"); + cache.set_permission_loader( + [call_count] (const auth::role_or_anonymous&, const auth::resource&) + -> seastar::future { + ++(*call_count); + co_await seastar::sleep(std::chrono::milliseconds(10)); + co_return auth::permission_set(); + }); + + testlog.info("Waiting for pruner to start reloading"); + while (*call_count == 0) { + seastar::sleep(std::chrono::milliseconds(1)).get(); + } + + testlog.info("Pruner started, letting teardown run"); + }, cfg); + + testlog.info("Loader called {} times", *call_count); +} +