From c0f7622d12c13b34b24186c05abdb916e70872ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dawid=20M=C4=99drek?= Date: Mon, 10 Nov 2025 18:41:35 +0100 Subject: [PATCH] service/qos: Do not crash Scylla if auth_integration absent If the user connects to Scylla via the maintenance socket, it may happen that `auth_integration` has not been registered in the service level controller yet. One example is maintenance mode when that will never happen; another when the connection occurs before Scylla is fully initialized. To avoid unnecessary crashes, we add new branches if the passed user is absent or if it corresponds to the anonymous role. Since the role corresponding to a connection via the maintenance socket is the anonymous role, that solves the problem. In those cases, we completely circumvent any calls to `auth_integration` and handle them separately. The modified methods are: * `get_user_scheduling_group`, * `with_user_service_level`, * `describe_service_levels`. For the first two, the new behavior is in line with the previous implementation of those functions. The last behaves differently now, but since it's a soft error, crashing the node is not necessary anyway. We throw an exception instead, whose error message should give the user a hint of what might be wrong. The other uses of `auth_integration` within the service level controller are not problematic: * `find_effective_service_level`, * `find_cached_effective_service_level`. They take the name of a role as their argument. Since the anonymous role doesn't have a name, it's not possible to call them with it. Fixes scylladb/scylladb#26816 --- docs/dev/service_levels.md | 15 +++++++++++++++ service/qos/service_level_controller.cc | 16 +++++++++++++++- service/qos/service_level_controller.hh | 12 ++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/docs/dev/service_levels.md b/docs/dev/service_levels.md index 2840310095..9dc660d888 100644 --- a/docs/dev/service_levels.md +++ b/docs/dev/service_levels.md @@ -189,3 +189,18 @@ The command displays a table with: option name, effective service level the valu workload_type | sl2 | batch timeout | sl1 | 2s ``` + +## Implementation +### Integration with auth + +Service levels ultimately depend on the state of `auth`. Since `auth::service` is initialized long after +`service_level_controller`, we register it separately once it's started, and unregister it right before +it's stopped. For that, we wrap it in a struct called `auth_integration` that manages access to it. +That ensures that `service_level_controller` will not try to reference it beyond its lifetime. + +It's important to note that there may still be attempts to fetch an effective service level for a role +or indirectly access `auth::service` in some other way when `auth_integration` is absent. One important +situation to have in mind is when the user connects to Scylla via the maintenance socket. It's possible +early on, way before Scylla is fully initialized. Since we don't have access to `auth` yet, we need to +ensure that the semantics of the operations performed on `service_level_controller` still make sense +in that context. diff --git a/service/qos/service_level_controller.cc b/service/qos/service_level_controller.cc index 0e20735307..c89157aea3 100644 --- a/service/qos/service_level_controller.cc +++ b/service/qos/service_level_controller.cc @@ -641,6 +641,17 @@ future service_level_controller::auth_integration::get_user_sc } future service_level_controller::get_user_scheduling_group(const std::optional& usr) { + // Special case: + // ------------- + // The maintenance socket can communicate with Scylla before `auth_integration` + // is registered, and we need to prepare for it. + // For the discussion, see: scylladb/scylladb#26816. + // + // TODO: Get rid of this. + if (!usr.has_value() || auth::is_anonymous(usr.value())) { + return make_ready_future(get_default_scheduling_group()); + } + SCYLLA_ASSERT(_auth_integration != nullptr); return _auth_integration->get_user_scheduling_group(usr); } @@ -1180,7 +1191,10 @@ future> service_level_controller::auth_integratio } future> service_level_controller::describe_service_levels() { - SCYLLA_ASSERT(_auth_integration != nullptr); + if (_auth_integration == nullptr) { + throw std::runtime_error("Describing service levels requires that `auth_integration` has been registered, " + "but it has not. One of the potential reasons is using the maintenance socket."); + } std::vector created_service_levels_descs = co_await describe_created_service_levels(); std::vector attached_service_levels_descs = co_await _auth_integration->describe_attached_service_levels(); diff --git a/service/qos/service_level_controller.hh b/service/qos/service_level_controller.hh index 05b8352c30..3a10335b01 100644 --- a/service/qos/service_level_controller.hh +++ b/service/qos/service_level_controller.hh @@ -17,6 +17,7 @@ #include #include +#include "auth/authenticated_user.hh" #include "seastarx.hh" #include "auth/service.hh" #include "cql3/description.hh" @@ -288,6 +289,17 @@ public: template > requires std::invocable futurize_t with_user_service_level(const std::optional& usr, Func&& func) { + // Special case: + // ------------- + // The maintenance socket can communicate with Scylla before `auth_integration` + // is registered, and we need to prepare for it. + // For the discussion, see: scylladb/scylladb#26816. + // + // TODO: Get rid of this. + if (!usr.has_value() || auth::is_anonymous(usr.value())) { + return with_scheduling_group(get_default_scheduling_group(), std::forward(func)); + } + SCYLLA_ASSERT(_auth_integration != nullptr); return _auth_integration->with_user_service_level(usr, std::forward(func)); }