diff --git a/service/qos/service_level_controller.cc b/service/qos/service_level_controller.cc index a78e0cae8d..10150a1c43 100644 --- a/service/qos/service_level_controller.cc +++ b/service/qos/service_level_controller.cc @@ -29,7 +29,6 @@ #include "cql3/query_processor.hh" #include "service/storage_service.hh" #include "service/topology_state_machine.hh" -#include "utils/sorting.hh" #include #include "utils/managed_string.hh" @@ -329,15 +328,40 @@ future<> service_level_controller::auth_integration::reload_cache(qos::query_con // includes only roles with attached service level const auto attributes = co_await role_manager.query_attribute_for_all("service_level", qs); - std::map effective_sl_map; + struct effective_sl_map_entry { + std::optional effective_sl; + bool fully_computed = false; + }; + std::map effective_sl_map; - auto sorted = co_await utils::topological_sort(all_roles, hierarchy); - // Roles are sorted from the top of the hierarchy to the bottom. - /// `GRANT role1 TO role2` means role2 is higher in the hierarchy than role1, so role2 will be before - // role1 in `sorted` vector. - // That's why if we iterate over the vector in reversed order, we will visit the roles from the bottom - // and we can use already calculated effective service levels for all of the subroles. - for (auto& role: sorted | std::views::reverse) { + // The effective service level of a role depends on the service level directly attached to it, + // as well as the effective service levels of the roles granted to it. Go over all roles + // and compute an effective service level for each one of them, recursively computing + // effective service levels of the granted roles, if needed. The role grants graph should not + // have cycles, but if we detect one - print it and gracefully handle it. + std::vector roles_stack; + noncopyable_function(sstring)> compute_effective_sl; + compute_effective_sl = [&] (sstring role) -> std::optional { + auto [entry_it, inserted] = effective_sl_map.try_emplace(role); + auto& entry = entry_it->second; + if (!inserted) { + if (entry.fully_computed) { + return entry.effective_sl; + } else { + auto cycle_printer = seastar::value_of([&] { + auto cycle_begin = std::ranges::find(roles_stack, role); + return fmt::join(std::ranges::subrange(cycle_begin, roles_stack.end()), " -> "); + }); + static logger::rate_limit cycle_detected_rate_limit{std::chrono::minutes(5)}; + sl_logger.log(log_level::warn, cycle_detected_rate_limit, + "Cycle detected in the system.role_members table: {} -> {}. " + "Some service level parameters might be computed incorrectly.", + cycle_printer, role); + return std::nullopt; + } + } + + roles_stack.push_back(role); std::optional sl_options; if (auto sl_name_it = attributes.find(role); sl_name_it != attributes.end()) { @@ -356,28 +380,40 @@ future<> service_level_controller::auth_integration::reload_cache(qos::query_con auto [it, it_end] = hierarchy.equal_range(role); while (it != it_end) { auto& subrole = it->second; - if (auto sub_sl_it = effective_sl_map.find(subrole); sub_sl_it != effective_sl_map.end()) { + std::optional sub_sl_options = compute_effective_sl(subrole); + if (sub_sl_options) { if (sl_options) { - sl_options = sl_options->merge_with(sub_sl_it->second); + sl_options = sl_options->merge_with(*sub_sl_options); } else { - sl_options = sub_sl_it->second; + sl_options = sub_sl_options; } } ++it; } - if (sl_options) { - effective_sl_map.insert({role, *sl_options}); - } + entry.effective_sl = sl_options; + entry.fully_computed = true; + + roles_stack.pop_back(); + return sl_options; + }; + + for (const auto& role: all_roles) { + compute_effective_sl(role); co_await coroutine::maybe_yield(); } - co_await _sl_controller.container().invoke_on_all([effective_sl_map] (service_level_controller& sl_controller) -> future<> { + const auto new_effective_sl_cache = effective_sl_map + | std::views::filter([] (const auto& p) { return p.second.effective_sl.has_value(); }) + | std::views::transform([] (const auto& p) { return std::pair{p.first, *p.second.effective_sl}; }) + | std::ranges::to>(); + + co_await _sl_controller.container().invoke_on_all([&new_effective_sl_cache] (service_level_controller& sl_controller) -> future<> { // We probably cannot predict if `auth_integration` is still in place on another shard, // so let's play it safe here. if (sl_controller._auth_integration) { - sl_controller._auth_integration->_cache = std::move(effective_sl_map); + sl_controller._auth_integration->_cache = new_effective_sl_cache; } co_await sl_controller.notify_effective_service_levels_cache_reloaded(); }); diff --git a/test/cluster/auth_cluster/test_service_levels_tolerate_invalid_role_grants_graph.py b/test/cluster/auth_cluster/test_service_levels_tolerate_invalid_role_grants_graph.py new file mode 100644 index 0000000000..343ceab644 --- /dev/null +++ b/test/cluster/auth_cluster/test_service_levels_tolerate_invalid_role_grants_graph.py @@ -0,0 +1,68 @@ +# +# Copyright (C) 2026-present ScyllaDB +# +# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 +# + +from test.cluster.auth_cluster import extra_scylla_config_options as auth_config +from test.pylib.manager_client import ManagerClient + + +async def test_tolerating_cycles_in_auth(manager: ManagerClient): + """ + Verify that the service levels logic in group0 gracefully handles cycles + in the roles graph (i.e. an edge is created by GRANT role1 TO role2). + Although auth has an explicit check which should prevent creation of such + cycles, it might (theoretically, not observed yet in practice) happen that + some inconsistent data is migrated from the system_auth distributed keyspace + to group0 tables. For each role, the final service level parameters depend + on the service levels attached to all roles reachable in the role grants graph. + This logic needs to be resilient to cycles. + """ + server = await manager.server_add(config=auth_config) + cql = manager.get_cql() + + await cql.run_async("CREATE ROLE a") + await cql.run_async("CREATE ROLE b") + + # Create an invalid state by simulating two cyclic GRANTs. + await cql.run_async("INSERT INTO system.role_members (role, member) VALUES ('a', 'b')") + await cql.run_async("INSERT INTO system.role_members (role, member) VALUES ('b', 'a')") + await cql.run_async("UPDATE system.roles SET member_of = {'b'} where role = 'a'") + await cql.run_async("UPDATE system.roles SET member_of = {'a'} where role = 'b'") + + # Now that a cycle has been created in group0, the next operation that modifies + # data in group0 will cause it to be reloaded. Create some service levels + # to trigger that. + await cql.run_async("CREATE SERVICE LEVEL sl_a WITH SHARES = 100") + await cql.run_async("ATTACH SERVICE LEVEL sl_a TO a") + await cql.run_async("CREATE SERVICE LEVEL sl_b WITH SHARES = 100") + await cql.run_async("ATTACH SERVICE LEVEL sl_b TO b") + + # There should be a warning about the cycle + log = await manager.server_open_log(server.server_id) + await log.wait_for("Cycle detected in the system.role_members table: (a -> b -> a|b -> a -> b)") + + +async def test_invalid_graph_with_edges_to_non_existing_members(manager: ManagerClient): + """ + Verify that the service levels logic gracefully handles invalid graphs of roles + with roles being assigned to roles that no longer exist (e.g. GRANT role1 + TO role2, and role2 no longer exists). Such graphs were observed in the wild, + and can occur if inconsistent data is migrated from the auth tables + in system_auth distributed keyspace to group0 tables. + """ + await manager.server_add(config=auth_config) + cql = manager.get_cql() + + await cql.run_async("CREATE ROLE a") + + # Create a connection to a role that does not exist. + await cql.run_async("INSERT INTO system.role_members (role, member) VALUES ('a', 'b')") + await cql.run_async("UPDATE system.roles SET member_of = {'b'} where role = 'a'") + + # Now that group0 has invalid roles graph, the next operation that modifies + # data in group0 will cause it to be reloaded. Create some service levels + # to trigger that. + await cql.run_async("CREATE SERVICE LEVEL sl_a WITH SHARES = 100") + await cql.run_async("ATTACH SERVICE LEVEL sl_a TO a")