From c2014f7e50fdcb48f06dea4bdb889ac856d30dfa Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 5 May 2026 11:48:07 +0300 Subject: [PATCH] qos: self-heal stale service levels version on startup Add self_heal_service_levels_version() and use it during startup when the node is already on raft topology but service levels are still marked as v1. In that stale state, migrate service levels to v2 through group0 instead of failing startup. --- main.cc | 48 ++++++++++++++++++++++--- service/qos/service_level_controller.cc | 9 +++-- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/main.cc b/main.cc index 7b5124a04d..4555b3db46 100644 --- a/main.cc +++ b/main.cc @@ -68,6 +68,7 @@ #include "vector_search/vector_store_client.hh" #include #include +#include #include #include #include @@ -223,6 +224,33 @@ read_config(bpo::variables_map& opts, db::config& cfg) { } } +static void +self_heal_service_levels_version(db::system_keyspace& sys_ks, cql3::query_processor& qp, service::raft_group0_client& group0_client, abort_source& as) { + static constexpr unsigned max_attempts = 10; + for (unsigned attempt = 1; attempt <= max_attempts; ++attempt) { + try { + auto guard = group0_client.start_operation(as).get(); + auto service_levels_version = sys_ks.get_service_levels_version().get(); + service::release_guard(std::move(guard)); + if (service_levels_version && *service_levels_version == 2) { + startlog.info("Service levels version marker was already self-healed to v2."); + return; + } + + auto nodes_count = qp.db().real_database().get_token_metadata().get_normal_token_owners().size(); + qos::service_level_controller::migrate_to_v2(nodes_count, sys_ks, qp, group0_client, as).get(); + group0_client.send_group0_read_barrier_to_live_members().get(); + startlog.info("Self-healed service levels version marker to v2."); + return; + } catch (...) { + if (attempt == max_attempts) { + std::throw_with_nested(std::runtime_error(format("Failed to self-heal service levels version marker after {} attempts", max_attempts))); + } + startlog.info("Concurrent group0 operation while self-healing service levels version marker, retrying ({}/{}).", attempt, max_attempts); + } + } +} + #ifdef SCYLLA_ENABLE_ERROR_INJECTION static future<> enable_initial_error_injections(const db::config& cfg) { @@ -1525,6 +1553,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl sys_ks.local().build_bootstrap_info().get(); + bool should_self_heal_service_levels_version = false; if (sys_ks.local().bootstrap_complete()) { // Check as early as possible if the cluster is fully upgraded to use Raft, since if it's not, then this node cannot be started with the current version. if (sys_ks.local().load_group0_upgrade_state().get() != "use_post_raft_procedures") { @@ -1532,7 +1561,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl " a node of a cluster that is not using Raft yet. This is no longer supported. Please first complete the upgrade of the cluster to use Raft"); } - if (sys_ks.local().load_topology_upgrade_state().get() != "done") { + const bool raft_topology_done = sys_ks.local().load_topology_upgrade_state().get() == "done"; + if (!raft_topology_done) { throw std::runtime_error( "Cannot start - cluster is not yet upgraded to use raft topology and this version does not support legacy topology operations. " "If you are trying to upgrade the node then first upgrade the cluster to use raft topology."); @@ -1543,10 +1573,13 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl "Cannot start - cluster is not yet upgraded to use auth v2 and this version does not support legacy auth. " "If you are trying to upgrade the node then first upgrade the cluster to use auth v2."); } - if (sys_ks.local().get_service_levels_version().get() != 2) { - throw std::runtime_error( - "Cannot start - cluster is not yet upgraded to use service levels v2 and this version does not support legacy service levels. " - "If you are trying to upgrade the node then first upgrade the cluster to use service levels v2."); + auto service_levels_version = sys_ks.local().get_service_levels_version().get(); + if (raft_topology_done && (!service_levels_version || *service_levels_version != 2)) { + should_self_heal_service_levels_version = true; + startlog.warn( + "Cluster is using raft topology but service levels are still marked as version {}. " + "Startup will continue and the service levels version marker will be self-healed after group0 starts.", + service_levels_version ? format("{}", *service_levels_version) : "unset"); } if (sys_ks.local().get_view_builder_version().get() != db::system_keyspace::view_builder_version_t::v2) { throw std::runtime_error( @@ -2368,6 +2401,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl }).get(); stop_signal.ready(false); + if (should_self_heal_service_levels_version) { + checkpoint(stop_signal, "self-healing service levels version"); + self_heal_service_levels_version(sys_ks.local(), qp.local(), group0_client, stop_signal.as_local_abort_source()); + } + // At this point, `locator::topology` should be stable, i.e. we should have complete information // about the layout of the cluster (= list of nodes along with the racks/DCs). startlog.info("Verifying that all of the keyspaces are RF-rack-valid"); diff --git a/service/qos/service_level_controller.cc b/service/qos/service_level_controller.cc index c79364ced8..a78e0cae8d 100644 --- a/service/qos/service_level_controller.cc +++ b/service/qos/service_level_controller.cc @@ -740,12 +740,11 @@ future<> service_level_controller::do_add_service_level(sstring name, service_le } future<> service_level_controller::migrate_to_v2(size_t nodes_count, db::system_keyspace& sys_ks, cql3::query_processor& qp, service::raft_group0_client& group0_client, abort_source& as) { - //TODO: - //Now we trust the administrator to not make changes to service levels during the migration. - //Ideally, during the migration we should set migration data accessor(on all nodes, on all shards) that allows to read but forbids writes + static constexpr auto SERVICE_LEVELS = "service_levels"; + static constexpr auto SYSTEM_DISTRIBUTED_KEYSPACE = "system_distributed"; using namespace std::chrono_literals; - auto schema = qp.db().find_schema(db::system_distributed_keyspace::NAME, db::system_distributed_keyspace::SERVICE_LEVELS); + auto schema = qp.db().find_schema(SYSTEM_DISTRIBUTED_KEYSPACE, SERVICE_LEVELS); const auto t = 5min; const timeout_config tc{t, t, t, t, t, t, t}; @@ -762,7 +761,7 @@ future<> service_level_controller::migrate_to_v2(size_t nodes_count, db::system_ } auto rows = co_await qp.execute_internal( - format("SELECT * FROM {}.{}", db::system_distributed_keyspace::NAME, db::system_distributed_keyspace::SERVICE_LEVELS), + format("SELECT * FROM {}.{}", SYSTEM_DISTRIBUTED_KEYSPACE, SERVICE_LEVELS), cl, qs, {},