From cb4a4f2caffa6b82cd2e4cafd26bdde4c4a699a0 Mon Sep 17 00:00:00 2001 From: Piotr Dulikowski Date: Tue, 16 Apr 2024 16:18:41 +0200 Subject: [PATCH] topology_coordinator: compute cluster size correctly during upgrade During upgrade to raft topology, information about service levels is copied from the legacy tables in system_distributed to the raft-managed tables of group 0. system_distributed has RF=3, so if the cluster has only one or two nodes we should use lower consistency level than ALL - and the current procedure does exactly that, it selects QUORUM in case of two nodes and ONE in case of only one node. The cluster size is determined based on the call to _gossiper.num_endpoints(). Despite its name, gossiper::num_endpoints() does not necessarily return the number of nodes in the cluster but rather the number of endpoint states in gossiper (this behavior is documented in a comment near the declaration of this function). In some cases, e.g. after gossiper-based nodetool remove, the state might be kept for some time after removal (3 days in this case). The consequence of this is that gossiper::num_endpoints() might return more than the current number of nodes during upgrade, and that in turn might cause migration of data from one table to another to fail - causing the upgrade procedure to get stuck if there is only 1 or two nodes in the cluster. In order to fix this, use token_metadata::get_all_endpoints() as a measure of the cluster size. Fixes: scylladb/scylladb#18198 --- service/topology_coordinator.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index 3b360c0fba..7ad31ed768 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -2446,10 +2446,12 @@ future<> topology_coordinator::build_coordinator_state(group0_guard guard) { [this] (abort_source*) { return start_operation();}, _as); } + auto tmptr = get_token_metadata_ptr(); + auto sl_version = co_await _sys_ks.get_service_levels_version(); if (!sl_version || *sl_version < 2) { rtlogger.info("migrating service levels data"); - co_await qos::service_level_controller::migrate_to_v2(_gossiper.num_endpoints(), _sys_ks, _sys_ks.query_processor(), _group0.client(), _as); + co_await qos::service_level_controller::migrate_to_v2(tmptr->get_all_endpoints().size(), _sys_ks, _sys_ks.query_processor(), _group0.client(), _as); } rtlogger.info("building initial raft topology state and CDC generation"); @@ -2466,7 +2468,6 @@ future<> topology_coordinator::build_coordinator_state(group0_guard guard) { }; // Create a new CDC generation - auto tmptr = get_token_metadata_ptr(); auto get_sharding_info_for_host_id = [&] (locator::host_id host_id) -> std::pair { const auto ep = tmptr->get_endpoint_for_host_id_if_known(host_id); if (!ep) {