From f6bb5cb7a02e4a51dc2b1df97223ed3da6bf8992 Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Wed, 25 Jun 2025 11:53:46 +0200 Subject: [PATCH] raft: fix voter assignment of transitioning nodes Previously, nodes would become voters immediately after joining, ensuring voter status was established before bootstrap completion. With the limited voters feature, voter assignment became deferred, creating a timing gap where nodes could finish bootstrapping without becoming voters. This timing issue could lead to quorum loss scenarios, particularly observed in tests but theoretically possible in production environments. This commit reorders voter assignment to occur before the `update_topology_state()` call, ensuring nodes achieve voter status before bootstrap operations are marked complete. This prevents the problematic timing gap while maintaining compatibility with limited voters functionality. If voter assignment succeeds but topology state update fails, the operation will raise an exception and be retried by the topology coordinator, maintaining system consistency. This commit also fixes issue where the `update_nodes` ignored leaving voters potentially exceeding the voter limit and having voters unaccounted for. Fixes: scylladb/scylladb#24420 --- service/raft/group0_voter_handler.cc | 35 ++++++++++++++++++++-------- service/topology_coordinator.cc | 15 ++++++------ 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index 611ad830d9..73ed792690 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -496,6 +496,20 @@ future<> group0_voter_handler::update_nodes( group0_voter_calculator::nodes_list_t nodes; + auto get_replica_state = [this](const raft::server_id& id, bool allow_transitioning) -> const replica_state* { + if (const auto it = _topology.normal_nodes.find(id); it != _topology.normal_nodes.end()) { + return &it->second; + } + rvlogger.debug("Node {} is not present in the normal nodes", id); + if (allow_transitioning) { + if (const auto it = _topology.transition_nodes.find(id); it != _topology.transition_nodes.end()) { + return &it->second; + } + rvlogger.debug("Node {} is not present in the transitioning nodes", id); + } + return nullptr; + }; + // Helper for adding a single node to the nodes list auto add_node = [&nodes, &group0_config, &leader_id](const raft::server_id& id, const replica_state& rs, bool is_alive) { const auto is_voter = group0_config.can_vote(id); @@ -510,17 +524,16 @@ future<> group0_voter_handler::update_nodes( }; // Helper function to add nodes from a gossiper node set - auto add_nodes_from_list = [this, &add_node](const std::set& nodes_set, bool is_alive) { + auto add_nodes_from_list = [&add_node, &get_replica_state, &group0_config](const std::set& nodes_set, bool is_alive) { for (const auto& host_id : nodes_set) { const raft::server_id id{host_id.uuid()}; - const auto it = _topology.normal_nodes.find(id); - if (it == _topology.normal_nodes.end()) { - // This is expected, as nodes present in the gossiper may not be present - // in the topology yet or anymore. - rvlogger.debug("Node {} not found in the topology", id); + // We only allow transitioning nodes to be added if they are already voters. + const bool allow_transitioning = group0_config.can_vote(id); + const auto* node = get_replica_state(id, allow_transitioning); + if (!node) { continue; } - add_node(id, it->second, is_alive); + add_node(id, *node, is_alive); } }; @@ -535,12 +548,14 @@ future<> group0_voter_handler::update_nodes( rvlogger.debug("Node {} to be added is already a member", id); continue; } - const auto it = _topology.normal_nodes.find(id); - if (it == _topology.normal_nodes.end()) { + // When adding a node, it may not be present in normal nodes yet, but might be joining + // and be in the transitioning state. + const auto* node = get_replica_state(id, true); + if (!node) { rvlogger.warn("Node {} was not found in the topology, can't add as a voter candidate", id); continue; } - add_node(id, it->second, _gossiper.is_alive(locator::host_id{id.uuid()})); + add_node(id, *node, _gossiper.is_alive(locator::host_id{id.uuid()})); } std::unordered_set voters_add; diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index 98b18aa34e..e16cf4f295 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -2137,8 +2137,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { .set("node_state", node_state::normal); rtbuilder.done(); auto reason = ::format("bootstrap: joined a zero-token node {}", node.id); - co_await update_topology_state(std::move(guard), {builder.build(), rtbuilder.build()}, reason); co_await _voter_handler.on_node_added(node.id, _as); + co_await update_topology_state(std::move(guard), {builder.build(), rtbuilder.build()}, reason); break; } @@ -2191,9 +2191,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { builder2.with_node(replaced_id) .set("node_state", node_state::left); rtbuilder.done(); + co_await _voter_handler.on_node_added(node.id, _as); co_await update_topology_state(take_guard(std::move(node)), {builder.build(), builder2.build(), rtbuilder.build()}, fmt::format("replace: replaced node {} with the new zero-token node {}", replaced_id, node.id)); - co_await _voter_handler.on_node_added(node.id, _as); break; } @@ -2455,14 +2455,13 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { .set("node_state", node_state::normal); muts.emplace_back(builder.build()); muts.emplace_back(rtbuilder.build()); - co_await update_topology_state(take_guard(std::move(node)), std::move(muts), - "bootstrap: read fence completed"); + co_await _voter_handler.on_node_added(node.id, _as); + co_await update_topology_state(take_guard(std::move(node)), std::move(muts), "bootstrap: read fence completed"); // Make sure the load balancer knows the capacity for the new node immediately. (void)_tablet_load_stats_refresh.trigger().handle_exception([] (auto ep) { rtlogger.warn("Error during tablet load stats refresh: {}", ep); }); } - co_await _voter_handler.on_node_added(node.id, _as); break; case node_state::removing: { co_await utils::get_local_injector().inject("delay_node_removal", utils::wait_for_message(std::chrono::minutes(5))); @@ -2517,10 +2516,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { muts.push_back(rtbuilder.build()); co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(replaced_node_id.uuid()), muts); + co_await _voter_handler.on_node_added(node.id, _as); co_await update_topology_state(take_guard(std::move(node)), std::move(muts), "replace: read fence completed"); } - co_await _voter_handler.on_node_added(node.id, _as); break; default: on_fatal_internal_error(rtlogger, ::format( @@ -2669,10 +2668,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { auto str = fmt::format("complete rollback of {} to state normal after {} failure", node.id, node.rs->state); + co_await _voter_handler.on_node_added(node.id, _as); + rtlogger.info("{}", str); co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, str); - - co_await _voter_handler.on_node_added(node.id, _as); } break; case topology::transition_state::truncate_table: