From f6bb5cb7a02e4a51dc2b1df97223ed3da6bf8992 Mon Sep 17 00:00:00 2001
From: Emil Maskovsky <emil.maskovsky@scylladb.com>
Date: Wed, 25 Jun 2025 11:53:46 +0200
Subject: [PATCH] raft: fix voter assignment of transitioning nodes

Previously, nodes would become voters immediately after joining, ensuring
voter status was established before bootstrap completion. With the limited
voters feature, voter assignment became deferred, creating a timing gap
where nodes could finish bootstrapping without becoming voters.

This timing issue could lead to quorum loss scenarios, particularly
observed in tests but theoretically possible in production environments.

This commit reorders voter assignment to occur before the
`update_topology_state()` call, ensuring nodes achieve voter status
before bootstrap operations are marked complete. This prevents the
problematic timing gap while maintaining compatibility with limited
voters functionality.

If voter assignment succeeds but topology state update fails, the
operation will raise an exception and be retried by the topology
coordinator, maintaining system consistency.

This commit also fixes issue where the `update_nodes` ignored leaving
voters potentially exceeding the voter limit and having voters
unaccounted for.

Fixes: scylladb/scylladb#24420
---
 service/raft/group0_voter_handler.cc | 35 ++++++++++++++++++++--------
 service/topology_coordinator.cc      | 15 ++++++------
 2 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc
index 611ad830d9..73ed792690 100644
--- a/service/raft/group0_voter_handler.cc
+++ b/service/raft/group0_voter_handler.cc
@@ -496,6 +496,20 @@ future<> group0_voter_handler::update_nodes(
 
     group0_voter_calculator::nodes_list_t nodes;
 
+    auto get_replica_state = [this](const raft::server_id& id, bool allow_transitioning) -> const replica_state* {
+        if (const auto it = _topology.normal_nodes.find(id); it != _topology.normal_nodes.end()) {
+            return &it->second;
+        }
+        rvlogger.debug("Node {} is not present in the normal nodes", id);
+        if (allow_transitioning) {
+            if (const auto it = _topology.transition_nodes.find(id); it != _topology.transition_nodes.end()) {
+                return &it->second;
+            }
+            rvlogger.debug("Node {} is not present in the transitioning nodes", id);
+        }
+        return nullptr;
+    };
+
     // Helper for adding a single node to the nodes list
     auto add_node = [&nodes, &group0_config, &leader_id](const raft::server_id& id, const replica_state& rs, bool is_alive) {
         const auto is_voter = group0_config.can_vote(id);
@@ -510,17 +524,16 @@ future<> group0_voter_handler::update_nodes(
     };
 
     // Helper function to add nodes from a gossiper node set
-    auto add_nodes_from_list = [this, &add_node](const std::set<locator::host_id>& nodes_set, bool is_alive) {
+    auto add_nodes_from_list = [&add_node, &get_replica_state, &group0_config](const std::set<locator::host_id>& nodes_set, bool is_alive) {
         for (const auto& host_id : nodes_set) {
             const raft::server_id id{host_id.uuid()};
-            const auto it = _topology.normal_nodes.find(id);
-            if (it == _topology.normal_nodes.end()) {
-                // This is expected, as nodes present in the gossiper may not be present
-                // in the topology yet or anymore.
-                rvlogger.debug("Node {} not found in the topology", id);
+            // We only allow transitioning nodes to be added if they are already voters.
+            const bool allow_transitioning = group0_config.can_vote(id);
+            const auto* node = get_replica_state(id, allow_transitioning);
+            if (!node) {
                 continue;
             }
-            add_node(id, it->second, is_alive);
+            add_node(id, *node, is_alive);
         }
     };
 
@@ -535,12 +548,14 @@ future<> group0_voter_handler::update_nodes(
             rvlogger.debug("Node {} to be added is already a member", id);
             continue;
         }
-        const auto it = _topology.normal_nodes.find(id);
-        if (it == _topology.normal_nodes.end()) {
+        // When adding a node, it may not be present in normal nodes yet, but might be joining
+        // and be in the transitioning state.
+        const auto* node = get_replica_state(id, true);
+        if (!node) {
             rvlogger.warn("Node {} was not found in the topology, can't add as a voter candidate", id);
             continue;
         }
-        add_node(id, it->second, _gossiper.is_alive(locator::host_id{id.uuid()}));
+        add_node(id, *node, _gossiper.is_alive(locator::host_id{id.uuid()}));
     }
 
     std::unordered_set<raft::server_id> voters_add;
diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc
index 98b18aa34e..e16cf4f295 100644
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -2137,8 +2137,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
                                    .set("node_state", node_state::normal);
                             rtbuilder.done();
                             auto reason = ::format("bootstrap: joined a zero-token node {}", node.id);
-                            co_await update_topology_state(std::move(guard), {builder.build(), rtbuilder.build()}, reason);
                             co_await _voter_handler.on_node_added(node.id, _as);
+                            co_await update_topology_state(std::move(guard), {builder.build(), rtbuilder.build()}, reason);
                             break;
                         }
 
@@ -2191,9 +2191,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
                             builder2.with_node(replaced_id)
                                     .set("node_state", node_state::left);
                             rtbuilder.done();
+                            co_await _voter_handler.on_node_added(node.id, _as);
                             co_await update_topology_state(take_guard(std::move(node)), {builder.build(), builder2.build(), rtbuilder.build()},
                                     fmt::format("replace: replaced node {} with the new zero-token node {}", replaced_id, node.id));
-                            co_await _voter_handler.on_node_added(node.id, _as);
                             break;
                         }
 
@@ -2455,14 +2455,13 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
                            .set("node_state", node_state::normal);
                     muts.emplace_back(builder.build());
                     muts.emplace_back(rtbuilder.build());
-                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts),
-                                                   "bootstrap: read fence completed");
+                    co_await _voter_handler.on_node_added(node.id, _as);
+                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts), "bootstrap: read fence completed");
                     // Make sure the load balancer knows the capacity for the new node immediately.
                     (void)_tablet_load_stats_refresh.trigger().handle_exception([] (auto ep) {
                         rtlogger.warn("Error during tablet load stats refresh: {}", ep);
                     });
                     }
-                    co_await _voter_handler.on_node_added(node.id, _as);
                     break;
                 case node_state::removing: {
                     co_await utils::get_local_injector().inject("delay_node_removal", utils::wait_for_message(std::chrono::minutes(5)));
@@ -2517,10 +2516,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
 
                     muts.push_back(rtbuilder.build());
                     co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(replaced_node_id.uuid()), muts);
+                    co_await _voter_handler.on_node_added(node.id, _as);
                     co_await update_topology_state(take_guard(std::move(node)), std::move(muts),
                                                   "replace: read fence completed");
                     }
-                    co_await _voter_handler.on_node_added(node.id, _as);
                     break;
                 default:
                     on_fatal_internal_error(rtlogger, ::format(
@@ -2669,10 +2668,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
 
                 auto str = fmt::format("complete rollback of {} to state normal after {} failure", node.id, node.rs->state);
 
+                co_await _voter_handler.on_node_added(node.id, _as);
+
                 rtlogger.info("{}", str);
                 co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, str);
-
-                co_await _voter_handler.on_node_added(node.id, _as);
             }
                 break;
             case topology::transition_state::truncate_table: