From 24dfd2034bb5957cc2f1fbbd5bf5b73845c53634 Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Tue, 22 Apr 2025 18:28:50 +0200 Subject: [PATCH] raft: ensure topology coordinator retains votership The limited voters feature did not account for the existing topology coordinator (Raft leader) when selecting voters to be removed. As a result, the limited voters calculator could inadvertently remove the votership of the current topology coordinator, triggering an unnecessary Raft leader re-election. This change ensures that the existing topology coordinator's votership status is preserved unless absolutely necessary. When choosing between otherwise equivalent voters, the node other than the topology coordinator is prioritized for removal. This helps maintain stability in the cluster by avoiding unnecessary leader re-elections. Additionally, only the alive leader node is considered relevant for this logic. A dead existing leader (topology coordinator) is excluded from consideration, as it is already in the process of losing leadership. Fixes: scylladb/scylladb#23588 Fixes: scylladb/scylladb#23786 --- service/raft/group0_voter_calculator.hh | 1 + service/raft/group0_voter_handler.cc | 75 ++++++++++++--- test/boost/group0_voter_calculator_test.cc | 103 +++++++++++++++++++++ test/cluster/test_raft_voters.py | 59 +++++++++++- 4 files changed, 224 insertions(+), 14 deletions(-) diff --git a/service/raft/group0_voter_calculator.hh b/service/raft/group0_voter_calculator.hh index 066128838b..3f8055b5f9 100644 --- a/service/raft/group0_voter_calculator.hh +++ b/service/raft/group0_voter_calculator.hh @@ -41,6 +41,7 @@ public: sstring rack; bool is_voter; bool is_alive; + bool is_leader = false; }; using nodes_list_t = std::unordered_map; diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index aadd82e2ac..33155f2956 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -41,6 +41,11 @@ struct node_priority { // It should be smaller than the alive node priority (to still prefer alive nodes to dead voters). static constexpr value_t voter = 1; + // The priority of the current leader node. + // + // It should be smaller than the alive node priority (to still prefer alive nodes). + static constexpr value_t leader = 2; + static constexpr value_t get_value(const group0_voter_calculator::node_descriptor& node) { value_t priority = 0; if (node.is_alive) { @@ -49,6 +54,9 @@ struct node_priority { if (node.is_voter) { priority += node_priority::voter; } + if (node.is_leader) { + priority += node_priority::leader; + } return priority; } }; @@ -72,6 +80,8 @@ class rack_info { size_t _existing_alive_voters_remaining = 0; size_t _existing_dead_voters_remaining = 0; + bool _owns_alive_leader = false; + using node_info_t = std::tuple 0); --_alive_nodes_remaining; + if (node.get().is_leader) { + SCYLLA_ASSERT(_owns_alive_leader); + _owns_alive_leader = false; + } } if (node.get().is_voter) { if (node.get().is_alive) { @@ -174,18 +192,25 @@ public: return rack1._assigned_voters_count > rack2._assigned_voters_count; } - // Second criteria: The number of existing alive voters remaining (higher has more priority) + // Second criteria: The ownership of existing alive leader node + // - note that we don't take an existing dead leader into account here, as it is about to lose its + // leadership and is about to be replaced by a new leader + if (rack1._owns_alive_leader != rack2._owns_alive_leader) { + return rack2._owns_alive_leader; + } + + // Third criteria: The number of existing alive voters remaining (higher has more priority) if (rack1._existing_alive_voters_remaining != rack2._existing_alive_voters_remaining) { return rack1._existing_alive_voters_remaining < rack2._existing_alive_voters_remaining; } - // Third criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) + // Fourth criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) if (rack1._alive_nodes_remaining != rack2._alive_nodes_remaining) { return rack1._alive_nodes_remaining < rack2._alive_nodes_remaining; } - // Fourth criteria: The number of existing dead voters remaining (higher has more priority) - // We want to keep the existing dead voters in case we can't find enough alive voters. + // Fifth criteria: The number of existing dead voters remaining (higher has more priority) + // We want to keep the dead voters in case we can't find enough alive voters. return rack1._existing_dead_voters_remaining < rack2._existing_dead_voters_remaining; } }; @@ -207,11 +232,14 @@ class datacenter_info { size_t _existing_alive_voters_remaining = 0; + bool _owns_alive_leader = false; + using racks_store_t = std::priority_queue; racks_store_t _racks; - static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t& existing_alive_voters_remaining) { + static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t& existing_alive_voters_remaining, bool& owns_alive_leader) { existing_alive_voters_remaining = 0; + owns_alive_leader = false; std::unordered_map nodes_by_rack; for (const auto& [id, node] : nodes) { @@ -220,6 +248,9 @@ class datacenter_info { if (node.get().is_voter) { ++existing_alive_voters_remaining; } + if (node.get().is_leader) { + owns_alive_leader = true; + } } } @@ -229,7 +260,7 @@ class datacenter_info { public: explicit datacenter_info(std::ranges::sized_range auto&& nodes) : _nodes_remaining(nodes.size()) - , _racks(create_racks_list(nodes, _existing_alive_voters_remaining)) { + , _racks(create_racks_list(nodes, _existing_alive_voters_remaining, _owns_alive_leader)) { } // Select the "best" next voter from the datacenter @@ -262,6 +293,10 @@ public: SCYLLA_ASSERT(_existing_alive_voters_remaining > 0); --_existing_alive_voters_remaining; } + if (node.is_leader) { + SCYLLA_ASSERT(_owns_alive_leader); + _owns_alive_leader = false; + } } SCYLLA_ASSERT(_nodes_remaining > 0); @@ -292,12 +327,19 @@ public: return dc1._assigned_voters_count > dc2._assigned_voters_count; } - // Second criteria: The number of existing alive voters remaining (higher has more priority) + // Second criteria: The ownership of existing alive leader node + // - note that we don't take an existing dead leader into account here, as it is about to lose its + // leadership and is about to be replaced by a new leader + if (dc1._owns_alive_leader != dc2._owns_alive_leader) { + return dc2._owns_alive_leader; + } + + // Third criteria: The number of existing alive voters remaining (higher has more priority) if (dc1._existing_alive_voters_remaining != dc2._existing_alive_voters_remaining) { return dc1._existing_alive_voters_remaining < dc2._existing_alive_voters_remaining; } - // Third criteria: The number of racks (higher has more priority) + // Fourth criteria: The number of racks (higher has more priority) return dc1._racks.size() < dc2._racks.size(); } }; @@ -317,6 +359,7 @@ class calculator_impl { uint64_t voters_max, const group0_voter_calculator::nodes_list_t& nodes, const datacenters_store_t& datacenters, size_t dc_largest_size) { auto num_voters = std::min(voters_max, nodes.size()); // Any number of voters under 3 is allowed + // TODO (scylladb/scylladb#23266): Enforce an odd number of voters in this case (remove this condition) if (num_voters <= 3) { return num_voters; } @@ -454,8 +497,12 @@ future<> group0_voter_handler::update_nodes( co_return co_await _group0.modify_voters({}, nodes_removed, as); } + auto& raft_server = _group0.group0_server(); + + const auto& leader_id = raft_server.current_leader(); + // Load the current cluster members - const auto& group0_config = _group0.group0_server().get_configuration(); + const auto& group0_config = raft_server.get_configuration(); const auto& nodes_alive = _gossiper.get_live_members(); const auto& nodes_dead = _gossiper.get_unreachable_members(); @@ -463,13 +510,15 @@ future<> group0_voter_handler::update_nodes( group0_voter_calculator::nodes_list_t nodes; // Helper for adding a single node - auto add_node = [&nodes, &group0_config](const raft::server_id& id, const replica_state& rs, bool is_alive) { + auto add_node = [&nodes, &group0_config, &leader_id](const raft::server_id& id, const replica_state& rs, bool is_alive) { const auto is_voter = group0_config.can_vote(id); + const auto is_leader = (id == leader_id); nodes.emplace(id, group0_voter_calculator::node_descriptor{ .datacenter = rs.datacenter, .rack = rs.rack, .is_voter = is_voter, .is_alive = is_alive, + .is_leader = is_leader, }); }; diff --git a/test/boost/group0_voter_calculator_test.cc b/test/boost/group0_voter_calculator_test.cc index 09eebf4cdd..4db13d6420 100644 --- a/test/boost/group0_voter_calculator_test.cc +++ b/test/boost/group0_voter_calculator_test.cc @@ -6,6 +6,7 @@ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 */ +#include #include #include "service/raft/group0_voter_calculator.hh" @@ -939,4 +940,106 @@ BOOST_AUTO_TEST_CASE(existing_voters_are_kept_across_racks) { } +BOOST_DATA_TEST_CASE(leader_is_retained_as_voter, boost::unit_test::data::make({0, 1, 2}), leader_node_idx) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add a third DC to a 2 DC cluster. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + // The initial nodes (just 2 DCs) + {ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 0}}, + {ids[1], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 1}}, + {ids[2], {.datacenter = "dc-2", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 2}}, + {ids[3], {.datacenter = "dc-2", .rack = "rack", .is_voter = false, .is_alive = true}}, + // The new nodes (3rd DC) + // - this will lead to 1 voter being removed from the DC-1 + {ids[4], {.datacenter = "dc-3", .rack = "rack", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc-3", .rack = "rack", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The current leader is retained as a voter, even after adding + // the third datacenter and redistributing voters. + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + BOOST_CHECK(voters.contains(ids[leader_node_idx])); +} + + +BOOST_DATA_TEST_CASE(leader_is_retained_as_voter_in_racks, boost::unit_test::data::make({0, 1, 3}), leader_node_idx) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add a third DC to a 2 DC cluster. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + // The initial nodes (just 2 DCs) + {ids[0], {.datacenter = "dc-1", .rack = "rack-1", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 0}}, + {ids[1], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 1}}, + {ids[2], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[3], {.datacenter = "dc-2", .rack = "rack-4", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 3}}, + // The new nodes (3rd DC) + // - this will lead to 1 voter being removed from the DC-1 + {ids[4], {.datacenter = "dc-3", .rack = "rack-5", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc-3", .rack = "rack-6", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The current leader is retained as a voter, even after adding + // the third datacenter and redistributing voters. + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + BOOST_CHECK(voters.contains(ids[leader_node_idx])); +} + + +BOOST_DATA_TEST_CASE(leader_is_retained_as_voter_in_two_dc_asymmetric_setup, boost::unit_test::data::make({0, 1}), leader_node_idx) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Remove nodes from a 2 DC cluster so that each DC has 1 node left. + + // - this will lead to 1 voter being removed from one DC + // - the leader should be retained as a voter + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + // The result nodes (just one per DC) + {ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 0}}, + {ids[1], {.datacenter = "dc-2", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 1}}, + // no other nodes - all removed + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The current leader is retained as a voter. + + // TODO (scylladb/scylladb#23266): This should be equal to 1, after we enforce the odd number of voters + BOOST_CHECK_GE(voters.size(), 1); + BOOST_CHECK(voters.contains(ids[leader_node_idx])); +} + + BOOST_AUTO_TEST_SUITE_END() diff --git a/test/cluster/test_raft_voters.py b/test/cluster/test_raft_voters.py index ef4d192e5d..939a21cd00 100644 --- a/test/cluster/test_raft_voters.py +++ b/test/cluster/test_raft_voters.py @@ -15,7 +15,7 @@ from test.pylib.internal_types import ServerInfo from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import read_barrier from test.cluster.conftest import cluster_con, skip_mode -from test.cluster.util import get_current_group0_config +from test.cluster.util import get_coordinator_host_ids, get_current_group0_config GROUP0_VOTERS_LIMIT = 5 @@ -176,3 +176,60 @@ async def test_raft_limited_voters_upgrade(manager: ManagerClient): num_voters = await get_number_of_voters(manager, servers[0]) assert num_voters == GROUP0_VOTERS_LIMIT, f"The number of voters should be limited to {GROUP0_VOTERS_LIMIT} (but is {num_voters})" + + +@pytest.mark.asyncio +async def test_raft_limited_voters_retain_coordinator(manager: ManagerClient): + """ + Test that the topology coordinator is retained as a voter when possible. + + This test ensures that if a voter needs to be removed and the topology coordinator + is among the candidates for removal, the coordinator is prioritized to remain a voter. + + Arrange: + - Create 2 DCs with 3 nodes each. + - Retrieve the current topology coordinator. + Act: + - Add a third DC with 1 node. + - This will require removing one voter from either DC1 or DC2. + Assert: + - Verify that the topology coordinator is retained as a voter. + """ + + # Arrange: Create a 3-node cluster with the limited voters feature disabled + + dc_setup = [ + { + 'property_file': {'dc': 'dc1', 'rack': 'rack1'}, + 'num_nodes': 3, + }, + { + 'property_file': {'dc': 'dc2', 'rack': 'rack2'}, + 'num_nodes': 3, + }, + ] + + dc_servers = [] + for dc in dc_setup: + logging.info( + f"Creating {dc['property_file']['dc']} with {dc['num_nodes']} nodes") + dc_servers.append(await manager.servers_add(dc['num_nodes'], + property_file=dc['property_file'])) + + assert len(dc_servers) == len(dc_setup) + + coordinator_ids = await get_coordinator_host_ids(manager) + assert coordinator_ids, "At least 1 coordinator id should be found" + + coordinator_id = coordinator_ids[0] + + # Act: Add a new DC with one node + + logging.info('Adding a new DC with one node') + await manager.servers_add(1, property_file={'dc': 'dc3', 'rack': 'rack3'}) + + # Assert: Verify that the topology coordinator is still the voter + + group0_members = await get_current_group0_config(manager, dc_servers[0][0]) + assert any(m[0] == coordinator_id and m[1] for m in group0_members), \ + f"The coordinator {coordinator_id} should be a voter (but is not)"