diff --git a/service/raft/group0_voter_calculator.hh b/service/raft/group0_voter_calculator.hh index 066128838b..3f8055b5f9 100644 --- a/service/raft/group0_voter_calculator.hh +++ b/service/raft/group0_voter_calculator.hh @@ -41,6 +41,7 @@ public: sstring rack; bool is_voter; bool is_alive; + bool is_leader = false; }; using nodes_list_t = std::unordered_map; diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index aadd82e2ac..33155f2956 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -41,6 +41,11 @@ struct node_priority { // It should be smaller than the alive node priority (to still prefer alive nodes to dead voters). static constexpr value_t voter = 1; + // The priority of the current leader node. + // + // It should be smaller than the alive node priority (to still prefer alive nodes). + static constexpr value_t leader = 2; + static constexpr value_t get_value(const group0_voter_calculator::node_descriptor& node) { value_t priority = 0; if (node.is_alive) { @@ -49,6 +54,9 @@ struct node_priority { if (node.is_voter) { priority += node_priority::voter; } + if (node.is_leader) { + priority += node_priority::leader; + } return priority; } }; @@ -72,6 +80,8 @@ class rack_info { size_t _existing_alive_voters_remaining = 0; size_t _existing_dead_voters_remaining = 0; + bool _owns_alive_leader = false; + using node_info_t = std::tuple 0); --_alive_nodes_remaining; + if (node.get().is_leader) { + SCYLLA_ASSERT(_owns_alive_leader); + _owns_alive_leader = false; + } } if (node.get().is_voter) { if (node.get().is_alive) { @@ -174,18 +192,25 @@ public: return rack1._assigned_voters_count > rack2._assigned_voters_count; } - // Second criteria: The number of existing alive voters remaining (higher has more priority) + // Second criteria: The ownership of existing alive leader node + // - note that we don't take an existing dead leader into account here, as it is about to lose its + // leadership and is about to be replaced by a new leader + if (rack1._owns_alive_leader != rack2._owns_alive_leader) { + return rack2._owns_alive_leader; + } + + // Third criteria: The number of existing alive voters remaining (higher has more priority) if (rack1._existing_alive_voters_remaining != rack2._existing_alive_voters_remaining) { return rack1._existing_alive_voters_remaining < rack2._existing_alive_voters_remaining; } - // Third criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) + // Fourth criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) if (rack1._alive_nodes_remaining != rack2._alive_nodes_remaining) { return rack1._alive_nodes_remaining < rack2._alive_nodes_remaining; } - // Fourth criteria: The number of existing dead voters remaining (higher has more priority) - // We want to keep the existing dead voters in case we can't find enough alive voters. + // Fifth criteria: The number of existing dead voters remaining (higher has more priority) + // We want to keep the dead voters in case we can't find enough alive voters. return rack1._existing_dead_voters_remaining < rack2._existing_dead_voters_remaining; } }; @@ -207,11 +232,14 @@ class datacenter_info { size_t _existing_alive_voters_remaining = 0; + bool _owns_alive_leader = false; + using racks_store_t = std::priority_queue; racks_store_t _racks; - static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t& existing_alive_voters_remaining) { + static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t& existing_alive_voters_remaining, bool& owns_alive_leader) { existing_alive_voters_remaining = 0; + owns_alive_leader = false; std::unordered_map nodes_by_rack; for (const auto& [id, node] : nodes) { @@ -220,6 +248,9 @@ class datacenter_info { if (node.get().is_voter) { ++existing_alive_voters_remaining; } + if (node.get().is_leader) { + owns_alive_leader = true; + } } } @@ -229,7 +260,7 @@ class datacenter_info { public: explicit datacenter_info(std::ranges::sized_range auto&& nodes) : _nodes_remaining(nodes.size()) - , _racks(create_racks_list(nodes, _existing_alive_voters_remaining)) { + , _racks(create_racks_list(nodes, _existing_alive_voters_remaining, _owns_alive_leader)) { } // Select the "best" next voter from the datacenter @@ -262,6 +293,10 @@ public: SCYLLA_ASSERT(_existing_alive_voters_remaining > 0); --_existing_alive_voters_remaining; } + if (node.is_leader) { + SCYLLA_ASSERT(_owns_alive_leader); + _owns_alive_leader = false; + } } SCYLLA_ASSERT(_nodes_remaining > 0); @@ -292,12 +327,19 @@ public: return dc1._assigned_voters_count > dc2._assigned_voters_count; } - // Second criteria: The number of existing alive voters remaining (higher has more priority) + // Second criteria: The ownership of existing alive leader node + // - note that we don't take an existing dead leader into account here, as it is about to lose its + // leadership and is about to be replaced by a new leader + if (dc1._owns_alive_leader != dc2._owns_alive_leader) { + return dc2._owns_alive_leader; + } + + // Third criteria: The number of existing alive voters remaining (higher has more priority) if (dc1._existing_alive_voters_remaining != dc2._existing_alive_voters_remaining) { return dc1._existing_alive_voters_remaining < dc2._existing_alive_voters_remaining; } - // Third criteria: The number of racks (higher has more priority) + // Fourth criteria: The number of racks (higher has more priority) return dc1._racks.size() < dc2._racks.size(); } }; @@ -317,6 +359,7 @@ class calculator_impl { uint64_t voters_max, const group0_voter_calculator::nodes_list_t& nodes, const datacenters_store_t& datacenters, size_t dc_largest_size) { auto num_voters = std::min(voters_max, nodes.size()); // Any number of voters under 3 is allowed + // TODO (scylladb/scylladb#23266): Enforce an odd number of voters in this case (remove this condition) if (num_voters <= 3) { return num_voters; } @@ -454,8 +497,12 @@ future<> group0_voter_handler::update_nodes( co_return co_await _group0.modify_voters({}, nodes_removed, as); } + auto& raft_server = _group0.group0_server(); + + const auto& leader_id = raft_server.current_leader(); + // Load the current cluster members - const auto& group0_config = _group0.group0_server().get_configuration(); + const auto& group0_config = raft_server.get_configuration(); const auto& nodes_alive = _gossiper.get_live_members(); const auto& nodes_dead = _gossiper.get_unreachable_members(); @@ -463,13 +510,15 @@ future<> group0_voter_handler::update_nodes( group0_voter_calculator::nodes_list_t nodes; // Helper for adding a single node - auto add_node = [&nodes, &group0_config](const raft::server_id& id, const replica_state& rs, bool is_alive) { + auto add_node = [&nodes, &group0_config, &leader_id](const raft::server_id& id, const replica_state& rs, bool is_alive) { const auto is_voter = group0_config.can_vote(id); + const auto is_leader = (id == leader_id); nodes.emplace(id, group0_voter_calculator::node_descriptor{ .datacenter = rs.datacenter, .rack = rs.rack, .is_voter = is_voter, .is_alive = is_alive, + .is_leader = is_leader, }); }; diff --git a/test/boost/group0_voter_calculator_test.cc b/test/boost/group0_voter_calculator_test.cc index 09eebf4cdd..4db13d6420 100644 --- a/test/boost/group0_voter_calculator_test.cc +++ b/test/boost/group0_voter_calculator_test.cc @@ -6,6 +6,7 @@ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 */ +#include #include #include "service/raft/group0_voter_calculator.hh" @@ -939,4 +940,106 @@ BOOST_AUTO_TEST_CASE(existing_voters_are_kept_across_racks) { } +BOOST_DATA_TEST_CASE(leader_is_retained_as_voter, boost::unit_test::data::make({0, 1, 2}), leader_node_idx) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add a third DC to a 2 DC cluster. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + // The initial nodes (just 2 DCs) + {ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 0}}, + {ids[1], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 1}}, + {ids[2], {.datacenter = "dc-2", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 2}}, + {ids[3], {.datacenter = "dc-2", .rack = "rack", .is_voter = false, .is_alive = true}}, + // The new nodes (3rd DC) + // - this will lead to 1 voter being removed from the DC-1 + {ids[4], {.datacenter = "dc-3", .rack = "rack", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc-3", .rack = "rack", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The current leader is retained as a voter, even after adding + // the third datacenter and redistributing voters. + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + BOOST_CHECK(voters.contains(ids[leader_node_idx])); +} + + +BOOST_DATA_TEST_CASE(leader_is_retained_as_voter_in_racks, boost::unit_test::data::make({0, 1, 3}), leader_node_idx) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add a third DC to a 2 DC cluster. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + // The initial nodes (just 2 DCs) + {ids[0], {.datacenter = "dc-1", .rack = "rack-1", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 0}}, + {ids[1], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 1}}, + {ids[2], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[3], {.datacenter = "dc-2", .rack = "rack-4", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 3}}, + // The new nodes (3rd DC) + // - this will lead to 1 voter being removed from the DC-1 + {ids[4], {.datacenter = "dc-3", .rack = "rack-5", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc-3", .rack = "rack-6", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The current leader is retained as a voter, even after adding + // the third datacenter and redistributing voters. + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + BOOST_CHECK(voters.contains(ids[leader_node_idx])); +} + + +BOOST_DATA_TEST_CASE(leader_is_retained_as_voter_in_two_dc_asymmetric_setup, boost::unit_test::data::make({0, 1}), leader_node_idx) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Remove nodes from a 2 DC cluster so that each DC has 1 node left. + + // - this will lead to 1 voter being removed from one DC + // - the leader should be retained as a voter + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + // The result nodes (just one per DC) + {ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 0}}, + {ids[1], {.datacenter = "dc-2", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 1}}, + // no other nodes - all removed + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The current leader is retained as a voter. + + // TODO (scylladb/scylladb#23266): This should be equal to 1, after we enforce the odd number of voters + BOOST_CHECK_GE(voters.size(), 1); + BOOST_CHECK(voters.contains(ids[leader_node_idx])); +} + + BOOST_AUTO_TEST_SUITE_END() diff --git a/test/cluster/test_raft_voters.py b/test/cluster/test_raft_voters.py index ef4d192e5d..939a21cd00 100644 --- a/test/cluster/test_raft_voters.py +++ b/test/cluster/test_raft_voters.py @@ -15,7 +15,7 @@ from test.pylib.internal_types import ServerInfo from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import read_barrier from test.cluster.conftest import cluster_con, skip_mode -from test.cluster.util import get_current_group0_config +from test.cluster.util import get_coordinator_host_ids, get_current_group0_config GROUP0_VOTERS_LIMIT = 5 @@ -176,3 +176,60 @@ async def test_raft_limited_voters_upgrade(manager: ManagerClient): num_voters = await get_number_of_voters(manager, servers[0]) assert num_voters == GROUP0_VOTERS_LIMIT, f"The number of voters should be limited to {GROUP0_VOTERS_LIMIT} (but is {num_voters})" + + +@pytest.mark.asyncio +async def test_raft_limited_voters_retain_coordinator(manager: ManagerClient): + """ + Test that the topology coordinator is retained as a voter when possible. + + This test ensures that if a voter needs to be removed and the topology coordinator + is among the candidates for removal, the coordinator is prioritized to remain a voter. + + Arrange: + - Create 2 DCs with 3 nodes each. + - Retrieve the current topology coordinator. + Act: + - Add a third DC with 1 node. + - This will require removing one voter from either DC1 or DC2. + Assert: + - Verify that the topology coordinator is retained as a voter. + """ + + # Arrange: Create a 3-node cluster with the limited voters feature disabled + + dc_setup = [ + { + 'property_file': {'dc': 'dc1', 'rack': 'rack1'}, + 'num_nodes': 3, + }, + { + 'property_file': {'dc': 'dc2', 'rack': 'rack2'}, + 'num_nodes': 3, + }, + ] + + dc_servers = [] + for dc in dc_setup: + logging.info( + f"Creating {dc['property_file']['dc']} with {dc['num_nodes']} nodes") + dc_servers.append(await manager.servers_add(dc['num_nodes'], + property_file=dc['property_file'])) + + assert len(dc_servers) == len(dc_setup) + + coordinator_ids = await get_coordinator_host_ids(manager) + assert coordinator_ids, "At least 1 coordinator id should be found" + + coordinator_id = coordinator_ids[0] + + # Act: Add a new DC with one node + + logging.info('Adding a new DC with one node') + await manager.servers_add(1, property_file={'dc': 'dc3', 'rack': 'rack3'}) + + # Assert: Verify that the topology coordinator is still the voter + + group0_members = await get_current_group0_config(manager, dc_servers[0][0]) + assert any(m[0] == coordinator_id and m[1] for m in group0_members), \ + f"The coordinator {coordinator_id} should be a voter (but is not)"