diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index eba1ff4e31..aadd82e2ac 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -69,6 +69,7 @@ using nodes_ref_list_t = std::vector(); @@ -105,7 +115,7 @@ class rack_info { public: explicit rack_info(std::ranges::input_range auto&& nodes) - : _nodes(create_nodes_list(nodes, _alive_nodes_remaining, _existing_dead_voters_remaining)) { + : _nodes(create_nodes_list(nodes, _alive_nodes_remaining, _existing_alive_voters_remaining, _existing_dead_voters_remaining)) { } // Select the "best" next voter from the rack @@ -115,6 +125,8 @@ public: // If a node is selected, it is removed from the list of candidates, and the number of voters assigned // to the rack is incremented. [[nodiscard]] std::optional select_next_voter() { + _selected_voter = nullptr; + if (_nodes.empty()) { return std::nullopt; } @@ -125,16 +137,29 @@ public: if (node.get().is_alive) { SCYLLA_ASSERT(_alive_nodes_remaining > 0); --_alive_nodes_remaining; - } else if (node.get().is_voter) { - SCYLLA_ASSERT(_existing_dead_voters_remaining > 0); - --_existing_dead_voters_remaining; + } + if (node.get().is_voter) { + if (node.get().is_alive) { + SCYLLA_ASSERT(_existing_alive_voters_remaining > 0); + --_existing_alive_voters_remaining; + } else { + SCYLLA_ASSERT(_existing_dead_voters_remaining > 0); + --_existing_dead_voters_remaining; + } } ++_assigned_voters_count; + _selected_voter = &node.get(); return voter_id; } + // Get the node descriptor for the currently selected voter + [[nodiscard]] const group0_voter_calculator::node_descriptor& get_selected_voter_info() const { + SCYLLA_ASSERT(_selected_voter != nullptr); + return *_selected_voter; + } + // Check if there are more candidates available for voter selection // // Returns `true` if there are more candidates available, `false` otherwise. @@ -149,12 +174,17 @@ public: return rack1._assigned_voters_count > rack2._assigned_voters_count; } - // Second criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) + // Second criteria: The number of existing alive voters remaining (higher has more priority) + if (rack1._existing_alive_voters_remaining != rack2._existing_alive_voters_remaining) { + return rack1._existing_alive_voters_remaining < rack2._existing_alive_voters_remaining; + } + + // Third criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) if (rack1._alive_nodes_remaining != rack2._alive_nodes_remaining) { return rack1._alive_nodes_remaining < rack2._alive_nodes_remaining; } - // Third criteria: The number of existing dead voters remaining (higher has more priority) + // Fourth criteria: The number of existing dead voters remaining (higher has more priority) // We want to keep the existing dead voters in case we can't find enough alive voters. return rack1._existing_dead_voters_remaining < rack2._existing_dead_voters_remaining; } @@ -175,13 +205,22 @@ class datacenter_info { size_t _nodes_remaining = 0; size_t _assigned_voters_count = 0; + size_t _existing_alive_voters_remaining = 0; + using racks_store_t = std::priority_queue; racks_store_t _racks; - static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes) { + static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t& existing_alive_voters_remaining) { + existing_alive_voters_remaining = 0; + std::unordered_map nodes_by_rack; for (const auto& [id, node] : nodes) { nodes_by_rack[node.get().rack].emplace_back(id, node); + if (node.get().is_alive) { + if (node.get().is_voter) { + ++existing_alive_voters_remaining; + } + } } return nodes_by_rack | std::views::values | std::ranges::to(); @@ -190,7 +229,7 @@ class datacenter_info { public: explicit datacenter_info(std::ranges::sized_range auto&& nodes) : _nodes_remaining(nodes.size()) - , _racks(create_racks_list(nodes)) { + , _racks(create_racks_list(nodes, _existing_alive_voters_remaining)) { } // Select the "best" next voter from the datacenter @@ -216,6 +255,15 @@ public: _racks.push(rack); } + const auto& node = rack.get_selected_voter_info(); + + if (node.is_alive) { + if (node.is_voter) { + SCYLLA_ASSERT(_existing_alive_voters_remaining > 0); + --_existing_alive_voters_remaining; + } + } + SCYLLA_ASSERT(_nodes_remaining > 0); --_nodes_remaining; @@ -244,7 +292,12 @@ public: return dc1._assigned_voters_count > dc2._assigned_voters_count; } - // Second criteria: The number of racks (higher has more priority) + // Second criteria: The number of existing alive voters remaining (higher has more priority) + if (dc1._existing_alive_voters_remaining != dc2._existing_alive_voters_remaining) { + return dc1._existing_alive_voters_remaining < dc2._existing_alive_voters_remaining; + } + + // Third criteria: The number of racks (higher has more priority) return dc1._racks.size() < dc2._racks.size(); } }; diff --git a/test/boost/group0_voter_calculator_test.cc b/test/boost/group0_voter_calculator_test.cc index f912f55fe9..09eebf4cdd 100644 --- a/test/boost/group0_voter_calculator_test.cc +++ b/test/boost/group0_voter_calculator_test.cc @@ -869,4 +869,74 @@ BOOST_AUTO_TEST_CASE(dcs_preferred_over_racks) { } +BOOST_AUTO_TEST_CASE(existing_voters_are_retained_across_dcs) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes (3 of them are voters). + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true}}, + {ids[1], {.datacenter = "dc-2", .rack = "rack", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc-3", .rack = "rack", .is_voter = true, .is_alive = true}}, + {ids[3], {.datacenter = "dc-4", .rack = "rack", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc-5", .rack = "rack", .is_voter = true, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: Existing voters are retained in the voter set. + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + for (const auto& [id, node] : nodes | std::views::filter([](const auto& node) { + return node.second.is_voter; + })) { + BOOST_CHECK(voters.contains(id)); + } +} + + +BOOST_AUTO_TEST_CASE(existing_voters_are_kept_across_racks) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes (3 of them are voters). + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc", .rack = "rack-1", .is_voter = true, .is_alive = true}}, + {ids[1], {.datacenter = "dc", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc", .rack = "rack-3", .is_voter = true, .is_alive = true}}, + {ids[3], {.datacenter = "dc", .rack = "rack-4", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc", .rack = "rack-5", .is_voter = true, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: Existing voters are retained in the voter set. + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + for (const auto& [id, node] : nodes | std::views::filter([](const auto& node) { + return node.second.is_voter; + })) { + BOOST_CHECK(voters.contains(id)); + } +} + + BOOST_AUTO_TEST_SUITE_END()