diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index 3ce8015772..58b419eebc 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -36,16 +36,16 @@ bool operator<(const is_voter_t& lhs, const is_voter_t& rhs) { } -// Represents information about a datacenter and provides functionality to manage voter selection. +// Represents information about a rack and provides functionality to manage voter selection. // -// This class is responsible for storing information about nodes within a datacenter and selecting voters based +// This class is responsible for storing information about nodes within a rack and selecting voters based // on specific criteria. // It maintains a map of nodes categorized by their status (alive/dead) and role (voter/non-voter) and provides // methods to select the next voter and check if more candidates are available. // -// The class also defines a priority comparator to determine the priority of datacenters based on the number of +// The class also defines a priority comparator to determine the priority of racks based on the number of // assigned voters, alive nodes, and dead voters. -class datacenter_info { +class rack_info { using nodes_map_key_t = std::tuple; using nodes_map_t = std::multimap; @@ -61,16 +61,16 @@ class datacenter_info { } public: - explicit datacenter_info(std::ranges::input_range auto&& nodes) + explicit rack_info(std::ranges::input_range auto&& nodes) : _nodes(create_nodes_map(nodes)) { } - // Select the "best" next voter from the datacenter + // Select the "best" next voter from the rack // // Returns the ID of the selected voter or `std::nullopt` if no more candidates are available. // // If a node is selected, it is removed from the list of candidates, and the number of voters assigned - // to the datacenter is incremented. + // to the rack is incremented. [[nodiscard]] std::optional select_next_voter() { const auto node = std::invoke([this]() { // Process alive nodes first, then dead nodes @@ -99,13 +99,133 @@ public: return voter_id; } + // Check if there are more candidates available for voter selection + // + // Returns `true` if there are more candidates available, `false` otherwise. + [[nodiscard]] bool has_more_candidates() const { + return !_nodes.empty(); + } + + // The priority comparator for the rack_info + friend bool operator<(const rack_info& rack1, const rack_info& rack2) { + // First criteria: The number of already assigned voters (lower has more priority) + if (rack1._voters_count != rack2._voters_count) { + return rack1._voters_count > rack2._voters_count; + } + + const auto& rack1_alive_nodes_boundary = rack1._nodes.lower_bound(std::make_tuple(is_alive_t::yes, is_voter_t::no)); + const auto& rack2_alive_nodes_boundary = rack2._nodes.lower_bound(std::make_tuple(is_alive_t::yes, is_voter_t::no)); + + // Second criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) + + const auto rack1_alive_nodes_remaining = std::distance(rack1_alive_nodes_boundary, rack1._nodes.end()); + const auto rack2_alive_nodes_remaining = std::distance(rack2_alive_nodes_boundary, rack2._nodes.end()); + if (rack1_alive_nodes_remaining != rack2_alive_nodes_remaining) { + return rack1_alive_nodes_remaining < rack2_alive_nodes_remaining; + } + + // Third criteria: The number of dead voters remaining (higher has more priority) + // We want to keep the dead voters in case we can't find enough alive voters. + + // Note that the nodes don't contain dead non-voters (we filter them out in `group0_voter_calculator::distribute_voters`), + // so we can use the whole subrange from the beginning (and can avoid another boundary search). + // We check for this condition in the non-release builds. + if constexpr (!tools::build_info::is_release_build()) { + SEASTAR_ASSERT(!rack1._nodes.contains(std::make_tuple(is_alive_t::no, is_voter_t::no))); + SEASTAR_ASSERT(!rack2._nodes.contains(std::make_tuple(is_alive_t::no, is_voter_t::no))); + } + + const auto rack1_dead_nodes_remaining = std::distance(rack1._nodes.begin(), rack1_alive_nodes_boundary); + const auto rack2_dead_nodes_remaining = std::distance(rack2._nodes.begin(), rack2_alive_nodes_boundary); + return rack1_dead_nodes_remaining < rack2_dead_nodes_remaining; + } + +}; + + +// Represents information about a datacenter and provides functionality to manage voter selection. +// +// This class is responsible for storing information about nodes within a datacenter and selecting voters based +// on specific criteria. +// It maintains a map of nodes categorized by their status (alive/dead) and role (voter/non-voter) and provides +// methods to select the next voter and check if more candidates are available. +// +// The class also defines a priority comparator to determine the priority of datacenters based on the number of +// assigned voters, alive nodes, and dead voters. +class datacenter_info { + + size_t _nodes_remaining = 0; + size_t _voters_count = 0; + + using racks_store_t = std::priority_queue; + racks_store_t _racks; + + static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t nodes_remaining) { + const auto nodes_by_rack = nodes | std::views::transform([](const auto& node_entry) { + const auto& [id, node] = node_entry; + return std::make_pair(std::string_view{node.rack}, std::make_pair(id, std::cref(node))); + }) | std::ranges::to(); + + racks_store_t::container_type racks; + + for (const auto rack : nodes_by_rack | std::views::keys | std::ranges::to()) { + const auto [first, last] = nodes_by_rack.equal_range(rack); + + racks.emplace_back(std::ranges::subrange(first, last) | std::views::transform([](const auto& node_entry) { + return node_entry.second; + })); + + nodes_remaining += std::distance(first, last); + } + + return racks | std::ranges::to(); + } + +public: + explicit datacenter_info(std::ranges::input_range auto&& nodes) + : _racks(create_racks_list(nodes, _nodes_remaining)) { + } + + // Select the "best" next voter from the datacenter + // + // Returns the ID of the selected voter or `std::nullopt` if no more candidates are available. + // + // If a node is selected, it is removed from the list of candidates, and the number of voters assigned + // to the datacenter is incremented. + [[nodiscard]] std::optional select_next_voter() { + while (!_racks.empty()) { + + // Select the datacenter with the highest priority (according to the comparator) + auto rack = _racks.top(); + _racks.pop(); + + const auto voter_id = rack.select_next_voter(); + + if (!voter_id) { + continue; + } + + if (rack.has_more_candidates()) { + _racks.push(rack); + } + + --_nodes_remaining; + ++_voters_count; + + return voter_id; + } + + // No more nodes to select + return std::nullopt; + } + // Check if there are more candidates available for voter selection // // Returns `true` if there are more candidates available, `false` otherwise. // // The selection is limited by the maximum number of voters per datacenter. [[nodiscard]] bool has_more_candidates(size_t voters_max_per_dc) const { - return !_nodes.empty() && _voters_count < voters_max_per_dc; + return _nodes_remaining > 0 && _voters_count < voters_max_per_dc; } // The priority comparator for the datacenter_info @@ -116,31 +236,8 @@ public: return dc1._voters_count > dc2._voters_count; } - const auto& dc1_alive_nodes_boundary = dc1._nodes.lower_bound(std::make_tuple(is_alive_t::yes, is_voter_t::no)); - const auto& dc2_alive_nodes_boundary = dc2._nodes.lower_bound(std::make_tuple(is_alive_t::yes, is_voter_t::no)); - - // Second criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) - - const auto dc1_alive_nodes_remaining = std::distance(dc1_alive_nodes_boundary, dc1._nodes.end()); - const auto dc2_alive_nodes_remaining = std::distance(dc2_alive_nodes_boundary, dc2._nodes.end()); - if (dc1_alive_nodes_remaining != dc2_alive_nodes_remaining) { - return dc1_alive_nodes_remaining < dc2_alive_nodes_remaining; - } - - // Third criteria: The number of dead voters remaining (higher has more priority) - // We want to keep the dead voters in case we can't find enough alive voters. - - // Note that the nodes do not contain dead non-voters (they are filtered out in `group0_voter_calculator::distribute_voters`), - // so we can use the entire subrange from the beginning and avoid another boundary search. - // We check for this condition in the non-release builds. - if constexpr (!tools::build_info::is_release_build()) { - SEASTAR_ASSERT(!dc1._nodes.contains(std::make_tuple(is_alive_t::no, is_voter_t::no))); - SEASTAR_ASSERT(!dc2._nodes.contains(std::make_tuple(is_alive_t::no, is_voter_t::no))); - } - - const auto dc1_dead_nodes = std::distance(dc1._nodes.begin(), dc1_alive_nodes_boundary); - const auto dc2_dead_nodes = std::distance(dc2._nodes.begin(), dc2_alive_nodes_boundary); - return dc1_dead_nodes < dc2_dead_nodes; + // Second criteria: The number of racks (higher has more priority) + return dc1._racks.size() < dc2._racks.size(); } }; diff --git a/test/boost/group0_voter_calculator_test.cc b/test/boost/group0_voter_calculator_test.cc index 0491f6bb2e..f912f55fe9 100644 --- a/test/boost/group0_voter_calculator_test.cc +++ b/test/boost/group0_voter_calculator_test.cc @@ -467,9 +467,9 @@ BOOST_AUTO_TEST_CASE(dc_cannot_have_half_or_more_of_voters) { raft::server_id::create_random_id(), raft::server_id::create_random_id()}; const service::group0_voter_calculator::nodes_list_t nodes = { - {ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = false, .is_alive = true}}, - {ids[1], {.datacenter = "dc-1", .rack = "rack", .is_voter = false, .is_alive = true}}, - {ids[2], {.datacenter = "dc-1", .rack = "rack", .is_voter = false, .is_alive = true}}, + {ids[0], {.datacenter = "dc-1", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[1], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, {ids[3], {.datacenter = "dc-2", .rack = "rack", .is_voter = false, .is_alive = true}}, {ids[4], {.datacenter = "dc-3", .rack = "rack", .is_voter = false, .is_alive = true}}, }; @@ -747,4 +747,126 @@ BOOST_AUTO_TEST_CASE(dead_voters_moved_to_available_alive_nodes) { } +BOOST_AUTO_TEST_CASE(voters_are_distributed_across_racks) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[1], {.datacenter = "dc", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[3], {.datacenter = "dc", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[6], {.datacenter = "dc", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: There is no duplicate rack across the voters (having 3 voters and 3 racks). + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + std::unordered_set voters_racks; + for (const auto id : voters) { + const auto& node = nodes.at(id); + BOOST_CHECK_MESSAGE(voters_racks.emplace(node.rack).second, fmt::format("Duplicate voter in the same rack: {}", node.rack)); + } +} + + +BOOST_AUTO_TEST_CASE(more_racks_preferred_over_more_nodes_in_two_dc) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[1], {.datacenter = "dc-1", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[3], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[6], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The racks are preferred over the nodes (DC-1 has more racks so it should have more voters, despite DC-2 having more nodes). + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + std::unordered_map voters_dcs; + for (const auto id : voters) { + const auto& node = nodes.at(id); + ++voters_dcs[node.datacenter]; + } + + BOOST_CHECK_GT(voters_dcs["dc-1"], voters_dcs["dc-2"]); + + // Check that the DC-1 has voters in different racks. + std::unordered_set voters_racks; + for (const auto id : voters | std::views::filter([&nodes](const auto id) { + return nodes.at(id).datacenter == "dc-1"; + })) { + const auto& node = nodes.at(id); + BOOST_CHECK_MESSAGE(voters_racks.emplace(node.rack).second, fmt::format("Duplicate voter in the same rack: {}", node.rack)); + } +} + + +BOOST_AUTO_TEST_CASE(dcs_preferred_over_racks) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc-1", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[1], {.datacenter = "dc-2", .rack = "rack-4", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[3], {.datacenter = "dc-4", .rack = "rack-6", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc-3", .rack = "rack-5", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc-1", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[6], {.datacenter = "dc-5", .rack = "rack-7", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The DCs are preferred over racks when distributing voters (each DC should have a voter). + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + std::unordered_set voters_dcs; + for (const auto id : voters) { + const auto& node = nodes.at(id); + BOOST_CHECK_MESSAGE(voters_dcs.emplace(node.datacenter).second, fmt::format("Duplicate voter in the same DC: {}", node.datacenter)); + } +} + + BOOST_AUTO_TEST_SUITE_END()