From 76ceaf129bb46a8fe297faeb85fd47c65337b78a Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Fri, 10 Jan 2025 16:44:30 +0100 Subject: [PATCH] raft: distribute voters by rack inside DC Distribute the voters evenly across racks in the datacenters. When distributing the voters across datacenters, the datacenters with more racks will be preferred in case of a tie. Also, in case of asymmetric voter distribution (2 DCs), the DC with more racks will have more voters (if the node counts allow it). In case of a single datacenter, the voters will be distributed across racks evenly (in the similar manner as done for the whole datacenters). The intention is that similar to losing a datacenter, we want to avoid losing the majority if a rack goes down - so if there are multiple racks, we want to distribute the voters across them in such a way that losing the whole rack will not cause the majority loss (if possible). --- service/raft/group0_voter_handler.cc | 163 ++++++++++++++++----- test/boost/group0_voter_calculator_test.cc | 128 +++++++++++++++- 2 files changed, 255 insertions(+), 36 deletions(-) diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index 3ce8015772..58b419eebc 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -36,16 +36,16 @@ bool operator<(const is_voter_t& lhs, const is_voter_t& rhs) { } -// Represents information about a datacenter and provides functionality to manage voter selection. +// Represents information about a rack and provides functionality to manage voter selection. // -// This class is responsible for storing information about nodes within a datacenter and selecting voters based +// This class is responsible for storing information about nodes within a rack and selecting voters based // on specific criteria. // It maintains a map of nodes categorized by their status (alive/dead) and role (voter/non-voter) and provides // methods to select the next voter and check if more candidates are available. // -// The class also defines a priority comparator to determine the priority of datacenters based on the number of +// The class also defines a priority comparator to determine the priority of racks based on the number of // assigned voters, alive nodes, and dead voters. -class datacenter_info { +class rack_info { using nodes_map_key_t = std::tuple; using nodes_map_t = std::multimap; @@ -61,16 +61,16 @@ class datacenter_info { } public: - explicit datacenter_info(std::ranges::input_range auto&& nodes) + explicit rack_info(std::ranges::input_range auto&& nodes) : _nodes(create_nodes_map(nodes)) { } - // Select the "best" next voter from the datacenter + // Select the "best" next voter from the rack // // Returns the ID of the selected voter or `std::nullopt` if no more candidates are available. // // If a node is selected, it is removed from the list of candidates, and the number of voters assigned - // to the datacenter is incremented. + // to the rack is incremented. [[nodiscard]] std::optional select_next_voter() { const auto node = std::invoke([this]() { // Process alive nodes first, then dead nodes @@ -99,13 +99,133 @@ public: return voter_id; } + // Check if there are more candidates available for voter selection + // + // Returns `true` if there are more candidates available, `false` otherwise. + [[nodiscard]] bool has_more_candidates() const { + return !_nodes.empty(); + } + + // The priority comparator for the rack_info + friend bool operator<(const rack_info& rack1, const rack_info& rack2) { + // First criteria: The number of already assigned voters (lower has more priority) + if (rack1._voters_count != rack2._voters_count) { + return rack1._voters_count > rack2._voters_count; + } + + const auto& rack1_alive_nodes_boundary = rack1._nodes.lower_bound(std::make_tuple(is_alive_t::yes, is_voter_t::no)); + const auto& rack2_alive_nodes_boundary = rack2._nodes.lower_bound(std::make_tuple(is_alive_t::yes, is_voter_t::no)); + + // Second criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) + + const auto rack1_alive_nodes_remaining = std::distance(rack1_alive_nodes_boundary, rack1._nodes.end()); + const auto rack2_alive_nodes_remaining = std::distance(rack2_alive_nodes_boundary, rack2._nodes.end()); + if (rack1_alive_nodes_remaining != rack2_alive_nodes_remaining) { + return rack1_alive_nodes_remaining < rack2_alive_nodes_remaining; + } + + // Third criteria: The number of dead voters remaining (higher has more priority) + // We want to keep the dead voters in case we can't find enough alive voters. + + // Note that the nodes don't contain dead non-voters (we filter them out in `group0_voter_calculator::distribute_voters`), + // so we can use the whole subrange from the beginning (and can avoid another boundary search). + // We check for this condition in the non-release builds. + if constexpr (!tools::build_info::is_release_build()) { + SEASTAR_ASSERT(!rack1._nodes.contains(std::make_tuple(is_alive_t::no, is_voter_t::no))); + SEASTAR_ASSERT(!rack2._nodes.contains(std::make_tuple(is_alive_t::no, is_voter_t::no))); + } + + const auto rack1_dead_nodes_remaining = std::distance(rack1._nodes.begin(), rack1_alive_nodes_boundary); + const auto rack2_dead_nodes_remaining = std::distance(rack2._nodes.begin(), rack2_alive_nodes_boundary); + return rack1_dead_nodes_remaining < rack2_dead_nodes_remaining; + } + +}; + + +// Represents information about a datacenter and provides functionality to manage voter selection. +// +// This class is responsible for storing information about nodes within a datacenter and selecting voters based +// on specific criteria. +// It maintains a map of nodes categorized by their status (alive/dead) and role (voter/non-voter) and provides +// methods to select the next voter and check if more candidates are available. +// +// The class also defines a priority comparator to determine the priority of datacenters based on the number of +// assigned voters, alive nodes, and dead voters. +class datacenter_info { + + size_t _nodes_remaining = 0; + size_t _voters_count = 0; + + using racks_store_t = std::priority_queue; + racks_store_t _racks; + + static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t nodes_remaining) { + const auto nodes_by_rack = nodes | std::views::transform([](const auto& node_entry) { + const auto& [id, node] = node_entry; + return std::make_pair(std::string_view{node.rack}, std::make_pair(id, std::cref(node))); + }) | std::ranges::to(); + + racks_store_t::container_type racks; + + for (const auto rack : nodes_by_rack | std::views::keys | std::ranges::to()) { + const auto [first, last] = nodes_by_rack.equal_range(rack); + + racks.emplace_back(std::ranges::subrange(first, last) | std::views::transform([](const auto& node_entry) { + return node_entry.second; + })); + + nodes_remaining += std::distance(first, last); + } + + return racks | std::ranges::to(); + } + +public: + explicit datacenter_info(std::ranges::input_range auto&& nodes) + : _racks(create_racks_list(nodes, _nodes_remaining)) { + } + + // Select the "best" next voter from the datacenter + // + // Returns the ID of the selected voter or `std::nullopt` if no more candidates are available. + // + // If a node is selected, it is removed from the list of candidates, and the number of voters assigned + // to the datacenter is incremented. + [[nodiscard]] std::optional select_next_voter() { + while (!_racks.empty()) { + + // Select the datacenter with the highest priority (according to the comparator) + auto rack = _racks.top(); + _racks.pop(); + + const auto voter_id = rack.select_next_voter(); + + if (!voter_id) { + continue; + } + + if (rack.has_more_candidates()) { + _racks.push(rack); + } + + --_nodes_remaining; + ++_voters_count; + + return voter_id; + } + + // No more nodes to select + return std::nullopt; + } + // Check if there are more candidates available for voter selection // // Returns `true` if there are more candidates available, `false` otherwise. // // The selection is limited by the maximum number of voters per datacenter. [[nodiscard]] bool has_more_candidates(size_t voters_max_per_dc) const { - return !_nodes.empty() && _voters_count < voters_max_per_dc; + return _nodes_remaining > 0 && _voters_count < voters_max_per_dc; } // The priority comparator for the datacenter_info @@ -116,31 +236,8 @@ public: return dc1._voters_count > dc2._voters_count; } - const auto& dc1_alive_nodes_boundary = dc1._nodes.lower_bound(std::make_tuple(is_alive_t::yes, is_voter_t::no)); - const auto& dc2_alive_nodes_boundary = dc2._nodes.lower_bound(std::make_tuple(is_alive_t::yes, is_voter_t::no)); - - // Second criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) - - const auto dc1_alive_nodes_remaining = std::distance(dc1_alive_nodes_boundary, dc1._nodes.end()); - const auto dc2_alive_nodes_remaining = std::distance(dc2_alive_nodes_boundary, dc2._nodes.end()); - if (dc1_alive_nodes_remaining != dc2_alive_nodes_remaining) { - return dc1_alive_nodes_remaining < dc2_alive_nodes_remaining; - } - - // Third criteria: The number of dead voters remaining (higher has more priority) - // We want to keep the dead voters in case we can't find enough alive voters. - - // Note that the nodes do not contain dead non-voters (they are filtered out in `group0_voter_calculator::distribute_voters`), - // so we can use the entire subrange from the beginning and avoid another boundary search. - // We check for this condition in the non-release builds. - if constexpr (!tools::build_info::is_release_build()) { - SEASTAR_ASSERT(!dc1._nodes.contains(std::make_tuple(is_alive_t::no, is_voter_t::no))); - SEASTAR_ASSERT(!dc2._nodes.contains(std::make_tuple(is_alive_t::no, is_voter_t::no))); - } - - const auto dc1_dead_nodes = std::distance(dc1._nodes.begin(), dc1_alive_nodes_boundary); - const auto dc2_dead_nodes = std::distance(dc2._nodes.begin(), dc2_alive_nodes_boundary); - return dc1_dead_nodes < dc2_dead_nodes; + // Second criteria: The number of racks (higher has more priority) + return dc1._racks.size() < dc2._racks.size(); } }; diff --git a/test/boost/group0_voter_calculator_test.cc b/test/boost/group0_voter_calculator_test.cc index 0491f6bb2e..f912f55fe9 100644 --- a/test/boost/group0_voter_calculator_test.cc +++ b/test/boost/group0_voter_calculator_test.cc @@ -467,9 +467,9 @@ BOOST_AUTO_TEST_CASE(dc_cannot_have_half_or_more_of_voters) { raft::server_id::create_random_id(), raft::server_id::create_random_id()}; const service::group0_voter_calculator::nodes_list_t nodes = { - {ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = false, .is_alive = true}}, - {ids[1], {.datacenter = "dc-1", .rack = "rack", .is_voter = false, .is_alive = true}}, - {ids[2], {.datacenter = "dc-1", .rack = "rack", .is_voter = false, .is_alive = true}}, + {ids[0], {.datacenter = "dc-1", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[1], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, {ids[3], {.datacenter = "dc-2", .rack = "rack", .is_voter = false, .is_alive = true}}, {ids[4], {.datacenter = "dc-3", .rack = "rack", .is_voter = false, .is_alive = true}}, }; @@ -747,4 +747,126 @@ BOOST_AUTO_TEST_CASE(dead_voters_moved_to_available_alive_nodes) { } +BOOST_AUTO_TEST_CASE(voters_are_distributed_across_racks) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[1], {.datacenter = "dc", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[3], {.datacenter = "dc", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[6], {.datacenter = "dc", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: There is no duplicate rack across the voters (having 3 voters and 3 racks). + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + std::unordered_set voters_racks; + for (const auto id : voters) { + const auto& node = nodes.at(id); + BOOST_CHECK_MESSAGE(voters_racks.emplace(node.rack).second, fmt::format("Duplicate voter in the same rack: {}", node.rack)); + } +} + + +BOOST_AUTO_TEST_CASE(more_racks_preferred_over_more_nodes_in_two_dc) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[1], {.datacenter = "dc-1", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[3], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[6], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The racks are preferred over the nodes (DC-1 has more racks so it should have more voters, despite DC-2 having more nodes). + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + std::unordered_map voters_dcs; + for (const auto id : voters) { + const auto& node = nodes.at(id); + ++voters_dcs[node.datacenter]; + } + + BOOST_CHECK_GT(voters_dcs["dc-1"], voters_dcs["dc-2"]); + + // Check that the DC-1 has voters in different racks. + std::unordered_set voters_racks; + for (const auto id : voters | std::views::filter([&nodes](const auto id) { + return nodes.at(id).datacenter == "dc-1"; + })) { + const auto& node = nodes.at(id); + BOOST_CHECK_MESSAGE(voters_racks.emplace(node.rack).second, fmt::format("Duplicate voter in the same rack: {}", node.rack)); + } +} + + +BOOST_AUTO_TEST_CASE(dcs_preferred_over_racks) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes. + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc-1", .rack = "rack-1", .is_voter = false, .is_alive = true}}, + {ids[1], {.datacenter = "dc-2", .rack = "rack-4", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[3], {.datacenter = "dc-4", .rack = "rack-6", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc-3", .rack = "rack-5", .is_voter = false, .is_alive = true}}, + {ids[5], {.datacenter = "dc-1", .rack = "rack-3", .is_voter = false, .is_alive = true}}, + {ids[6], {.datacenter = "dc-5", .rack = "rack-7", .is_voter = false, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: The DCs are preferred over racks when distributing voters (each DC should have a voter). + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + std::unordered_set voters_dcs; + for (const auto id : voters) { + const auto& node = nodes.at(id); + BOOST_CHECK_MESSAGE(voters_dcs.emplace(node.datacenter).second, fmt::format("Duplicate voter in the same DC: {}", node.datacenter)); + } +} + + BOOST_AUTO_TEST_SUITE_END()