From 2ae59e8a871acdf08baf440496b88da245225935 Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Mon, 28 Apr 2025 19:19:11 +0200 Subject: [PATCH] raft: retain existing voters across data centers and racks Fix an issue in the voter calculator where existing voters were not retained across data centers and racks in certain scenarios. This occurred when voters were distributed across more data centers and racks than the maximum allowed number of voters. Previously, the prioritization logic for data centers and racks did not consider the number of existing assigned voters. It only prioritized nodes within a single data center or rack, which could result in unnecessary reassignment of voters. Improved the prioritization logic to account for the number of existing voters in each data center and rack. This change ensures a more stable voter distribution and reduces unnecessary voter reassignments. Fixes: scylladb/scylladb#23950 --- service/raft/group0_voter_handler.cc | 79 ++++++++++++++++++---- test/boost/group0_voter_calculator_test.cc | 70 +++++++++++++++++++ 2 files changed, 136 insertions(+), 13 deletions(-) diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index eba1ff4e31..aadd82e2ac 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -69,6 +69,7 @@ using nodes_ref_list_t = std::vector(); @@ -105,7 +115,7 @@ class rack_info { public: explicit rack_info(std::ranges::input_range auto&& nodes) - : _nodes(create_nodes_list(nodes, _alive_nodes_remaining, _existing_dead_voters_remaining)) { + : _nodes(create_nodes_list(nodes, _alive_nodes_remaining, _existing_alive_voters_remaining, _existing_dead_voters_remaining)) { } // Select the "best" next voter from the rack @@ -115,6 +125,8 @@ public: // If a node is selected, it is removed from the list of candidates, and the number of voters assigned // to the rack is incremented. [[nodiscard]] std::optional select_next_voter() { + _selected_voter = nullptr; + if (_nodes.empty()) { return std::nullopt; } @@ -125,16 +137,29 @@ public: if (node.get().is_alive) { SCYLLA_ASSERT(_alive_nodes_remaining > 0); --_alive_nodes_remaining; - } else if (node.get().is_voter) { - SCYLLA_ASSERT(_existing_dead_voters_remaining > 0); - --_existing_dead_voters_remaining; + } + if (node.get().is_voter) { + if (node.get().is_alive) { + SCYLLA_ASSERT(_existing_alive_voters_remaining > 0); + --_existing_alive_voters_remaining; + } else { + SCYLLA_ASSERT(_existing_dead_voters_remaining > 0); + --_existing_dead_voters_remaining; + } } ++_assigned_voters_count; + _selected_voter = &node.get(); return voter_id; } + // Get the node descriptor for the currently selected voter + [[nodiscard]] const group0_voter_calculator::node_descriptor& get_selected_voter_info() const { + SCYLLA_ASSERT(_selected_voter != nullptr); + return *_selected_voter; + } + // Check if there are more candidates available for voter selection // // Returns `true` if there are more candidates available, `false` otherwise. @@ -149,12 +174,17 @@ public: return rack1._assigned_voters_count > rack2._assigned_voters_count; } - // Second criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) + // Second criteria: The number of existing alive voters remaining (higher has more priority) + if (rack1._existing_alive_voters_remaining != rack2._existing_alive_voters_remaining) { + return rack1._existing_alive_voters_remaining < rack2._existing_alive_voters_remaining; + } + + // Third criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) if (rack1._alive_nodes_remaining != rack2._alive_nodes_remaining) { return rack1._alive_nodes_remaining < rack2._alive_nodes_remaining; } - // Third criteria: The number of existing dead voters remaining (higher has more priority) + // Fourth criteria: The number of existing dead voters remaining (higher has more priority) // We want to keep the existing dead voters in case we can't find enough alive voters. return rack1._existing_dead_voters_remaining < rack2._existing_dead_voters_remaining; } @@ -175,13 +205,22 @@ class datacenter_info { size_t _nodes_remaining = 0; size_t _assigned_voters_count = 0; + size_t _existing_alive_voters_remaining = 0; + using racks_store_t = std::priority_queue; racks_store_t _racks; - static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes) { + static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t& existing_alive_voters_remaining) { + existing_alive_voters_remaining = 0; + std::unordered_map nodes_by_rack; for (const auto& [id, node] : nodes) { nodes_by_rack[node.get().rack].emplace_back(id, node); + if (node.get().is_alive) { + if (node.get().is_voter) { + ++existing_alive_voters_remaining; + } + } } return nodes_by_rack | std::views::values | std::ranges::to(); @@ -190,7 +229,7 @@ class datacenter_info { public: explicit datacenter_info(std::ranges::sized_range auto&& nodes) : _nodes_remaining(nodes.size()) - , _racks(create_racks_list(nodes)) { + , _racks(create_racks_list(nodes, _existing_alive_voters_remaining)) { } // Select the "best" next voter from the datacenter @@ -216,6 +255,15 @@ public: _racks.push(rack); } + const auto& node = rack.get_selected_voter_info(); + + if (node.is_alive) { + if (node.is_voter) { + SCYLLA_ASSERT(_existing_alive_voters_remaining > 0); + --_existing_alive_voters_remaining; + } + } + SCYLLA_ASSERT(_nodes_remaining > 0); --_nodes_remaining; @@ -244,7 +292,12 @@ public: return dc1._assigned_voters_count > dc2._assigned_voters_count; } - // Second criteria: The number of racks (higher has more priority) + // Second criteria: The number of existing alive voters remaining (higher has more priority) + if (dc1._existing_alive_voters_remaining != dc2._existing_alive_voters_remaining) { + return dc1._existing_alive_voters_remaining < dc2._existing_alive_voters_remaining; + } + + // Third criteria: The number of racks (higher has more priority) return dc1._racks.size() < dc2._racks.size(); } }; diff --git a/test/boost/group0_voter_calculator_test.cc b/test/boost/group0_voter_calculator_test.cc index f912f55fe9..09eebf4cdd 100644 --- a/test/boost/group0_voter_calculator_test.cc +++ b/test/boost/group0_voter_calculator_test.cc @@ -869,4 +869,74 @@ BOOST_AUTO_TEST_CASE(dcs_preferred_over_racks) { } +BOOST_AUTO_TEST_CASE(existing_voters_are_retained_across_dcs) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes (3 of them are voters). + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true}}, + {ids[1], {.datacenter = "dc-2", .rack = "rack", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc-3", .rack = "rack", .is_voter = true, .is_alive = true}}, + {ids[3], {.datacenter = "dc-4", .rack = "rack", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc-5", .rack = "rack", .is_voter = true, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: Existing voters are retained in the voter set. + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + for (const auto& [id, node] : nodes | std::views::filter([](const auto& node) { + return node.second.is_voter; + })) { + BOOST_CHECK(voters.contains(id)); + } +} + + +BOOST_AUTO_TEST_CASE(existing_voters_are_kept_across_racks) { + + // Arrange: Set the voters limit and create the voter calculator. + + constexpr size_t max_voters = 3; + + const service::group0_voter_calculator voter_calc{max_voters}; + + // Act: Add the nodes (3 of them are voters). + + const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(), + raft::server_id::create_random_id(), raft::server_id::create_random_id()}; + + const service::group0_voter_calculator::nodes_list_t nodes = { + {ids[0], {.datacenter = "dc", .rack = "rack-1", .is_voter = true, .is_alive = true}}, + {ids[1], {.datacenter = "dc", .rack = "rack-2", .is_voter = false, .is_alive = true}}, + {ids[2], {.datacenter = "dc", .rack = "rack-3", .is_voter = true, .is_alive = true}}, + {ids[3], {.datacenter = "dc", .rack = "rack-4", .is_voter = false, .is_alive = true}}, + {ids[4], {.datacenter = "dc", .rack = "rack-5", .is_voter = true, .is_alive = true}}, + }; + + const auto& voters = voter_calc.distribute_voters(nodes); + + // Assert: Existing voters are retained in the voter set. + + BOOST_CHECK_EQUAL(voters.size(), max_voters); + + for (const auto& [id, node] : nodes | std::views::filter([](const auto& node) { + return node.second.is_voter; + })) { + BOOST_CHECK(voters.contains(id)); + } +} + + BOOST_AUTO_TEST_SUITE_END()