From e93bf3f05a84b0fe41d016c4d59b22eea4296063 Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Mon, 9 Jun 2025 18:16:50 +0200 Subject: [PATCH 1/4] raft: clean up voter handler by removing redundant comments Remove comments from the group0 voter handler that simply restate the code or do not provide meaningful clarification. This improves code readability and maintainability by reducing noise and focusing on essential documentation. --- service/raft/group0_voter_handler.cc | 30 +++++----------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index 33155f2956..f72aa8c01d 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -187,30 +187,21 @@ public: // The priority comparator for the rack_info friend bool operator<(const rack_info& rack1, const rack_info& rack2) { - // First criteria: The number of already newly assigned voters (lower has more priority) if (rack1._assigned_voters_count != rack2._assigned_voters_count) { return rack1._assigned_voters_count > rack2._assigned_voters_count; } - - // Second criteria: The ownership of existing alive leader node - // - note that we don't take an existing dead leader into account here, as it is about to lose its - // leadership and is about to be replaced by a new leader + // We don't take an existing dead leader into account here (only alive leader), as a dead leader is about to lose + // its leadership and is about to be replaced by a new leader if (rack1._owns_alive_leader != rack2._owns_alive_leader) { return rack2._owns_alive_leader; } - - // Third criteria: The number of existing alive voters remaining (higher has more priority) if (rack1._existing_alive_voters_remaining != rack2._existing_alive_voters_remaining) { return rack1._existing_alive_voters_remaining < rack2._existing_alive_voters_remaining; } - - // Fourth criteria: The number of alive nodes (voters and non-voters) remaining (higher has more priority) if (rack1._alive_nodes_remaining != rack2._alive_nodes_remaining) { return rack1._alive_nodes_remaining < rack2._alive_nodes_remaining; } - - // Fifth criteria: The number of existing dead voters remaining (higher has more priority) - // We want to keep the dead voters in case we can't find enough alive voters. + // We retain the dead voters in case we can't find enough alive voters. return rack1._existing_dead_voters_remaining < rack2._existing_dead_voters_remaining; } }; @@ -271,8 +262,6 @@ public: // to the datacenter is incremented. [[nodiscard]] std::optional select_next_voter() { while (!_racks.empty()) { - - // Select the datacenter with the highest priority (according to the comparator) auto rack = _racks.top(); _racks.pop(); @@ -322,24 +311,17 @@ public: // The priority comparator for the datacenter_info friend bool operator<(const datacenter_info& dc1, const datacenter_info& dc2) { - // First criteria: The number of already newly assigned voters (lower has more priority) if (dc1._assigned_voters_count != dc2._assigned_voters_count) { return dc1._assigned_voters_count > dc2._assigned_voters_count; } - - // Second criteria: The ownership of existing alive leader node - // - note that we don't take an existing dead leader into account here, as it is about to lose its - // leadership and is about to be replaced by a new leader + // We don't take an existing dead leader into account here (only alive leader), as a dead leader is about to lose + // its leadership and is about to be replaced by a new leader if (dc1._owns_alive_leader != dc2._owns_alive_leader) { return dc2._owns_alive_leader; } - - // Third criteria: The number of existing alive voters remaining (higher has more priority) if (dc1._existing_alive_voters_remaining != dc2._existing_alive_voters_remaining) { return dc1._existing_alive_voters_remaining < dc2._existing_alive_voters_remaining; } - - // Fourth criteria: The number of racks (higher has more priority) return dc1._racks.size() < dc2._racks.size(); } }; @@ -430,8 +412,6 @@ class calculator_impl { // from that datacenter. [[nodiscard]] std::optional select_next_voter() { while (!_datacenters.empty()) { - - // Select the datacenter with the highest priority (based on the comparator) auto dc = _datacenters.top(); _datacenters.pop(); From 05392e6ef3e046c002b4befebf7bcb980ee21d37 Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Mon, 9 Jun 2025 18:54:27 +0200 Subject: [PATCH 2/4] raft: use explicit priority comparators for datacenters and racks Refactor the voter handler to use explicit priority comparator classes for datacenter and rack selection. This makes the prioritization logic more transparent and robust, and reduces the risk of subtle bugs that could arise from relying on implicit comparison operators. --- service/raft/group0_voter_handler.cc | 68 +++++++++++++++------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index f72aa8c01d..b9af6aee82 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -186,24 +186,26 @@ public: } // The priority comparator for the rack_info - friend bool operator<(const rack_info& rack1, const rack_info& rack2) { - if (rack1._assigned_voters_count != rack2._assigned_voters_count) { - return rack1._assigned_voters_count > rack2._assigned_voters_count; + struct priority_compare { + bool operator()(const rack_info& lhs, const rack_info& rhs) const { + if (lhs._assigned_voters_count != rhs._assigned_voters_count) { + return lhs._assigned_voters_count > rhs._assigned_voters_count; + } + // We don't take an existing dead leader into account here (only alive leader), as a dead leader is about to lose + // its leadership and is about to be replaced by a new leader + if (lhs._owns_alive_leader != rhs._owns_alive_leader) { + return rhs._owns_alive_leader; + } + if (lhs._existing_alive_voters_remaining != rhs._existing_alive_voters_remaining) { + return lhs._existing_alive_voters_remaining < rhs._existing_alive_voters_remaining; + } + if (lhs._alive_nodes_remaining != rhs._alive_nodes_remaining) { + return lhs._alive_nodes_remaining < rhs._alive_nodes_remaining; + } + // We retain the dead voters in case we can't find enough alive voters. + return lhs._existing_dead_voters_remaining < rhs._existing_dead_voters_remaining; } - // We don't take an existing dead leader into account here (only alive leader), as a dead leader is about to lose - // its leadership and is about to be replaced by a new leader - if (rack1._owns_alive_leader != rack2._owns_alive_leader) { - return rack2._owns_alive_leader; - } - if (rack1._existing_alive_voters_remaining != rack2._existing_alive_voters_remaining) { - return rack1._existing_alive_voters_remaining < rack2._existing_alive_voters_remaining; - } - if (rack1._alive_nodes_remaining != rack2._alive_nodes_remaining) { - return rack1._alive_nodes_remaining < rack2._alive_nodes_remaining; - } - // We retain the dead voters in case we can't find enough alive voters. - return rack1._existing_dead_voters_remaining < rack2._existing_dead_voters_remaining; - } + }; }; @@ -225,7 +227,7 @@ class datacenter_info { bool _owns_alive_leader = false; - using racks_store_t = std::priority_queue; + using racks_store_t = std::priority_queue, rack_info::priority_compare>; racks_store_t _racks; static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t& existing_alive_voters_remaining, bool& owns_alive_leader) { @@ -310,26 +312,28 @@ public: } // The priority comparator for the datacenter_info - friend bool operator<(const datacenter_info& dc1, const datacenter_info& dc2) { - if (dc1._assigned_voters_count != dc2._assigned_voters_count) { - return dc1._assigned_voters_count > dc2._assigned_voters_count; + struct priority_compare { + bool operator()(const datacenter_info& lhs, const datacenter_info& rhs) const { + if (lhs._assigned_voters_count != rhs._assigned_voters_count) { + return lhs._assigned_voters_count > rhs._assigned_voters_count; + } + // We don't take an existing dead leader into account here (only alive leader), as a dead leader is about to lose + // its leadership and is about to be replaced by a new leader + if (lhs._owns_alive_leader != rhs._owns_alive_leader) { + return rhs._owns_alive_leader; + } + if (lhs._existing_alive_voters_remaining != rhs._existing_alive_voters_remaining) { + return lhs._existing_alive_voters_remaining < rhs._existing_alive_voters_remaining; + } + return lhs._racks.size() < rhs._racks.size(); } - // We don't take an existing dead leader into account here (only alive leader), as a dead leader is about to lose - // its leadership and is about to be replaced by a new leader - if (dc1._owns_alive_leader != dc2._owns_alive_leader) { - return dc2._owns_alive_leader; - } - if (dc1._existing_alive_voters_remaining != dc2._existing_alive_voters_remaining) { - return dc1._existing_alive_voters_remaining < dc2._existing_alive_voters_remaining; - } - return dc1._racks.size() < dc2._racks.size(); - } + }; }; class calculator_impl { - using datacenters_store_t = std::priority_queue; + using datacenters_store_t = std::priority_queue, datacenter_info::priority_compare>; size_t _largest_dc_size = 0; datacenters_store_t _datacenters; From 839e0bf40d4ec7785b78f1c0a1e668aaa9568a4a Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Tue, 10 Jun 2025 09:55:37 +0200 Subject: [PATCH 3/4] raft: reformat voter handler for consistent indentation Reformatted the voter handler implementation to comply with clang-format automatic formatting rules. No functional changes. --- service/raft/group0_voter_handler.cc | 37 +++++++++++++++------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index b9af6aee82..15c2089c6f 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -108,23 +108,26 @@ class rack_info { existing_dead_voters_remaining = 0; owns_alive_leader = false; - return nodes | std::views::transform([&alive_nodes_remaining, &existing_alive_voters_remaining, &existing_dead_voters_remaining, &owns_alive_leader](const auto& node_entry) { - const auto& [id, node] = node_entry; - if (node.get().is_alive) { - ++alive_nodes_remaining; - if (node.get().is_leader) { - owns_alive_leader = true; - } - } - if (node.get().is_voter) { - if (node.get().is_alive) { - ++existing_alive_voters_remaining; - } else { - ++existing_dead_voters_remaining; - } - } - return std::make_tuple(node_priority::get_value(node), id, node); - }) | std::ranges::to(); + return nodes | + std::views::transform([&alive_nodes_remaining, &existing_alive_voters_remaining, &existing_dead_voters_remaining, + &owns_alive_leader](const auto& node_entry) { + const auto& [id, node] = node_entry; + if (node.get().is_alive) { + ++alive_nodes_remaining; + if (node.get().is_leader) { + owns_alive_leader = true; + } + } + if (node.get().is_voter) { + if (node.get().is_alive) { + ++existing_alive_voters_remaining; + } else { + ++existing_dead_voters_remaining; + } + } + return std::make_tuple(node_priority::get_value(node), id); + }) | + std::ranges::to(); } public: From b7e0a01fcc35531bc8c570ff442d1ff6452a865f Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Mon, 5 May 2025 18:48:58 +0200 Subject: [PATCH 4/4] raft: simplify voter handler code to not pass node references around Refactor the voter handler logic to only pass around node IDs (`raft::server_id`), instead of pairs of IDs and node descriptor references. Node descriptors can always be efficiently retrieved from the original nodes map, which remains valid throughout the calculation. This change reduces unnecessary reference passing and simplifies the code. All node detail lookups are now performed via the central nodes map as needed. Fixes: scylladb/scylladb#24035 --- service/raft/group0_voter_handler.cc | 108 +++++++++++++-------------- 1 file changed, 53 insertions(+), 55 deletions(-) diff --git a/service/raft/group0_voter_handler.cc b/service/raft/group0_voter_handler.cc index 15c2089c6f..5e6e155548 100644 --- a/service/raft/group0_voter_handler.cc +++ b/service/raft/group0_voter_handler.cc @@ -62,7 +62,7 @@ struct node_priority { }; -using nodes_ref_list_t = std::vector>>; +using node_ids_t = std::vector; // Represents information about a rack and provides functionality to manage voter selection. @@ -82,10 +82,7 @@ class rack_info { bool _owns_alive_leader = false; - using node_info_t = std::tuple>; + using node_info_t = std::tuple; struct node_info_priority_compare { bool operator()(const node_info_t& lhs, const node_info_t& rhs) const { @@ -99,27 +96,25 @@ class rack_info { size_t _assigned_voters_count = 0; - const group0_voter_calculator::node_descriptor* _selected_voter = nullptr; - - static nodes_store_t create_nodes_list(std::ranges::input_range auto&& nodes, size_t& alive_nodes_remaining, size_t& existing_alive_voters_remaining, - size_t& existing_dead_voters_remaining, bool& owns_alive_leader) { + static nodes_store_t create_nodes_list(const group0_voter_calculator::nodes_list_t& all_nodes, const node_ids_t& rack_nodes, size_t& alive_nodes_remaining, + size_t& existing_alive_voters_remaining, size_t& existing_dead_voters_remaining, bool& owns_alive_leader) { alive_nodes_remaining = 0; existing_alive_voters_remaining = 0; existing_dead_voters_remaining = 0; owns_alive_leader = false; - return nodes | - std::views::transform([&alive_nodes_remaining, &existing_alive_voters_remaining, &existing_dead_voters_remaining, - &owns_alive_leader](const auto& node_entry) { - const auto& [id, node] = node_entry; - if (node.get().is_alive) { + return rack_nodes | + std::views::transform([&all_nodes, &alive_nodes_remaining, &existing_alive_voters_remaining, &existing_dead_voters_remaining, + &owns_alive_leader](const auto& id) { + const auto& node = all_nodes.at(id); + if (node.is_alive) { ++alive_nodes_remaining; - if (node.get().is_leader) { + if (node.is_leader) { owns_alive_leader = true; } } - if (node.get().is_voter) { - if (node.get().is_alive) { + if (node.is_voter) { + if (node.is_alive) { ++existing_alive_voters_remaining; } else { ++existing_dead_voters_remaining; @@ -131,8 +126,9 @@ class rack_info { } public: - explicit rack_info(std::ranges::input_range auto&& nodes) - : _nodes(create_nodes_list(nodes, _alive_nodes_remaining, _existing_alive_voters_remaining, _existing_dead_voters_remaining, _owns_alive_leader)) { + explicit rack_info(const group0_voter_calculator::nodes_list_t& all_nodes, const node_ids_t& rack_nodes) + : _nodes(create_nodes_list( + all_nodes, rack_nodes, _alive_nodes_remaining, _existing_alive_voters_remaining, _existing_dead_voters_remaining, _owns_alive_leader)) { } // Select the "best" next voter from the rack @@ -141,26 +137,26 @@ public: // // If a node is selected, it is removed from the list of candidates, and the number of voters assigned // to the rack is incremented. - [[nodiscard]] std::optional select_next_voter() { - _selected_voter = nullptr; - + [[nodiscard]] std::optional select_next_voter(const group0_voter_calculator::nodes_list_t& nodes_info) { if (_nodes.empty()) { return std::nullopt; } - const auto [priority, voter_id, node] = _nodes.top(); + const auto [priority, voter_id] = _nodes.top(); _nodes.pop(); - if (node.get().is_alive) { + const auto& node = nodes_info.at(voter_id); + + if (node.is_alive) { SCYLLA_ASSERT(_alive_nodes_remaining > 0); --_alive_nodes_remaining; - if (node.get().is_leader) { + if (node.is_leader) { SCYLLA_ASSERT(_owns_alive_leader); _owns_alive_leader = false; } } - if (node.get().is_voter) { - if (node.get().is_alive) { + if (node.is_voter) { + if (node.is_alive) { SCYLLA_ASSERT(_existing_alive_voters_remaining > 0); --_existing_alive_voters_remaining; } else { @@ -170,17 +166,10 @@ public: } ++_assigned_voters_count; - _selected_voter = &node.get(); return voter_id; } - // Get the node descriptor for the currently selected voter - [[nodiscard]] const group0_voter_calculator::node_descriptor& get_selected_voter_info() const { - SCYLLA_ASSERT(_selected_voter != nullptr); - return *_selected_voter; - } - // Check if there are more candidates available for voter selection // // Returns `true` if there are more candidates available, `false` otherwise. @@ -233,30 +222,34 @@ class datacenter_info { using racks_store_t = std::priority_queue, rack_info::priority_compare>; racks_store_t _racks; - static racks_store_t create_racks_list(std::ranges::input_range auto&& nodes, size_t& existing_alive_voters_remaining, bool& owns_alive_leader) { + static racks_store_t create_racks_list(const group0_voter_calculator::nodes_list_t& all_nodes, const node_ids_t& dc_nodes, + size_t& existing_alive_voters_remaining, bool& owns_alive_leader) { existing_alive_voters_remaining = 0; owns_alive_leader = false; - std::unordered_map nodes_by_rack; - for (const auto& [id, node] : nodes) { - nodes_by_rack[node.get().rack].emplace_back(id, node); - if (node.get().is_alive) { - if (node.get().is_voter) { + std::unordered_map nodes_by_rack; + for (const auto& id : dc_nodes) { + const auto& node = all_nodes.at(id); + nodes_by_rack[node.rack].emplace_back(id); + if (node.is_alive) { + if (node.is_voter) { ++existing_alive_voters_remaining; } - if (node.get().is_leader) { + if (node.is_leader) { owns_alive_leader = true; } } } - return nodes_by_rack | std::views::values | std::ranges::to(); + return nodes_by_rack | std::views::values | std::views::transform([&all_nodes](const auto& rack_nodes) { + return rack_info(all_nodes, rack_nodes); + }) | std::ranges::to(); } public: - explicit datacenter_info(std::ranges::sized_range auto&& nodes) - : _nodes_remaining(nodes.size()) - , _racks(create_racks_list(nodes, _existing_alive_voters_remaining, _owns_alive_leader)) { + explicit datacenter_info(const group0_voter_calculator::nodes_list_t& all_nodes, const node_ids_t& dc_nodes) + : _nodes_remaining(dc_nodes.size()) + , _racks(create_racks_list(all_nodes, dc_nodes, _existing_alive_voters_remaining, _owns_alive_leader)) { } // Select the "best" next voter from the datacenter @@ -265,12 +258,12 @@ public: // // If a node is selected, it is removed from the list of candidates, and the number of voters assigned // to the datacenter is incremented. - [[nodiscard]] std::optional select_next_voter() { + [[nodiscard]] std::optional select_next_voter(const group0_voter_calculator::nodes_list_t& nodes_info) { while (!_racks.empty()) { auto rack = _racks.top(); _racks.pop(); - const auto voter_id = rack.select_next_voter(); + const auto voter_id = rack.select_next_voter(nodes_info); if (!voter_id) { continue; @@ -280,7 +273,7 @@ public: _racks.push(rack); } - const auto& node = rack.get_selected_voter_info(); + const auto& node = nodes_info.at(voter_id.value()); if (node.is_alive) { if (node.is_voter) { @@ -336,6 +329,8 @@ public: class calculator_impl { + const group0_voter_calculator::nodes_list_t& _nodes_info; + using datacenters_store_t = std::priority_queue, datacenter_info::priority_compare>; size_t _largest_dc_size = 0; @@ -397,9 +392,9 @@ class calculator_impl { static datacenters_store_t create_datacenters_list(const group0_voter_calculator::nodes_list_t& nodes, size_t& largest_dc_size) { largest_dc_size = 0; - std::unordered_map nodes_by_dc; + std::unordered_map nodes_by_dc; for (const auto& [id, node] : nodes) { - nodes_by_dc[node.datacenter].emplace_back(id, node); + nodes_by_dc[node.datacenter].emplace_back(id); } if (!nodes_by_dc.empty()) { @@ -408,7 +403,9 @@ class calculator_impl { })->second.size(); } - return nodes_by_dc | std::views::values | std::ranges::to(); + return nodes_by_dc | std::views::values | std::views::transform([&nodes](const auto& dc_nodes) { + return datacenter_info(nodes, dc_nodes); + }) | std::ranges::to(); } // Select the next voter from the datacenters @@ -417,12 +414,12 @@ class calculator_impl { // // The method selects the datacenter with the highest priority (based on the comparator) and selects the next voter // from that datacenter. - [[nodiscard]] std::optional select_next_voter() { + [[nodiscard]] std::optional select_next_voter(const group0_voter_calculator::nodes_list_t& nodes_info) { while (!_datacenters.empty()) { auto dc = _datacenters.top(); _datacenters.pop(); - const auto voter_id = dc.select_next_voter(); + const auto voter_id = dc.select_next_voter(nodes_info); if (!voter_id) { // no more available voter candidates in this DC @@ -445,7 +442,8 @@ public: constexpr static size_t VOTERS_MAX_DEFAULT = 5; calculator_impl(uint64_t voters_max, const group0_voter_calculator::nodes_list_t& nodes) - : _datacenters(create_datacenters_list(nodes, _largest_dc_size)) + : _nodes_info(nodes) + , _datacenters(create_datacenters_list(nodes, _largest_dc_size)) , _voters_max(calc_voters_max(voters_max, nodes, _datacenters, _largest_dc_size)) , _voters_max_per_dc(calc_voters_max_per_dc(_voters_max, nodes, _datacenters, _largest_dc_size)) { @@ -457,7 +455,7 @@ public: voters.reserve(_voters_max); while (voters.size() < _voters_max) { - const auto& voter = select_next_voter(); + const auto& voter = select_next_voter(_nodes_info); if (!voter) { break; }