From 9e1f78d162ba09363f3c209eec2debec416e1dfd Mon Sep 17 00:00:00 2001 From: Michael Litvak Date: Wed, 8 Oct 2025 13:17:24 +0200 Subject: [PATCH] locator: extend rf-rack validation functions Extend the locator function assert_rf_rack_valid_keyspace to accept arbitrary topology dc-rack maps and nodes instead of using the current token metadata. This allows us to add a new variant of the function that checks rf-rack validity given a topology change that we want to apply. we will use it to check that rf-rack validity will be maintained before applying the topology change. The possible topology changes for the check are node add and node remove / decommission. These operations can change the number of normal racks - if a new node is added to a new rack, or the last node is removed from a rack. --- locator/tablets.cc | 56 ++++++++++++++++++++++++++++++++++++++++----- locator/tablets.hh | 17 ++++++++++++++ replica/database.cc | 47 +++++++++++++++++++++++++++++++++++++ replica/database.hh | 12 ++++++++++ 4 files changed, 126 insertions(+), 6 deletions(-) diff --git a/locator/tablets.cc b/locator/tablets.cc index 9301c4d4e0..7764ab792b 100644 --- a/locator/tablets.cc +++ b/locator/tablets.cc @@ -1423,7 +1423,9 @@ const effective_replication_map_ptr& token_metadata_guard::get_erm() const { return g ? (**g).get_erm() : get(_guard); } -void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars) { +static void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars, + const std::unordered_map>>& dc_rack_map, + std::function is_normal_token_owner) { tablet_logger.debug("[assert_rf_rack_valid_keyspace]: Starting verifying that keyspace '{}' is RF-rack-valid", ks); // Any keyspace that does NOT use tablets is RF-rack-valid. @@ -1436,8 +1438,6 @@ void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr SCYLLA_ASSERT(ars.get_type() == replication_strategy_type::network_topology); const auto& nts = *static_cast(std::addressof(ars)); - const auto& dc_rack_map = tmptr->get_topology().get_datacenter_racks(); - for (const auto& dc : nts.get_datacenters()) { if (!dc_rack_map.contains(dc)) { on_internal_error(tablet_logger, seastar::format( @@ -1453,9 +1453,7 @@ void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr for (const auto& [_, rack_nodes] : rack_map) { // We must ignore zero-token nodes because they don't take part in replication. // Verify that this rack has at least one normal node. - const bool normal_rack = std::ranges::any_of(rack_nodes, [tmptr] (host_id host_id) { - return tmptr->is_normal_token_owner(host_id); - }); + const bool normal_rack = std::ranges::any_of(rack_nodes, is_normal_token_owner); if (normal_rack) { ++normal_rack_count; } @@ -1486,6 +1484,52 @@ void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tablet_logger.debug("[assert_rf_rack_valid_keyspace]: Keyspace '{}' has been verified to be RF-rack-valid", ks); } +void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars) { + assert_rf_rack_valid_keyspace(ks, tmptr, ars, + tmptr->get_topology().get_datacenter_racks(), + [&tmptr] (host_id host) { + return tmptr->is_normal_token_owner(host); + } + ); +} + +void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars, rf_rack_topology_operation op) { + auto dc_rack_map = tmptr->get_topology().get_datacenter_racks(); + + switch (op.tag) { + case rf_rack_topology_operation::type::add: + // include the new node only if it's added to an existing DC. + if (dc_rack_map.contains(op.dc)) { + tablet_logger.debug("[assert_rf_rack_valid_keyspace]: Including new node {} in DC '{}' and rack '{}'", op.node_id, op.dc, op.rack); + dc_rack_map[op.dc][op.rack].insert(op.node_id); + } + break; + case rf_rack_topology_operation::type::remove: + // remove the node from the map, if it exists. + tablet_logger.debug("[assert_rf_rack_valid_keyspace]: Excluding removed node {} from DC '{}' and rack '{}'", op.node_id, op.dc, op.rack); + if (dc_rack_map.contains(op.dc) && dc_rack_map[op.dc].contains(op.rack)) { + dc_rack_map[op.dc][op.rack].erase(op.node_id); + if (dc_rack_map[op.dc][op.rack].empty()) { + dc_rack_map[op.dc].erase(op.rack); + if (dc_rack_map[op.dc].empty()) { + dc_rack_map.erase(op.dc); + } + } + } + break; + } + + assert_rf_rack_valid_keyspace(ks, tmptr, ars, + dc_rack_map, + [&tmptr, &op] (host_id host) { + if (op.tag == rf_rack_topology_operation::type::add && host == op.node_id) { + return true; + } + return tmptr->is_normal_token_owner(host); + } + ); +} + rack_list get_allowed_racks(const locator::token_metadata& tm, const sstring& dc) { auto& topo = tm.get_topology(); auto normal_nodes = [&] (const sstring& rack) { diff --git a/locator/tablets.hh b/locator/tablets.hh index 1d0107181e..c2560b57c9 100644 --- a/locator/tablets.hh +++ b/locator/tablets.hh @@ -880,6 +880,23 @@ class abstract_replication_strategy; /// * The keyspace need not exist. We use its name purely for informational reasons (in error messages). void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr, const abstract_replication_strategy&); +struct rf_rack_topology_operation { + enum class type { + add, + remove // node remove or decommission + }; + type tag; + host_id node_id; + sstring dc; + sstring rack; +}; + +// Verify that the provided keyspace corresponding to the provided replication strategy will be RF-rack-valid +// after the provided topology change operation is applied. +// The operation is either adding a node, or removing / decommissioning a node. +// The added/removed node should be a normal token owning node. Nodes that don't own tokens don't affect RF-rack-validity. +void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars, rf_rack_topology_operation op); + /// Returns the list of racks that can be used for placing replicas in a given DC. rack_list get_allowed_racks(const locator::token_metadata&, const sstring& dc); diff --git a/replica/database.cc b/replica/database.cc index 94f4823bbd..ba26ddd052 100644 --- a/replica/database.cc +++ b/replica/database.cc @@ -3599,6 +3599,53 @@ void database::check_rf_rack_validity(const locator::token_metadata_ptr tmptr) c } } +bool database::check_rf_rack_validity_with_topology_change(locator::token_metadata_ptr tmptr, locator::rf_rack_topology_operation change) const { + // if it's already invalid before the topology change, it's allowed to remain invalid + try { + check_rf_rack_validity(tmptr); + } catch (...) { + return true; + } + + const auto& keyspaces = get_keyspaces(); + std::vector invalid_keyspaces{}; + bool valid = true; + + for (const auto& [name, info] : keyspaces) { + try { + locator::assert_rf_rack_valid_keyspace(name, tmptr, info.get_replication_strategy(), change); + } catch (...) { + if (enforce_rf_rack_validity_for_keyspace(info)) { + valid = false; + } + + invalid_keyspaces.push_back(std::string_view(name)); + } + } + + if (invalid_keyspaces.size() != 0) { + const auto ks_list = invalid_keyspaces + | std::views::join_with(std::string_view(", ")) + | std::ranges::to(); + + auto op_str = [&] { + switch (change.tag) { + case locator::rf_rack_topology_operation::type::add: return "joining"; + case locator::rf_rack_topology_operation::type::remove: return "removed"; + } + }(); + + dblog.warn("The {} node with DC='{}' and rack='{}' makes some existing keyspaces RF-rack-invalid, i.e. the replication factor " + "does not match the number of racks in one of the datacenters. That may reduce " + "availability in case of a failure (cf. " + "https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace). " + "Those keyspaces are: {}", + op_str, change.dc, change.rack, ks_list); + } + + return valid; +} + utils::chunked_vector compute_random_sorted_ints(uint64_t max_value, uint64_t n_values) { static thread_local std::minstd_rand rng{std::random_device{}()}; std::uniform_int_distribution dist(0, max_value); diff --git a/replica/database.hh b/replica/database.hh index de0af6c187..83b313fc56 100644 --- a/replica/database.hh +++ b/replica/database.hh @@ -2125,6 +2125,18 @@ public: // must contain a complete list of racks and data centers in the cluster. void check_rf_rack_validity(const locator::token_metadata_ptr) const; + // Verifies whether all keyspaces that require RF-rack-validity (as determined by enforce_rf_rack_validity_for_keyspace) + // would remain RF-rack-valid after applying a topology change. + // + // Returns true if all such keyspaces would remain RF-rack-valid after the change, and false otherwise. + // For keyspaces that would become RF-rack-invalid but do not require enforcement, + // a warning is printed, but this does not affect the return value. + // + // Preconditions: + // * The provided locator::topology instance (from the passed locator::token_metadata_ptr) + // must contain a complete list of racks and data centers in the cluster. + bool check_rf_rack_validity_with_topology_change(locator::token_metadata_ptr, locator::rf_rack_topology_operation) const; + private: // SSTable sampling might require considerable amounts of memory, // so we want to limit the number of concurrent sampling operations.