locator: extend rf-rack validation functions

Extend the locator function assert_rf_rack_valid_keyspace to accept
arbitrary topology dc-rack maps and nodes instead of using the current
token metadata.

This allows us to add a new variant of the function that checks rf-rack
validity given a topology change that we want to apply. we will use it
to check that rf-rack validity will be maintained before applying the
topology change.

The possible topology changes for the check are node add and node remove
/ decommission. These operations can change the number of normal racks -
if a new node is added to a new rack, or the last node is removed from a
rack.
This commit is contained in:
Michael Litvak
2025-10-08 13:17:24 +02:00
parent 8df61f6d99
commit 9e1f78d162
4 changed files with 126 additions and 6 deletions

View File

@@ -1423,7 +1423,9 @@ const effective_replication_map_ptr& token_metadata_guard::get_erm() const {
return g ? (**g).get_erm() : get<effective_replication_map_ptr>(_guard);
}
void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars) {
static void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars,
const std::unordered_map<sstring, std::unordered_map<sstring, std::unordered_set<host_id>>>& dc_rack_map,
std::function<bool(host_id)> is_normal_token_owner) {
tablet_logger.debug("[assert_rf_rack_valid_keyspace]: Starting verifying that keyspace '{}' is RF-rack-valid", ks);
// Any keyspace that does NOT use tablets is RF-rack-valid.
@@ -1436,8 +1438,6 @@ void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr
SCYLLA_ASSERT(ars.get_type() == replication_strategy_type::network_topology);
const auto& nts = *static_cast<const network_topology_strategy*>(std::addressof(ars));
const auto& dc_rack_map = tmptr->get_topology().get_datacenter_racks();
for (const auto& dc : nts.get_datacenters()) {
if (!dc_rack_map.contains(dc)) {
on_internal_error(tablet_logger, seastar::format(
@@ -1453,9 +1453,7 @@ void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr
for (const auto& [_, rack_nodes] : rack_map) {
// We must ignore zero-token nodes because they don't take part in replication.
// Verify that this rack has at least one normal node.
const bool normal_rack = std::ranges::any_of(rack_nodes, [tmptr] (host_id host_id) {
return tmptr->is_normal_token_owner(host_id);
});
const bool normal_rack = std::ranges::any_of(rack_nodes, is_normal_token_owner);
if (normal_rack) {
++normal_rack_count;
}
@@ -1486,6 +1484,52 @@ void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr
tablet_logger.debug("[assert_rf_rack_valid_keyspace]: Keyspace '{}' has been verified to be RF-rack-valid", ks);
}
void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars) {
assert_rf_rack_valid_keyspace(ks, tmptr, ars,
tmptr->get_topology().get_datacenter_racks(),
[&tmptr] (host_id host) {
return tmptr->is_normal_token_owner(host);
}
);
}
void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars, rf_rack_topology_operation op) {
auto dc_rack_map = tmptr->get_topology().get_datacenter_racks();
switch (op.tag) {
case rf_rack_topology_operation::type::add:
// include the new node only if it's added to an existing DC.
if (dc_rack_map.contains(op.dc)) {
tablet_logger.debug("[assert_rf_rack_valid_keyspace]: Including new node {} in DC '{}' and rack '{}'", op.node_id, op.dc, op.rack);
dc_rack_map[op.dc][op.rack].insert(op.node_id);
}
break;
case rf_rack_topology_operation::type::remove:
// remove the node from the map, if it exists.
tablet_logger.debug("[assert_rf_rack_valid_keyspace]: Excluding removed node {} from DC '{}' and rack '{}'", op.node_id, op.dc, op.rack);
if (dc_rack_map.contains(op.dc) && dc_rack_map[op.dc].contains(op.rack)) {
dc_rack_map[op.dc][op.rack].erase(op.node_id);
if (dc_rack_map[op.dc][op.rack].empty()) {
dc_rack_map[op.dc].erase(op.rack);
if (dc_rack_map[op.dc].empty()) {
dc_rack_map.erase(op.dc);
}
}
}
break;
}
assert_rf_rack_valid_keyspace(ks, tmptr, ars,
dc_rack_map,
[&tmptr, &op] (host_id host) {
if (op.tag == rf_rack_topology_operation::type::add && host == op.node_id) {
return true;
}
return tmptr->is_normal_token_owner(host);
}
);
}
rack_list get_allowed_racks(const locator::token_metadata& tm, const sstring& dc) {
auto& topo = tm.get_topology();
auto normal_nodes = [&] (const sstring& rack) {

View File

@@ -880,6 +880,23 @@ class abstract_replication_strategy;
/// * The keyspace need not exist. We use its name purely for informational reasons (in error messages).
void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr, const abstract_replication_strategy&);
struct rf_rack_topology_operation {
enum class type {
add,
remove // node remove or decommission
};
type tag;
host_id node_id;
sstring dc;
sstring rack;
};
// Verify that the provided keyspace corresponding to the provided replication strategy will be RF-rack-valid
// after the provided topology change operation is applied.
// The operation is either adding a node, or removing / decommissioning a node.
// The added/removed node should be a normal token owning node. Nodes that don't own tokens don't affect RF-rack-validity.
void assert_rf_rack_valid_keyspace(std::string_view ks, const token_metadata_ptr tmptr, const abstract_replication_strategy& ars, rf_rack_topology_operation op);
/// Returns the list of racks that can be used for placing replicas in a given DC.
rack_list get_allowed_racks(const locator::token_metadata&, const sstring& dc);

View File

@@ -3599,6 +3599,53 @@ void database::check_rf_rack_validity(const locator::token_metadata_ptr tmptr) c
}
}
bool database::check_rf_rack_validity_with_topology_change(locator::token_metadata_ptr tmptr, locator::rf_rack_topology_operation change) const {
// if it's already invalid before the topology change, it's allowed to remain invalid
try {
check_rf_rack_validity(tmptr);
} catch (...) {
return true;
}
const auto& keyspaces = get_keyspaces();
std::vector<std::string_view> invalid_keyspaces{};
bool valid = true;
for (const auto& [name, info] : keyspaces) {
try {
locator::assert_rf_rack_valid_keyspace(name, tmptr, info.get_replication_strategy(), change);
} catch (...) {
if (enforce_rf_rack_validity_for_keyspace(info)) {
valid = false;
}
invalid_keyspaces.push_back(std::string_view(name));
}
}
if (invalid_keyspaces.size() != 0) {
const auto ks_list = invalid_keyspaces
| std::views::join_with(std::string_view(", "))
| std::ranges::to<std::string>();
auto op_str = [&] {
switch (change.tag) {
case locator::rf_rack_topology_operation::type::add: return "joining";
case locator::rf_rack_topology_operation::type::remove: return "removed";
}
}();
dblog.warn("The {} node with DC='{}' and rack='{}' makes some existing keyspaces RF-rack-invalid, i.e. the replication factor "
"does not match the number of racks in one of the datacenters. That may reduce "
"availability in case of a failure (cf. "
"https://docs.scylladb.com/manual/stable/reference/glossary.html#term-RF-rack-valid-keyspace). "
"Those keyspaces are: {}",
op_str, change.dc, change.rack, ks_list);
}
return valid;
}
utils::chunked_vector<uint64_t> compute_random_sorted_ints(uint64_t max_value, uint64_t n_values) {
static thread_local std::minstd_rand rng{std::random_device{}()};
std::uniform_int_distribution<uint64_t> dist(0, max_value);

View File

@@ -2125,6 +2125,18 @@ public:
// must contain a complete list of racks and data centers in the cluster.
void check_rf_rack_validity(const locator::token_metadata_ptr) const;
// Verifies whether all keyspaces that require RF-rack-validity (as determined by enforce_rf_rack_validity_for_keyspace)
// would remain RF-rack-valid after applying a topology change.
//
// Returns true if all such keyspaces would remain RF-rack-valid after the change, and false otherwise.
// For keyspaces that would become RF-rack-invalid but do not require enforcement,
// a warning is printed, but this does not affect the return value.
//
// Preconditions:
// * The provided locator::topology instance (from the passed locator::token_metadata_ptr)
// must contain a complete list of racks and data centers in the cluster.
bool check_rf_rack_validity_with_topology_change(locator::token_metadata_ptr, locator::rf_rack_topology_operation) const;
private:
// SSTable sampling might require considerable amounts of memory,
// so we want to limit the number of concurrent sampling operations.