scylladb/locator/network_topology_strategy.cc

/*
 *
 * Modified by ScyllaDB
 * Copyright (C) 2015-present ScyllaDB
 */

/*
 * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.1 and Apache-2.0)
 */

#include <algorithm>
#include <functional>
#include <random>

#include <fmt/ranges.h>

#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/maybe_yield.hh>

#include "locator/network_topology_strategy.hh"
#include "locator/load_sketch.hh"

#include <absl/container/flat_hash_map.h>
#include <boost/algorithm/string.hpp>
#include "exceptions/exceptions.hh"
#include "utils/assert.hh"
#include "utils/class_registrator.hh"
#include "utils/hash.hh"

namespace std {

size_t hash<locator::endpoint_dc_rack>::operator()(const locator::endpoint_dc_rack& v) const {
    return utils::tuple_hash()(std::tie(v.dc, v.rack));
}

}

namespace locator {

static logging::logger logger("network_topology_strategy");

network_topology_strategy::network_topology_strategy(replication_strategy_params params, const topology* topo) :
        abstract_replication_strategy(params,
                                      replication_strategy_type::network_topology) {
    auto opts = _config_options;

    logger.debug("options={}", opts);

    process_tablet_options(*this, opts, params);

    size_t rep_factor = 0;
    for (auto& config_pair : opts) {
        auto& key = config_pair.first;
        auto& val = config_pair.second;

        //
        // FIXME!!!
        // The first option we get at the moment is a class name. Skip it!
        //
        if (boost::iequals(key, "class")) {
            continue;
        }

        if (boost::iequals(key, "replication_factor")) {
            if (boost::equals(key, "replication_factor")) {
                on_internal_error(rslogger, "replication_factor should have been replaced with a DC:RF mapping by now");
            } else {
                throw exceptions::configuration_exception(format(
                "'{}' is not a valid option, did you mean (lowercase) 'replication_factor'?", key));
            }
        }

        auto rf = parse_replication_factor(val);
        rep_factor += rf.count();
        _dc_rep_factor.emplace(key, std::move(rf));
        _datacenteres.push_back(key);
    }

    _rep_factor = rep_factor;

    rslogger.debug("Configured datacenter replicas are: {}", _dc_rep_factor);
}

using endpoint_dc_rack_set = std::unordered_set<endpoint_dc_rack>;

class natural_endpoints_tracker {
    /**
     * Endpoint adder applying the replication rules for a given DC.
     */
    struct data_center_endpoints {
        /** List accepted endpoints get pushed into. */
        host_id_set& _endpoints;

        /**
         * Racks encountered so far. Replicas are put into separate racks while possible.
         * For efficiency the set is shared between the instances, using the location pair (dc, rack) to make sure
         * clashing names aren't a problem.
         */
        endpoint_dc_rack_set& _racks;

        /** Number of replicas left to fill from this DC. */
        size_t _rf_left;
        ssize_t _acceptable_rack_repeats;

        data_center_endpoints(size_t rf, size_t rack_count, size_t node_count, host_id_set& endpoints, endpoint_dc_rack_set& racks)
            : _endpoints(endpoints)
            , _racks(racks)
            // If there aren't enough nodes in this DC to fill the RF, the number of nodes is the effective RF.
            , _rf_left(std::min(rf, node_count))
            // If there aren't enough racks in this DC to fill the RF, we'll still use at least one node from each rack,
            // and the difference is to be filled by the first encountered nodes.
            , _acceptable_rack_repeats(rf - rack_count)
        {}

        /**
         * Attempts to add an endpoint to the replicas for this datacenter, adding to the endpoints set if successful.
         * Returns true if the endpoint was added, and this datacenter does not require further replicas.
         */
        bool add_endpoint_and_check_if_done(const host_id& ep, const endpoint_dc_rack& location) {
            if (done()) {
                return false;
            }

            if (_racks.emplace(location).second) {
                // New rack.
                --_rf_left;
                auto added = _endpoints.insert(ep).second;
                if (!added) {
                    throw std::runtime_error(fmt::format("Topology error: found {} in more than one rack", ep));
                }
                return done();
            }

            /**
             * Ensure we don't allow too many endpoints in the same rack, i.e. we have
             * minimum current rf_left + 1 distinct racks. See above, _acceptable_rack_repeats
             * is defined as RF - rack_count, i.e. how many nodes in a single rack we are ok
             * with.
             *
             * With RF = 3 and 2 Racks in DC,
             *
             * IP1, Rack1
             * IP2, Rack1
             * IP3, Rack1,    The line _acceptable_rack_repeats <= 0 will reject IP3.
             * IP4, Rack2
             *
             */
            if (_acceptable_rack_repeats <= 0) {
                // There must be rf_left distinct racks left, do not add any more rack repeats.
                return false;
            }

            if (!_endpoints.insert(ep).second) {
                // Cannot repeat a node.
                return false;
            }

            // Added a node that is from an already met rack to match RF when there aren't enough racks.
            --_acceptable_rack_repeats;
            --_rf_left;

            return done();
        }

        bool done() const {
            return _rf_left == 0;
        }
    };

    const token_metadata& _tm;
    const topology& _tp;
    network_topology_strategy::dc_rep_factor_map _dc_rep_factor;

    //
    // We want to preserve insertion order so that the first added endpoint
    // becomes primary.
    //
    host_id_set _replicas;
    // tracks the racks we have already placed replicas in
    endpoint_dc_rack_set _seen_racks;

    //
    // all token owners in each DC, so we can check when we have exhausted all
    // the token-owning members of a DC
    //
    std::unordered_map<sstring, std::unordered_set<locator::host_id>> _token_owners;

    //
    // all racks (with non-token owners filtered out) in a DC so we can check
    // when we have exhausted all racks in a DC
    //
    std::unordered_map<sstring, std::unordered_map<sstring, std::unordered_set<locator::host_id>>> _racks;

    std::unordered_map<std::string_view, data_center_endpoints> _dcs;

    size_t _dcs_to_fill;

public:
    natural_endpoints_tracker(const token_metadata& tm, const network_topology_strategy::dc_rep_factor_map& dc_rep_factor)
        : _tm(tm)
        , _tp(_tm.get_topology())
        , _dc_rep_factor(dc_rep_factor)
        , _token_owners(_tm.get_datacenter_token_owners())
        , _racks(_tm.get_datacenter_racks_token_owners())
    {
        // not aware of any cluster members
        SCYLLA_ASSERT(!_token_owners.empty() && !_racks.empty());

        auto size_for = [](auto& map, auto& k) {
            auto i = map.find(k);
            return i != map.end() ? i->second.size() : size_t(0);
        };

        // Create a data_center_endpoints object for each non-empty DC.
        for (auto& [dc, rf_data] : _dc_rep_factor) {
            auto node_count = size_for(_token_owners, dc);
            auto rf = rf_data.count();

            if (rf == 0 || node_count == 0) {
                continue;
            }

            _dcs.emplace(dc, data_center_endpoints(rf, size_for(_racks, dc), node_count, _replicas, _seen_racks));
            _dcs_to_fill = _dcs.size();
        }
    }

    bool add_endpoint_and_check_if_done(host_id ep) {
        auto& loc = _tp.get_location(ep);
        auto i = _dcs.find(loc.dc);
        if (i != _dcs.end() && i->second.add_endpoint_and_check_if_done(ep, loc)) {
            --_dcs_to_fill;
        }
        return done();
    }

    bool done() const noexcept {
        return _dcs_to_fill == 0;
    }

    host_id_set& replicas() noexcept {
        return _replicas;
    }

    static void check_enough_endpoints(const token_metadata& tm, const network_topology_strategy::dc_rep_factor_map& dc_rf) {
        auto dc_endpoints = tm.get_datacenter_token_owners();
        auto endpoints_in = [&dc_endpoints](sstring dc) {
            auto i = dc_endpoints.find(dc);
            return i != dc_endpoints.end() ? i->second.size() : size_t(0);
        };
        for (const auto& [dc, rf_data] : dc_rf) {
            auto rf = rf_data.count();
            if (rf > endpoints_in(dc)) {
                throw exceptions::configuration_exception(seastar::format(
                        "Datacenter {} doesn't have enough token-owning nodes for replication_factor={}", dc, rf));
            }
        }
    }
};

future<host_id_set>
network_topology_strategy::calculate_natural_endpoints(
    const token& search_token, const token_metadata& tm) const {

    natural_endpoints_tracker tracker(tm, _dc_rep_factor);

    for (auto& next : tm.ring_range(search_token)) {
        co_await coroutine::maybe_yield();

        host_id ep = *tm.get_endpoint(next);
        if (tracker.add_endpoint_and_check_if_done(ep)) {
            break;
        }
    }

    co_return std::move(tracker.replicas());
}

void network_topology_strategy::validate_options(const gms::feature_service& fs, const locator::topology& topology) const {
    // #22688 / #20039 - we want to remove dc:s once rf=0, and we
    // also want to allow fully setting rf=0 in _all_ dc:s (hello data loss)
    // so empty options here are in fact ok. Removed check for it
    auto dcs = topology.get_datacenter_racks();
    validate_tablet_options(*this, fs, _config_options);
    for (auto& c : _config_options) {
        if (c.first == sstring("replication_factor")) {
            on_internal_error(rslogger, fmt::format("'replication_factor' tag should be unrolled into a list of DC:RF by now."
                                                    "_config_options:{}", _config_options));
        }
        auto dc = dcs.find(c.first);
        if (dc == dcs.end()) {
            throw exceptions::configuration_exception(format("Unrecognized strategy option {{{}}} "
                "passed to NetworkTopologyStrategy", this->to_qualified_class_name(c.first)));
        }
        auto racks = dc->second | std::views::keys | std::ranges::to<std::unordered_set<sstring>>();
        auto rf = parse_replication_factor(c.second);
        rf.validate(racks);
    }
}

effective_replication_map_ptr network_topology_strategy::make_replication_map(table_id table, token_metadata_ptr tm) const {
    if (!uses_tablets()) {
        on_internal_error(rslogger, format("make_replication_map() called for table {} but replication strategy not configured to use tablets", table));
    }
    return do_make_replication_map(table, shared_from_this(), std::move(tm), _rep_factor);
}

future<tablet_map> network_topology_strategy::allocate_tablets_for_new_table(schema_ptr s, token_metadata_ptr tm, size_t tablet_count) const {
    co_return co_await reallocate_tablets(std::move(s), std::move(tm), tablet_map(tablet_count, get_consistency() != data_dictionary::consistency_config_option::eventual));
}

future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, tablet_map tablets) const {
    natural_endpoints_tracker::check_enough_endpoints(*tm, _dc_rep_factor);
    load_sketch load(tm);
    co_await load.populate_with_normalized_load();
    co_await load.populate(std::nullopt, s->id());

    tablet_logger.debug("Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());

    for (tablet_id tb : tablets.tablet_ids()) {
        auto tinfo = tablets.get_tablet_info(tb);
        tinfo.replicas = co_await reallocate_tablets(s, tm, load, tablets, tb);
        if (tablets.has_raft_info()) {
            if (!tablets.get_tablet_raft_info(tb).group_id) {
                tablets.set_tablet_raft_info(tb, tablet_raft_info {
                    .group_id = raft::group_id{utils::UUID_gen::get_time_UUID()}
                });
            }
        }
        tablets.set_tablet(tb, std::move(tinfo));
    }

    tablet_logger.debug("Allocated tablets for {}.{} ({}): dc_rep_factor={}: {}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets);
    co_return tablets;
}

future<tablet_replica_set> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, load_sketch& load, const tablet_map& cur_tablets, tablet_id tb) const {
    tablet_replica_set replicas;
    // Current number of replicas per dc
    std::unordered_map<sstring, size_t> nodes_per_dc;
    // Current replicas per dc/rack
    std::unordered_map<sstring, std::map<sstring, std::unordered_set<locator::host_id>>> replicas_per_dc_rack;
    std::unordered_map<sstring, rack_list> old_racks_per_dc;

    replicas = cur_tablets.get_tablet_info(tb).replicas;
    for (const auto& tr : replicas) {
        const auto& node = tm->get_topology().get_node(tr.host);
        replicas_per_dc_rack[node.dc_rack().dc][node.dc_rack().rack].insert(tr.host);
        ++nodes_per_dc[node.dc_rack().dc];
        old_racks_per_dc[node.dc_rack().dc].push_back(node.dc_rack().rack);
    }

    // #22688 - take all dcs in topology into account when determining migration.
    // Any change should still have been pre-checked to never exceed rf factor one.
    for (const auto& dc : tm->get_topology().get_datacenters()) {
        auto new_rf = get_replication_factor_data(dc);

        if (new_rf && new_rf->is_rack_based()) {
            auto diff = diff_racks(old_racks_per_dc[dc], new_rf->get_rack_list());

            tablet_logger.debug("reallocate_tablets {}.{} tablet_id={} dc={} old_racks={} add_racks={} del_racks={}",
                    s->ks_name(), s->cf_name(), tb, dc, old_racks_per_dc[dc], diff.added, diff.removed);

            if (!diff) {
                continue;
            }

            if (!diff.added.empty() && !diff.removed.empty()) {
                throw std::runtime_error("replacing racks unsupported");
            } else if (!diff.added.empty()) {
                replicas = add_tablets_in_racks(s, tm, load, tb, replicas, dc, diff.added);
            } else { // diff.removed
                replicas = drop_tablets_in_racks(s, tm, load, tb, replicas, dc, diff.removed);
            }
        } else {
            auto dc_rf = new_rf ? new_rf->count() : 0;
            auto dc_node_count = nodes_per_dc[dc];
            if (dc_rf == dc_node_count) {
                continue;
            }
            if (dc_rf > dc_node_count) {
                replicas = co_await add_tablets_in_dc(s, tm, load, tb, replicas_per_dc_rack[dc], replicas, dc, dc_node_count, dc_rf);
            } else {
                replicas = drop_tablets_in_dc(s, tm->get_topology(), load, tb, replicas, dc, dc_node_count, dc_rf);
            }
        }
    }

    co_return replicas;
}

tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s,
                                                                    token_metadata_ptr tm,
                                                                    load_sketch& load,
                                                                    tablet_id tb,
                                                                    const tablet_replica_set& cur_replicas,
                                                                    const sstring& dc,
                                                                    const rack_list& racks_to_drop) const {
    auto& topo = tm->get_topology();
    tablet_replica_set filtered;
    auto is_rack_to_drop = [&racks_to_drop] (const sstring& rack) {
        return std::ranges::contains(racks_to_drop, rack);
    };
    for (const auto& tr : cur_replicas) {
        auto& node = topo.get_node(tr.host);
        if (node.dc_rack().dc == dc && is_rack_to_drop(node.dc_rack().rack)) {
            tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}",
                            s->ks_name(), s->cf_name(), tb, node.dc_rack().dc, node.dc_rack().rack, tr);
            load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
        } else {
            filtered.emplace_back(tr);
        }
    }
    return filtered;
}

tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
                                                                   token_metadata_ptr tm,
                                                                   load_sketch& load,
                                                                   tablet_id tb,
                                                                   const tablet_replica_set& cur_replicas,
                                                                   const sstring& dc,
                                                                   const rack_list& racks_to_add) const {
    auto nodes = tm->get_datacenter_racks_token_owners_nodes();
    auto& dc_nodes = nodes.at(dc);
    auto new_replicas = cur_replicas;

    for (auto&& rack: racks_to_add) {
        host_id min_node;
        double min_load = std::numeric_limits<double>::max();

        for (auto&& node: dc_nodes.at(rack)) {
            if (!node.get().is_normal()) {
                continue;
            }
            // Assume that if there was a diff to add a rack, we don't already have a replica
            // in the target rack so all nodes in the rack are eligible.
            // FIXME: pick based on storage utilization: https://github.com/scylladb/scylladb/issues/26366
            auto node_load = load.get_real_avg_shard_load(node.get().host_id());
            if (node_load < min_load) {
                min_load = node_load;
                min_node = node.get().host_id();
            }
        }

        if (!min_node) {
            throw std::runtime_error(
                    fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
        }

        auto new_replica = tablet_replica{min_node, load.next_shard(min_node, 1, service::default_target_tablet_size)};
        new_replicas.push_back(new_replica);

        tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}",
                            s->ks_name(), s->cf_name(), tb.id, dc, rack, min_load, new_replica);
    }
    return new_replicas;
}

future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
        std::map<sstring, std::unordered_set<locator::host_id>>& replicas_per_rack,
        const tablet_replica_set& cur_replicas,
        sstring dc, size_t dc_node_count, size_t dc_rf) const {
    static thread_local std::default_random_engine rnd_engine{std::random_device{}()};

    auto replicas = cur_replicas;
    // all_dc_racks is ordered lexicographically on purpose
    auto all_dc_racks = tm->get_datacenter_racks_token_owners_nodes().at(dc)
        | std::ranges::to<std::map>();

    // Track all nodes with no replicas on them for this tablet, per rack.
    struct node_load {
        locator::host_id host;
        double load;
    };
    // for sorting in descending load order
    // (in terms of load)
    auto node_load_cmp = [] (const node_load& a, const node_load& b) {
        return a.load > b.load;
    };

    struct rack_candidates {
        sstring rack;
        utils::small_vector<node_load, 3> nodes;
    };
    using candidates_list = std::vector<rack_candidates>;
    candidates_list existing_racks;

    // We use this list to start allocating from an
    // unpopulated rack.
    candidates_list new_racks;

    for (const auto& [rack, nodes] : all_dc_racks) {
        co_await coroutine::maybe_yield();
        if (nodes.empty()) {
            continue;
        }
        const auto& existing = replicas_per_rack[rack];
        candidates_list& rack_list = existing.empty() ? new_racks : existing_racks;
        auto& candidate = rack_list.emplace_back(rack);
        for (const auto& node : nodes) {
            if (!node.get().is_normal() || node.get().is_draining()) {
                continue;
            }
            const auto& host_id = node.get().host_id();
            if (!existing.contains(host_id)) {
                // FIXME: https://github.com/scylladb/scylladb/issues/26366
                candidate.nodes.emplace_back(host_id, load.get_real_avg_shard_load(host_id));
            }
        }
        if (candidate.nodes.empty()) {
            rack_list.pop_back();
            tablet_logger.trace("allocate_replica {}.{}: no candidate nodes left on rack={}", s->ks_name(), s->cf_name(), rack);
            // Note that this rack can't be in new_racks since
            // those had no existing replicas and if current rack has no nodes
            // we skip it in the beginning of the loop body
            continue;
        }
        // Sort candidate nodes in each rack in descending load order
        // so we want to allocate first from the least loaded nodes.
        // Do shuffle + stable_sort to shuffle nodes with equal load.
        std::shuffle(candidate.nodes.begin(), candidate.nodes.end(), rnd_engine);
        std::stable_sort(candidate.nodes.begin(), candidate.nodes.end(), node_load_cmp);
    }

    candidates_list candidate_racks;

    // ensure fairness across racks (in particular if rf < number_of_racks)
    // by rotating the racks order
    auto append_candidate_racks = [&] (candidates_list& racks) {
        if (auto size = racks.size()) {
            auto it = racks.begin() + tb.id % size;
            std::move(it, racks.end(), std::back_inserter(candidate_racks));
            std::move(racks.begin(), it, std::back_inserter(candidate_racks));
        }
    };

    append_candidate_racks(new_racks);
    append_candidate_racks(existing_racks);

    if (candidate_racks.empty()) {
        on_internal_error(tablet_logger,
                seastar::format("allocate_replica {}.{}: no candidate racks found for dc={} allocated={} rf={}: existing={}",
                        s->ks_name(), s->cf_name(), dc, dc_node_count, dc_rf, replicas_per_rack));
    }

    auto candidate_rack = candidate_racks.begin();

    auto allocate_replica = [&] (candidates_list::iterator& candidate) {
        const auto& rack = candidate->rack;
        auto& nodes = candidate->nodes;
        if (nodes.empty()) {
            on_internal_error(tablet_logger,
                    seastar::format("allocate_replica {}.{} tablet_id={}: candidates vector for rack={} is empty for allocating tablet replicas in dc={} allocated={} rf={}",
                            s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
        }
        auto host_id = nodes.back().host;
        auto replica = tablet_replica{host_id, load.next_shard(host_id, 1, service::default_target_tablet_size)};
        const auto& node = tm->get_topology().get_node(host_id);
        auto inserted = replicas_per_rack[node.dc_rack().rack].insert(host_id).second;
        // Sanity check that a node is not used more than once
        if (!inserted) {
            on_internal_error(tablet_logger,
                    seastar::format("allocate_replica {}.{} tablet_id={}: allocated replica={} node already used when allocating tablet replicas in dc={} allocated={} rf={}: replicas={}",
                            s->ks_name(), s->cf_name(), tb.id, replica, dc, dc_node_count, dc_rf, replicas));
        }
        nodes.pop_back();
        tablet_logger.trace("allocate_replica {}.{} tablet_id={}: allocated tablet replica={} dc={} rack={}: nodes remaining in rack={}",
                s->ks_name(), s->cf_name(), tb.id, replica, node.dc_rack().dc, node.dc_rack().rack, nodes.size());
        if (nodes.empty()) {
            candidate = candidate_racks.erase(candidate);
        } else {
            ++candidate;
        }
        if (candidate == candidate_racks.end()) {
            candidate = candidate_racks.begin();
        }
        if (tablet_logger.is_enabled(log_level::trace)) {
            if (candidate != candidate_racks.end()) {
                tablet_logger.trace("allocate_replica {}.{} tablet_id={}: next rack={} nodes={}", s->ks_name(), s->cf_name(), tb.id, candidate->rack, candidate->nodes.size());
            } else {
                tablet_logger.trace("allocate_replica {}.{} tablet_id={}: no candidate racks left", s->ks_name(), s->cf_name(), tb.id);
            }
        }
        return replica;
    };

    tablet_logger.debug("allocate_replica {}.{} tablet_id={}: allocating tablet replicas in dc={} allocated={} rf={}",
            s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);

    for (size_t remaining = dc_rf - dc_node_count; remaining; --remaining) {
        co_await coroutine::maybe_yield();
        if (candidate_rack == candidate_racks.end()) {
            on_internal_error(tablet_logger,
                    format("allocate_replica {}.{} tablet_id={}: ran out of candidates for allocating tablet replicas in dc={} allocated={} rf={}: remaining={}",
                            s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf, remaining));
        }
        replicas.emplace_back(allocate_replica(candidate_rack));
    }

    co_return replicas;
}

tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, const locator::topology& topo, load_sketch& load, tablet_id tb,
        const tablet_replica_set& cur_replicas,
        sstring dc, size_t dc_node_count, size_t dc_rf) const {
    tablet_logger.debug("drop_tablets_in_dc {}.{} tablet_id={}: deallocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);

    // Leave dc_rf replicas in dc, effectively deallocating in reverse order,
    // to maintain replica pairing between the base table and its materialized views.
    // This may leave racks unbalanced, but that's ok since the tablet load balancer
    // can fix this later.
    tablet_replica_set filtered;
    filtered.reserve(cur_replicas.size() - (dc_node_count - dc_rf));
    size_t nodes_in_dc = 0;
    for (const auto& tr : cur_replicas) {
        if (topo.get_node(tr.host).dc_rack().dc != dc || ++nodes_in_dc <= dc_rf) {
            filtered.emplace_back(tr);
        } else {
            load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
        }
    }
    return filtered;
}

sstring network_topology_strategy::sanity_check_read_replicas(const effective_replication_map& erm,
                                                              const host_id_vector_replica_set& read_replicas) const {
    const auto& topology = erm.get_topology();

    struct rf_node_count {
        size_t replication_factor{0};
        size_t node_count{0};
    };

    absl::flat_hash_map<sstring, rf_node_count> data_centers_replication_factor;
    std::ranges::for_each(read_replicas, [&data_centers_replication_factor, &topology, this](const auto& node) {
        auto res = data_centers_replication_factor.emplace(topology.get_datacenter(node), rf_node_count{0, 0});
        if (res.second) {
            // For new item add replication factor.
            res.first->second.replication_factor = get_replication_factor(res.first->first);
        }
        ++res.first->second.node_count;
    });

    for (const auto& [key, item] : data_centers_replication_factor) {
        if (item.replication_factor < item.node_count) {
            return seastar::format("network_topology_strategy: ERM inconsistency, Datacenter [{}] has higher count of read replicas (accounting for "
                                   "current consistency level): [{}] than its replication factor [{}]",
                    key, item.node_count, item.replication_factor);
        }
    }

    return {};
}

// Note: signature must match the class_registry signature defined and used by abstract_replication_strategy::to_qualified_class_name
using registry = class_registrator<abstract_replication_strategy, network_topology_strategy, replication_strategy_params, const topology*>;
static registry registrator("org.apache.cassandra.locator.NetworkTopologyStrategy");
static registry registrator_short_name("NetworkTopologyStrategy");
}