Files
scylladb/locator/network_topology_strategy.cc
Tomasz Grabiec b6a7023f68 tablets: Prepare for non-power-of-two tablet count
This is a step towards more flexibility in managing tablets.  A
prerequisite before we can split individual tablets, isolating hot
partitions, and evening-out tablet sizes by shifting boundaries.

After this patch, the system can handle tables with arbitrary tablet
count. Tablet allocator is still rounding up desired tablet count to
the nearest power of two when allocating tablets for a new table, so
unless the tablet map is allocated in some other way, the counts will
be still a power of two.

We plan to utilize arbitrary count when migrating from vnodes to
tablets, by creating a tablet map which matches vnode boundaries.

One of the reasons we don't give up on power-of-two by default yet is
that it creates an issue with merges. If tablet count is odd, one of
the tablets doesn't have a sibling and will not be merged. That can
obviously cause imbalance of token space and tablet sizes between
tablets. To limit the impact, this patch dynamically chooses which
tablet to isolate when initiating a merge. The largest tablet is
chosen, as that will minimize imbalance. Otherwise, if we always chose
the last tablet to isolate, its size would remain the same while other
tablets double in size with each odd-count merge, leading to
imbalance. The imbalance will still be there, but the difference in
tablet sizes is limited to 2x.

Example (3 tablets):
  [0] owns 1/3 of tokens
  [1] owns 1/3 of tokens
  [2] owns 1/3 of tokens

After merge:
  [0] owns 2/3 of tokens
  [1] owns 1/3 of tokens

What we would like instead:

Step 1 (split [1]):
  [0] owns 1/3 of tokens
  [1] old 1.left, owns 1/6 of tokens
  [2] old 1.right, owns 1/6 of tokens
  [3] owns 1/3 of tokens

Step 2 (merge):
  [0] owns 1/2 of tokens
  [1] owns 1/2 of tokens

To do that, we need to be able to split individual tablets, but we're
not there yet.
2026-04-15 10:40:55 +02:00

661 lines
28 KiB
C++

/*
*
* Modified by ScyllaDB
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.1 and Apache-2.0)
*/
#include <algorithm>
#include <functional>
#include <random>
#include <fmt/ranges.h>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include "locator/network_topology_strategy.hh"
#include "locator/load_sketch.hh"
#include <absl/container/flat_hash_map.h>
#include <boost/algorithm/string.hpp>
#include "exceptions/exceptions.hh"
#include "utils/assert.hh"
#include "utils/class_registrator.hh"
#include "utils/hash.hh"
namespace std {
size_t hash<locator::endpoint_dc_rack>::operator()(const locator::endpoint_dc_rack& v) const {
return utils::tuple_hash()(std::tie(v.dc, v.rack));
}
}
namespace locator {
static logging::logger logger("network_topology_strategy");
network_topology_strategy::network_topology_strategy(replication_strategy_params params, const topology* topo) :
abstract_replication_strategy(params,
replication_strategy_type::network_topology) {
auto opts = _config_options;
logger.debug("options={}", opts);
process_tablet_options(*this, opts, params);
size_t rep_factor = 0;
for (auto& config_pair : opts) {
auto& key = config_pair.first;
auto& val = config_pair.second;
//
// FIXME!!!
// The first option we get at the moment is a class name. Skip it!
//
if (boost::iequals(key, "class")) {
continue;
}
if (boost::iequals(key, "replication_factor")) {
if (boost::equals(key, "replication_factor")) {
on_internal_error(rslogger, "replication_factor should have been replaced with a DC:RF mapping by now");
} else {
throw exceptions::configuration_exception(format(
"'{}' is not a valid option, did you mean (lowercase) 'replication_factor'?", key));
}
}
auto rf = parse_replication_factor(val);
rep_factor += rf.count();
_dc_rep_factor.emplace(key, std::move(rf));
_datacenteres.push_back(key);
}
_rep_factor = rep_factor;
rslogger.debug("Configured datacenter replicas are: {}", _dc_rep_factor);
}
using endpoint_dc_rack_set = std::unordered_set<endpoint_dc_rack>;
class natural_endpoints_tracker {
/**
* Endpoint adder applying the replication rules for a given DC.
*/
struct data_center_endpoints {
/** List accepted endpoints get pushed into. */
host_id_set& _endpoints;
/**
* Racks encountered so far. Replicas are put into separate racks while possible.
* For efficiency the set is shared between the instances, using the location pair (dc, rack) to make sure
* clashing names aren't a problem.
*/
endpoint_dc_rack_set& _racks;
/** Number of replicas left to fill from this DC. */
size_t _rf_left;
ssize_t _acceptable_rack_repeats;
data_center_endpoints(size_t rf, size_t rack_count, size_t node_count, host_id_set& endpoints, endpoint_dc_rack_set& racks)
: _endpoints(endpoints)
, _racks(racks)
// If there aren't enough nodes in this DC to fill the RF, the number of nodes is the effective RF.
, _rf_left(std::min(rf, node_count))
// If there aren't enough racks in this DC to fill the RF, we'll still use at least one node from each rack,
// and the difference is to be filled by the first encountered nodes.
, _acceptable_rack_repeats(rf - rack_count)
{}
/**
* Attempts to add an endpoint to the replicas for this datacenter, adding to the endpoints set if successful.
* Returns true if the endpoint was added, and this datacenter does not require further replicas.
*/
bool add_endpoint_and_check_if_done(const host_id& ep, const endpoint_dc_rack& location) {
if (done()) {
return false;
}
if (_racks.emplace(location).second) {
// New rack.
--_rf_left;
auto added = _endpoints.insert(ep).second;
if (!added) {
throw std::runtime_error(fmt::format("Topology error: found {} in more than one rack", ep));
}
return done();
}
/**
* Ensure we don't allow too many endpoints in the same rack, i.e. we have
* minimum current rf_left + 1 distinct racks. See above, _acceptable_rack_repeats
* is defined as RF - rack_count, i.e. how many nodes in a single rack we are ok
* with.
*
* With RF = 3 and 2 Racks in DC,
*
* IP1, Rack1
* IP2, Rack1
* IP3, Rack1, The line _acceptable_rack_repeats <= 0 will reject IP3.
* IP4, Rack2
*
*/
if (_acceptable_rack_repeats <= 0) {
// There must be rf_left distinct racks left, do not add any more rack repeats.
return false;
}
if (!_endpoints.insert(ep).second) {
// Cannot repeat a node.
return false;
}
// Added a node that is from an already met rack to match RF when there aren't enough racks.
--_acceptable_rack_repeats;
--_rf_left;
return done();
}
bool done() const {
return _rf_left == 0;
}
};
const token_metadata& _tm;
const topology& _tp;
network_topology_strategy::dc_rep_factor_map _dc_rep_factor;
//
// We want to preserve insertion order so that the first added endpoint
// becomes primary.
//
host_id_set _replicas;
// tracks the racks we have already placed replicas in
endpoint_dc_rack_set _seen_racks;
//
// all token owners in each DC, so we can check when we have exhausted all
// the token-owning members of a DC
//
std::unordered_map<sstring, std::unordered_set<locator::host_id>> _token_owners;
//
// all racks (with non-token owners filtered out) in a DC so we can check
// when we have exhausted all racks in a DC
//
std::unordered_map<sstring, std::unordered_map<sstring, std::unordered_set<locator::host_id>>> _racks;
std::unordered_map<std::string_view, data_center_endpoints> _dcs;
size_t _dcs_to_fill;
public:
natural_endpoints_tracker(const token_metadata& tm, const network_topology_strategy::dc_rep_factor_map& dc_rep_factor)
: _tm(tm)
, _tp(_tm.get_topology())
, _dc_rep_factor(dc_rep_factor)
, _token_owners(_tm.get_datacenter_token_owners())
, _racks(_tm.get_datacenter_racks_token_owners())
{
// not aware of any cluster members
SCYLLA_ASSERT(!_token_owners.empty() && !_racks.empty());
auto size_for = [](auto& map, auto& k) {
auto i = map.find(k);
return i != map.end() ? i->second.size() : size_t(0);
};
// Create a data_center_endpoints object for each non-empty DC.
for (auto& [dc, rf_data] : _dc_rep_factor) {
auto node_count = size_for(_token_owners, dc);
auto rf = rf_data.count();
if (rf == 0 || node_count == 0) {
continue;
}
_dcs.emplace(dc, data_center_endpoints(rf, size_for(_racks, dc), node_count, _replicas, _seen_racks));
_dcs_to_fill = _dcs.size();
}
}
bool add_endpoint_and_check_if_done(host_id ep) {
auto& loc = _tp.get_location(ep);
auto i = _dcs.find(loc.dc);
if (i != _dcs.end() && i->second.add_endpoint_and_check_if_done(ep, loc)) {
--_dcs_to_fill;
}
return done();
}
bool done() const noexcept {
return _dcs_to_fill == 0;
}
host_id_set& replicas() noexcept {
return _replicas;
}
static void check_enough_endpoints(const token_metadata& tm, const network_topology_strategy::dc_rep_factor_map& dc_rf) {
auto dc_endpoints = tm.get_datacenter_token_owners();
auto endpoints_in = [&dc_endpoints](sstring dc) {
auto i = dc_endpoints.find(dc);
return i != dc_endpoints.end() ? i->second.size() : size_t(0);
};
for (const auto& [dc, rf_data] : dc_rf) {
auto rf = rf_data.count();
if (rf > endpoints_in(dc)) {
throw exceptions::configuration_exception(seastar::format(
"Datacenter {} doesn't have enough token-owning nodes for replication_factor={}", dc, rf));
}
}
}
};
future<host_id_set>
network_topology_strategy::calculate_natural_endpoints(
const token& search_token, const token_metadata& tm) const {
natural_endpoints_tracker tracker(tm, _dc_rep_factor);
for (auto& next : tm.ring_range(search_token)) {
co_await coroutine::maybe_yield();
host_id ep = *tm.get_endpoint(next);
if (tracker.add_endpoint_and_check_if_done(ep)) {
break;
}
}
co_return std::move(tracker.replicas());
}
void network_topology_strategy::validate_options(const gms::feature_service& fs, const locator::topology& topology) const {
// #22688 / #20039 - we want to remove dc:s once rf=0, and we
// also want to allow fully setting rf=0 in _all_ dc:s (hello data loss)
// so empty options here are in fact ok. Removed check for it
auto dcs = topology.get_datacenter_racks();
validate_tablet_options(*this, fs, _config_options);
for (auto& c : _config_options) {
if (c.first == sstring("replication_factor")) {
on_internal_error(rslogger, fmt::format("'replication_factor' tag should be unrolled into a list of DC:RF by now."
"_config_options:{}", _config_options));
}
auto dc = dcs.find(c.first);
if (dc == dcs.end()) {
throw exceptions::configuration_exception(format("Unrecognized strategy option {{{}}} "
"passed to NetworkTopologyStrategy", this->to_qualified_class_name(c.first)));
}
auto racks = dc->second | std::views::keys | std::ranges::to<std::unordered_set<sstring>>();
auto rf = parse_replication_factor(c.second);
rf.validate(racks);
}
}
effective_replication_map_ptr network_topology_strategy::make_replication_map(table_id table, token_metadata_ptr tm) const {
if (!uses_tablets()) {
on_internal_error(rslogger, format("make_replication_map() called for table {} but replication strategy not configured to use tablets", table));
}
return do_make_replication_map(table, shared_from_this(), std::move(tm), _rep_factor);
}
future<tablet_map> network_topology_strategy::allocate_tablets_for_new_table(schema_ptr s, token_metadata_ptr tm, size_t tablet_count) const {
co_return co_await reallocate_tablets(std::move(s), std::move(tm), tablet_map(tablet_count, get_consistency() != data_dictionary::consistency_config_option::eventual));
}
future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, tablet_map tablets) const {
natural_endpoints_tracker::check_enough_endpoints(*tm, _dc_rep_factor);
load_sketch load(tm);
co_await load.populate_with_normalized_load();
co_await load.populate(std::nullopt, s->id());
tablet_logger.debug("Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());
for (tablet_id tb : tablets.tablet_ids()) {
auto tinfo = tablets.get_tablet_info(tb);
tinfo.replicas = co_await reallocate_tablets(s, tm, load, tablets, tb);
if (tablets.has_raft_info()) {
if (!tablets.get_tablet_raft_info(tb).group_id) {
tablets.set_tablet_raft_info(tb, tablet_raft_info {
.group_id = raft::group_id{utils::UUID_gen::get_time_UUID()}
});
}
}
tablets.set_tablet(tb, std::move(tinfo));
}
tablet_logger.debug("Allocated tablets for {}.{} ({}): dc_rep_factor={}: {}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets);
co_return tablets;
}
future<tablet_replica_set> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, load_sketch& load, const tablet_map& cur_tablets, tablet_id tb) const {
tablet_replica_set replicas;
// Current number of replicas per dc
std::unordered_map<sstring, size_t> nodes_per_dc;
// Current replicas per dc/rack
std::unordered_map<sstring, std::map<sstring, std::unordered_set<locator::host_id>>> replicas_per_dc_rack;
std::unordered_map<sstring, rack_list> old_racks_per_dc;
replicas = cur_tablets.get_tablet_info(tb).replicas;
for (const auto& tr : replicas) {
const auto& node = tm->get_topology().get_node(tr.host);
replicas_per_dc_rack[node.dc_rack().dc][node.dc_rack().rack].insert(tr.host);
++nodes_per_dc[node.dc_rack().dc];
old_racks_per_dc[node.dc_rack().dc].push_back(node.dc_rack().rack);
}
// #22688 - take all dcs in topology into account when determining migration.
// Any change should still have been pre-checked to never exceed rf factor one.
for (const auto& dc : tm->get_topology().get_datacenters()) {
auto new_rf = get_replication_factor_data(dc);
if (new_rf && new_rf->is_rack_based()) {
auto diff = diff_racks(old_racks_per_dc[dc], new_rf->get_rack_list());
tablet_logger.debug("reallocate_tablets {}.{} tablet_id={} dc={} old_racks={} add_racks={} del_racks={}",
s->ks_name(), s->cf_name(), tb, dc, old_racks_per_dc[dc], diff.added, diff.removed);
if (!diff) {
continue;
}
if (!diff.added.empty() && !diff.removed.empty()) {
throw std::runtime_error("replacing racks unsupported");
} else if (!diff.added.empty()) {
replicas = add_tablets_in_racks(s, tm, load, tb, replicas, dc, diff.added);
} else { // diff.removed
replicas = drop_tablets_in_racks(s, tm, load, tb, replicas, dc, diff.removed);
}
} else {
auto dc_rf = new_rf ? new_rf->count() : 0;
auto dc_node_count = nodes_per_dc[dc];
if (dc_rf == dc_node_count) {
continue;
}
if (dc_rf > dc_node_count) {
replicas = co_await add_tablets_in_dc(s, tm, load, tb, replicas_per_dc_rack[dc], replicas, dc, dc_node_count, dc_rf);
} else {
replicas = drop_tablets_in_dc(s, tm->get_topology(), load, tb, replicas, dc, dc_node_count, dc_rf);
}
}
}
co_return replicas;
}
tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s,
token_metadata_ptr tm,
load_sketch& load,
tablet_id tb,
const tablet_replica_set& cur_replicas,
const sstring& dc,
const rack_list& racks_to_drop) const {
auto& topo = tm->get_topology();
tablet_replica_set filtered;
auto is_rack_to_drop = [&racks_to_drop] (const sstring& rack) {
return std::ranges::contains(racks_to_drop, rack);
};
for (const auto& tr : cur_replicas) {
auto& node = topo.get_node(tr.host);
if (node.dc_rack().dc == dc && is_rack_to_drop(node.dc_rack().rack)) {
tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}",
s->ks_name(), s->cf_name(), tb, node.dc_rack().dc, node.dc_rack().rack, tr);
load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
} else {
filtered.emplace_back(tr);
}
}
return filtered;
}
tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
token_metadata_ptr tm,
load_sketch& load,
tablet_id tb,
const tablet_replica_set& cur_replicas,
const sstring& dc,
const rack_list& racks_to_add) const {
auto nodes = tm->get_datacenter_racks_token_owners_nodes();
auto& dc_nodes = nodes.at(dc);
auto new_replicas = cur_replicas;
for (auto&& rack: racks_to_add) {
host_id min_node;
double min_load = std::numeric_limits<double>::max();
for (auto&& node: dc_nodes.at(rack)) {
if (!node.get().is_normal()) {
continue;
}
// Assume that if there was a diff to add a rack, we don't already have a replica
// in the target rack so all nodes in the rack are eligible.
// FIXME: pick based on storage utilization: https://github.com/scylladb/scylladb/issues/26366
auto node_load = load.get_real_avg_shard_load(node.get().host_id());
if (node_load < min_load) {
min_load = node_load;
min_node = node.get().host_id();
}
}
if (!min_node) {
throw std::runtime_error(
fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
}
auto new_replica = tablet_replica{min_node, load.next_shard(min_node, 1, service::default_target_tablet_size)};
new_replicas.push_back(new_replica);
tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}",
s->ks_name(), s->cf_name(), tb.id, dc, rack, min_load, new_replica);
}
return new_replicas;
}
future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_ptr s, token_metadata_ptr tm, load_sketch& load, tablet_id tb,
std::map<sstring, std::unordered_set<locator::host_id>>& replicas_per_rack,
const tablet_replica_set& cur_replicas,
sstring dc, size_t dc_node_count, size_t dc_rf) const {
static thread_local std::default_random_engine rnd_engine{std::random_device{}()};
auto replicas = cur_replicas;
// all_dc_racks is ordered lexicographically on purpose
auto all_dc_racks = tm->get_datacenter_racks_token_owners_nodes().at(dc)
| std::ranges::to<std::map>();
// Track all nodes with no replicas on them for this tablet, per rack.
struct node_load {
locator::host_id host;
double load;
};
// for sorting in descending load order
// (in terms of load)
auto node_load_cmp = [] (const node_load& a, const node_load& b) {
return a.load > b.load;
};
struct rack_candidates {
sstring rack;
utils::small_vector<node_load, 3> nodes;
};
using candidates_list = std::vector<rack_candidates>;
candidates_list existing_racks;
// We use this list to start allocating from an
// unpopulated rack.
candidates_list new_racks;
for (const auto& [rack, nodes] : all_dc_racks) {
co_await coroutine::maybe_yield();
if (nodes.empty()) {
continue;
}
const auto& existing = replicas_per_rack[rack];
candidates_list& rack_list = existing.empty() ? new_racks : existing_racks;
auto& candidate = rack_list.emplace_back(rack);
for (const auto& node : nodes) {
if (!node.get().is_normal() || node.get().is_draining()) {
continue;
}
const auto& host_id = node.get().host_id();
if (!existing.contains(host_id)) {
// FIXME: https://github.com/scylladb/scylladb/issues/26366
candidate.nodes.emplace_back(host_id, load.get_real_avg_shard_load(host_id));
}
}
if (candidate.nodes.empty()) {
rack_list.pop_back();
tablet_logger.trace("allocate_replica {}.{}: no candidate nodes left on rack={}", s->ks_name(), s->cf_name(), rack);
// Note that this rack can't be in new_racks since
// those had no existing replicas and if current rack has no nodes
// we skip it in the beginning of the loop body
continue;
}
// Sort candidate nodes in each rack in descending load order
// so we want to allocate first from the least loaded nodes.
// Do shuffle + stable_sort to shuffle nodes with equal load.
std::shuffle(candidate.nodes.begin(), candidate.nodes.end(), rnd_engine);
std::stable_sort(candidate.nodes.begin(), candidate.nodes.end(), node_load_cmp);
}
candidates_list candidate_racks;
// ensure fairness across racks (in particular if rf < number_of_racks)
// by rotating the racks order
auto append_candidate_racks = [&] (candidates_list& racks) {
if (auto size = racks.size()) {
auto it = racks.begin() + tb.id % size;
std::move(it, racks.end(), std::back_inserter(candidate_racks));
std::move(racks.begin(), it, std::back_inserter(candidate_racks));
}
};
append_candidate_racks(new_racks);
append_candidate_racks(existing_racks);
if (candidate_racks.empty()) {
on_internal_error(tablet_logger,
seastar::format("allocate_replica {}.{}: no candidate racks found for dc={} allocated={} rf={}: existing={}",
s->ks_name(), s->cf_name(), dc, dc_node_count, dc_rf, replicas_per_rack));
}
auto candidate_rack = candidate_racks.begin();
auto allocate_replica = [&] (candidates_list::iterator& candidate) {
const auto& rack = candidate->rack;
auto& nodes = candidate->nodes;
if (nodes.empty()) {
on_internal_error(tablet_logger,
seastar::format("allocate_replica {}.{} tablet_id={}: candidates vector for rack={} is empty for allocating tablet replicas in dc={} allocated={} rf={}",
s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
}
auto host_id = nodes.back().host;
auto replica = tablet_replica{host_id, load.next_shard(host_id, 1, service::default_target_tablet_size)};
const auto& node = tm->get_topology().get_node(host_id);
auto inserted = replicas_per_rack[node.dc_rack().rack].insert(host_id).second;
// Sanity check that a node is not used more than once
if (!inserted) {
on_internal_error(tablet_logger,
seastar::format("allocate_replica {}.{} tablet_id={}: allocated replica={} node already used when allocating tablet replicas in dc={} allocated={} rf={}: replicas={}",
s->ks_name(), s->cf_name(), tb.id, replica, dc, dc_node_count, dc_rf, replicas));
}
nodes.pop_back();
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: allocated tablet replica={} dc={} rack={}: nodes remaining in rack={}",
s->ks_name(), s->cf_name(), tb.id, replica, node.dc_rack().dc, node.dc_rack().rack, nodes.size());
if (nodes.empty()) {
candidate = candidate_racks.erase(candidate);
} else {
++candidate;
}
if (candidate == candidate_racks.end()) {
candidate = candidate_racks.begin();
}
if (tablet_logger.is_enabled(log_level::trace)) {
if (candidate != candidate_racks.end()) {
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: next rack={} nodes={}", s->ks_name(), s->cf_name(), tb.id, candidate->rack, candidate->nodes.size());
} else {
tablet_logger.trace("allocate_replica {}.{} tablet_id={}: no candidate racks left", s->ks_name(), s->cf_name(), tb.id);
}
}
return replica;
};
tablet_logger.debug("allocate_replica {}.{} tablet_id={}: allocating tablet replicas in dc={} allocated={} rf={}",
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);
for (size_t remaining = dc_rf - dc_node_count; remaining; --remaining) {
co_await coroutine::maybe_yield();
if (candidate_rack == candidate_racks.end()) {
on_internal_error(tablet_logger,
format("allocate_replica {}.{} tablet_id={}: ran out of candidates for allocating tablet replicas in dc={} allocated={} rf={}: remaining={}",
s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf, remaining));
}
replicas.emplace_back(allocate_replica(candidate_rack));
}
co_return replicas;
}
tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, const locator::topology& topo, load_sketch& load, tablet_id tb,
const tablet_replica_set& cur_replicas,
sstring dc, size_t dc_node_count, size_t dc_rf) const {
tablet_logger.debug("drop_tablets_in_dc {}.{} tablet_id={}: deallocating tablet replicas in dc={} allocated={} rf={}", s->ks_name(), s->cf_name(), tb.id, dc, dc_node_count, dc_rf);
// Leave dc_rf replicas in dc, effectively deallocating in reverse order,
// to maintain replica pairing between the base table and its materialized views.
// This may leave racks unbalanced, but that's ok since the tablet load balancer
// can fix this later.
tablet_replica_set filtered;
filtered.reserve(cur_replicas.size() - (dc_node_count - dc_rf));
size_t nodes_in_dc = 0;
for (const auto& tr : cur_replicas) {
if (topo.get_node(tr.host).dc_rack().dc != dc || ++nodes_in_dc <= dc_rf) {
filtered.emplace_back(tr);
} else {
load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
}
}
return filtered;
}
sstring network_topology_strategy::sanity_check_read_replicas(const effective_replication_map& erm,
const host_id_vector_replica_set& read_replicas) const {
const auto& topology = erm.get_topology();
struct rf_node_count {
size_t replication_factor{0};
size_t node_count{0};
};
absl::flat_hash_map<sstring, rf_node_count> data_centers_replication_factor;
std::ranges::for_each(read_replicas, [&data_centers_replication_factor, &topology, this](const auto& node) {
auto res = data_centers_replication_factor.emplace(topology.get_datacenter(node), rf_node_count{0, 0});
if (res.second) {
// For new item add replication factor.
res.first->second.replication_factor = get_replication_factor(res.first->first);
}
++res.first->second.node_count;
});
for (const auto& [key, item] : data_centers_replication_factor) {
if (item.replication_factor < item.node_count) {
return seastar::format("network_topology_strategy: ERM inconsistency, Datacenter [{}] has higher count of read replicas (accounting for "
"current consistency level): [{}] than its replication factor [{}]",
key, item.node_count, item.replication_factor);
}
}
return {};
}
// Note: signature must match the class_registry signature defined and used by abstract_replication_strategy::to_qualified_class_name
using registry = class_registrator<abstract_replication_strategy, network_topology_strategy, replication_strategy_params, const topology*>;
static registry registrator("org.apache.cassandra.locator.NetworkTopologyStrategy");
static registry registrator_short_name("NetworkTopologyStrategy");
}