Files
scylladb/test/lib/topology_builder.hh
Tomasz Grabiec b6a7023f68 tablets: Prepare for non-power-of-two tablet count
This is a step towards more flexibility in managing tablets.  A
prerequisite before we can split individual tablets, isolating hot
partitions, and evening-out tablet sizes by shifting boundaries.

After this patch, the system can handle tables with arbitrary tablet
count. Tablet allocator is still rounding up desired tablet count to
the nearest power of two when allocating tablets for a new table, so
unless the tablet map is allocated in some other way, the counts will
be still a power of two.

We plan to utilize arbitrary count when migrating from vnodes to
tablets, by creating a tablet map which matches vnode boundaries.

One of the reasons we don't give up on power-of-two by default yet is
that it creates an issue with merges. If tablet count is odd, one of
the tablets doesn't have a sibling and will not be merged. That can
obviously cause imbalance of token space and tablet sizes between
tablets. To limit the impact, this patch dynamically chooses which
tablet to isolate when initiating a merge. The largest tablet is
chosen, as that will minimize imbalance. Otherwise, if we always chose
the last tablet to isolate, its size would remain the same while other
tablets double in size with each odd-count merge, leading to
imbalance. The imbalance will still be there, but the difference in
tablet sizes is limited to 2x.

Example (3 tablets):
  [0] owns 1/3 of tokens
  [1] owns 1/3 of tokens
  [2] owns 1/3 of tokens

After merge:
  [0] owns 2/3 of tokens
  [1] owns 1/3 of tokens

What we would like instead:

Step 1 (split [1]):
  [0] owns 1/3 of tokens
  [1] old 1.left, owns 1/6 of tokens
  [2] old 1.right, owns 1/6 of tokens
  [3] owns 1/3 of tokens

Step 2 (merge):
  [0] owns 1/2 of tokens
  [1] owns 1/2 of tokens

To do that, we need to be able to split individual tablets, but we're
not there yet.
2026-04-15 10:40:55 +02:00

395 lines
16 KiB
C++

/*
* Copyright (C) 2025-present ScyllaDB
*
*/
/*
* SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.1 and Apache-2.0)
*/
#pragma once
#include "cql_test_env.hh"
#include "locator/topology.hh"
#include "gms/inet_address.hh"
#include "service/topology_mutation.hh"
#include "service/topology_state_machine.hh"
#include "service/raft/raft_group0_client.hh"
#include "service/tablet_allocator_fwd.hh"
#include "locator/host_id.hh"
#include "locator/tablets.hh"
#include "test/lib/log.hh"
#include "version.hh"
#include <atomic>
struct shared_load_stats {
locator::load_stats stats;
locator::load_stats_ptr get() const {
return make_lw_shared(stats);
}
void set_size(table_id table, size_t size_in_bytes) {
stats.tables[table].size_in_bytes = size_in_bytes;
}
void set_split_ready_seq_number(table_id table, size_t seq_number) {
stats.tables[table].split_ready_seq_number = seq_number;
}
void set_capacity(locator::host_id host, size_t capacity) {
stats.capacity[host] = capacity;
stats.tablet_stats[host].effective_capacity = capacity;
}
void set_tablet_size(locator::host_id host, const locator::range_based_tablet_id& rb_tid, uint64_t tablet_size) {
stats.tablet_stats[host].tablet_sizes[rb_tid.table][rb_tid.range] = tablet_size;
}
void set_default_tablet_sizes(locator::token_metadata_ptr tmptr) {
for (auto&& [table, tmap_ptr] : tmptr->tablets().all_tables_ungrouped()) {
tmap_ptr->for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& tinfo) -> future<> {
locator::range_based_tablet_id rb_tid {table, tmap_ptr->get_token_range(tid)};
for (auto& replica : tinfo.replicas) {
if (!stats.get_tablet_size(replica.host, rb_tid)) {
stats.tablet_stats[replica.host].tablet_sizes[table][rb_tid.range] = service::default_target_tablet_size;
}
}
return make_ready_future<>();
}).get();
}
}
void set_tablet_sizes(locator::token_metadata_ptr tmptr, table_id table, uint64_t tablet_size) {
auto& tmap = tmptr->tablets().get_tablet_map(table);
tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& tinfo) -> future<> {
locator::range_based_tablet_id rb_tid {table, tmap.get_token_range(tid)};
for (auto& replica : tinfo.replicas) {
stats.tablet_stats[replica.host].tablet_sizes[table][rb_tid.range] = tablet_size;
}
return make_ready_future<>();
}).get();
set_size(table, tablet_size * tmap.tablet_count());
}
};
/// Modifies topology inside a given cql_test_env.
/// Local node's membership is not affected, but it belongs to a different DC than those produced by this builder.
///
/// Creating the builder locks the topology state machine so there are no concurrent topology operations
/// and load balancing.
/// The built topology is not removed when the builder is destroyed and the state machine is left locked.
///
/// All methods expect to be run in a seastar thread.
///
/// Examples usage:
///
/// topology_builder topo(e);
/// auto host1 = topo.add_node(); // dc1 rack1
/// auto host2 = topo.add_node(); // dc1 rack1
/// topo.start_new_dc();
/// auto host3 = topo.add_node(); // dc2 rack1
/// auto host4 = topo.add_node(); // dc2 rack1
/// topo.start_new_rack();
/// auto host5 = topo.add_node(); // dc2 rack2
/// auto host6 = topo.add_node(); // dc2 rack2
///
class topology_builder {
public:
using inet_address = locator::inet_address;
using endpoint_dc_rack = locator::endpoint_dc_rack;
private:
cql_test_env& _env;
int _nr_nodes = 0;
int _dc_id;
int _rack_id;
sstring _dc;
sstring _rack;
shared_load_stats _load_stats;
std::vector<locator::host_id> _hosts;
std::unordered_map<locator::host_id, gms::inet_address> _host_addresses;
private:
inet_address make_node_address(int n) {
assert(n > 0);
int a = n % 256;
n /= 256;
int b = n % 256;
n /= 256;
assert(n < 256);
return inet_address(fmt::format("10.{}.{}.{}", n, b, a));
}
// Locks the topology to prevent concurrent topology operations and load balancing.
// Setting transition_state to "lock" blocks background load-balancing which could interfere with the test
// and prevents errors from load_topology_state() complaining about nodes in transition with no transition state.
void lock_topology() {
abort_source as;
auto& client = _env.get_raft_group0_client();
service::topology& topo = _env.get_topology_state_machine().local()._topology;
while (true) {
if (topo.tstate && *topo.tstate == service::topology::transition_state::lock) {
testlog.info("Topology is locked");
return;
}
auto guard = client.start_operation(as).get();
if (topo.tstate) {
testlog.info("Waiting for topology state machine to be idle");
release_guard(std::move(guard));
_env.get_topology_state_machine().local().await_not_busy().get();
testlog.info("Woken up");
continue;
}
service::topology_change change({service::topology_mutation_builder(guard.write_timestamp())
.set_transition_state(service::topology::transition_state::lock)
.build()});
service::group0_command g0_cmd = client.prepare_command(std::move(change), guard, "locking topology");
try {
client.add_entry(std::move(g0_cmd), std::move(guard), as).get();
} catch (service::group0_concurrent_modification&) {
testlog.info("Concurrent modification detected while locking topology, retrying");
}
}
}
public:
topology_builder(cql_test_env& e)
: _env(e)
{
start_new_dc();
lock_topology();
}
// Returns a new token from some sequence of unique tokens.
// Uniqueness is in the scope of the process, not just this object.
dht::token new_token() {
static std::atomic<int64_t> next_token = 1;
return dht::token(next_token.fetch_add(1));
}
// Returns the name of the currently built DC.
const sstring& dc() const {
return _dc;
}
// Returns location of the currently built rack.
endpoint_dc_rack rack() const {
return {_dc, _rack};
}
// Starts building a new rack in the current DC.
// Returns location of the new rack.
endpoint_dc_rack start_new_rack() {
_rack = fmt::format("rack{}{:c}", _dc_id, 'a' + _rack_id++);
return rack();
}
// Starts building a new rack in the current DC.
// Returns location of the new rack.
endpoint_dc_rack start_new_rack(sstring rack_name) {
_rack = std::move(rack_name);
return rack();
}
// Starts building a new DC.
// DC is named uniquely in the scope of the process, not just this object.
endpoint_dc_rack start_new_dc() {
static std::atomic<int> next_id = 1;
_dc_id = next_id.fetch_add(1);
_dc = fmt::format("dc{}", _dc_id);
_rack_id = 0;
return start_new_rack();
}
// Starts building a new DC.
endpoint_dc_rack start_new_dc(endpoint_dc_rack dc_and_rack) {
_dc = dc_and_rack.dc;
_rack = dc_and_rack.rack;
return rack();
}
locator::load_stats_ptr get_load_stats() const {
return _load_stats.get();
}
shared_load_stats& get_shared_load_stats() {
return _load_stats;
}
/// Returns total cluster's storage capacity in bytes.
uint64_t get_capacity() const {
uint64_t cap = 0;
auto stats = get_load_stats();
for (auto h : _hosts) {
cap += stats->capacity.at(h);
}
return cap;
}
locator::host_id add_node(service::node_state state = service::node_state::normal,
unsigned shard_count = 1,
std::optional<endpoint_dc_rack> rack_override = {})
{
++_nr_nodes;
auto ip = make_node_address(_nr_nodes);
auto id = locator::host_id(utils::UUID_gen::get_time_UUID());
auto dc_rack = rack_override.value_or(rack());
dht::token token = new_token();
std::unordered_set<dht::token> tokens({token});
abort_source as;
auto& client = _env.get_raft_group0_client();
_load_stats.set_capacity(id, service::default_target_tablet_size * shard_count);
while (true) {
auto guard = client.start_operation(as).get();
service::topology_mutation_builder builder(guard.write_timestamp());
builder.with_node(raft::server_id(id.uuid()))
.set("datacenter", dc_rack.dc)
.set("rack", dc_rack.rack)
.set("node_state", state)
.set("shard_count", (uint32_t) shard_count)
.set("cleanup_status", service::cleanup_status::clean)
.set("release_version", version::release())
.set("num_tokens", (uint32_t) 1)
.set("tokens_string", fmt::format("{}", token))
.set("tokens", tokens)
.set("supported_features", std::set<sstring>())
.set("request_id", utils::UUID())
.set("ignore_msb", (uint32_t) 0);
service::topology_change change({builder.build()});
service::group0_command g0_cmd = client.prepare_command(std::move(change), guard,
format("adding node {} to topology", id));
testlog.info("Adding node {}/{} dc={} rack={} to topology", id, ip, dc_rack.dc, dc_rack.rack);
try {
client.add_entry(std::move(g0_cmd), std::move(guard), as).get();
break;
} catch (service::group0_concurrent_modification&) {
testlog.warn("Concurrent modification detected, retrying");
}
}
_hosts.push_back(id);
_host_addresses.emplace(id, ip);
return id;
}
locator::host_id add_i4i_2xlarge(endpoint_dc_rack rack) {
auto h = add_node(service::node_state::normal, 7, rack);
get_shared_load_stats().set_capacity(h, 1'875'000'000'000);
return h;
}
locator::host_id add_i4i_large(endpoint_dc_rack rack) {
auto h = add_node(service::node_state::normal, 2, rack);
get_shared_load_stats().set_capacity(h, 468'000'000'000);
return h;
}
void modify_group0(std::function<void(service::group0_guard&, utils::chunked_vector<canonical_mutation>&)> func) {
abort_source as;
auto& client = _env.get_raft_group0_client();
while (true) {
auto guard = client.start_operation(as).get();
utils::chunked_vector<canonical_mutation> muts;
func(guard, muts);
service::topology_change change({std::move(muts)});
service::group0_command g0_cmd = client.prepare_command(std::move(change), guard, "modify_topology()");
try {
client.add_entry(std::move(g0_cmd), std::move(guard), as).get();
break;
} catch (service::group0_concurrent_modification&) {
testlog.warn("Concurrent modification detected, retrying");
}
}
}
void modify_topology(std::function<void(service::topology_mutation_builder&)> func) {
modify_group0([&] (service::group0_guard& guard, utils::chunked_vector<canonical_mutation>& muts) {
service::topology_mutation_builder builder(guard.write_timestamp());
func(builder);
muts.emplace_back(builder.build());
});
}
void pause_rf_change_request(utils::UUID new_elem) {
abort_source as;
auto& client = _env.get_raft_group0_client();
while (true) {
auto guard = client.start_operation(as).get();
service::topology_mutation_builder builder(guard.write_timestamp());
builder.pause_rf_change_request(new_elem);
service::topology_change change({builder.build()});
service::group0_command g0_cmd = client.prepare_command(std::move(change), guard,
"setting ongoing RF change data");
try {
client.add_entry(std::move(g0_cmd), std::move(guard), as).get();
break;
} catch (service::group0_concurrent_modification&) {
testlog.warn("Concurrent modification detected, retrying");
}
}
}
void resume_rf_change_request(const std::unordered_set<utils::UUID>& current_queue, utils::UUID elem_to_remove) {
abort_source as;
auto& client = _env.get_raft_group0_client();
while (true) {
auto guard = client.start_operation(as).get();
service::topology_mutation_builder builder(guard.write_timestamp());
builder.resume_rf_change_request(current_queue, elem_to_remove);
service::topology_change change({builder.build()});
service::group0_command g0_cmd = client.prepare_command(std::move(change), guard,
"setting ongoing RF change data");
try {
client.add_entry(std::move(g0_cmd), std::move(guard), as).get();
break;
} catch (service::group0_concurrent_modification&) {
testlog.warn("Concurrent modification detected, retrying");
}
}
}
void set_node_state(locator::host_id id, service::node_state state) {
modify_topology([&](service::topology_mutation_builder& builder) {
testlog.info("Changing node {} state={}", id, state);
builder.with_node(raft::server_id(id.uuid()))
.set("node_state", state);
});
}
void add_draining_request(locator::host_id id) {
modify_group0([&](service::group0_guard& guard, utils::chunked_vector<canonical_mutation>& muts) {
auto& topo = _env.local_db().get_shared_token_metadata().get()->get_topology();
auto req = topo.get_node(id).is_excluded() ? service::topology_request::remove : service::topology_request::leave;
service::topology_mutation_builder builder(guard.write_timestamp());
builder.with_node(raft::server_id(id.uuid()))
.set("topology_request", req)
.set("request_id", guard.new_group0_state_id());
muts.emplace_back(builder.build());
service::topology_request_tracking_mutation_builder rtbuilder(guard.new_group0_state_id(),
_env.local_db().features().topology_requests_type_column);
rtbuilder.set("initiating_host", raft::server_id(topo.my_host_id().uuid()))
.set("done", false);
rtbuilder.set("request_type", req);
muts.emplace_back(rtbuilder.build());
testlog.info("Adding {} request for node {}", req, id);
});
}
const std::vector<locator::host_id>& hosts() const {
return _hosts;
}
const std::unordered_map<locator::host_id, gms::inet_address>& host_addresses() const {
return _host_addresses;
}
};