mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-23 10:00:35 +00:00
Saves on lookup cost, esp. for candidate evaluation. This showed up in
perf profile in the past.
Also, lays the ground for splitting stats per rack.
(cherry picked from commit 0d090aa47b)
630 lines
24 KiB
C++
630 lines
24 KiB
C++
/*
|
|
* Copyright (C) 2024-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
|
|
#include <fmt/ranges.h>
|
|
#include <bit>
|
|
|
|
#include <seastar/core/sharded.hh>
|
|
#include <seastar/core/app-template.hh>
|
|
#include <seastar/core/sstring.hh>
|
|
#include <seastar/core/thread.hh>
|
|
#include <seastar/core/reactor.hh>
|
|
|
|
#include "locator/tablets.hh"
|
|
#include "service/tablet_allocator.hh"
|
|
#include "locator/tablet_replication_strategy.hh"
|
|
#include "locator/network_topology_strategy.hh"
|
|
#include "locator/load_sketch.hh"
|
|
#include "test/lib/topology_builder.hh"
|
|
#include "replica/tablets.hh"
|
|
#include "locator/tablet_replication_strategy.hh"
|
|
#include "db/config.hh"
|
|
#include "schema/schema_builder.hh"
|
|
#include "service/storage_proxy.hh"
|
|
#include "db/system_keyspace.hh"
|
|
#include "tools/utils.hh"
|
|
|
|
#include "test/perf/perf.hh"
|
|
#include "test/lib/log.hh"
|
|
#include "test/lib/cql_test_env.hh"
|
|
#include "test/lib/random_utils.hh"
|
|
#include "test/lib/key_utils.hh"
|
|
|
|
using namespace locator;
|
|
using namespace replica;
|
|
using namespace service;
|
|
using namespace tools::utils;
|
|
|
|
namespace bpo = boost::program_options;
|
|
|
|
static seastar::abort_source aborted;
|
|
|
|
static const sstring dc = "dc1";
|
|
|
|
static
|
|
cql_test_config tablet_cql_test_config() {
|
|
cql_test_config c;
|
|
return c;
|
|
}
|
|
|
|
static
|
|
future<table_id> add_table(cql_test_env& e, sstring test_ks_name = "") {
|
|
auto id = table_id(utils::UUID_gen::get_time_UUID());
|
|
co_await e.create_table([&] (std::string_view ks_name) {
|
|
if (!test_ks_name.empty()) {
|
|
ks_name = test_ks_name;
|
|
}
|
|
return *schema_builder(ks_name, id.to_sstring(), id)
|
|
.with_column("p1", utf8_type, column_kind::partition_key)
|
|
.with_column("r1", int32_type)
|
|
.build();
|
|
});
|
|
co_return id;
|
|
}
|
|
|
|
// Run in a seastar thread
|
|
static
|
|
sstring add_keyspace(cql_test_env& e, std::unordered_map<sstring, int> dc_rf, int initial_tablets = 0) {
|
|
static std::atomic<int> ks_id = 0;
|
|
auto ks_name = fmt::format("keyspace{}", ks_id.fetch_add(1));
|
|
sstring rf_options;
|
|
for (auto& [dc, rf] : dc_rf) {
|
|
rf_options += format(", '{}': {}", dc, rf);
|
|
}
|
|
e.execute_cql(fmt::format("create keyspace {} with replication = {{'class': 'NetworkTopologyStrategy'{}}}"
|
|
" and tablets = {{'enabled': true, 'initial': {}}}",
|
|
ks_name, rf_options, initial_tablets)).get();
|
|
return ks_name;
|
|
}
|
|
|
|
static
|
|
size_t get_tablet_count(const tablet_metadata& tm) {
|
|
size_t count = 0;
|
|
for (const auto& [table, tmap] : tm.all_tables_ungrouped()) {
|
|
count += std::accumulate(tmap->tablets().begin(), tmap->tablets().end(), size_t(0),
|
|
[] (size_t accumulator, const locator::tablet_info& info) {
|
|
return accumulator + info.replicas.size();
|
|
});
|
|
}
|
|
return count;
|
|
}
|
|
|
|
static
|
|
future<> apply_resize_plan(token_metadata& tm, const migration_plan& plan) {
|
|
for (auto [table_id, resize_decision] : plan.resize_plan().resize) {
|
|
co_await tm.tablets().mutate_tablet_map_async(table_id, [&resize_decision] (tablet_map& tmap) {
|
|
resize_decision.sequence_number = tmap.resize_decision().sequence_number + 1;
|
|
tmap.set_resize_decision(resize_decision);
|
|
return make_ready_future();
|
|
});
|
|
}
|
|
for (auto table_id : plan.resize_plan().finalize_resize) {
|
|
auto& old_tmap = tm.tablets().get_tablet_map(table_id);
|
|
testlog.info("Setting new tablet map of size {}", old_tmap.tablet_count() * 2);
|
|
tablet_map tmap(old_tmap.tablet_count() * 2);
|
|
tm.tablets().set_tablet_map(table_id, std::move(tmap));
|
|
}
|
|
}
|
|
|
|
// Reflects the plan in a given token metadata as if the migrations were fully executed.
|
|
static
|
|
future<> apply_plan(token_metadata& tm, const migration_plan& plan) {
|
|
for (auto&& mig : plan.migrations()) {
|
|
co_await tm.tablets().mutate_tablet_map_async(mig.tablet.table, [&mig] (tablet_map& tmap) {
|
|
auto tinfo = tmap.get_tablet_info(mig.tablet.tablet);
|
|
tinfo.replicas = replace_replica(tinfo.replicas, mig.src, mig.dst);
|
|
tmap.set_tablet(mig.tablet.tablet, tinfo);
|
|
return make_ready_future();
|
|
});
|
|
}
|
|
co_await apply_resize_plan(tm, plan);
|
|
}
|
|
|
|
using seconds_double = std::chrono::duration<double>;
|
|
|
|
struct rebalance_stats {
|
|
seconds_double elapsed_time = seconds_double(0);
|
|
seconds_double max_rebalance_time = seconds_double(0);
|
|
uint64_t rebalance_count = 0;
|
|
|
|
rebalance_stats& operator+=(const rebalance_stats& other) {
|
|
elapsed_time += other.elapsed_time;
|
|
max_rebalance_time = std::max(max_rebalance_time, other.max_rebalance_time);
|
|
rebalance_count += other.rebalance_count;
|
|
return *this;
|
|
}
|
|
};
|
|
|
|
static
|
|
rebalance_stats rebalance_tablets(cql_test_env& e, locator::load_stats_ptr load_stats = {}, std::unordered_set<host_id> skiplist = {}) {
|
|
rebalance_stats stats;
|
|
abort_source as;
|
|
|
|
auto guard = e.get_raft_group0_client().start_operation(as).get();
|
|
auto& talloc = e.get_tablet_allocator().local();
|
|
auto& stm = e.shared_token_metadata().local();
|
|
|
|
// Sanity limit to avoid infinite loops.
|
|
// The x10 factor is arbitrary, it's there to account for more complex schedules than direct migration.
|
|
auto max_iterations = 1 + get_tablet_count(stm.get()->tablets()) * 10;
|
|
|
|
for (size_t i = 0; i < max_iterations; ++i) {
|
|
auto prev_lb_stats = *talloc.stats().for_dc(dc);
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
|
|
auto plan = talloc.balance_tablets(stm.get(), nullptr, nullptr, load_stats, skiplist).get();
|
|
|
|
auto end_time = std::chrono::steady_clock::now();
|
|
auto lb_stats = *talloc.stats().for_dc(dc) - prev_lb_stats;
|
|
|
|
auto elapsed = std::chrono::duration_cast<seconds_double>(end_time - start_time);
|
|
rebalance_stats iteration_stats = {
|
|
.elapsed_time = elapsed,
|
|
.max_rebalance_time = elapsed,
|
|
.rebalance_count = 1,
|
|
};
|
|
stats += iteration_stats;
|
|
testlog.debug("Rebalance iteration {} took {:.3f} [s]: mig={}, bad={}, first_bad={}, eval={}, skiplist={}, skip: (load={}, rack={}, node={})",
|
|
i + 1, elapsed.count(),
|
|
lb_stats.migrations_produced,
|
|
lb_stats.bad_migrations,
|
|
lb_stats.bad_first_candidates,
|
|
lb_stats.candidates_evaluated,
|
|
lb_stats.migrations_from_skiplist,
|
|
lb_stats.migrations_skipped,
|
|
lb_stats.tablets_skipped_rack,
|
|
lb_stats.tablets_skipped_node);
|
|
|
|
if (plan.empty()) {
|
|
// We should not introduce inconsistency between on-disk state and in-memory state
|
|
// as that may violate invariants and cause failures in later operations
|
|
// causing test flakiness.
|
|
save_tablet_metadata(e.local_db(), stm.get()->tablets(), guard.write_timestamp()).get();
|
|
e.get_storage_service().local().update_tablet_metadata({}).get();
|
|
|
|
testlog.info("Rebalance took {:.3f} [s] after {} iteration(s)", stats.elapsed_time.count(), i + 1);
|
|
return stats;
|
|
}
|
|
stm.mutate_token_metadata([&] (token_metadata& tm) {
|
|
return apply_plan(tm, plan);
|
|
}).get();
|
|
}
|
|
throw std::runtime_error("rebalance_tablets(): convergence not reached within limit");
|
|
}
|
|
|
|
struct params {
|
|
int iterations;
|
|
int nodes;
|
|
std::optional<int> tablets1;
|
|
std::optional<int> tablets2;
|
|
int rf1;
|
|
int rf2;
|
|
int shards;
|
|
int scale1 = 1;
|
|
int scale2 = 1;
|
|
};
|
|
|
|
struct table_balance {
|
|
double shard_overcommit;
|
|
double best_shard_overcommit;
|
|
double node_overcommit;
|
|
};
|
|
|
|
constexpr auto nr_tables = 2;
|
|
|
|
struct cluster_balance {
|
|
table_balance tables[nr_tables];
|
|
};
|
|
|
|
struct results {
|
|
cluster_balance init;
|
|
cluster_balance worst;
|
|
cluster_balance last;
|
|
rebalance_stats stats;
|
|
};
|
|
|
|
template<>
|
|
struct fmt::formatter<table_balance> : fmt::formatter<string_view> {
|
|
template <typename FormatContext>
|
|
auto format(const table_balance& b, FormatContext& ctx) const {
|
|
return fmt::format_to(ctx.out(), "{{shard={:.2f} (best={:.2f}), node={:.2f}}}",
|
|
b.shard_overcommit, b.best_shard_overcommit, b.node_overcommit);
|
|
}
|
|
};
|
|
|
|
template<>
|
|
struct fmt::formatter<cluster_balance> : fmt::formatter<string_view> {
|
|
template <typename FormatContext>
|
|
auto format(const cluster_balance& r, FormatContext& ctx) const {
|
|
return fmt::format_to(ctx.out(), "{{table1={}, table2={}}}", r.tables[0], r.tables[1]);
|
|
}
|
|
};
|
|
|
|
template<>
|
|
struct fmt::formatter<params> : fmt::formatter<string_view> {
|
|
template <typename FormatContext>
|
|
auto format(const params& p, FormatContext& ctx) const {
|
|
auto tablets1_per_shard = double(p.tablets1.value_or(0)) * p.rf1 / (p.nodes * p.shards);
|
|
auto tablets2_per_shard = double(p.tablets2.value_or(0)) * p.rf2 / (p.nodes * p.shards);
|
|
return fmt::format_to(ctx.out(), "{{iterations={}, nodes={}, tablets1={} ({:0.1f}/sh), tablets2={} ({:0.1f}/sh), rf1={}, rf2={}, shards={}}}",
|
|
p.iterations, p.nodes,
|
|
p.tablets1.value_or(0), tablets1_per_shard,
|
|
p.tablets2.value_or(0), tablets2_per_shard,
|
|
p.rf1, p.rf2, p.shards);
|
|
}
|
|
};
|
|
|
|
future<results> test_load_balancing_with_many_tables(params p, bool tablet_aware) {
|
|
auto cfg = tablet_cql_test_config();
|
|
results global_res;
|
|
co_await do_with_cql_env_thread([&] (auto& e) {
|
|
SCYLLA_ASSERT(p.nodes > 0);
|
|
SCYLLA_ASSERT(p.rf1 > 0);
|
|
SCYLLA_ASSERT(p.rf2 > 0);
|
|
|
|
const size_t n_hosts = p.nodes;
|
|
const size_t rf1 = p.rf1;
|
|
const size_t rf2 = p.rf2;
|
|
const shard_id shard_count = p.shards;
|
|
const int cycles = p.iterations;
|
|
|
|
struct host_info {
|
|
host_id id;
|
|
endpoint_dc_rack dc_rack;
|
|
};
|
|
|
|
topology_builder topo(e);
|
|
std::vector<endpoint_dc_rack> racks;
|
|
std::vector<host_info> hosts;
|
|
locator::load_stats stats;
|
|
|
|
auto populate_racks = [&] (const size_t count) {
|
|
SCYLLA_ASSERT(count > 0);
|
|
racks.push_back(topo.rack());
|
|
for (size_t i = 0; i < count - 1; ++i) {
|
|
racks.push_back(topo.start_new_rack());
|
|
}
|
|
};
|
|
|
|
const sstring dc1 = topo.dc();
|
|
populate_racks(rf1);
|
|
|
|
const size_t rack_count = racks.size();
|
|
|
|
auto add_host = [&] (endpoint_dc_rack dc_rack) {
|
|
auto host = topo.add_node(service::node_state::normal, shard_count, dc_rack);
|
|
hosts.emplace_back(host, dc_rack);
|
|
stats.capacity[host] = default_target_tablet_size * shard_count;
|
|
testlog.info("Added new node: {} / {}:{}", host, dc_rack.dc, dc_rack.rack);
|
|
};
|
|
|
|
auto make_stats = [&] {
|
|
return make_lw_shared<locator::load_stats>(stats);
|
|
};
|
|
|
|
for (size_t i = 0; i < n_hosts; ++i) {
|
|
add_host(racks[i % rack_count]);
|
|
}
|
|
|
|
auto& stm = e.shared_token_metadata().local();
|
|
|
|
auto bootstrap = [&] (endpoint_dc_rack dc_rack) {
|
|
add_host(std::move(dc_rack));
|
|
global_res.stats += rebalance_tablets(e, make_stats());
|
|
};
|
|
|
|
auto decommission = [&] (host_id host) {
|
|
const auto it = std::ranges::find_if(hosts, [&] (const host_info& info) {
|
|
return info.id == host;
|
|
});
|
|
if (it == hosts.end()) {
|
|
throw std::runtime_error(format("No such host: {}", host));
|
|
}
|
|
topo.set_node_state(host, service::node_state::decommissioning);
|
|
global_res.stats += rebalance_tablets(e, make_stats());
|
|
if (stm.get()->tablets().has_replica_on(host)) {
|
|
throw std::runtime_error(format("Host {} still has replicas!", host));
|
|
}
|
|
topo.set_node_state(host, service::node_state::left);
|
|
testlog.info("Node decommissioned: {}", host);
|
|
hosts.erase(it);
|
|
};
|
|
|
|
auto ks1 = add_keyspace(e, {{dc1, rf1}}, p.tablets1.value_or(1));
|
|
auto ks2 = add_keyspace(e, {{dc1, rf2}}, p.tablets2.value_or(1));
|
|
auto id1 = add_table(e, ks1).get();
|
|
auto id2 = add_table(e, ks2).get();
|
|
schema_ptr s1 = e.local_db().find_schema(id1);
|
|
schema_ptr s2 = e.local_db().find_schema(id2);
|
|
|
|
auto check_balance = [&] () -> cluster_balance {
|
|
cluster_balance res;
|
|
|
|
testlog.debug("tablet metadata: {}", stm.get()->tablets());
|
|
|
|
int table_index = 0;
|
|
for (auto s : {s1, s2}) {
|
|
load_sketch load(stm.get());
|
|
load.populate(std::nullopt, s->id()).get();
|
|
|
|
min_max_tracker<uint64_t> shard_load_minmax;
|
|
min_max_tracker<uint64_t> node_load_minmax;
|
|
uint64_t sum_node_load = 0;
|
|
uint64_t shard_count = 0;
|
|
for (auto [h, _] : hosts) {
|
|
auto minmax = load.get_shard_minmax(h);
|
|
auto node_load = load.get_load(h);
|
|
auto avg_shard_load = load.get_real_avg_tablet_count(h);
|
|
auto overcommit = double(minmax.max()) / avg_shard_load;
|
|
shard_load_minmax.update(minmax.max());
|
|
shard_count += load.get_shard_count(h);
|
|
testlog.info("Load on host {} for table {}: total={}, min={}, max={}, spread={}, avg={:.2f}, overcommit={:.2f}",
|
|
h, s->cf_name(), node_load, minmax.min(), minmax.max(), minmax.max() - minmax.min(), avg_shard_load, overcommit);
|
|
node_load_minmax.update(node_load);
|
|
sum_node_load += node_load;
|
|
}
|
|
|
|
auto avg_shard_load = double(sum_node_load) / shard_count;
|
|
auto shard_overcommit = shard_load_minmax.max() / avg_shard_load;
|
|
// Overcommit given the best distribution of tablets given current number of tablets.
|
|
auto best_shard_overcommit = div_ceil(sum_node_load, shard_count) / avg_shard_load;
|
|
testlog.info("Shard overcommit: {:.2f}, best={:.2f}", shard_overcommit, best_shard_overcommit);
|
|
|
|
auto node_imbalance = node_load_minmax.max() - node_load_minmax.min();
|
|
auto avg_node_load = double(sum_node_load) / hosts.size();
|
|
auto node_overcommit = node_load_minmax.max() / avg_node_load;
|
|
testlog.info("Node imbalance: min={}, max={}, spread={}, avg={:.2f}, overcommit={:.2f}",
|
|
node_load_minmax.min(), node_load_minmax.max(), node_imbalance, avg_node_load, node_overcommit);
|
|
|
|
res.tables[table_index++] = {
|
|
.shard_overcommit = shard_overcommit,
|
|
.best_shard_overcommit = best_shard_overcommit,
|
|
.node_overcommit = node_overcommit
|
|
};
|
|
}
|
|
|
|
for (int i = 0; i < nr_tables; i++) {
|
|
auto t = res.tables[i];
|
|
global_res.worst.tables[i].shard_overcommit = std::max(global_res.worst.tables[i].shard_overcommit, t.shard_overcommit);
|
|
global_res.worst.tables[i].node_overcommit = std::max(global_res.worst.tables[i].node_overcommit, t.node_overcommit);
|
|
}
|
|
|
|
testlog.info("Overcommit: {}", res);
|
|
return res;
|
|
};
|
|
|
|
testlog.debug("tablet metadata: {}", stm.get()->tablets());
|
|
|
|
e.get_tablet_allocator().local().set_use_table_aware_balancing(tablet_aware);
|
|
|
|
check_balance();
|
|
|
|
rebalance_tablets(e, make_stats());
|
|
|
|
global_res.init = global_res.worst = check_balance();
|
|
|
|
for (int i = 0; i < cycles; i++) {
|
|
const auto [id, dc_rack] = hosts[0];
|
|
|
|
bootstrap(dc_rack);
|
|
check_balance();
|
|
|
|
decommission(id);
|
|
global_res.last = check_balance();
|
|
}
|
|
}, cfg);
|
|
co_return global_res;
|
|
}
|
|
|
|
void test_parallel_scaleout(const bpo::variables_map& opts) {
|
|
const shard_id shard_count = opts["shards"].as<int>();
|
|
const int nr_tables = opts["tables"].as<int>();
|
|
const int tablets_per_table = opts["tablets_per_table"].as<int>();
|
|
const int nr_racks = opts["racks"].as<int>();
|
|
const int initial_nodes = nr_racks * opts["nodes-per-rack"].as<int>();
|
|
const int extra_nodes = nr_racks * opts["extra-nodes-per-rack"].as<int>();
|
|
|
|
auto cfg = tablet_cql_test_config();
|
|
cfg.db_config->rf_rack_valid_keyspaces(true);
|
|
results global_res;
|
|
do_with_cql_env_thread([&] (auto& e) {
|
|
topology_builder topo(e);
|
|
locator::load_stats stats;
|
|
|
|
auto make_stats = [&] {
|
|
return make_lw_shared<locator::load_stats>(stats);
|
|
};
|
|
|
|
std::vector<endpoint_dc_rack> racks;
|
|
racks.push_back(topo.rack());
|
|
for (int i = 1; i < nr_racks; ++i) {
|
|
racks.push_back(topo.start_new_rack());
|
|
}
|
|
|
|
auto add_host = [&] (endpoint_dc_rack rack) {
|
|
auto host = topo.add_node(service::node_state::normal, shard_count, rack);
|
|
stats.capacity[host] = default_target_tablet_size * shard_count;
|
|
testlog.info("Added new node: {}", host);
|
|
};
|
|
|
|
auto add_hosts = [&] (int n) {
|
|
for (int i = 0; i < n; ++i) {
|
|
add_host(racks[i % racks.size()]);
|
|
}
|
|
};
|
|
|
|
add_hosts(initial_nodes);
|
|
|
|
testlog.info("Creating schema");
|
|
auto ks1 = add_keyspace(e, {{topo.dc(), nr_racks}}, tablets_per_table);
|
|
seastar::parallel_for_each(std::views::iota(0, nr_tables), [&] (int) -> future<> {
|
|
return add_table(e, ks1).discard_result();
|
|
}).get();
|
|
|
|
testlog.info("Initial rebalancing");
|
|
rebalance_tablets(e, make_stats());
|
|
|
|
testlog.info("Scaleout");
|
|
add_hosts(extra_nodes);
|
|
global_res.stats += rebalance_tablets(e, make_stats());
|
|
}, cfg).get();
|
|
}
|
|
|
|
future<> run_simulation(const params& p, const sstring& name = "") {
|
|
testlog.info("[run {}] params: {}", name, p);
|
|
|
|
auto total_tablet_count = p.tablets1.value_or(0) * p.rf1 + p.tablets2.value_or(0) * p.rf2;
|
|
testlog.info("[run {}] tablet count: {}", name, total_tablet_count);
|
|
testlog.info("[run {}] tablet count / shard: {:.3f}", name, double(total_tablet_count) / (p.nodes * p.shards));
|
|
|
|
auto res = co_await test_load_balancing_with_many_tables(p, true);
|
|
testlog.info("[run {}] Overcommit : init : {}", name, res.init);
|
|
testlog.info("[run {}] Overcommit : worst: {}", name, res.worst);
|
|
testlog.info("[run {}] Overcommit : last : {}", name, res.last);
|
|
testlog.info("[run {}] Overcommit : time : {:.3f} [s], max={:.3f} [s], count={}", name,
|
|
res.stats.elapsed_time.count(), res.stats.max_rebalance_time.count(), res.stats.rebalance_count);
|
|
|
|
if (res.stats.elapsed_time > seconds_double(1)) {
|
|
testlog.warn("[run {}] Scheduling took longer than 1s!", name);
|
|
}
|
|
|
|
auto old_res = co_await test_load_balancing_with_many_tables(p, false);
|
|
testlog.info("[run {}] Overcommit (old) : init : {}", name, old_res.init);
|
|
testlog.info("[run {}] Overcommit (old) : worst: {}", name, old_res.worst);
|
|
testlog.info("[run {}] Overcommit (old) : last : {}", name, old_res.last);
|
|
testlog.info("[run {}] Overcommit : time : {:.3f} [s], max={:.3f} [s], count={}", name,
|
|
old_res.stats.elapsed_time.count(), old_res.stats.max_rebalance_time.count(), old_res.stats.rebalance_count);
|
|
|
|
for (int i = 0; i < nr_tables; ++i) {
|
|
if (res.worst.tables[i].shard_overcommit > old_res.worst.tables[i].shard_overcommit) {
|
|
testlog.warn("[run {}] table{} shard overcommit worse!", name, i + 1);
|
|
}
|
|
auto overcommit = res.worst.tables[i].shard_overcommit;
|
|
if (overcommit > 1.2) {
|
|
testlog.warn("[run {}] table{} shard overcommit {:.2f} > 1.2!", name, i + 1, overcommit);
|
|
}
|
|
}
|
|
}
|
|
|
|
future<> run_simulations(const boost::program_options::variables_map& app_cfg) {
|
|
for (auto i = 0; i < app_cfg["runs"].as<int>(); i++) {
|
|
constexpr int MIN_RF = 1;
|
|
constexpr int MAX_RF = 3;
|
|
|
|
auto shards = 1 << tests::random::get_int(0, 8);
|
|
auto rf1 = tests::random::get_int(MIN_RF, MAX_RF);
|
|
// FIXME: Once we allow for RF <= #racks (and not just RF == #racks), we can randomize this RF too.
|
|
// For now, the values must be equal.
|
|
auto rf2 = rf1;
|
|
auto scale1 = 1 << tests::random::get_int(0, 5);
|
|
auto scale2 = 1 << tests::random::get_int(0, 5);
|
|
auto nodes = tests::random::get_int(rf1 + rf2, 2 * MAX_RF);
|
|
|
|
params p {
|
|
.iterations = app_cfg["iterations"].as<int>(),
|
|
.nodes = nodes,
|
|
.tablets1 = std::bit_ceil<size_t>(div_ceil(shards * nodes, rf1) * scale1),
|
|
.tablets2 = std::bit_ceil<size_t>(div_ceil(shards * nodes, rf2) * scale2),
|
|
.rf1 = rf1,
|
|
.rf2 = rf2,
|
|
.shards = shards,
|
|
.scale1 = scale1,
|
|
.scale2 = scale2,
|
|
};
|
|
|
|
auto name = format("#{}", i);
|
|
co_await run_simulation(p, name);
|
|
}
|
|
}
|
|
|
|
namespace perf {
|
|
|
|
void run_add_dec(const bpo::variables_map& opts) {
|
|
if (opts.contains("runs")) {
|
|
run_simulations(opts).get();
|
|
} else {
|
|
params p {
|
|
.iterations = opts["iterations"].as<int>(),
|
|
.nodes = opts["nodes"].as<int>(),
|
|
.tablets1 = opts["tablets1"].as<int>(),
|
|
.tablets2 = opts["tablets2"].as<int>(),
|
|
.rf1 = opts["rf1"].as<int>(),
|
|
.rf2 = opts["rf2"].as<int>(),
|
|
.shards = opts["shards"].as<int>(),
|
|
};
|
|
run_simulation(p).get();
|
|
}
|
|
}
|
|
|
|
using operation_func = std::function<void(const bpo::variables_map&)>;
|
|
|
|
const std::vector<operation_option> global_options {};
|
|
const std::vector<operation_option> global_positional_options{};
|
|
|
|
const std::map<operation, operation_func> operations_with_func{
|
|
{
|
|
{{"rolling-add-dec",
|
|
"Sequence of bootstraps and decommissions with two tables",
|
|
"",
|
|
{
|
|
typed_option<int>("runs", "Number of simulation runs."),
|
|
typed_option<int>("iterations", 8, "Number of topology-changing cycles in each run."),
|
|
typed_option<int>("tablets1", 512, "Number of tablets for the first table."),
|
|
typed_option<int>("tablets2", 128, "Number of tablets for the second table."),
|
|
typed_option<int>("rf1", 1, "Replication factor for the first table."),
|
|
typed_option<int>("rf2", 1, "Replication factor for the second table."),
|
|
typed_option<int>("nodes", 3, "Number of nodes in the cluster."),
|
|
typed_option<int>("shards", 30, "Number of shards per node.")
|
|
}
|
|
}, &run_add_dec},
|
|
|
|
{{"parallel-scaleout",
|
|
"Simulates a single scale-out involving simultaneous addition of multiple nodes per rack",
|
|
"",
|
|
{
|
|
typed_option<int>("tablets_per_table", 256, "Number of tablets per table."),
|
|
typed_option<int>("tables", 70, "Table count."),
|
|
typed_option<int>("nodes-per-rack", 5, "Number of initial nodes per rack."),
|
|
typed_option<int>("extra-nodes-per-rack", 3, "Number of nodes to add per rack."),
|
|
typed_option<int>("racks", 2, "Number of racks."),
|
|
typed_option<int>("shards", 88, "Number of shards per node.")
|
|
}
|
|
}, &test_parallel_scaleout},
|
|
}
|
|
};
|
|
|
|
int scylla_tablet_load_balancing_main(int argc, char** argv) {
|
|
const auto operations = operations_with_func | std::views::keys | std::ranges::to<std::vector>();
|
|
tool_app_template::config app_cfg{
|
|
.name = "perf-load-balancing",
|
|
.description = "Tests tablet load balancer in various scenarios",
|
|
.logger_name = testlog.name(),
|
|
.lsa_segment_pool_backend_size_mb = 100,
|
|
.operations = std::move(operations),
|
|
.global_options = &global_options,
|
|
.global_positional_options = &global_positional_options,
|
|
.db_cfg_ext = db_config_and_extensions()
|
|
};
|
|
tool_app_template app(std::move(app_cfg));
|
|
|
|
return app.run_async(argc, argv, [] (const operation& operation, const bpo::variables_map& app_config) {
|
|
auto stop_test = defer([] {
|
|
aborted.request_abort();
|
|
});
|
|
try {
|
|
operations_with_func.at(operation)(app_config);
|
|
return 0;
|
|
} catch (seastar::abort_requested_exception&) {
|
|
// Ignore
|
|
}
|
|
return 1;
|
|
});
|
|
}
|
|
|
|
} // namespace perf
|