/* * Copyright (C) 2024-present ScyllaDB */ /* * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 */ #include #include #include #include #include #include #include #include "locator/tablets.hh" #include "service/tablet_allocator.hh" #include "locator/tablet_replication_strategy.hh" #include "locator/network_topology_strategy.hh" #include "locator/load_sketch.hh" #include "test/lib/topology_builder.hh" #include "replica/tablets.hh" #include "locator/tablet_replication_strategy.hh" #include "db/config.hh" #include "schema/schema_builder.hh" #include "service/storage_proxy.hh" #include "db/system_keyspace.hh" #include "tools/utils.hh" #include "test/perf/perf.hh" #include "test/lib/log.hh" #include "test/lib/cql_test_env.hh" #include "test/lib/random_utils.hh" #include "test/lib/key_utils.hh" using namespace locator; using namespace replica; using namespace service; using namespace tools::utils; namespace bpo = boost::program_options; static seastar::abort_source aborted; static const sstring dc = "dc1"; static cql_test_config tablet_cql_test_config() { cql_test_config c; return c; } static future add_table(cql_test_env& e, sstring test_ks_name = "") { auto id = table_id(utils::UUID_gen::get_time_UUID()); co_await e.create_table([&] (std::string_view ks_name) { if (!test_ks_name.empty()) { ks_name = test_ks_name; } return *schema_builder(ks_name, id.to_sstring(), id) .with_column("p1", utf8_type, column_kind::partition_key) .with_column("r1", int32_type) .build(); }); co_return id; } // Run in a seastar thread static sstring add_keyspace(cql_test_env& e, std::unordered_map dc_rf, int initial_tablets = 0) { static std::atomic ks_id = 0; auto ks_name = fmt::format("keyspace{}", ks_id.fetch_add(1)); sstring rf_options; for (auto& [dc, rf] : dc_rf) { rf_options += format(", '{}': {}", dc, rf); } e.execute_cql(fmt::format("create keyspace {} with replication = {{'class': 'NetworkTopologyStrategy'{}}}" " and tablets = {{'enabled': true, 'initial': {}}}", ks_name, rf_options, initial_tablets)).get(); return ks_name; } static size_t get_tablet_count(const tablet_metadata& tm) { size_t count = 0; for (const auto& [table, tmap] : tm.all_tables_ungrouped()) { count += std::accumulate(tmap->tablets().begin(), tmap->tablets().end(), size_t(0), [] (size_t accumulator, const locator::tablet_info& info) { return accumulator + info.replicas.size(); }); } return count; } static future<> apply_resize_plan(token_metadata& tm, const migration_plan& plan) { for (auto [table_id, resize_decision] : plan.resize_plan().resize) { co_await tm.tablets().mutate_tablet_map_async(table_id, [&resize_decision] (tablet_map& tmap) { resize_decision.sequence_number = tmap.resize_decision().sequence_number + 1; tmap.set_resize_decision(resize_decision); return make_ready_future(); }); } for (auto table_id : plan.resize_plan().finalize_resize) { auto& old_tmap = tm.tablets().get_tablet_map(table_id); testlog.info("Setting new tablet map of size {}", old_tmap.tablet_count() * 2); tablet_map tmap(old_tmap.tablet_count() * 2); tm.tablets().set_tablet_map(table_id, std::move(tmap)); } } // Reflects the plan in a given token metadata as if the migrations were fully executed. static future<> apply_plan(token_metadata& tm, const migration_plan& plan, locator::load_stats& load_stats) { for (auto&& mig : plan.migrations()) { co_await tm.tablets().mutate_tablet_map_async(mig.tablet.table, [&mig] (tablet_map& tmap) { auto tinfo = tmap.get_tablet_info(mig.tablet.tablet); tinfo.replicas = replace_replica(tinfo.replicas, mig.src, mig.dst); tmap.set_tablet(mig.tablet.tablet, tinfo); return make_ready_future(); }); // Move tablet size in load_stats to account for the migration if (mig.src && mig.dst && mig.src->host != mig.dst->host) { auto& tmap = tm.tablets().get_tablet_map(mig.tablet.table); const dht::token_range trange = tmap.get_token_range(mig.tablet.tablet); lw_shared_ptr new_stats = load_stats.migrate_tablet_size(mig.src->host, mig.dst->host, mig.tablet, trange); if (new_stats) { load_stats = std::move(*new_stats); } else { throw std::runtime_error(format("Unable to migrate tablet size in load_stats for migration: {}", mig)); } } } co_await apply_resize_plan(tm, plan); } using seconds_double = std::chrono::duration; struct rebalance_stats { seconds_double elapsed_time = seconds_double(0); seconds_double max_rebalance_time = seconds_double(0); uint64_t rebalance_count = 0; rebalance_stats& operator+=(const rebalance_stats& other) { elapsed_time += other.elapsed_time; max_rebalance_time = std::max(max_rebalance_time, other.max_rebalance_time); rebalance_count += other.rebalance_count; return *this; } }; static rebalance_stats rebalance_tablets(cql_test_env& e, locator::load_stats& load_stats, std::unordered_set skiplist = {}) { rebalance_stats stats; abort_source as; auto guard = e.get_raft_group0_client().start_operation(as).get(); auto& talloc = e.get_tablet_allocator().local(); auto& stm = e.shared_token_metadata().local(); // Sanity limit to avoid infinite loops. // The x10 factor is arbitrary, it's there to account for more complex schedules than direct migration. auto max_iterations = 1 + get_tablet_count(stm.get()->tablets()) * 10; for (size_t i = 0; i < max_iterations; ++i) { auto prev_lb_stats = *talloc.stats().for_dc(dc); auto load_stats_p = make_lw_shared(load_stats); auto start_time = std::chrono::steady_clock::now(); auto plan = talloc.balance_tablets(stm.get(), nullptr, nullptr, load_stats_p, skiplist).get(); auto end_time = std::chrono::steady_clock::now(); auto lb_stats = *talloc.stats().for_dc(dc) - prev_lb_stats; auto elapsed = std::chrono::duration_cast(end_time - start_time); rebalance_stats iteration_stats = { .elapsed_time = elapsed, .max_rebalance_time = elapsed, .rebalance_count = 1, }; stats += iteration_stats; testlog.debug("Rebalance iteration {} took {:.3f} [s]: mig={}, bad={}, first_bad={}, eval={}, skiplist={}, skip: (load={}, rack={}, node={})", i + 1, elapsed.count(), lb_stats.migrations_produced, lb_stats.bad_migrations, lb_stats.bad_first_candidates, lb_stats.candidates_evaluated, lb_stats.migrations_from_skiplist, lb_stats.migrations_skipped, lb_stats.tablets_skipped_rack, lb_stats.tablets_skipped_node); if (plan.empty()) { // We should not introduce inconsistency between on-disk state and in-memory state // as that may violate invariants and cause failures in later operations // causing test flakiness. save_tablet_metadata(e.local_db(), stm.get()->tablets(), guard.write_timestamp()).get(); e.get_storage_service().local().update_tablet_metadata({}).get(); testlog.info("Rebalance took {:.3f} [s] after {} iteration(s)", stats.elapsed_time.count(), i + 1); return stats; } stm.mutate_token_metadata([&] (token_metadata& tm) { return apply_plan(tm, plan, load_stats); }).get(); } throw std::runtime_error("rebalance_tablets(): convergence not reached within limit"); } struct params { int iterations; int nodes; std::optional tablets1; std::optional tablets2; int rf1; int rf2; int shards; int scale1 = 1; int scale2 = 1; double tablet_size_deviation_factor = 0.5; }; struct table_balance { double shard_overcommit; double best_shard_overcommit; double node_overcommit; }; constexpr auto nr_tables = 2; struct cluster_balance { table_balance tables[nr_tables]; }; struct results { cluster_balance init; cluster_balance worst; cluster_balance last; rebalance_stats stats; }; template<> struct fmt::formatter : fmt::formatter { template auto format(const table_balance& b, FormatContext& ctx) const { return fmt::format_to(ctx.out(), "{{shard={} (best={}), node={}}}", b.shard_overcommit, b.best_shard_overcommit, b.node_overcommit); } }; template<> struct fmt::formatter : fmt::formatter { template auto format(const cluster_balance& r, FormatContext& ctx) const { return fmt::format_to(ctx.out(), "{{table1={}, table2={}}}", r.tables[0], r.tables[1]); } }; template<> struct fmt::formatter : fmt::formatter { template auto format(const params& p, FormatContext& ctx) const { auto tablets1_per_shard = double(p.tablets1.value_or(0)) * p.rf1 / (p.nodes * p.shards); auto tablets2_per_shard = double(p.tablets2.value_or(0)) * p.rf2 / (p.nodes * p.shards); return fmt::format_to(ctx.out(), "{{iterations={}, nodes={}, tablets1={} ({:0.1f}/sh), tablets2={} ({:0.1f}/sh), rf1={}, rf2={}, shards={}, tablet_size_deviation_factor={}}}", p.iterations, p.nodes, p.tablets1.value_or(0), tablets1_per_shard, p.tablets2.value_or(0), tablets2_per_shard, p.rf1, p.rf2, p.shards, p.tablet_size_deviation_factor); } }; class tablet_size_generator { std::default_random_engine _rnd_engine{std::random_device{}()}; std::normal_distribution<> _dist; public: explicit tablet_size_generator(double deviation_factor) : _dist(default_target_tablet_size, default_target_tablet_size * deviation_factor) { } uint64_t generate() { // We can't have a negative tablet size, which is why we need to minimize it to 0 (with std::max()). // One consequence of this is that the average generated tablet size will actually // be larger than default_target_tablet_size. // This will be especially pronounced as deviation_factor gets larger. For instance: // // deviation_factor | avg tablet size // -----------------+---------------------------------------- // 1 | default_target_tablet_size * 1.08 // 1.5 | default_target_tablet_size * 1.22 // 2 | default_target_tablet_size * 1.39 // 3 | default_target_tablet_size * 1.76 return std::max(0.0, _dist(_rnd_engine)); } }; void generate_tablet_sizes(double tablet_size_deviation_factor, locator::load_stats& stats, locator::shared_token_metadata& stm) { tablet_size_generator tsg(tablet_size_deviation_factor); for (auto&& [table, tmap] : stm.get()->tablets().all_tables_ungrouped()) { tmap->for_each_tablet([&] (tablet_id tid, const tablet_info& ti) -> future<> { for (const auto& replica : ti.replicas) { const uint64_t tablet_size = tsg.generate(); locator::range_based_tablet_id rb_tid {table, tmap->get_token_range(tid)}; stats.tablet_stats[replica.host].tablet_sizes[rb_tid.table][rb_tid.range] = tablet_size; testlog.trace("Generated tablet size {} for {}:{}", tablet_size, table, tid); } return make_ready_future<>(); }).get(); } } future test_load_balancing_with_many_tables(params p, bool tablet_aware) { auto cfg = tablet_cql_test_config(); results global_res; co_await do_with_cql_env_thread([&] (auto& e) { SCYLLA_ASSERT(p.nodes > 0); SCYLLA_ASSERT(p.rf1 > 0); SCYLLA_ASSERT(p.rf2 > 0); const size_t n_hosts = p.nodes; const size_t rf1 = p.rf1; const size_t rf2 = p.rf2; const shard_id shard_count = p.shards; const int cycles = p.iterations; const uint64_t shard_capacity = default_target_tablet_size * 100; struct host_info { host_id id; endpoint_dc_rack dc_rack; }; topology_builder topo(e); std::vector racks; std::vector hosts; locator::load_stats stats; auto populate_racks = [&] (const size_t count) { SCYLLA_ASSERT(count > 0); racks.push_back(topo.rack()); for (size_t i = 0; i < count - 1; ++i) { racks.push_back(topo.start_new_rack()); } }; const sstring dc1 = topo.dc(); populate_racks(rf1); // The rack for which we output stats sstring test_rack = racks.front().rack; const size_t rack_count = racks.size(); std::unordered_map rack_capacity; auto add_host = [&] (endpoint_dc_rack dc_rack) { auto host = topo.add_node(service::node_state::normal, shard_count, dc_rack); hosts.emplace_back(host, dc_rack); const uint64_t capacity = shard_capacity * shard_count; stats.capacity[host] = capacity; stats.tablet_stats[host].effective_capacity = capacity; rack_capacity[dc_rack.rack] += capacity; testlog.info("Added new node: {} / {}:{}", host, dc_rack.dc, dc_rack.rack); }; for (size_t i = 0; i < n_hosts; ++i) { add_host(racks[i % rack_count]); } auto& stm = e.shared_token_metadata().local(); auto bootstrap = [&] (endpoint_dc_rack dc_rack) { add_host(std::move(dc_rack)); global_res.stats += rebalance_tablets(e, stats); }; auto decommission = [&] (host_id host) { const auto it = std::ranges::find_if(hosts, [&] (const host_info& info) { return info.id == host; }); if (it == hosts.end()) { throw std::runtime_error(format("No such host: {}", host)); } topo.set_node_state(host, service::node_state::decommissioning); global_res.stats += rebalance_tablets(e, stats); if (stm.get()->tablets().has_replica_on(host)) { throw std::runtime_error(format("Host {} still has replicas!", host)); } topo.set_node_state(host, service::node_state::left); testlog.info("Node decommissioned: {}", host); rack_capacity[it->dc_rack.rack] -= stats.capacity.at(host); hosts.erase(it); stats.tablet_stats.erase(host); }; auto ks1 = add_keyspace(e, {{dc1, rf1}}, p.tablets1.value_or(1)); auto ks2 = add_keyspace(e, {{dc1, rf2}}, p.tablets2.value_or(1)); auto id1 = add_table(e, ks1).get(); auto id2 = add_table(e, ks2).get(); schema_ptr s1 = e.local_db().find_schema(id1); schema_ptr s2 = e.local_db().find_schema(id2); generate_tablet_sizes(p.tablet_size_deviation_factor, stats, stm); // Compute table size per rack, and collect all tablets per rack std::unordered_map> table_sizes_per_rack; std::unordered_map>> tablet_sizes_in_rack; for (auto& [host, tls] : stats.tablet_stats) { auto host_i = std::ranges::find(hosts, host, &host_info::id); if (host_i == hosts.end()) { throw std::runtime_error(format("Host {} not found in hosts", host)); } auto rack = host_i->dc_rack.rack; for (auto& [table, ranges] : tls.tablet_sizes) { for (auto& [trange, tablet_size] : ranges) { table_sizes_per_rack[rack][table] += tablet_size; tablet_sizes_in_rack[rack][table].push_back(tablet_size); } } } // Sort the tablet sizes per rack in descending order for (auto& [rack, tables] : tablet_sizes_in_rack) { for (auto& [table, tablets] : tables) { std::ranges::sort(tablets, std::greater()); } } struct node_used_size { host_id host; uint64_t used = 0; }; // Compute best shard overcommit per table per rack std::unordered_map> best_shard_overcommit_per_rack; auto compute_best_overcommit = [&] () { auto node_size_compare = [] (const node_used_size& lhs, const node_used_size& rhs) { return lhs.used > rhs.used; }; for (auto& all_dc_rack : racks) { auto rack = all_dc_rack.rack; // Allocate tablet sizes to nodes for (auto& [table, tablet_sizes]: tablet_sizes_in_rack.at(rack)) { load_sketch load(e.shared_token_metadata().local().get(), make_lw_shared(stats)); // Add nodes to load_sketch and to the nodes_used heap std::vector nodes_used; for (const auto& [host_id, host_dc_rack] : hosts) { if (rack == host_dc_rack.rack) { load.ensure_node(host_id); nodes_used.push_back({host_id, 0}); } } // Allocate tablets to nodes/shards for (uint64_t tablet_size : tablet_sizes) { std::pop_heap(nodes_used.begin(), nodes_used.end(), node_size_compare); host_id add_to_host = nodes_used.back().host; nodes_used.back().used += tablet_size; std::push_heap(nodes_used.begin(), nodes_used.end(), node_size_compare); // Add to the least loaded shard on the least loaded node load.next_shard(add_to_host, 1, tablet_size); } // Get the best overcommit from all the nodes min_max_tracker load_minmax; for (const auto& n : nodes_used) { load_minmax.update(load.get_shard_minmax(n.host)); } const uint64_t table_size = table_sizes_per_rack.at(rack).at(table); const double ideal_load = double(table_size) / rack_capacity.at(rack); const double best_overcommit = load_minmax.max() / ideal_load; best_shard_overcommit_per_rack[rack][table] = best_overcommit; } } }; auto check_balance = [&] () -> cluster_balance { cluster_balance res; testlog.debug("tablet metadata: {}", stm.get()->tablets()); compute_best_overcommit(); auto load_stats_p = make_lw_shared(stats); int table_index = 0; for (auto s : {s1, s2}) { auto table = s->id(); load_sketch load(stm.get(), load_stats_p); load.populate(std::nullopt, table).get(); min_max_tracker shard_overcommit_minmax; min_max_tracker node_overcommit_minmax; auto rack = test_rack; auto table_size = table_sizes_per_rack.at(rack).at(table); auto ideal_load = double(table_size) / rack_capacity.at(rack); min_max_tracker shard_load_minmax; min_max_tracker node_load_minmax; for (auto [h, host_dc_rack] : hosts) { if (host_dc_rack.rack != rack) { continue; } auto minmax = load.get_shard_minmax(h); auto node_load = load.get_load(h); auto overcommit = double(minmax.max()) / ideal_load; testlog.info("Load on host {} for table {}: total={}, min={}, max={}, spread={}, ideal={}, overcommit={}", h, s->cf_name(), node_load, minmax.min(), minmax.max(), minmax.max() - minmax.min(), ideal_load, overcommit); node_load_minmax.update(node_load); shard_load_minmax.update(minmax.max()); } auto shard_overcommit = shard_load_minmax.max() / ideal_load; auto best_shard_overcommit = best_shard_overcommit_per_rack.at(rack).at(table); testlog.info("Shard overcommit: {} best: {}", shard_overcommit, best_shard_overcommit); auto node_imbalance = node_load_minmax.max() - node_load_minmax.min(); auto node_overcommit = node_load_minmax.max() / ideal_load; testlog.info("Node imbalance in min={}, max={}, spread={}, ideal={}, overcommit={}", node_load_minmax.min(), node_load_minmax.max(), node_imbalance, ideal_load, node_overcommit); shard_overcommit_minmax.update(shard_overcommit); node_overcommit_minmax.update(node_overcommit); res.tables[table_index++] = { .shard_overcommit = shard_overcommit_minmax.max(), .best_shard_overcommit = best_shard_overcommit, .node_overcommit = node_overcommit_minmax.max(), }; } for (int i = 0; i < nr_tables; i++) { auto t = res.tables[i]; global_res.worst.tables[i].shard_overcommit = std::max(global_res.worst.tables[i].shard_overcommit, t.shard_overcommit); global_res.worst.tables[i].node_overcommit = std::max(global_res.worst.tables[i].node_overcommit, t.node_overcommit); } testlog.info("Overcommit: {}", res); return res; }; testlog.debug("tablet metadata: {}", stm.get()->tablets()); e.get_tablet_allocator().local().set_use_table_aware_balancing(tablet_aware); check_balance(); rebalance_tablets(e, stats); global_res.init = global_res.worst = check_balance(); for (int i = 0; i < cycles; i++) { const auto [id, dc_rack] = hosts[0]; bootstrap(dc_rack); check_balance(); decommission(id); global_res.last = check_balance(); } }, cfg); co_return global_res; } void test_parallel_scaleout(const bpo::variables_map& opts) { const shard_id shard_count = opts["shards"].as(); const int nr_tables = opts["tables"].as(); const int tablets_per_table = opts["tablets_per_table"].as(); const int nr_racks = opts["racks"].as(); const int initial_nodes = nr_racks * opts["nodes-per-rack"].as(); const int extra_nodes = nr_racks * opts["extra-nodes-per-rack"].as(); const double tablet_size_deviation_factor = opts["tablet-size-deviation-factor"].as(); auto cfg = tablet_cql_test_config(); cfg.db_config->rf_rack_valid_keyspaces(true); results global_res; do_with_cql_env_thread([&] (auto& e) { topology_builder topo(e); locator::load_stats stats; std::vector racks; racks.push_back(topo.rack()); for (int i = 1; i < nr_racks; ++i) { racks.push_back(topo.start_new_rack()); } auto add_host = [&] (endpoint_dc_rack rack) { auto host = topo.add_node(service::node_state::normal, shard_count, rack); const uint64_t capacity = default_target_tablet_size * shard_count * 100; stats.capacity[host] = capacity; stats.tablet_stats[host].effective_capacity = capacity; testlog.info("Added new node: {}", host); }; auto add_hosts = [&] (int n) { for (int i = 0; i < n; ++i) { add_host(racks[i % racks.size()]); } }; add_hosts(initial_nodes); testlog.info("Creating schema"); auto ks1 = add_keyspace(e, {{topo.dc(), nr_racks}}, tablets_per_table); seastar::parallel_for_each(std::views::iota(0, nr_tables), [&] (int) -> future<> { return add_table(e, ks1).discard_result(); }).get(); generate_tablet_sizes(tablet_size_deviation_factor, stats, e.shared_token_metadata().local()); testlog.info("Initial rebalancing"); rebalance_tablets(e, stats); testlog.info("Scaleout"); add_hosts(extra_nodes); global_res.stats += rebalance_tablets(e, stats); }, cfg).get(); } future<> run_simulation(const params& p, const sstring& name = "") { testlog.info("[run {}] params: {}", name, p); auto total_tablet_count = p.tablets1.value_or(0) * p.rf1 + p.tablets2.value_or(0) * p.rf2; testlog.info("[run {}] tablet count: {}", name, total_tablet_count); testlog.info("[run {}] tablet count / shard: {:.3f}", name, double(total_tablet_count) / (p.nodes * p.shards)); auto res = co_await test_load_balancing_with_many_tables(p, true); testlog.info("[run {}] Overcommit : init : {}", name, res.init); testlog.info("[run {}] Overcommit : worst: {}", name, res.worst); testlog.info("[run {}] Overcommit : last : {}", name, res.last); testlog.info("[run {}] Overcommit : time : {:.3f} [s], max={:.3f} [s], count={}", name, res.stats.elapsed_time.count(), res.stats.max_rebalance_time.count(), res.stats.rebalance_count); if (res.stats.elapsed_time > seconds_double(1)) { testlog.warn("[run {}] Scheduling took longer than 1s!", name); } auto old_res = co_await test_load_balancing_with_many_tables(p, false); testlog.info("[run {}] Overcommit (old) : init : {}", name, old_res.init); testlog.info("[run {}] Overcommit (old) : worst: {}", name, old_res.worst); testlog.info("[run {}] Overcommit (old) : last : {}", name, old_res.last); testlog.info("[run {}] Overcommit : time : {:.3f} [s], max={:.3f} [s], count={}", name, old_res.stats.elapsed_time.count(), old_res.stats.max_rebalance_time.count(), old_res.stats.rebalance_count); for (int i = 0; i < nr_tables; ++i) { if (res.worst.tables[i].shard_overcommit > old_res.worst.tables[i].shard_overcommit) { testlog.warn("[run {}] table{} shard overcommit worse!", name, i + 1); } auto overcommit = res.worst.tables[i].shard_overcommit; if (overcommit > 1.2) { testlog.warn("[run {}] table{} shard overcommit {:.4f} > 1.2!", name, i + 1, overcommit); } } } future<> run_simulations(const boost::program_options::variables_map& app_cfg) { for (auto i = 0; i < app_cfg["runs"].as(); i++) { constexpr int MIN_RF = 1; constexpr int MAX_RF = 3; auto shards = 1 << tests::random::get_int(0, 8); auto rf1 = tests::random::get_int(MIN_RF, MAX_RF); // FIXME: Once we allow for RF <= #racks (and not just RF == #racks), we can randomize this RF too. // For now, the values must be equal. auto rf2 = rf1; auto scale1 = 1 << tests::random::get_int(0, 5); auto scale2 = 1 << tests::random::get_int(0, 5); auto nodes = tests::random::get_int(rf1 + rf2, 2 * MAX_RF); // results in a deviation factor of 0.0 - 2.0 auto tablet_size_deviation_factor = tests::random::get_int(0, 200) / 100.0; params p { .iterations = app_cfg["iterations"].as(), .nodes = nodes, .tablets1 = std::bit_ceil(div_ceil(shards * nodes, rf1) * scale1), .tablets2 = std::bit_ceil(div_ceil(shards * nodes, rf2) * scale2), .rf1 = rf1, .rf2 = rf2, .shards = shards, .scale1 = scale1, .scale2 = scale2, .tablet_size_deviation_factor = tablet_size_deviation_factor }; auto name = format("#{}", i); co_await run_simulation(p, name); } } namespace perf { void run_add_dec(const bpo::variables_map& opts) { if (opts.contains("runs")) { run_simulations(opts).get(); } else { params p { .iterations = opts["iterations"].as(), .nodes = opts["nodes"].as(), .tablets1 = opts["tablets1"].as(), .tablets2 = opts["tablets2"].as(), .rf1 = opts["rf1"].as(), .rf2 = opts["rf2"].as(), .shards = opts["shards"].as(), .tablet_size_deviation_factor = opts["tablet-size-deviation-factor"].as(), }; run_simulation(p).get(); } } using operation_func = std::function; const std::vector global_options {}; const std::vector global_positional_options{}; const std::map operations_with_func{ { {{"rolling-add-dec", "Sequence of bootstraps and decommissions with two tables", "", { typed_option("runs", "Number of simulation runs."), typed_option("iterations", 8, "Number of topology-changing cycles in each run."), typed_option("tablets1", 512, "Number of tablets for the first table."), typed_option("tablets2", 128, "Number of tablets for the second table."), typed_option("rf1", 1, "Replication factor for the first table."), typed_option("rf2", 1, "Replication factor for the second table."), typed_option("nodes", 3, "Number of nodes in the cluster."), typed_option("shards", 30, "Number of shards per node."), typed_option("tablet-size-deviation-factor", 0.5, "Deviation factor for the tablet size random generator.") } }, &run_add_dec}, {{"parallel-scaleout", "Simulates a single scale-out involving simultaneous addition of multiple nodes per rack", "", { typed_option("tablets_per_table", 256, "Number of tablets per table."), typed_option("tables", 70, "Table count."), typed_option("nodes-per-rack", 5, "Number of initial nodes per rack."), typed_option("extra-nodes-per-rack", 3, "Number of nodes to add per rack."), typed_option("racks", 2, "Number of racks."), typed_option("shards", 88, "Number of shards per node."), typed_option("tablet-size-deviation-factor", 0.5, "Deviation factor for the tablet size random generator.") } }, &test_parallel_scaleout}, } }; int scylla_tablet_load_balancing_main(int argc, char** argv) { const auto operations = operations_with_func | std::views::keys | std::ranges::to(); tool_app_template::config app_cfg{ .name = "perf-load-balancing", .description = "Tests tablet load balancer in various scenarios", .logger_name = testlog.name(), .lsa_segment_pool_backend_size_mb = 100, .operations = std::move(operations), .global_options = &global_options, .global_positional_options = &global_positional_options, .db_cfg_ext = db_config_and_extensions() }; tool_app_template app(std::move(app_cfg)); return app.run_async(argc, argv, [] (const operation& operation, const bpo::variables_map& app_config) { auto stop_test = defer([] { aborted.request_abort(); }); try { operations_with_func.at(operation)(app_config); return 0; } catch (seastar::abort_requested_exception&) { // Ignore } return 1; }); } } // namespace perf