mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-12 19:02:12 +00:00
Merge 'load_balance: fix drain with forced capacity-based balancing' from Ferenc Szili
When `force_capacity_based_balancing` is enabled and a node is being drained/excluded, the tablet allocator incorrectly aborts balancing due to incomplete tablet stats - even though capacity-based balancing doesn't depend on tablet sizes. The tablet allocator normally waits for complete load stats before balancing. An exception exists for drained+excluded nodes (they're unreachable and won't return stats). However, when forced capacity-based balancing is active, this exception was not being applied, causing the balancer to reject the drain plan. Adjust the condition in `tablet_allocator.cc` so that the "ignore missing data for drained nodes" logic applies regardless of whether capacity-based balancing is forced. Added a Boost unit test that forces capacity-based balancing and verifies a drained/excluded node gets its tablets migrated even when tablet size stats are missing. This bug was introduced in 2026.1, so this needs to be backported to 2026.1 and 2026.2 Fixes: SCYLLADB-1803 Closes scylladb/scylladb#29791 * github.com:scylladb/scylladb: test: boost: add drain test for forced capacity-based balancing service: allow draining with forced capacity-based balancing
This commit is contained in:
@@ -4261,10 +4261,10 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
// For size based balancing, only excluded nodes are allowed to have incomplete tablet stats
|
||||
// Only excluded nodes are allowed to have incomplete tablet stats
|
||||
for (auto& [host, node] : nodes) {
|
||||
if (!_load_sketch->has_complete_data(host)) {
|
||||
if (!_force_capacity_based_balancing && node.drained && node.node->is_excluded()) {
|
||||
if (node.drained && node.node->is_excluded()) {
|
||||
_load_sketch->ignore_incomplete_data(host);
|
||||
} else {
|
||||
lblogger.info("Cannot balance because node {} (or more) has incomplete tablet stats", host);
|
||||
|
||||
@@ -4136,6 +4136,58 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancing_with_asymmetric_node_capacity) {
|
||||
}).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_drain_with_forced_capacity_based_balancing_with_incomplete_data) {
|
||||
auto cfg = tablet_cql_test_config();
|
||||
cfg.db_config->force_capacity_based_balancing.set(true);
|
||||
|
||||
do_with_cql_env_thread([] (auto& e) {
|
||||
logging::logger_registry().set_logger_level("load_balancer", logging::log_level::debug);
|
||||
topology_builder topo(e);
|
||||
|
||||
auto host1 = topo.add_node(node_state::removing, 8);
|
||||
e.get_storage_service().local().mark_excluded({host1}).get();
|
||||
auto host2 = topo.add_node(node_state::normal, 1);
|
||||
auto host3 = topo.add_node(node_state::normal, 7);
|
||||
|
||||
const uint64_t capacity_unit = 100UL * 1024UL * 1024UL * 1024UL;
|
||||
topo.get_shared_load_stats().set_capacity(host2, capacity_unit);
|
||||
topo.get_shared_load_stats().set_capacity(host3, capacity_unit * 7);
|
||||
|
||||
auto ks_name = add_keyspace(e, {{topo.dc(), 1}}, 16);
|
||||
auto table1 = add_table(e, ks_name).get();
|
||||
|
||||
mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> {
|
||||
tablet_map tmap(16);
|
||||
for (auto tid: tmap.tablet_ids()) {
|
||||
tmap.set_tablet(tid, tablet_info {
|
||||
tablet_replica_set {
|
||||
tablet_replica {host1, 0},
|
||||
}
|
||||
});
|
||||
}
|
||||
tmeta.set_tablet_map(table1, std::move(tmap));
|
||||
co_return;
|
||||
});
|
||||
|
||||
auto until_nodes_drained = [] (const migration_plan& plan) {
|
||||
return !plan.has_nodes_to_drain();
|
||||
};
|
||||
|
||||
auto& stm = e.shared_token_metadata().local();
|
||||
|
||||
rebalance_tablets(e, &topo.get_shared_load_stats(), {}, until_nodes_drained);
|
||||
|
||||
load_sketch load(stm.get());
|
||||
load.populate().get();
|
||||
|
||||
for (auto h: {host2, host3}) {
|
||||
testlog.info("Checking host {}", h);
|
||||
BOOST_REQUIRE_EQUAL(load.get_avg_tablet_count(h), 2); // 16 tablets / 8 shards = 2 tablets / shard
|
||||
BOOST_REQUIRE_EQUAL(load.get_shard_tablet_count_imbalance(h), 0);
|
||||
}
|
||||
}, std::move(cfg)).get();
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_load_balancer_disabling) {
|
||||
do_with_cql_env_thread([] (auto& e) {
|
||||
topology_builder topo(e);
|
||||
|
||||
Reference in New Issue
Block a user