Merge '[Backport 2026.2] load_balancer: apply balance threshold to intranode shard balancing' from Scylladb[bot]

- Fix intranode shard balancing to respect the size-based balance threshold, preventing unnecessary migrations when load difference between shards is negligible
- Add a regression test that verifies the threshold is respected for intranode balancing

The intranode shard balancing loop only stopped when the algorithm exhausted the migration candidates or when a migration would go against convergence (it would increase imbalance instead of decrease it). This caused unnecessary tablet migrations for negligible imbalances (e.g., 0.78% difference between shards).

The inter-node balancer already uses `is_balanced()` to stop when the relative load difference is within the configured `size_based_balance_threshold`, but this check was missing from the intranode path.

Apply the same `is_balanced()` threshold check that is already used for inter-node balancing to the intranode convergence loop. When the relative load difference between the most-loaded and least-loaded shards on a node is within the threshold, the balancer now stops without issuing further migrations.

The test creates a single node with 2 shards and 512 tablets:
1. **Balanced scenario** (257 vs 255 tablets, same size): relative diff = 0.78% < 1% threshold → verifies no intranode migration is emitted
2. **Unbalanced scenario** (307 vs 205 tablets, same size): relative diff = 33% >> 1% threshold → verifies intranode migration IS emitted

Fixes: SCYLLADB-2006

This is a performance improvement which reduces the number of intranode migrations issued, and needs to be backported to versions with size-based load balancing: 2026.1 and 2026.2

- (cherry picked from commit aaead10e5d)
- (cherry picked from commit 6856f51097)

Parent PR: #29756

Closes scylladb/scylladb#29895

* github.com:scylladb/scylladb:
  test: add test for intranode balance threshold in size-based mode
  tablet_allocator: apply balance threshold to intranode shard balancing
This commit is contained in:
Botond Dénes
2026-05-22 15:07:33 +03:00
2 changed files with 129 additions and 1 deletions

View File

@@ -3208,7 +3208,16 @@ public:
// Convergence check
// When in shuffle mode, exit condition is guaranteed by running out of candidates or by load limit.
if (!shuffle && src == dst) {
auto node_is_balanced = [&] {
auto min_load = node_load.shard_load(dst);
auto max_load = node_load.shard_load(src);
// We can't compute accurate load without disk capacity, so stop balancing this node
if (!min_load || !max_load) {
return true;
}
return is_balanced(*min_load, *max_load);
};
if (!shuffle && (src == dst || node_is_balanced())) {
lblogger.debug("Node {} is balanced", host);
break;
}

View File

@@ -5913,6 +5913,125 @@ SEASTAR_THREAD_TEST_CASE(test_drain_node_without_capacity) {
}).get();
}
// Verifies that the intranode shard balancer respects the size-based balance threshold
// and does not issue migrations when the load difference between shards is within the threshold.
// Without the threshold check, the balancer would keep issuing migrations of small tablets
// from the slightly-heavier shard until load difference reaches exactly 0, or it can't find
// a tablet to migrate because all migrations would go against convergence.
SEASTAR_THREAD_TEST_CASE(test_intranode_balance_threshold) {
auto cfg = tablet_cql_test_config();
// Set the balance threshold explicitly
cfg.db_config->size_based_balance_threshold_percentage.set(1.0);
do_with_cql_env_thread([] (auto& e) {
logging::logger_registry().set_logger_level("load_balancer", logging::log_level::debug);
topology_builder topo(e);
// Single node with 2 shards, RF=1. This isolates intranode balancing.
const unsigned shard_count = 2;
auto host1 = topo.add_node(node_state::normal, shard_count);
// Set up capacity for size-based balancing mode.
// Each shard has 100GB capacity, so the node has 200GB total.
const uint64_t shard_capacity = 100UL * 1024UL * 1024UL * 1024UL;
const uint64_t node_capacity = shard_capacity * shard_count;
topo.get_shared_load_stats().set_capacity(host1, node_capacity);
// Create a table with 512 tablets.
// Place 257 on shard 0 and 255 on shard 1, all with the same size.
// This gives:
// Relative load diff = (257 - 255) / 257 = 0.78% < 1% threshold
// But each individual tablet passes the per-tablet convergence check:
// 257*S > 255*S + S => 257 > 256 => true
// So without the threshold fix, the balancer issues a migration.
// With the fix, it recognizes the node is balanced and stops.
const size_t tablet_count = 512;
size_t tablets_on_shard0 = 257;
size_t tablets_on_shard1 = 255;
const uint64_t tablet_size = 100UL * 1024UL * 1024UL; // 100MB each
auto ks_name = add_keyspace(e, {{topo.dc(), 1}}, tablet_count);
auto table1 = add_table(e, ks_name).get();
auto& stm = e.shared_token_metadata().local();
auto& load_stats = topo.get_shared_load_stats();
auto mutate_tmap = [&] (tablet_metadata& tmeta) -> future<> {
tablet_map tmap(tablet_count);
auto tid = tmap.first_tablet();
for (size_t i = 0; i < tablets_on_shard0; ++i) {
tmap.set_tablet(tid, tablet_info {
tablet_replica_set { tablet_replica {host1, 0} }
});
if (i < tablets_on_shard0 - 1) {
tid = *tmap.next_tablet(tid);
}
}
for (size_t i = 0; i < tablets_on_shard1; ++i) {
tid = *tmap.next_tablet(tid);
tmap.set_tablet(tid, tablet_info {
tablet_replica_set { tablet_replica {host1, 1} }
});
}
tmeta.set_tablet_map(table1, std::move(tmap));
co_return;
};
mutate_tablets(e, mutate_tmap);
// Set uniform per-tablet sizes
auto& tmap = stm.get()->tablets().get_tablet_map(table1);
tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& tinfo) {
locator::range_based_tablet_id rb_tid {table1, tmap.get_token_range(tid)};
load_stats.set_tablet_size(host1, rb_tid, tablet_size);
return make_ready_future<>();
}).get();
load_stats.set_size(table1, tablet_count * tablet_size);
// Call the balancer once and verify NO intranode migrations in the plan.
auto& talloc = e.get_tablet_allocator().local();
auto& topology = e.get_topology_state_machine().local()._topology;
auto& sys_ks = e.get_system_keyspace().local();
{
auto plan = talloc.balance_tablets(stm.get(), &topology, &sys_ks, load_stats.get()).get();
for (auto&& mig : plan.migrations()) {
BOOST_REQUIRE_MESSAGE(mig.kind != tablet_transition_kind::intranode_migration,
"Unexpected intranode migration when load difference is within threshold");
}
}
// Now create a clearly unbalanced scenario: 307 vs 205 tablets.
// Relative diff = (307 - 205) / 307 = 33% >> 1% threshold
tablets_on_shard0 = 307;
tablets_on_shard1 = 205;
mutate_tablets(e, mutate_tmap);
// Update tablet sizes for new layout
auto& tmap2 = stm.get()->tablets().get_tablet_map(table1);
tmap2.for_each_tablet([&] (tablet_id tid, const tablet_info& tinfo) {
locator::range_based_tablet_id rb_tid {table1, tmap2.get_token_range(tid)};
load_stats.set_tablet_size(host1, rb_tid, tablet_size);
return make_ready_future<>();
}).get();
// Call balancer once - this time intranode migrations SHOULD be emitted.
{
auto plan = talloc.balance_tablets(stm.get(), &topology, &sys_ks, load_stats.get()).get();
bool saw_intranode_migration = false;
for (auto&& mig : plan.migrations()) {
if (mig.kind == tablet_transition_kind::intranode_migration) {
saw_intranode_migration = true;
break;
}
}
BOOST_REQUIRE_MESSAGE(saw_intranode_migration,
"Expected intranode migration when load difference exceeds threshold");
}
}, cfg).get();
}
SEASTAR_THREAD_TEST_CASE(test_tablet_range_splitter) {
simple_schema ss;