From 8a71a5ba886feaf368c9bef493547310f6029c72 Mon Sep 17 00:00:00 2001 From: Ferenc Szili Date: Tue, 5 May 2026 13:19:50 +0200 Subject: [PATCH 1/2] tablet_allocator: apply balance threshold to intranode shard balancing The intranode shard balancing loop only stopped when the most-loaded and least-loaded shard were the same (src == dst), meaning it would keep issuing migrations until the load difference reached exactly 0. This caused unnecessary migrations for negligible imbalances. Apply the same is_balanced() threshold check that is already used for inter-node balancing, so that intranode migrations stop when the relative load difference between shards is within the configured size_based_balance_threshold (default 1%). (cherry picked from commit aaead10e5deaf5abdf7838c0e8427e64d66793e9) --- service/tablet_allocator.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index a0aa421885..5c822eed30 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -3207,7 +3207,16 @@ public: // Convergence check // When in shuffle mode, exit condition is guaranteed by running out of candidates or by load limit. - if (!shuffle && src == dst) { + auto node_is_balanced = [&] { + auto min_load = node_load.shard_load(dst); + auto max_load = node_load.shard_load(src); + // We can't compute accurate load without disk capacity, so stop balancing this node + if (!min_load || !max_load) { + return true; + } + return is_balanced(*min_load, *max_load); + }; + if (!shuffle && (src == dst || node_is_balanced())) { lblogger.debug("Node {} is balanced", host); break; } From 21f0ef209b8dd1e38ceb48a2f41fe094e8990621 Mon Sep 17 00:00:00 2001 From: Ferenc Szili Date: Tue, 5 May 2026 13:22:18 +0200 Subject: [PATCH 2/2] test: add test for intranode balance threshold in size-based mode Verify that the load balancer does not issue intranode migrations when the load difference between shards is within the size_based_balance_threshold, and that it does issue migrations when the difference exceeds the threshold. (cherry picked from commit 6856f510970582cee10dcef42c46dce621b90ae6) --- test/boost/tablets_test.cc | 119 +++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index 0bd339a3a0..035f5836a4 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -5913,6 +5913,125 @@ SEASTAR_THREAD_TEST_CASE(test_drain_node_without_capacity) { }).get(); } +// Verifies that the intranode shard balancer respects the size-based balance threshold +// and does not issue migrations when the load difference between shards is within the threshold. +// Without the threshold check, the balancer would keep issuing migrations of small tablets +// from the slightly-heavier shard until load difference reaches exactly 0, or it can't find +// a tablet to migrate because all migrations would go against convergence. +SEASTAR_THREAD_TEST_CASE(test_intranode_balance_threshold) { + auto cfg = tablet_cql_test_config(); + // Set the balance threshold explicitly + cfg.db_config->size_based_balance_threshold_percentage.set(1.0); + do_with_cql_env_thread([] (auto& e) { + logging::logger_registry().set_logger_level("load_balancer", logging::log_level::debug); + + topology_builder topo(e); + + // Single node with 2 shards, RF=1. This isolates intranode balancing. + const unsigned shard_count = 2; + auto host1 = topo.add_node(node_state::normal, shard_count); + + // Set up capacity for size-based balancing mode. + // Each shard has 100GB capacity, so the node has 200GB total. + const uint64_t shard_capacity = 100UL * 1024UL * 1024UL * 1024UL; + const uint64_t node_capacity = shard_capacity * shard_count; + topo.get_shared_load_stats().set_capacity(host1, node_capacity); + + // Create a table with 512 tablets. + // Place 257 on shard 0 and 255 on shard 1, all with the same size. + // This gives: + // Relative load diff = (257 - 255) / 257 = 0.78% < 1% threshold + // But each individual tablet passes the per-tablet convergence check: + // 257*S > 255*S + S => 257 > 256 => true + // So without the threshold fix, the balancer issues a migration. + // With the fix, it recognizes the node is balanced and stops. + const size_t tablet_count = 512; + size_t tablets_on_shard0 = 257; + size_t tablets_on_shard1 = 255; + const uint64_t tablet_size = 100UL * 1024UL * 1024UL; // 100MB each + + auto ks_name = add_keyspace(e, {{topo.dc(), 1}}, tablet_count); + auto table1 = add_table(e, ks_name).get(); + + auto& stm = e.shared_token_metadata().local(); + auto& load_stats = topo.get_shared_load_stats(); + + auto mutate_tmap = [&] (tablet_metadata& tmeta) -> future<> { + tablet_map tmap(tablet_count); + auto tid = tmap.first_tablet(); + for (size_t i = 0; i < tablets_on_shard0; ++i) { + tmap.set_tablet(tid, tablet_info { + tablet_replica_set { tablet_replica {host1, 0} } + }); + if (i < tablets_on_shard0 - 1) { + tid = *tmap.next_tablet(tid); + } + } + for (size_t i = 0; i < tablets_on_shard1; ++i) { + tid = *tmap.next_tablet(tid); + tmap.set_tablet(tid, tablet_info { + tablet_replica_set { tablet_replica {host1, 1} } + }); + } + tmeta.set_tablet_map(table1, std::move(tmap)); + co_return; + }; + + mutate_tablets(e, mutate_tmap); + + // Set uniform per-tablet sizes + auto& tmap = stm.get()->tablets().get_tablet_map(table1); + tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& tinfo) { + locator::range_based_tablet_id rb_tid {table1, tmap.get_token_range(tid)}; + load_stats.set_tablet_size(host1, rb_tid, tablet_size); + return make_ready_future<>(); + }).get(); + load_stats.set_size(table1, tablet_count * tablet_size); + + // Call the balancer once and verify NO intranode migrations in the plan. + auto& talloc = e.get_tablet_allocator().local(); + auto& topology = e.get_topology_state_machine().local()._topology; + auto& sys_ks = e.get_system_keyspace().local(); + + { + auto plan = talloc.balance_tablets(stm.get(), &topology, &sys_ks, load_stats.get()).get(); + for (auto&& mig : plan.migrations()) { + BOOST_REQUIRE_MESSAGE(mig.kind != tablet_transition_kind::intranode_migration, + "Unexpected intranode migration when load difference is within threshold"); + } + } + + // Now create a clearly unbalanced scenario: 307 vs 205 tablets. + // Relative diff = (307 - 205) / 307 = 33% >> 1% threshold + tablets_on_shard0 = 307; + tablets_on_shard1 = 205; + + mutate_tablets(e, mutate_tmap); + + // Update tablet sizes for new layout + auto& tmap2 = stm.get()->tablets().get_tablet_map(table1); + tmap2.for_each_tablet([&] (tablet_id tid, const tablet_info& tinfo) { + locator::range_based_tablet_id rb_tid {table1, tmap2.get_token_range(tid)}; + load_stats.set_tablet_size(host1, rb_tid, tablet_size); + return make_ready_future<>(); + }).get(); + + // Call balancer once - this time intranode migrations SHOULD be emitted. + { + auto plan = talloc.balance_tablets(stm.get(), &topology, &sys_ks, load_stats.get()).get(); + bool saw_intranode_migration = false; + for (auto&& mig : plan.migrations()) { + if (mig.kind == tablet_transition_kind::intranode_migration) { + saw_intranode_migration = true; + break; + } + } + BOOST_REQUIRE_MESSAGE(saw_intranode_migration, + "Expected intranode migration when load difference exceeds threshold"); + } + }, cfg).get(); +} + SEASTAR_THREAD_TEST_CASE(test_tablet_range_splitter) { simple_schema ss;