diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index bc7a43a720..2f967165a1 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -1655,10 +1655,14 @@ public: co_return std::move(plan); } + // Returns the schema and tablet-aware replication strategy for a given table. + // Returns {nullptr, nullptr} if the table has been dropped concurrently (race between + // the token metadata snapshot and the live schema). std::tuple get_schema_and_rs(table_id table) { auto t = _db.get_tables_metadata().get_table_if_exists(table); if (!t) { - on_internal_error(lblogger, format("Table {} does not exist", table)); + lblogger.debug("Table {} no longer exists, skipping", table); + return {nullptr, nullptr}; } auto s = t->schema(); @@ -1673,6 +1677,8 @@ public: return {s, rs}; } + // Returns the tablet-aware replication strategy for a given table, or nullptr + // if the table has been dropped concurrently. const tablet_aware_replication_strategy* get_rs(table_id id) { auto [s, rs] = get_schema_and_rs(id); return rs; @@ -1870,7 +1876,9 @@ public: for (const auto& [table, tables] : _tm->tablets().all_table_groups()) { const auto& tmap = _tm->tablets().get_tablet_map(table); auto [s, rs] = get_schema_and_rs(table); - + if (s == nullptr || rs == nullptr) { + continue; + } auto tablet_options = combine_tablet_options( tables | std::views::transform([&] (table_id table) { return _db.get_tables_metadata().get_table_if_exists(table); }) | std::views::filter([] (auto t) { return t != nullptr; }) @@ -2699,6 +2707,10 @@ public: std::unordered_map rack_load; auto rs = get_rs(tablet.table); + if (rs == nullptr) { + // Table was dropped concurrently. Skip this tablet. + return skip_info{}; + } auto get_viable_targets = [&] () { std::unordered_set viable_targets; diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index a7ea41f444..2fcc064b24 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -6207,4 +6207,61 @@ SEASTAR_THREAD_TEST_CASE(test_get_secondary_replica) { topo.clear_gently().get(); } +SEASTAR_THREAD_TEST_CASE(test_load_balancing_with_dropped_table) { + // Verifies that balance_tablets() gracefully handles a table that exists + // in the token metadata snapshot but has been dropped from the live schema. + // This simulates the race where a DROP TABLE is applied between yield + // points during load balancer planning. + do_with_cql_env_thread([] (auto& e) { + topology_builder topo(e); + + unsigned shard_count = 2; + auto host1 = topo.add_node(node_state::normal, shard_count); + auto host2 = topo.add_node(node_state::normal, shard_count); + auto host3 = topo.add_node(node_state::normal, shard_count); + + auto ks_name = add_keyspace(e, {{topo.dc(), 1}}, 4); + auto table1 = add_table(e, ks_name).get(); + + mutate_tablets(e, [&] (tablet_metadata& tmeta) -> future<> { + tablet_map tmap(4); + auto tid = tmap.first_tablet(); + tmap.set_tablet(tid, tablet_info{tablet_replica_set{tablet_replica{host1, 0}}}); + tid = *tmap.next_tablet(tid); + tmap.set_tablet(tid, tablet_info{tablet_replica_set{tablet_replica{host1, 1}}}); + tid = *tmap.next_tablet(tid); + tmap.set_tablet(tid, tablet_info{tablet_replica_set{tablet_replica{host2, 0}}}); + tid = *tmap.next_tablet(tid); + tmap.set_tablet(tid, tablet_info{tablet_replica_set{tablet_replica{host2, 1}}}); + tmeta.set_tablet_map(table1, std::move(tmap)); + co_return; + }); + + auto& stm = e.shared_token_metadata().local(); + + shared_load_stats& load_stats = topo.get_shared_load_stats(); + load_stats.set_default_tablet_sizes(stm.get()); + + // Capture the token metadata snapshot while the table still exists. + auto stale_tm = stm.get(); + + // Drop the table from the live schema. The stale snapshot still has + // the table's tablet map, simulating the race condition. + e.execute_cql(fmt::format("DROP TABLE \"{}\".\"{}\"", ks_name, table1.to_sstring())).get(); + + // balance_tablets should handle the stale table gracefully without + // throwing or aborting. + auto& talloc = e.get_tablet_allocator().local(); + auto& topology = e.get_topology_state_machine().local()._topology; + auto& sys_ks = e.get_system_keyspace().local(); + auto plan = talloc.balance_tablets(stale_tm, &topology, &sys_ks, + load_stats.get(), {}).get(); + + // No migrations should reference the dropped table. + for (auto& mig : plan.migrations()) { + BOOST_REQUIRE_NE(mig.tablet.table, table1); + } + }).get(); +} + BOOST_AUTO_TEST_SUITE_END()