From 71be10b8d637f935aede0046805c1d59a17e983e Mon Sep 17 00:00:00 2001
From: Ferenc Szili <ferenc.szili@scylladb.com>
Date: Thu, 29 Jan 2026 16:48:40 +0100
Subject: [PATCH 1/2] load_stats: handle dropped tables when refreshing
 load_stats

When the topology coordinator refreshes load_stats, it caches load_stats
for every node. In case the node becomes unresponsive, and fresh
load_stats can not be read from the node, the cached version of
load_stats will be used. This is to allow the load balancer to
have at least some information about the table sizes and disk capacities
of the host.

During load_stats refresh, we aggregate the table sizes from all the
nodes. This procedure calls db.find_column_family() for each table_id
found in load_stats. This function will throw if the table is not found.
This will cause load_stats refresh to fail.

It is also possible for a table to have been dropped between the time
load_stats has been prepared on the host, and the time it is processed
on the topology coordinator. This would also cause an exception in the
refresh procedure.

This patch fixes this problem by checking if the table still exists.
---
 service/topology_coordinator.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc
index bd78e46c80..6c5ed363c6 100644
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -3876,6 +3876,9 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
         for (auto& [table_id, table_stats] : dc_stats.tables) {
             co_await coroutine::maybe_yield();
 
+            if (!_db.column_family_exists(table_id)) {
+                continue;
+            }
             auto& t = _db.find_column_family(table_id);
             auto& rs = t.get_effective_replication_map()->get_replication_strategy();
             if (!rs.uses_tablets()) {
@@ -3899,6 +3902,9 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
     }
 
     for (auto& [table_id, table_load_stats] : stats.tables) {
+        if (!total_replicas.contains(table_id)) {
+            continue;
+        }
         auto table_total_replicas = total_replicas.at(table_id);
         if (table_total_replicas == 0) {
             continue;

From 92dbde54a592bf0fa2126b4848b7bba3c1b74c3a Mon Sep 17 00:00:00 2001
From: Ferenc Szili <ferenc.szili@scylladb.com>
Date: Thu, 29 Jan 2026 17:01:09 +0100
Subject: [PATCH 2/2] test: add test and reproducer for load_stats refresh
 exception

This patch adds a test and reproducer for the issue where the load_stats
refresh procedure throws exceptions if any of the tables have been
dropped since load_stats was produced.
---
 test/cluster/test_tablets2.py | 55 +++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/test/cluster/test_tablets2.py b/test/cluster/test_tablets2.py
index 834338e4dc..ed3386027f 100644
--- a/test/cluster/test_tablets2.py
+++ b/test/cluster/test_tablets2.py
@@ -60,6 +60,24 @@ async def safe_rolling_restart(manager, servers, with_down):
     cql = await reconnect_driver(manager)
     return cql
 
+async def wait_for_valid_load_stats(cql, table_id, timeout=120):
+    started = time.time()
+    # Wait until the given table has no missing tablet sizes
+    while True:
+        missing_cnt = 0
+        found_cnt = 0
+        for r in await cql.run_async(f"SELECT * FROM system.tablet_sizes WHERE table_id = {table_id};"):
+            found_cnt += 1
+            if len(r.missing_replicas) > 0:
+                missing_cnt += 1
+
+        if missing_cnt == 0 and found_cnt > 0:
+            break
+
+        assert time.time() - started < timeout, "Timed out while waiting for valid load_stats"
+
+        await asyncio.sleep(0.2)
+
 @pytest.mark.asyncio
 async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(manager: ManagerClient):
     """Test that you can create a table and insert and query data"""
@@ -1922,6 +1940,43 @@ async def test_update_load_stats_after_migration(manager: ManagerClient):
         assert leaving_replica[0] not in replica_hosts, "Leaving replica tablet size is not in load_stats any more"
         assert pending_replica[0] in replica_hosts, "Pending replica tablet size is in load_stats"
 
+@pytest.mark.asyncio
+@pytest.mark.skip_mode('release', 'error injections are not supported in release mode')
+async def test_crash_on_missing_table_from_load_stats(manager: ManagerClient):
+    logger.info('Bootstrapping cluster')
+    cfg = { 'enable_tablets': True,
+            'tablet_load_stats_refresh_interval_in_seconds': 1
+            }
+    cmdline = [
+        '--logger-log-level', 'load_balancer=debug',
+        '--logger-log-level', 'raft_topology=debug',
+        '--smp', '2',
+    ]
+    servers = await manager.servers_add(2, config=cfg, cmdline=cmdline, property_file=[
+        {"dc": "dc1", "rack": "rack1"},
+        {"dc": "dc1", "rack": "rack1"},
+    ])
+
+    cql = manager.get_cql()
+
+    async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") as ks:
+        await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)")
+
+        # Make sure load_stats has been refreshed and that the coordinator has cached load_stats
+        table_id = await manager.get_table_or_view_id(ks, 'test')
+        await wait_for_valid_load_stats(cql, table_id)
+
+        # Kill the non-coordinator node
+        await manager.server_stop_gracefully(servers[1].server_id)
+
+        # Drop the table; this leaves the table size in the cached load_stats on the coordinator
+        await cql.run_async(f"DROP TABLE {ks}.test")
+
+        # Wait for the next load_stats refresh
+        s0_log = await manager.server_open_log(servers[0].server_id)
+        s0_mark = await s0_log.mark()
+        await s0_log.wait_for('raft topology: Refreshed table load stats for all DC', from_mark=s0_mark)
+
 @pytest.mark.asyncio
 @pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_timed_out_reader_after_cleanup(manager: ManagerClient):