Merge 'load_stats: fix problem with load_stats refresh throwing no_such_column_family' from Ferenc Szili

When the topology coordinator refreshes load_stats, it caches load_stats for every node. In case the node becomes unresponsive, and fresh load_stats can not be read from the node, the cached version of load_stats will be used. This is to allow the load balancer to have at least some information about the table sizes and disk capacities of the host. During load_stats refresh, we aggregate the table sizes from all the nodes. This procedure calls db.find_column_family() for each table_id found in load_stats. This function will throw if the table is not found. This will cause load_stats refresh to fail. It is also possible for a table to have been dropped between the time load_stats has been prepared on the host, and the time it is processed on the topology coordinator. This would also cause an exception in the refresh procedure. This fixes this problem by checking if the table still exists. Fixes: #28359 Closes scylladb/scylladb#28440 * github.com:scylladb/scylladb: test: add test and reproducer for load_stats refresh exception load_stats: handle dropped tables when refreshing load_stats
2026-05-29 11:10:40 +00:00 · 2026-01-31 21:12:19 +01:00
parent e18b519692 92dbde54a5
commit b93472d595
2 changed files with 61 additions and 0 deletions
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -3876,6 +3876,9 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
        for (auto& [table_id, table_stats] : dc_stats.tables) {
            co_await coroutine::maybe_yield();

+            if (!_db.column_family_exists(table_id)) {
+                continue;
+            }
            auto& t = _db.find_column_family(table_id);
            auto& rs = t.get_effective_replication_map()->get_replication_strategy();
            if (!rs.uses_tablets()) {
@@ -3899,6 +3902,9 @@ future<> topology_coordinator::refresh_tablet_load_stats() {
    }

    for (auto& [table_id, table_load_stats] : stats.tables) {
+        if (!total_replicas.contains(table_id)) {
+            continue;
+        }
        auto table_total_replicas = total_replicas.at(table_id);
        if (table_total_replicas == 0) {
            continue;
--- a/test/cluster/test_tablets2.py
+++ b/test/cluster/test_tablets2.py
@@ -60,6 +60,24 @@ async def safe_rolling_restart(manager, servers, with_down):
    cql = await reconnect_driver(manager)
    return cql

+async def wait_for_valid_load_stats(cql, table_id, timeout=120):
+    started = time.time()
+    # Wait until the given table has no missing tablet sizes
+    while True:
+        missing_cnt = 0
+        found_cnt = 0
+        for r in await cql.run_async(f"SELECT * FROM system.tablet_sizes WHERE table_id = {table_id};"):
+            found_cnt += 1
+            if len(r.missing_replicas) > 0:
+                missing_cnt += 1
+
+        if missing_cnt == 0 and found_cnt > 0:
+            break
+
+        assert time.time() - started < timeout, "Timed out while waiting for valid load_stats"
+
+        await asyncio.sleep(0.2)
+
@pytest.mark.asyncio
 async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(manager: ManagerClient):
    """Test that you can create a table and insert and query data"""
@@ -1922,6 +1940,43 @@ async def test_update_load_stats_after_migration(manager: ManagerClient):
        assert leaving_replica[0] not in replica_hosts, "Leaving replica tablet size is not in load_stats any more"
        assert pending_replica[0] in replica_hosts, "Pending replica tablet size is in load_stats"

+@pytest.mark.asyncio
+@pytest.mark.skip_mode('release', 'error injections are not supported in release mode')
+async def test_crash_on_missing_table_from_load_stats(manager: ManagerClient):
+    logger.info('Bootstrapping cluster')
+    cfg = { 'enable_tablets': True,
+            'tablet_load_stats_refresh_interval_in_seconds': 1
+            }
+    cmdline = [
+        '--logger-log-level', 'load_balancer=debug',
+        '--logger-log-level', 'raft_topology=debug',
+        '--smp', '2',
+    ]
+    servers = await manager.servers_add(2, config=cfg, cmdline=cmdline, property_file=[
+        {"dc": "dc1", "rack": "rack1"},
+        {"dc": "dc1", "rack": "rack1"},
+    ])
+
+    cql = manager.get_cql()
+
+    async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") as ks:
+        await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)")
+
+        # Make sure load_stats has been refreshed and that the coordinator has cached load_stats
+        table_id = await manager.get_table_or_view_id(ks, 'test')
+        await wait_for_valid_load_stats(cql, table_id)
+
+        # Kill the non-coordinator node
+        await manager.server_stop_gracefully(servers[1].server_id)
+
+        # Drop the table; this leaves the table size in the cached load_stats on the coordinator
+        await cql.run_async(f"DROP TABLE {ks}.test")
+
+        # Wait for the next load_stats refresh
+        s0_log = await manager.server_open_log(servers[0].server_id)
+        s0_mark = await s0_log.mark()
+        await s0_log.wait_for('raft topology: Refreshed table load stats for all DC', from_mark=s0_mark)
+
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
 async def test_timed_out_reader_after_cleanup(manager: ManagerClient):