replica/table: improve error message when encountering orphaned sstables

On startup, if a server reads an sstable that belongs to a tablet that doesn't have any local replica, it throws an error in the following format and refuses to start : ``` Storage wasn't found for tablet 1 of table test.test ``` This patch updates the code path to throw a nicer error that includes the sstable name that caused the problem. This patch also adds a testcase to verify the error being thrown. Fixes #18038
2026-05-13 03:12:13 +00:00 · 2024-12-05 20:22:34 +05:30
parent 6c90a25014
commit fa10b0b390
2 changed files with 59 additions and 0 deletions
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -1107,6 +1107,7 @@ compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(con
                                          sst->get_filename(), first_id, last_id));
    }

+    try {
    auto& sg = storage_group_for_id(first_id);

    if (first_range_side != last_range_side) {
@@ -1114,6 +1115,9 @@ compaction_group& tablet_storage_group_manager::compaction_group_for_sstable(con
    }

    return *sg.select_compaction_group(first_range_side);
+    } catch (std::out_of_range& e) {
+        on_internal_error(tlogger, format("Unable to load SSTable {} : {}", sst->get_filename(), e.what()));
+    }
 }

 compaction_group& table::compaction_group_for_sstable(const sstables::shared_sstable& sst) const noexcept {
--- a/test/topology_custom/test_tablets.py
+++ b/test/topology_custom/test_tablets.py
@@ -21,6 +21,7 @@ import requests
 import random
 import os
 import glob
+import shutil

 logger = logging.getLogger(__name__)

@@ -571,3 +572,57 @@ async def test_tablet_streaming_with_staged_sstables(manager: ManagerClient):
    # Verify that the view has the expected number of rows
    rows = await cql.run_async("SELECT c from test.mv1")
    assert len(list(rows)) == expected_num_of_rows
+
+@pytest.mark.asyncio
+async def test_orphaned_sstables_on_startup(manager: ManagerClient):
+    """
+    Reproducer for https://github.com/scylladb/scylladb/issues/18038
+        1) Start a node (node1)
+        2) Create a table with 1 initial tablet and populate it
+        3) Start another node (node2)
+        4) Migrate the existing tablet from node1 to node2
+        5) Stop node1
+        6) Copy the sstables from node2 to node1
+        7) Attempting to start node1 should fail as it now has an 'orphaned' sstable
+    """
+    logger.info("Starting Node 1")
+    cfg = {'enable_user_defined_functions': False, 'enable_tablets': True}
+    cmdline = [
+        '--logger-log-level', 'storage_service=debug',
+        '--logger-log-level', 'raft_topology=debug',
+    ]
+    servers = [await manager.server_add(cmdline=cmdline, config=cfg)]
+    await manager.api.disable_tablet_balancing(servers[0].ip_addr)
+
+    logger.info("Create the test table, populate few rows and flush to disk")
+    cql = manager.get_cql()
+    await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2};")
+    await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);")
+    await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k%3});") for k in range(256)])
+    await manager.api.keyspace_flush(servers[0].ip_addr, "test", "test")
+    node0_workdir = await manager.server_get_workdir(servers[0].server_id)
+    node0_table_dir = glob.glob(os.path.join(node0_workdir, "data", "test", "test-*"))[0]
+
+    logger.info("Start Node 2")
+    servers.append(await manager.server_add(cmdline=cmdline, config=cfg))
+    await manager.api.disable_tablet_balancing(servers[1].ip_addr)
+    node1_workdir = await manager.server_get_workdir(servers[1].server_id)
+    node1_table_dir = glob.glob(os.path.join(node1_workdir, "data", "test", "test-*"))[0]
+    s1_host_id = await manager.get_host_id(servers[1].server_id)
+
+    logger.info("Migrate the tablet from node1 to node2")
+    tablet_token = 0 # Doesn't matter since there is one tablet
+    replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token)
+    await manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, 0, tablet_token)
+    logger.info("Migration done")
+
+    logger.info("Stop node1 and copy the sstables from node2")
+    await manager.server_stop(servers[0].server_id)
+    for src_path in glob.glob(os.path.join(node1_table_dir, "me-*")):
+        dst_path = os.path.join(node0_table_dir, os.path.basename(src_path))
+        shutil.copy(src_path, dst_path)
+
+    # try starting the server again
+    logger.info("Start node1 with the orphaned sstables and expect it to fail")
+    # Error thrown is of format : "Unable to load SSTable {sstable_name} : Storage wasn't found for tablet {tablet_id} of table test.test"
+    await manager.server_start(servers[0].server_id, expected_error="Storage wasn't found for tablet")