Merge 'strong consistency: fix drop table blocking on stuck writes and handle timeout in update()' from Petr Gusev

- Fix table drop blocking for the full client timeout when in-flight writes can't reach quorum - Handle unhandled timeout exception in the wait-for-leader loop during group startup When a strongly consistent table is dropped, `schedule_raft_group_deletion`() calls `g->close()` which waits for all in-flight operations to release their gate holders. But other nodes may have already destroyed their raft servers for this group, so an in-flight write on the leader cannot reach quorum and hangs until the client timeout expires (~seconds), unnecessarily delaying group deletion. Additionally, the wait-for-leader loop in groups_manager::update() uses abort_on_expiry with a 60-second timeout but never catches the exception if it fires, leaving the group in an indeterminate state. SCYLLADB-2080 fix: - Reorder `schedule_raft_group_deletion`: initiate gate close (prevents new operations), then abort the raft server (unblocks stuck writes by causing `raft::stopped_error`), then await the gate future (resolves immediately since holders are released). - Handle `raft::stopped_error` in the coordinator's top-level catch blocks (both write and read paths): if the table no longer exists, return `no_such_column_family` (CQL layer converts to InvalidRequest: unconfigured table). Otherwise fall through to the default timeout handling. - Replace gate->hold() with try_hold() + on_internal_error in acquire_server, with a comment explaining why the gate can never be closed at that point (table removal in `schema_applier::commit_on_shard` precedes gate closure, with no scheduling point in between). Timeout handling fix: - Use `coroutine::as_future` in the wait-for-leader loop to catch timeout exceptions gracefully — log a warning and break out instead of propagating unhandled. Includes a cluster test reproducer (test_drop_table_unblocks_stuck_write) that: 1. Pauses a write on the leader before add_entry 2. Drops the table (follower destroys its group immediately) 3. Resumes the write — verifies it fails promptly with InvalidRequest ("unconfigured table") instead of hanging for 15 seconds backport: no need, strong consistency is not released yet Fixes: SCYLLADB-2080 Closes scylladb/scylladb#30105 * github.com:scylladb/scylladb: strong consistency/groups_manager: handle timeout in update() wait-for-leader loop strong consistency: abort raft server before gate close when dropping a table test/cluster: rewrite test_queries_while_dropping_table for SCYLLADB-2080
2026-05-30 11:36:54 +00:00 · 2026-05-28 09:59:20 +02:00
parent 76dac2fd8e f2b1cbe998
commit 8dfd455001
3 changed files with 130 additions and 69 deletions
--- a/test/cluster/test_strong_consistency.py
+++ b/test/cluster/test_strong_consistency.py
@@ -5,6 +5,7 @@
 #

 from typing import Tuple
+import re

 from test.pylib.manager_client import ManagerClient
 from test.pylib.util import gather_safely, wait_for, Host
@@ -945,75 +946,110 @@ async def test_timed_out_queries(manager: ManagerClient):

@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
 async def test_queries_while_dropping_table(manager: ManagerClient):
-    """
-    A simple test that verifies that ongoing reads and writes are not interrupted
-    by a tablet migration. We simulate the process by dropping a table.
+    """Verify that in-flight reads and writes are promptly aborted when
+    a strongly consistent table is dropped.

-    We verify that:
-    - New reads and writes are rejected with a proper error.
-    - Ongoing reads and writes are not interrupted by a tablet migration
-      (which boils down to successful executions of Raft methods).
-    - Ongoing reads can still eventually observe that the table has
-      been dropped.
-    - Ongoing writes should also finish successfully if they've reached
-      the strongly consistent coordinator.
+    Setup: 2 nodes, RF=2, 1 tablet (raft quorum = 2).
+
+    We pause a read (before read_barrier) and a write (before add_entry)
+    on the leader, then drop the table. The follower destroys its raft group
+    immediately (no in-flight ops). On the leader, the raft server is aborted
+    as part of group deletion (SCYLLADB-2080 fix), causing the paused
+    operations to fail with raft::stopped_error which the coordinator converts
+    to "no such column family" / "unconfigured table".
+
+    We also verify that new reads/writes after the drop are rejected
+    immediately.
    """

-    s1 = await manager.server_add(config=DEFAULT_CONFIG, cmdline=DEFAULT_CMDLINE)
-    cql, _ = await manager.get_ready_cql([s1])
+    servers = await manager.servers_add(2, config=DEFAULT_CONFIG, cmdline=DEFAULT_CMDLINE, auto_rack_dc='my_dc')
+    cql, hosts = await manager.get_ready_cql(servers)
+    host_ids = [await manager.get_host_id(s.server_id) for s in servers]

-    log = await manager.server_open_log(s1.server_id)
-    mark = await log.mark()
-
-    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
-        table_name = "my_table"
+    async with new_test_keyspace(manager,
+        "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}"
+        " AND tablets = {'initial': 1} AND consistency = 'global'"
+    ) as ks:
+        table_name = "tbl"
        table = f"{ks}.{table_name}"
-
        await cql.run_async(f"CREATE TABLE {table} (pk int PRIMARY KEY, v int)")
        await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (0, 13)")

-        await gather_safely(*[
-            manager.api.enable_injection(s1.ip_addr, "sc_coordinator_wait_before_query_read_barrier", one_shot=True),
-            manager.api.enable_injection(s1.ip_addr, "sc_coordinator_wait_before_add_entry", one_shot=True)
-        ])
+        # Identify the leader node for this tablet's raft group.
+        group_id = await get_table_raft_group_id(manager, ks, table_name)
+        leader_host_id = await wait_for_leader(manager, servers[0], group_id)

-        read_fut = cql.run_async(f"SELECT * FROM {table} WHERE pk = 0")
-        write_fut = cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (7, 17)")
+        leader_idx = next(i for i, hid in enumerate(host_ids) if str(hid) == leader_host_id)
+        leader_server = servers[leader_idx]
+        leader_host = hosts[leader_idx]
+        follower_server = servers[1 - leader_idx]

-        await asyncio.gather(*[
-            asyncio.create_task(log.wait_for("sc_coordinator_wait_before_query_read_barrier", from_mark=mark)),
-            asyncio.create_task(log.wait_for("sc_coordinator_wait_before_add_entry", from_mark=mark))
-        ])
+        leader_log = await manager.server_open_log(leader_server.server_id)
+        follower_log = await manager.server_open_log(follower_server.server_id)

-        mark = await log.mark()
+        mark_leader = await leader_log.mark()

+        # Pause both a read and a write on the leader.
+        await asyncio.gather(
+            manager.api.enable_injection(leader_server.ip_addr,
+                "sc_coordinator_wait_before_query_read_barrier", one_shot=True),
+            manager.api.enable_injection(leader_server.ip_addr,
+                "sc_coordinator_wait_before_add_entry", one_shot=True))
+
+        read_fut = asyncio.ensure_future(
+            cql.run_async(f"SELECT * FROM {table} WHERE pk = 0", host=leader_host))
+        write_fut = asyncio.ensure_future(
+            cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (7, 17)", host=leader_host))
+
+        # Wait for both to hit their injection points.
+        await asyncio.gather(
+            leader_log.wait_for("sc_coordinator_wait_before_query_read_barrier: waiting",
+                from_mark=mark_leader, timeout=30),
+            leader_log.wait_for("sc_coordinator_wait_before_add_entry: waiting",
+                from_mark=mark_leader, timeout=30))
+
+        mark_leader = await leader_log.mark()
+        mark_follower = await follower_log.mark()
+
+        # Drop the table.
        await cql.run_async(f"DROP TABLE {table}")

-        # Sanity check: The table was really dropped and we can no longer read or write to it.
+        # Sanity check: new queries are rejected immediately.
        with pytest.raises(InvalidRequest, match="unconfigured table"):
            await cql.run_async(f"SELECT * FROM {table} WHERE pk = 0")
        with pytest.raises(InvalidRequest, match="unconfigured table"):
            await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (13, 11)")

-        # Wait for the Raft group to be removed. This should happen almost immediately
-        # after dropping the table, but let's avoid any potential cause of flakiness.
-        await log.wait_for(r"schedule_raft_group_deletion\(\): group id \S+: starting", from_mark=mark)
+        # Confirm the follower destroyed its raft group.
+        await follower_log.wait_for("schedule_raft_group_deletion.*raft server.*is destroyed",
+            from_mark=mark_follower, timeout=30)

-        await asyncio.gather(*[
-            asyncio.create_task(manager.api.message_injection(s1.ip_addr, "sc_coordinator_wait_before_query_read_barrier")),
-            asyncio.create_task(manager.api.message_injection(s1.ip_addr, "sc_coordinator_wait_before_add_entry"))
-        ])
+        # Resume both operations.
+        await asyncio.gather(
+            manager.api.message_injection(leader_server.ip_addr,
+                "sc_coordinator_wait_before_query_read_barrier"),
+            manager.api.message_injection(leader_server.ip_addr,
+                "sc_coordinator_wait_before_add_entry"))

-        # Make sure that the read noticed that the table had been dropped.
-        with pytest.raises(Exception, match="Can't find a column family"):
+        # Both should fail with "no such column family" / "unconfigured table".
+        # The raft server is aborted as part of group deletion, causing
+        # stopped_error which the coordinator converts to no_such_column_family.
+        # Use a timeout to detect the old buggy behavior (write stuck forever).
+        try:
+            await asyncio.wait_for(asyncio.shield(write_fut), timeout=15)
+        except asyncio.TimeoutError:
+            cql.cluster.shutdown()
+            for s in servers:
+                await manager.server_stop(s.server_id, convict=True)
+            pytest.fail("SCYLLADB-2080: write is stuck waiting for quorum that can never "
+                        "be reached because the follower's raft group was already destroyed. "
+                        "The leader should abort the raft server to unblock the write.")
+        except InvalidRequest as e:
+            assert re.search(r"[Uu]nconfigured table|[Nn]o such column family", str(e)), \
+                f"Expected 'unconfigured table' or 'no such column family', got: {e}"
+
+        with pytest.raises((InvalidRequest, Exception), match="[Uu]nconfigured table|[Cc]an't find a column family|[Nn]o such column family"):
            await read_fut
-        # The groups manager waits for all ongoing queries to finish before
-        # it aborts the Raft server. Thanks to this, the write will succeed.
-        # No matter if we get a `no_such_column_family` exception when later
-        # applying the mutation in the state machine or not, there will be
-        # no exception thrown, so the write will still look like a success.
-        # And that's the desired behavior.
-        await write_fut


@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")