strong consistency: abort raft server before gate close when dropping a table

When a strongly consistent table is dropped, schedule_raft_group_deletion() used to call g->close() first, which waits for all in-flight operations to release their gate holders. But other nodes may have already destroyed their raft servers for this group, so an in-flight write on the leader cannot reach quorum and hangs until the client timeout expires, unnecessarily delaying group deletion. Fix: initiate gate close (prevents new operations from entering), then abort the raft server (causes in-flight add_entry/read_barrier to throw raft::stopped_error, releasing their gate holders), then await the gate future (resolves immediately since holders are now released). Handle raft::stopped_error in the coordinator's top-level catch blocks (both write and read paths): if the table no longer exists, return no_such_column_family (which the CQL layer converts to InvalidRequest 'unconfigured table'). Otherwise fall through to the default timeout handling. Also replace gate->hold() with try_hold() + on_internal_error in acquire_server, and handle the timeout exception in the wait-for-leader loop in update() gracefully (log + break instead of propagating). Fixes: SCYLLADB-2080
2026-06-08 16:03:20 +00:00 · 2026-05-27 11:57:41 +02:00
parent 89307064b5
commit d922c43358
3 changed files with 41 additions and 22 deletions
--- a/service/strong_consistency/coordinator.cc
+++ b/service/strong_consistency/coordinator.cc
@@ -339,14 +339,7 @@ future<value_or_redirect<>> coordinator::mutate(schema_ptr schema,
                co_return std::monostate{};
            } catch (...) {
                auto ex = std::current_exception();
-                if (try_catch<raft::stopped_error>(ex)) {
-                    // Holding raft_server.holder guarantees that the raft::server is not
-                    // aborted until the holder is released.
-
-                    on_internal_error(logger,
-                        format("mutate(): add_entry, unexpected exception {}, table {}.{}, tablet {}, term {}",
-                            ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term));
-                } else if (try_catch<raft::not_a_leader>(ex) || try_catch<raft::dropped_entry>(ex)) {
+                if (try_catch<raft::not_a_leader>(ex) || try_catch<raft::dropped_entry>(ex)) {
                    logger.debug("mutate(): add_entry, got retriable error {}, table {}.{}, tablet {}, term {}",
                        ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term);

@@ -379,10 +372,16 @@ future<value_or_redirect<>> coordinator::mutate(schema_ptr schema,
        // * seastar::abort_requested_exception: Can be thrown by create_operation_ctx.
        // * timed_out_error: Can be thrown by the abort_on_expiry.
        // * condition_variable_timed_out: Can be thrown by begin_mutate.
+        // * raft::stopped_error: The raft server was aborted (e.g. table being dropped).
        //
        // We handle them collectively here.
        if (try_catch<raft::request_aborted>(ex) || try_catch<seastar::abort_requested_exception>(ex)
-                || try_catch<seastar::timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)) {
+                || try_catch<seastar::timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)
+                || try_catch<raft::stopped_error>(ex)) {
+            if (!_db.column_family_exists(schema->id())) {
+                co_return coroutine::return_exception(
+                    replica::no_such_column_family(schema->ks_name(), schema->cf_name()));
+            }
            logger.trace("mutate(): request timed out with error {}, table {}.{}, token {}",
                ex, schema->ks_name(), schema->cf_name(), token);
            ++_stats.write_errors_timeout;
@@ -468,10 +467,16 @@ auto coordinator::query(schema_ptr schema,
        // * seastar::abort_requested_exception: Can be thrown by create_operation_ctx.
        // * timed_out_error: Can be thrown by the abort_on_expiry.
        // * seastar::condition_variable_timed_out: Can be thrown by begin_read's wait_for_leader.
+        // * raft::stopped_error: The raft server was aborted (e.g. table being dropped).
        //
        // We handle them collectively here.
        if (try_catch<raft::request_aborted>(ex) || try_catch<seastar::abort_requested_exception>(ex)
-                || try_catch<timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)) {
+                || try_catch<timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)
+                || try_catch<raft::stopped_error>(ex)) {
+            if (!_db.column_family_exists(schema->id())) {
+                co_return coroutine::return_exception(
+                    replica::no_such_column_family(schema->ks_name(), schema->cf_name()));
+            }
            logger.trace("query(): request timed out with error {}, table {}.{}, read cmd {}",
                ex, schema->ks_name(), schema->cf_name(), cmd);
            ++_stats.read_errors_timeout;
--- a/service/strong_consistency/groups_manager.cc
+++ b/service/strong_consistency/groups_manager.cc
@@ -184,12 +184,21 @@ void groups_manager::schedule_raft_group_deletion(raft::group_id id, raft_group_
        co_await state.server_control_op.get_future();
        logger.debug("schedule_raft_group_deletion(): group id {}: starting", id);

-        co_await g->close();
-        logger.debug("schedule_raft_group_deletion(): group id {}: gate closed", id);
+        // Initiate gate close, then abort the raft server, then wait for the
+        // gate. Gate close alone would block until all holders are released,
+        // but in-flight writes may be stuck in add_entry waiting for quorum
+        // that will never come (other nodes already destroyed their servers).
+        // Aborting the server causes those stuck operations to throw
+        // raft::stopped_error, releasing their holders and unblocking the gate.
+        auto gate_fut = g->close();
+        logger.debug("schedule_raft_group_deletion(): group id {}: gate close initiated", id);

        co_await _raft_gr.abort_server(id);
        logger.debug("schedule_raft_group_deletion(): group id {}: server aborted", id);

+        co_await std::move(gate_fut);
+        logger.debug("schedule_raft_group_deletion(): group id {}: gate closed", id);
+
        co_await std::move(state.leader_info_updater);

        _raft_gr.destroy_server(id);
@@ -429,21 +438,27 @@ future<raft_server> groups_manager::acquire_server(table_id table_id, raft::grou
    // lw_shared_ptr<table>, so that the table is dropped but the table object
    // is still alive.
    //
-    // Check that the table still exists in the database to turn the
-    // fatal on_internal_error below into a clean no_such_column_family
-    // exception.
-    //
-    // When the table does exist, we proceed to acquire state.gate->hold().
-    // This prevents schedule_raft_group_deletion (which co_awaits gate::close)
-    // from erasing the group until the DML operation completes.
-    _db.find_column_family(table_id);
+    // Check that the table still exists. The table is removed from the
+    // database (via schema_applier::commit_tables_and_views) BEFORE
+    // groups_manager::update() is called (which triggers gate closure via
+    // schedule_raft_group_deletion). Since there's no scheduling point
+    // between the column_family_exists check and try_hold below, the gate
+    // cannot be closed if the table exists.
+    if (!_db.column_family_exists(table_id)) {
+        return make_exception_future<raft_server>(
+            replica::no_such_column_family(table_id));
+    }

    const auto it = _raft_groups.find(group_id);
    if (it == _raft_groups.end()) {
        on_internal_error(logger, format("raft group {} not found", group_id));
    }
    auto& state = it->second;
-    return state.server_control_op.get_future(as).then([&state, h = state.gate->hold()] mutable {
+    auto h = state.gate->try_hold();
+    if (!h) {
+        on_internal_error(logger, format("acquire_server: gate closed for group {} while table {} exists", group_id, table_id));
+    }
+    return state.server_control_op.get_future(as).then([&state, h = std::move(*h)] mutable {
        return raft_server(state, std::move(h));
    });
 }
--- a/test/cluster/test_strong_consistency.py
+++ b/test/cluster/test_strong_consistency.py
@@ -944,7 +944,6 @@ async def test_timed_out_queries(manager: ManagerClient):
                await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (11, 13) USING TIMEOUT 100ms")


-@pytest.mark.xfail(reason="SCYLLADB-2080: write gets stuck waiting for quorum; fix not yet applied")
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
 async def test_queries_while_dropping_table(manager: ManagerClient):
    """Verify that in-flight reads and writes are promptly aborted when