mirror of
https://github.com/scylladb/scylladb.git
synced 2026-06-08 16:03:20 +00:00
strong consistency: abort raft server before gate close when dropping a table
When a strongly consistent table is dropped, schedule_raft_group_deletion() used to call g->close() first, which waits for all in-flight operations to release their gate holders. But other nodes may have already destroyed their raft servers for this group, so an in-flight write on the leader cannot reach quorum and hangs until the client timeout expires, unnecessarily delaying group deletion. Fix: initiate gate close (prevents new operations from entering), then abort the raft server (causes in-flight add_entry/read_barrier to throw raft::stopped_error, releasing their gate holders), then await the gate future (resolves immediately since holders are now released). Handle raft::stopped_error in the coordinator's top-level catch blocks (both write and read paths): if the table no longer exists, return no_such_column_family (which the CQL layer converts to InvalidRequest 'unconfigured table'). Otherwise fall through to the default timeout handling. Also replace gate->hold() with try_hold() + on_internal_error in acquire_server, and handle the timeout exception in the wait-for-leader loop in update() gracefully (log + break instead of propagating). Fixes: SCYLLADB-2080
This commit is contained in:
@@ -339,14 +339,7 @@ future<value_or_redirect<>> coordinator::mutate(schema_ptr schema,
|
||||
co_return std::monostate{};
|
||||
} catch (...) {
|
||||
auto ex = std::current_exception();
|
||||
if (try_catch<raft::stopped_error>(ex)) {
|
||||
// Holding raft_server.holder guarantees that the raft::server is not
|
||||
// aborted until the holder is released.
|
||||
|
||||
on_internal_error(logger,
|
||||
format("mutate(): add_entry, unexpected exception {}, table {}.{}, tablet {}, term {}",
|
||||
ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term));
|
||||
} else if (try_catch<raft::not_a_leader>(ex) || try_catch<raft::dropped_entry>(ex)) {
|
||||
if (try_catch<raft::not_a_leader>(ex) || try_catch<raft::dropped_entry>(ex)) {
|
||||
logger.debug("mutate(): add_entry, got retriable error {}, table {}.{}, tablet {}, term {}",
|
||||
ex, schema->ks_name(), schema->cf_name(), op.tablet_id, term);
|
||||
|
||||
@@ -379,10 +372,16 @@ future<value_or_redirect<>> coordinator::mutate(schema_ptr schema,
|
||||
// * seastar::abort_requested_exception: Can be thrown by create_operation_ctx.
|
||||
// * timed_out_error: Can be thrown by the abort_on_expiry.
|
||||
// * condition_variable_timed_out: Can be thrown by begin_mutate.
|
||||
// * raft::stopped_error: The raft server was aborted (e.g. table being dropped).
|
||||
//
|
||||
// We handle them collectively here.
|
||||
if (try_catch<raft::request_aborted>(ex) || try_catch<seastar::abort_requested_exception>(ex)
|
||||
|| try_catch<seastar::timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)) {
|
||||
|| try_catch<seastar::timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)
|
||||
|| try_catch<raft::stopped_error>(ex)) {
|
||||
if (!_db.column_family_exists(schema->id())) {
|
||||
co_return coroutine::return_exception(
|
||||
replica::no_such_column_family(schema->ks_name(), schema->cf_name()));
|
||||
}
|
||||
logger.trace("mutate(): request timed out with error {}, table {}.{}, token {}",
|
||||
ex, schema->ks_name(), schema->cf_name(), token);
|
||||
++_stats.write_errors_timeout;
|
||||
@@ -468,10 +467,16 @@ auto coordinator::query(schema_ptr schema,
|
||||
// * seastar::abort_requested_exception: Can be thrown by create_operation_ctx.
|
||||
// * timed_out_error: Can be thrown by the abort_on_expiry.
|
||||
// * seastar::condition_variable_timed_out: Can be thrown by begin_read's wait_for_leader.
|
||||
// * raft::stopped_error: The raft server was aborted (e.g. table being dropped).
|
||||
//
|
||||
// We handle them collectively here.
|
||||
if (try_catch<raft::request_aborted>(ex) || try_catch<seastar::abort_requested_exception>(ex)
|
||||
|| try_catch<timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)) {
|
||||
|| try_catch<timed_out_error>(ex) || try_catch<seastar::condition_variable_timed_out>(ex)
|
||||
|| try_catch<raft::stopped_error>(ex)) {
|
||||
if (!_db.column_family_exists(schema->id())) {
|
||||
co_return coroutine::return_exception(
|
||||
replica::no_such_column_family(schema->ks_name(), schema->cf_name()));
|
||||
}
|
||||
logger.trace("query(): request timed out with error {}, table {}.{}, read cmd {}",
|
||||
ex, schema->ks_name(), schema->cf_name(), cmd);
|
||||
++_stats.read_errors_timeout;
|
||||
|
||||
@@ -184,12 +184,21 @@ void groups_manager::schedule_raft_group_deletion(raft::group_id id, raft_group_
|
||||
co_await state.server_control_op.get_future();
|
||||
logger.debug("schedule_raft_group_deletion(): group id {}: starting", id);
|
||||
|
||||
co_await g->close();
|
||||
logger.debug("schedule_raft_group_deletion(): group id {}: gate closed", id);
|
||||
// Initiate gate close, then abort the raft server, then wait for the
|
||||
// gate. Gate close alone would block until all holders are released,
|
||||
// but in-flight writes may be stuck in add_entry waiting for quorum
|
||||
// that will never come (other nodes already destroyed their servers).
|
||||
// Aborting the server causes those stuck operations to throw
|
||||
// raft::stopped_error, releasing their holders and unblocking the gate.
|
||||
auto gate_fut = g->close();
|
||||
logger.debug("schedule_raft_group_deletion(): group id {}: gate close initiated", id);
|
||||
|
||||
co_await _raft_gr.abort_server(id);
|
||||
logger.debug("schedule_raft_group_deletion(): group id {}: server aborted", id);
|
||||
|
||||
co_await std::move(gate_fut);
|
||||
logger.debug("schedule_raft_group_deletion(): group id {}: gate closed", id);
|
||||
|
||||
co_await std::move(state.leader_info_updater);
|
||||
|
||||
_raft_gr.destroy_server(id);
|
||||
@@ -429,21 +438,27 @@ future<raft_server> groups_manager::acquire_server(table_id table_id, raft::grou
|
||||
// lw_shared_ptr<table>, so that the table is dropped but the table object
|
||||
// is still alive.
|
||||
//
|
||||
// Check that the table still exists in the database to turn the
|
||||
// fatal on_internal_error below into a clean no_such_column_family
|
||||
// exception.
|
||||
//
|
||||
// When the table does exist, we proceed to acquire state.gate->hold().
|
||||
// This prevents schedule_raft_group_deletion (which co_awaits gate::close)
|
||||
// from erasing the group until the DML operation completes.
|
||||
_db.find_column_family(table_id);
|
||||
// Check that the table still exists. The table is removed from the
|
||||
// database (via schema_applier::commit_tables_and_views) BEFORE
|
||||
// groups_manager::update() is called (which triggers gate closure via
|
||||
// schedule_raft_group_deletion). Since there's no scheduling point
|
||||
// between the column_family_exists check and try_hold below, the gate
|
||||
// cannot be closed if the table exists.
|
||||
if (!_db.column_family_exists(table_id)) {
|
||||
return make_exception_future<raft_server>(
|
||||
replica::no_such_column_family(table_id));
|
||||
}
|
||||
|
||||
const auto it = _raft_groups.find(group_id);
|
||||
if (it == _raft_groups.end()) {
|
||||
on_internal_error(logger, format("raft group {} not found", group_id));
|
||||
}
|
||||
auto& state = it->second;
|
||||
return state.server_control_op.get_future(as).then([&state, h = state.gate->hold()] mutable {
|
||||
auto h = state.gate->try_hold();
|
||||
if (!h) {
|
||||
on_internal_error(logger, format("acquire_server: gate closed for group {} while table {} exists", group_id, table_id));
|
||||
}
|
||||
return state.server_control_op.get_future(as).then([&state, h = std::move(*h)] mutable {
|
||||
return raft_server(state, std::move(h));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -944,7 +944,6 @@ async def test_timed_out_queries(manager: ManagerClient):
|
||||
await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (11, 13) USING TIMEOUT 100ms")
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="SCYLLADB-2080: write gets stuck waiting for quorum; fix not yet applied")
|
||||
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
|
||||
async def test_queries_while_dropping_table(manager: ManagerClient):
|
||||
"""Verify that in-flight reads and writes are promptly aborted when
|
||||
|
||||
Reference in New Issue
Block a user