From 954426407ebdce71b2fdcfb89c1c230b4478e07c Mon Sep 17 00:00:00 2001 From: Petr Gusev Date: Fri, 22 May 2026 13:28:49 +0200 Subject: [PATCH] storage_proxy: only cancel write handlers with pending remote targets during drain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix (cancel_all_write_response_handlers in do_drain) was too aggressive — it killed all handlers including ones used by group0 for raft commits. Since group0 is still running at that point (before wait_for_group0_stop), this caused group0 operations to fail (SCYLLADB-2168). The actual problem is only with handlers that have pending remote targets: after stop_transport() their MUTATION_DONE responses can never arrive via messaging. Handlers whose only pending targets are local can still complete via apply_locally and should be left alone. Add cancel_nonlocal_write_response_handlers() which checks each handler's remaining targets against the local host ID. Only handlers with at least one remote pending target are cancelled. Use it in do_drain instead of cancel_all_write_response_handlers. The latter remains unchanged for drain_on_shutdown (final proxy shutdown where all handlers must be killed). Fixes: SCYLLADB-2168 Closes scylladb/scylladb#30020 --- service/storage_proxy.cc | 49 ++++++++++++++++++++++++++------------ service/storage_proxy.hh | 5 ++-- service/storage_service.cc | 9 ++++--- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index bfb2e02be9..185f08fba0 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -1753,7 +1753,7 @@ public: register_cancellable(); } - attach_to(*_proxy->_write_handlers_gate); + attach_to(_proxy->_write_handlers_gate); } void attach_to(gate& g) { _holders.push_back(g.hold()); @@ -7477,7 +7477,7 @@ void storage_proxy::on_down(const gms::inet_address& endpoint, locator::host_id }; future<> storage_proxy::drain_on_shutdown() { - co_await cancel_all_write_response_handlers(false); + co_await cancel_all_write_response_handlers(); co_await _hints_resource_manager.stop(); } @@ -7501,23 +7501,42 @@ future> storage_proxy::describ return locator::describe_ring(_db.local(), _remote->gossiper(), keyspace, include_only_local_dc); } -future<> storage_proxy::cancel_all_write_response_handlers(bool allow_new) { - // Cancel all existing handlers. They are attached to the current gate. +future<> storage_proxy::cancel_all_write_response_handlers() { + auto f = _write_handlers_gate.close(); while (!_response_handlers.empty()) { _response_handlers.begin()->second->timeout_cb(); co_await coroutine::maybe_yield(); } - // Close the gate and wait for destruction of handlers still alive via - // external shared_ptr refs (e.g. background futures in send_to_live_endpoints). - auto f = _write_handlers_gate->close(); - std::unique_ptr old_gate; - if (allow_new) { - // Swap in a fresh open gate so new handlers can be created. - // Keep the old gate alive (via old_gate) until the close future resolves. - old_gate = std::exchange(_write_handlers_gate, std::make_unique()); - } - // If !allow_new, the closed gate stays in _write_handlers_gate — any - // subsequent handler creation will get gate_closed_exception. co_await std::move(f); } + +future<> storage_proxy::cancel_nonlocal_write_response_handlers() { + // Cancel handlers that have pending remote targets. After + // stop_transport(), MUTATION_DONE responses from remote nodes can + // no longer arrive, so these handlers are stuck. Handlers whose + // only pending targets are local can still complete via + // apply_locally and are left alone. + gate g; + // Collect IDs first to avoid iterator invalidation during cancellation. + // No yield here — the loop body is trivial (integer comparison + push_back). + std::vector to_cancel; + to_cancel.reserve(_response_handlers.size()); + for (auto& [id, handler] : _response_handlers) { + auto dominated_by_local = std::ranges::all_of(handler->get_targets(), [&] (locator::host_id target) { + return is_me(*handler->_effective_replication_map_ptr, target); + }); + if (!dominated_by_local) { + to_cancel.push_back(id); + } + } + for (auto id : to_cancel) { + auto it = _response_handlers.find(id); + if (it != _response_handlers.end()) { + it->second->attach_to(g); + it->second->timeout_cb(); + } + co_await coroutine::maybe_yield(); + } + co_await g.close(); +} } diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh index 5b14ba03a1..9ea893d608 100644 --- a/service/storage_proxy.hh +++ b/service/storage_proxy.hh @@ -327,7 +327,7 @@ private: class cancellable_write_handlers_list; std::unique_ptr _cancellable_write_handlers_list; - std::unique_ptr _write_handlers_gate = std::make_unique(); + gate _write_handlers_gate; /* This is a pointer to the shard-local part of the sharded cdc_service: * storage_proxy needs access to cdc_service to augment mutations. @@ -577,7 +577,8 @@ public: bool is_me(gms::inet_address addr) const noexcept; bool is_me(const locator::effective_replication_map& erm, locator::host_id id) const noexcept; - future<> cancel_all_write_response_handlers(bool allow_new); + future<> cancel_all_write_response_handlers(); + future<> cancel_nonlocal_write_response_handlers(); private: bool only_me(const locator::effective_replication_map& erm, const host_id_vector_replica_set& replicas) const noexcept; diff --git a/service/storage_service.cc b/service/storage_service.cc index 1d05c6bf4f..ce665e1726 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2970,14 +2970,17 @@ future<> storage_service::do_drain() { // Need to stop transport before group0, otherwise RPCs may fail with raft_group_not_found. co_await stop_transport(); - // Cancel write handlers on all shards so they release their ERMs - // (and the associated token_metadata versions). Without this, + // Cancel write handlers that have pending remote targets so they release + // their ERMs (and the associated token_metadata versions). Without this, // wait_for_group0_stop below may deadlock: the topology coordinator // fiber calls barrier_and_drain which blocks in stale_versions_in_use() // waiting for stale token_metadata versions held by write handlers // whose MUTATION_DONE responses can no longer arrive (transport is stopped). + // Handlers with only local pending targets are left alone — they can + // still complete via apply_locally, and group0 needs them for raft commits + // until wait_for_group0_stop below. co_await _qp.proxy().container().invoke_on_all([] (storage_proxy& sp) { - return sp.cancel_all_write_response_handlers(true); + return sp.cancel_nonlocal_write_response_handlers(); }); // Drain view builder before group0, because the view builder uses group0 to coordinate view building.