From 2d57dc32a3f985ef63fa890728a2a0472e5b8f3f Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Mon, 15 Sep 2025 11:39:10 +0200 Subject: [PATCH 1/2] topology_coordinator: consistently rethrow `raft::request_aborted` for direct/global commands Ensure all direct and global topology commands rethrow the `raft::request_aborted` exception when aborted, typically due to leadership changes. This makes abortion explicit to callers, enabling proper handling such as retries or workflow termination. This change completes the work started in PR scylladb/scylladb#23962, covering all remaining cases where the exception was not rethrown. Fixes: scylladb/scylladb#23589 (cherry picked from commit 943af1ef1ccb1c860e11e97cefc8250f6f1d3a05) --- service/topology_coordinator.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index 20717784e8..81694f6303 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -2552,6 +2552,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { node.guard = co_await exec_global_command(std::move(node.guard),raft_topology_cmd::command::barrier_and_drain, get_excluded_nodes(node), drop_guard_and_retake::yes); } catch (term_changed_error&) { throw; + } catch (raft::request_aborted&) { + throw; } catch(...) { rtlogger.warn("failed to run barrier_and_drain during rollback of {} after {} failure: {}", node.id, state, std::current_exception()); @@ -2629,6 +2631,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { } } catch (term_changed_error&) { throw; + } catch (raft::request_aborted&) { + throw; } catch(...) { wait_for_ip_error = std::current_exception(); rtlogger.warn("raft_topology_cmd::command::wait_for_ip failed, error {}", @@ -2751,6 +2755,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { rtbuilder.done(); } catch (term_changed_error&) { throw; + } catch (raft::request_aborted&) { + throw; } catch (...) { rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception" " (node state is rebuilding): {}", std::current_exception()); From 66765c6bd3b21f0b16be7973002db628472ad258 Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Thu, 27 Nov 2025 16:09:18 +0100 Subject: [PATCH 2/2] topology_coordinator: handle seastar::abort_requested_exception alongside raft::request_aborted In several exception handlers, only raft::request_aborted was being caught and rethrown, while seastar::abort_requested_exception was falling through to the generic catch(...) block. This caused the exception to be incorrectly treated as a failure that triggers rollback, instead of being recognized as an abort signal. For example, during tablet draining, the error log showed: "tablets draining failed with seastar::abort_requested_exception (abort requested). Aborting the topology operation" This change adds seastar::abort_requested_exception handling alongside raft::request_aborted in all places where it was missing. When rethrown, these exceptions propagate up to the main run() loop where handle_topology_coordinator_error() recognizes them as normal abort signals and allows the coordinator to exit gracefully without triggering unnecessary rollback operations. Fixes: scylladb/scylladb#27255 (cherry picked from commit 37e3dacf332b016012bfa7e150e35edbe42d7b68) --- service/topology_coordinator.cc | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index 81694f6303..4bf3cd6303 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -2004,6 +2004,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::join_group0, " "global_token_metadata_barrier failed, error {}", @@ -2140,6 +2142,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::commit_cdc_generation, " "raft_topology_cmd::command::barrier failed, error {}", std::current_exception()); @@ -2220,6 +2224,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("tablets draining failed with {}. Aborting the topology operation", std::current_exception()); _rollback = fmt::format("Failed to drain tablets: {}", std::current_exception()); @@ -2237,6 +2243,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::write_both_read_old, " "global_token_metadata_barrier failed, error {}", @@ -2303,6 +2311,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception" " (node state is {}): {}", state, std::current_exception()); @@ -2335,6 +2345,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::write_both_read_new, " "global_token_metadata_barrier failed, error {}", @@ -2478,6 +2490,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::left_token_ring, " "raft_topology_cmd::command::barrier failed, error {}", @@ -2554,6 +2568,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch(...) { rtlogger.warn("failed to run barrier_and_drain during rollback of {} after {} failure: {}", node.id, state, std::current_exception()); @@ -2633,6 +2649,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch(...) { wait_for_ip_error = std::current_exception(); rtlogger.warn("raft_topology_cmd::command::wait_for_ip failed, error {}", @@ -2757,6 +2775,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception" " (node state is rebuilding): {}", std::current_exception()); @@ -3344,6 +3364,10 @@ future<> topology_coordinator::fence_previous_coordinator() { // Abort was requested. Break the loop rtlogger.debug("request to fence previous coordinator was aborted"); break; + } catch (seastar::abort_requested_exception&) { + // Abort was requested. Break the loop + rtlogger.debug("request to fence previous coordinator was aborted"); + break; } catch (...) { rtlogger.error("failed to fence previous coordinator {}", std::current_exception()); }