From bfce02ce7e16f0d304f72620601ac2bc29e742f6 Mon Sep 17 00:00:00 2001 From: Emil Maskovsky Date: Thu, 27 Nov 2025 16:09:18 +0100 Subject: [PATCH] topology_coordinator: handle seastar::abort_requested_exception alongside raft::request_aborted In several exception handlers, only raft::request_aborted was being caught and rethrown, while seastar::abort_requested_exception was falling through to the generic catch(...) block. This caused the exception to be incorrectly treated as a failure that triggers rollback, instead of being recognized as an abort signal. For example, during tablet draining, the error log showed: "tablets draining failed with seastar::abort_requested_exception (abort requested). Aborting the topology operation" This change adds seastar::abort_requested_exception handling alongside raft::request_aborted in all places where it was missing. When rethrown, these exceptions propagate up to the main run() loop where handle_topology_coordinator_error() recognizes them as normal abort signals and allows the coordinator to exit gracefully without triggering unnecessary rollback operations. Fixes: scylladb/scylladb#27255 (cherry picked from commit 37e3dacf332b016012bfa7e150e35edbe42d7b68) --- service/topology_coordinator.cc | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index b9a2a94e2a..0d1af5f711 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -2066,6 +2066,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::join_group0, " "global_token_metadata_barrier failed, error {}", @@ -2211,6 +2213,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::commit_cdc_generation, " "raft_topology_cmd::command::barrier failed, error {}", std::current_exception()); @@ -2306,6 +2310,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("tablets draining failed with {}. Aborting the topology operation", std::current_exception()); _rollback = fmt::format("Failed to drain tablets: {}", std::current_exception()); @@ -2323,6 +2329,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::write_both_read_old, " "global_token_metadata_barrier failed, error {}", @@ -2371,6 +2379,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception" " (node state is {}): {}", state, std::current_exception()); @@ -2403,6 +2413,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::write_both_read_new, " "global_token_metadata_barrier failed, error {}", @@ -2542,6 +2554,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch (...) { rtlogger.error("transition_state::left_token_ring, " "raft_topology_cmd::command::barrier failed, error {}", @@ -2624,6 +2638,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch(...) { rtlogger.warn("failed to run barrier_and_drain during rollback of {} after {} failure: {}", node.id, state, std::current_exception()); @@ -2705,6 +2721,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { throw; } catch (raft::request_aborted&) { throw; + } catch (seastar::abort_requested_exception&) { + throw; } catch(...) { wait_for_ip_error = std::current_exception(); rtlogger.warn("raft_topology_cmd::command::wait_for_ip failed, error {}", @@ -2827,7 +2845,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { rtbuilder.done(); } catch (term_changed_error&) { throw; - } catch (raft::request_aborted& e) { + } catch (raft::request_aborted&) { + throw; + } catch (seastar::abort_requested_exception&) { throw; } catch (...) { rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception" @@ -3430,6 +3450,10 @@ future<> topology_coordinator::fence_previous_coordinator() { // Abort was requested. Break the loop rtlogger.debug("request to fence previous coordinator was aborted"); break; + } catch (seastar::abort_requested_exception&) { + // Abort was requested. Break the loop + rtlogger.debug("request to fence previous coordinator was aborted"); + break; } catch (...) { rtlogger.error("failed to fence previous coordinator {}", std::current_exception()); }