Merge '[Backport 2025.1] topology_coordinator: handle seastar::abort_requested_exception alongside raft::request_aborted' from Scylladb[bot]

In several exception handlers, only `raft::request_aborted` was being caught and rethrown, while `seastar::abort_requested_exception` was falling through to the generic catch(...) block. This caused the exception to be incorrectly treated as a failure that triggers rollback, instead of being recognized as an abort signal.

For example, during tablet draining, the error log showed: "tablets draining failed with seastar::abort_requested_exception (abort requested). Aborting the topology operation"

This change adds `seastar::abort_requested_exception` handling alongside `raft::request_aborted` in all places where it was missing. When rethrown, these exceptions propagate up to the main `run()` loop where `handle_topology_coordinator_error()` recognizes them as normal abort signals and allows the coordinator to exit gracefully without triggering unnecessary rollback operations.

Fixes: scylladb/scylladb#27255

No backport: The problem was only seen in tests and not reported in customer tickets, so it's enough to fix it in the main branch.

- (cherry picked from commit 37e3dacf33)

Parent PR: #27314

Closes scylladb/scylladb#27660

* https://github.com/scylladb/scylladb:
  topology_coordinator: handle seastar::abort_requested_exception alongside raft::request_aborted
  topology_coordinator: consistently rethrow `raft::request_aborted` for direct/global commands
This commit is contained in:
Patryk Jędrzejczak
2025-12-20 19:37:17 +01:00

View File

@@ -2004,6 +2004,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch (...) {
rtlogger.error("transition_state::join_group0, "
"global_token_metadata_barrier failed, error {}",
@@ -2140,6 +2142,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch (...) {
rtlogger.error("transition_state::commit_cdc_generation, "
"raft_topology_cmd::command::barrier failed, error {}", std::current_exception());
@@ -2220,6 +2224,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch (...) {
rtlogger.error("tablets draining failed with {}. Aborting the topology operation", std::current_exception());
_rollback = fmt::format("Failed to drain tablets: {}", std::current_exception());
@@ -2237,6 +2243,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch (...) {
rtlogger.error("transition_state::write_both_read_old, "
"global_token_metadata_barrier failed, error {}",
@@ -2303,6 +2311,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch (...) {
rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
" (node state is {}): {}", state, std::current_exception());
@@ -2335,6 +2345,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch (...) {
rtlogger.error("transition_state::write_both_read_new, "
"global_token_metadata_barrier failed, error {}",
@@ -2478,6 +2490,8 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch (...) {
rtlogger.error("transition_state::left_token_ring, "
"raft_topology_cmd::command::barrier failed, error {}",
@@ -2552,6 +2566,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
node.guard = co_await exec_global_command(std::move(node.guard),raft_topology_cmd::command::barrier_and_drain, get_excluded_nodes(node), drop_guard_and_retake::yes);
} catch (term_changed_error&) {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch(...) {
rtlogger.warn("failed to run barrier_and_drain during rollback of {} after {} failure: {}",
node.id, state, std::current_exception());
@@ -2629,6 +2647,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
}
} catch (term_changed_error&) {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch(...) {
wait_for_ip_error = std::current_exception();
rtlogger.warn("raft_topology_cmd::command::wait_for_ip failed, error {}",
@@ -2751,6 +2773,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
rtbuilder.done();
} catch (term_changed_error&) {
throw;
} catch (raft::request_aborted&) {
throw;
} catch (seastar::abort_requested_exception&) {
throw;
} catch (...) {
rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
" (node state is rebuilding): {}", std::current_exception());
@@ -3338,6 +3364,10 @@ future<> topology_coordinator::fence_previous_coordinator() {
// Abort was requested. Break the loop
rtlogger.debug("request to fence previous coordinator was aborted");
break;
} catch (seastar::abort_requested_exception&) {
// Abort was requested. Break the loop
rtlogger.debug("request to fence previous coordinator was aborted");
break;
} catch (...) {
rtlogger.error("failed to fence previous coordinator {}", std::current_exception());
}