From 65bf5877e78612006ab898dad458b02ed38d61b6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 23 Oct 2023 13:03:17 +0300 Subject: [PATCH 1/2] storage_service: raft topology: do not throw error from fence_previous_coordinator() Throwing error kills the topology coordinator monitor fiber. Instead we retry the operation until it succeeds or the node looses its leadership. This is fine before for the operation to succeed quorum is needed and if the quorum is not available the node should relinquish its leadership. Fixes #15728 --- service/storage_service.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 5a033af140..7d2b41c800 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2399,7 +2399,7 @@ future<> topology_coordinator::fence_previous_coordinator() { // Topology state machine moves to state S while RPC R is still running. // If RPC is idempotent that should not be a problem since second one executed by B will do nothing, // but better to be safe and cut off previous write attempt - while (true) { + while (!_as.abort_requested()) { try { auto guard = co_await start_operation(); topology_mutation_builder builder(guard.write_timestamp()); @@ -2410,8 +2410,8 @@ future<> topology_coordinator::fence_previous_coordinator() { continue; } catch (...) { slogger.error("raft topology: failed to fence previous coordinator {}", std::current_exception()); - throw; } + co_await seastar::sleep_abortable(std::chrono::seconds(1), _as); } } From dcaaa74cd4d72b075871cd154a81355fb04411ee Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 23 Oct 2023 13:30:37 +0300 Subject: [PATCH 2/2] storage_service: raft topology: mark topology_coordinator::run function as noexcept The function handled all exceptions internally. By making it noexcept we make sure that the caller (raft_state_monitor_fiber) does not need handle any exceptions from the topology coordinator fiber. --- service/storage_service.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 7d2b41c800..293b34d0ad 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2339,7 +2339,7 @@ class topology_coordinator { co_await _topo_sm.event.when(); } - future<> fence_previous_coordinator(); + future<> fence_previous_coordinator() noexcept; public: topology_coordinator( @@ -2358,7 +2358,7 @@ public: , _ring_delay(ring_delay) {} - future<> run(); + future<> run() noexcept; }; future topology_coordinator::maybe_start_tablet_migration(group0_guard guard) { @@ -2385,7 +2385,7 @@ future topology_coordinator::maybe_start_tablet_migration(group0_guard gua co_return true; } -future<> topology_coordinator::fence_previous_coordinator() { +future<> topology_coordinator::fence_previous_coordinator() noexcept { // Write empty change to make sure that a guard taken by any previous coordinator cannot // be used to do a successful write any more. Otherwise the following can theoretically happen // while a coordinator tries to execute RPC R and move to state S. @@ -2415,7 +2415,7 @@ future<> topology_coordinator::fence_previous_coordinator() { } } -future<> topology_coordinator::run() { +future<> topology_coordinator::run() noexcept { slogger.info("raft topology: start topology coordinator fiber"); auto abort = _as.subscribe([this] () noexcept {