From 2811b1df0a8f4b2e6868ccac28fd90147c2165da Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Sun, 16 Jul 2023 23:26:02 +0200 Subject: [PATCH] topology_coordinator: Fix missed notification on abort If _as is aborted while the coordinator is in the middle of handling, and decides to go to sleep, it may go to sleep without noticing that it was aborted. Fix by checking before blocking on the condition variable. In general, every condition which can cause signal() should be checked before when(). This patch doesn't fix all the cases. For example, signal() can be called when there arrives a new topology request. This can happen after the coordinator checked because it releases the guard before calling when(). --- service/storage_service.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 83066bc5ab..d2a86544f0 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -1744,6 +1744,11 @@ class topology_coordinator { // Returns true if the state machine was transitioned into tablet migration path. future maybe_start_tablet_migration(group0_guard); + + future<> await_event() { + _as.check(); + co_await _topo_sm.event.when(); + } public: topology_coordinator( sharded& sys_dist_ks, @@ -1817,7 +1822,7 @@ future<> topology_coordinator::run() { if (!had_work) { // Nothing to work on. Wait for topology change event. slogger.trace("raft topology: topology coordinator fiber has nothing to do. Sleeping."); - co_await _topo_sm.event.when(); + co_await await_event(); slogger.trace("raft topology: topology coordinator fiber got an event"); } } catch (raft::request_aborted&) {