From 242011c4d4bac8c5528f56fbf0ec3560c23fba10 Mon Sep 17 00:00:00 2001 From: Calle Wilund Date: Wed, 30 Oct 2024 14:52:22 +0000 Subject: [PATCH] cql_test_env/gossip: Prevent double shutdown call crash Fixes #21159 When an exception is thrown in sstable write etc such that storage_manager::isolate is initiated, we start a shutdown chain for message service, gossip etc. These are synced (properly) in storage_manager::stop, but if we somehow call gossiper::shutdown outside the normal service::stop cycle, we can end up running the method simultaneously, intertwined (missing the guard because of the state change between check and set). We then end up co_awaiting an invalid future (_failure_detector_loop_done) - a second wait. Fixed by a.) Remove superfluous gossiper::shutdown in cql_test_env. This was added in 20496ed, ages ago. However, it should not be needed nowadays. b.) Ensure _failure_detector_loop_done is always waitable. Just to be sure. (cherry picked from commit c28a5173d9885a0feb34321d5cd495ca7cdc6264) --- gms/gossiper.cc | 10 +++++++++- test/lib/cql_test_env.cc | 6 +++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/gms/gossiper.cc b/gms/gossiper.cc index c2908cd81e..f59cad8772 100644 --- a/gms/gossiper.cc +++ b/gms/gossiper.cc @@ -2335,7 +2335,15 @@ future<> gossiper::do_stop_gossiping() { // Take the semaphore makes sure existing gossip loop is finished auto units = co_await get_units(_callback_running, 1); co_await container().invoke_on_all([] (auto& g) { - return std::move(g._failure_detector_loop_done); + // #21159 + // gossiper::shutdown can be called from more than once place - both + // storage_service::isolate and normal gossip service stop. The former is + // waited for in storage_service::stop, but if we, as was done in cql_test_env, + // call shutdown independently, we could still end up here twite, and not hit + // the _enabled guard (because we do waiting things before setting it, and setting it + // is also waiting). However, making sure we don't leave an invalid future + // here should ensure even if we reenter this method in such as way, we don't crash. + return std::exchange(g._failure_detector_loop_done, make_ready_future<>()); }); logger.info("Gossip is now stopped"); } diff --git a/test/lib/cql_test_env.cc b/test/lib/cql_test_env.cc index eeca943469..859ae66f6d 100644 --- a/test/lib/cql_test_env.cc +++ b/test/lib/cql_test_env.cc @@ -968,7 +968,11 @@ private: }).get(); auto deinit_storage_service_server = defer([this] { - _gossiper.invoke_on_all(&gms::gossiper::shutdown).get(); + // #21159 don't shutdown gossip here - we don't in main.cc, and we should + // strive to keep the two paths aligned. Doing a gossip::shutdown here + // can, if we've provoked a storage_manager::isolate, cause parallel + // double execution of the shutdown method, which causes waiting for + // an invalid future if we're unlucky. _auth_service.stop().get(); });