diff --git a/gms/gossiper.cc b/gms/gossiper.cc index c2908cd81e..f59cad8772 100644 --- a/gms/gossiper.cc +++ b/gms/gossiper.cc @@ -2335,7 +2335,15 @@ future<> gossiper::do_stop_gossiping() { // Take the semaphore makes sure existing gossip loop is finished auto units = co_await get_units(_callback_running, 1); co_await container().invoke_on_all([] (auto& g) { - return std::move(g._failure_detector_loop_done); + // #21159 + // gossiper::shutdown can be called from more than once place - both + // storage_service::isolate and normal gossip service stop. The former is + // waited for in storage_service::stop, but if we, as was done in cql_test_env, + // call shutdown independently, we could still end up here twite, and not hit + // the _enabled guard (because we do waiting things before setting it, and setting it + // is also waiting). However, making sure we don't leave an invalid future + // here should ensure even if we reenter this method in such as way, we don't crash. + return std::exchange(g._failure_detector_loop_done, make_ready_future<>()); }); logger.info("Gossip is now stopped"); } diff --git a/test/lib/cql_test_env.cc b/test/lib/cql_test_env.cc index eeca943469..859ae66f6d 100644 --- a/test/lib/cql_test_env.cc +++ b/test/lib/cql_test_env.cc @@ -968,7 +968,11 @@ private: }).get(); auto deinit_storage_service_server = defer([this] { - _gossiper.invoke_on_all(&gms::gossiper::shutdown).get(); + // #21159 don't shutdown gossip here - we don't in main.cc, and we should + // strive to keep the two paths aligned. Doing a gossip::shutdown here + // can, if we've provoked a storage_manager::isolate, cause parallel + // double execution of the shutdown method, which causes waiting for + // an invalid future if we're unlucky. _auth_service.stop().get(); });