From b0199b005db0d6f3c61e0f53f8c03ab9375afbfc Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 25 Jun 2021 12:56:25 +0300 Subject: [PATCH 1/3] storage_service: Stop transport on decommission The stop_transport sequence is: - stop client services (cql, thrift, alternator) - stop gossiping - stop messaging - stop stream manager The decommissioning goes very similarly - stop client services - stop batchlog manager - stop gossiping - stop messaging So this change makes decommission stop all networking _before_ batchlog, like it's already done on drain, and additionally stop the streaming manager. This change is prerequisite for fixing race between transport stop and isolation (on disk error). Signed-off-by: Pavel Emelyanov --- service/storage_service.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 9dd5b86a55..5f78a1e4da 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2009,18 +2009,14 @@ future<> storage_service::decommission() { throw; } - ss.shutdown_client_servers(); - slogger.info("DECOMMISSIONING: shutdown rpc and cql server done"); + ss.stop_transport().get(); + slogger.info("DECOMMISSIONING: stopped transport"); db::get_batchlog_manager().invoke_on_all([] (auto& bm) { return bm.stop(); }).get(); slogger.info("DECOMMISSIONING: stop batchlog_manager done"); - gms::stop_gossiping().get(); - slogger.info("DECOMMISSIONING: stop_gossiping done"); - ss.do_stop_ms().get(); - slogger.info("DECOMMISSIONING: stop messaging_service done"); // StageManager.shutdownNow(); db::system_keyspace::set_bootstrap_state(db::system_keyspace::bootstrap_state::DECOMMISSIONED).get(); slogger.info("DECOMMISSIONING: set_bootstrap_state done"); From bd2a58060ef554c5f3cfe51e5d1902b5e62b0156 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 25 Jun 2021 12:59:39 +0300 Subject: [PATCH 2/3] storage_service: Make stop_transport re-entrable It may happen that disk error opccurs and subsequent isolation runs in parallel with drain or decommission or shutdown. In this case the stop_transport method would be running two times in parallel. Also the drain after decommission is not disabled, so it may happen that stop_transport will be called two times in a row (however -- not in parallel). Using shared_promise solves all the possible reentrances. (the indentation is deliberately left broken) Signed-off-by: Pavel Emelyanov --- service/storage_service.cc | 14 +++++++++++++- service/storage_service.hh | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 5f78a1e4da..1936ca57db 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -1348,7 +1348,10 @@ future<> storage_service::unregister_subscriber(endpoint_lifecycle_subscriber* s } future<> storage_service::stop_transport() { - return seastar::async([this] { + if (!_transport_stopped.has_value()) { + _transport_stopped.emplace(); + + (void) seastar::async([this] { slogger.info("Stop transport: starts"); shutdown_client_servers(); @@ -1362,9 +1365,18 @@ future<> storage_service::stop_transport() { do_stop_stream_manager().get(); slogger.info("Stop transport: shutdown stream_manager done"); + }).then_wrapped([this] (future<> f) { + if (f.failed()) { + _transport_stopped->set_exception(f.get_exception()); + } else { + _transport_stopped->set_value(); + } slogger.info("Stop transport: done"); }); + } + + return _transport_stopped->get_shared_future(); } future<> storage_service::drain_on_shutdown() { diff --git a/service/storage_service.hh b/service/storage_service.hh index 44d6c2f1e0..d47037e595 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -847,6 +847,7 @@ public: private: promise<> _drain_finished; + std::optional> _transport_stopped; future<> do_drain(bool on_shutdown); /** * Seed data to the endpoints that will be responsible for it at the future From a89ae9a8e74cab04b4c7ab25b4aa522a5815d152 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 25 Jun 2021 13:00:29 +0300 Subject: [PATCH 3/3] storage_service: Indentation fix Signed-off-by: Pavel Emelyanov --- service/storage_service.cc | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 1936ca57db..ee9a420945 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -1349,31 +1349,31 @@ future<> storage_service::unregister_subscriber(endpoint_lifecycle_subscriber* s future<> storage_service::stop_transport() { if (!_transport_stopped.has_value()) { - _transport_stopped.emplace(); + _transport_stopped.emplace(); - (void) seastar::async([this] { - slogger.info("Stop transport: starts"); + (void) seastar::async([this] { + slogger.info("Stop transport: starts"); - shutdown_client_servers(); - slogger.info("Stop transport: shutdown rpc and cql server done"); + shutdown_client_servers(); + slogger.info("Stop transport: shutdown rpc and cql server done"); - gms::stop_gossiping().get(); - slogger.info("Stop transport: stop_gossiping done"); + gms::stop_gossiping().get(); + slogger.info("Stop transport: stop_gossiping done"); - do_stop_ms().get(); - slogger.info("Stop transport: shutdown messaging_service done"); + do_stop_ms().get(); + slogger.info("Stop transport: shutdown messaging_service done"); - do_stop_stream_manager().get(); - slogger.info("Stop transport: shutdown stream_manager done"); - }).then_wrapped([this] (future<> f) { - if (f.failed()) { - _transport_stopped->set_exception(f.get_exception()); - } else { - _transport_stopped->set_value(); - } + do_stop_stream_manager().get(); + slogger.info("Stop transport: shutdown stream_manager done"); + }).then_wrapped([this] (future<> f) { + if (f.failed()) { + _transport_stopped->set_exception(f.get_exception()); + } else { + _transport_stopped->set_value(); + } - slogger.info("Stop transport: done"); - }); + slogger.info("Stop transport: done"); + }); } return _transport_stopped->get_shared_future();