From 2fa89d86966e35b1f4eb3e6edc3f585a8316610b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 09:21:34 +0300 Subject: [PATCH 01/10] main: Move some trailing startup earlier The set_abort_on_ebadf() call and some api endpoints registration come after protocol servers. The latter is going to be shuffled, so move the former earlier not to hang around. Signed-off-by: Pavel Emelyanov --- main.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/main.cc b/main.cc index 073a8cf727..eb010d9f69 100644 --- a/main.cc +++ b/main.cc @@ -1960,8 +1960,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl }); db.invoke_on_all(&replica::database::revert_initial_system_read_concurrency_boost).get(); - notify_set.notify_all(configurable::system_state::started).get(); + seastar::set_abort_on_ebadf(cfg->abort_on_ebadf()); + api::set_server_done(ctx).get(); cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group); @@ -2025,8 +2026,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl } ss.local().register_protocol_server(redis_ctl); - seastar::set_abort_on_ebadf(cfg->abort_on_ebadf()); - api::set_server_done(ctx).get(); supervisor::notify("serving"); // Register at_exit last, so that storage_service::drain_on_shutdown will be called first From 315ef4c48476e94748867052100893693d435a3b Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 09:22:33 +0300 Subject: [PATCH 02/10] main: Schedule deferred drain_on_shutdown() prior to protocol servers Nex patches will remove protocol servers' deferred stops and will rely on drain_on_shutdown -> stop_transport to do it, so the drain deferred action some come before protocol servers' registration. This also fixes a bug. Currently alternator and redis both rely on protocol servers to stop them on shutdown. However, when startup is aborted prior to drain_on_shutdown() registration, protocol servers are not stopped and alternator and redis can remain stopped. Signed-off-by: Pavel Emelyanov --- main.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main.cc b/main.cc index eb010d9f69..4d018e66d7 100644 --- a/main.cc +++ b/main.cc @@ -1964,6 +1964,11 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl seastar::set_abort_on_ebadf(cfg->abort_on_ebadf()); api::set_server_done(ctx).get(); + // Register at_exit last, so that storage_service::drain_on_shutdown will be called first + auto do_drain = defer_verbose_shutdown("local storage", [&ss] { + ss.local().drain_on_shutdown().get(); + }); + cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group); ss.local().register_protocol_server(cql_server_ctl); @@ -2027,11 +2032,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl ss.local().register_protocol_server(redis_ctl); supervisor::notify("serving"); - // Register at_exit last, so that storage_service::drain_on_shutdown will be called first - - auto do_drain = defer_verbose_shutdown("local storage", [&ss] { - ss.local().drain_on_shutdown().get(); - }); startlog.info("Scylla version {} initialization completed.", scylla_version()); if (after_init_func) { From eb033e3c5fb09331879ba3676d659dda5f4b19e4 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 09:26:48 +0300 Subject: [PATCH 03/10] storage_service: Outline register_protocol_server() To make next patch shorter Signed-off-by: Pavel Emelyanov --- service/storage_service.cc | 4 ++++ service/storage_service.hh | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index a451187fe5..e8fae1e2fe 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -7204,5 +7204,9 @@ void storage_service::set_topology_change_kind(topology_change_kind kind) { _gossiper.set_topology_state_machine(kind == topology_change_kind::raft ? & _topology_state_machine : nullptr); } +void storage_service::register_protocol_server(protocol_server& server) { + _protocol_servers.push_back(&server); +} + } // namespace service diff --git a/service/storage_service.hh b/service/storage_service.hh index a38fe4b395..c73421e755 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -329,9 +329,7 @@ public: // should only be called via JMX future is_gossip_running(); - void register_protocol_server(protocol_server& server) { - _protocol_servers.push_back(&server); - } + void register_protocol_server(protocol_server& server); // All pointers are valid. const std::vector& protocol_servers() const { From 2aab9f634041b223d2478e94f23157e43fcea8df Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 09:27:20 +0300 Subject: [PATCH 04/10] storage_service: Turn register_protocol_server() async method To make the next patch shorter Signed-off-by: Pavel Emelyanov --- main.cc | 8 ++++---- service/storage_service.cc | 3 ++- service/storage_service.hh | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/main.cc b/main.cc index 4d018e66d7..b5e66f53b5 100644 --- a/main.cc +++ b/main.cc @@ -1971,7 +1971,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group); - ss.local().register_protocol_server(cql_server_ctl); + ss.local().register_protocol_server(cql_server_ctl).get(); std::any stop_cql; @@ -1986,7 +1986,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl ::thrift_controller thrift_ctl(db, auth_service, qp, service_memory_limiter, ss, proxy, dbcfg.statement_scheduling_group); - ss.local().register_protocol_server(thrift_ctl); + ss.local().register_protocol_server(thrift_ctl).get(); std::any stop_rpc; if (cfg->start_rpc()) { @@ -2023,13 +2023,13 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl return es.invoke_on_all(&alternator::expiration_service::start); }).get(); } - ss.local().register_protocol_server(alternator_ctl); + ss.local().register_protocol_server(alternator_ctl).get(); redis::controller redis_ctl(proxy, auth_service, mm, *cfg, gossiper, dbcfg.statement_scheduling_group); if (cfg->redis_port() || cfg->redis_ssl_port()) { redis_ctl.start_server().get(); } - ss.local().register_protocol_server(redis_ctl); + ss.local().register_protocol_server(redis_ctl).get(); supervisor::notify("serving"); diff --git a/service/storage_service.cc b/service/storage_service.cc index e8fae1e2fe..5af7d5d732 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -7204,8 +7204,9 @@ void storage_service::set_topology_change_kind(topology_change_kind kind) { _gossiper.set_topology_state_machine(kind == topology_change_kind::raft ? & _topology_state_machine : nullptr); } -void storage_service::register_protocol_server(protocol_server& server) { +future<> storage_service::register_protocol_server(protocol_server& server) { _protocol_servers.push_back(&server); + co_return; } } // namespace service diff --git a/service/storage_service.hh b/service/storage_service.hh index c73421e755..b6597a0400 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -329,7 +329,7 @@ public: // should only be called via JMX future is_gossip_running(); - void register_protocol_server(protocol_server& server); + future<> register_protocol_server(protocol_server& server); // All pointers are valid. const std::vector& protocol_servers() const { From 9292d326b7d1ed0f780f161a6b6af0f67fb1db0d Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 09:28:51 +0300 Subject: [PATCH 05/10] storage_service: Make register_protocol_server() start the server After a protocol server is registered, it can be instantly started by the main code. It makes sense to generalize this sequence by teaching register_protocol_server() start it. For now it's a no-op change, as "start_instantly" is false by default, but next patches will make use of it. Signed-off-by: Pavel Emelyanov --- service/storage_service.cc | 6 ++++-- service/storage_service.hh | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 5af7d5d732..f644e38476 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -7204,9 +7204,11 @@ void storage_service::set_topology_change_kind(topology_change_kind kind) { _gossiper.set_topology_state_machine(kind == topology_change_kind::raft ? & _topology_state_machine : nullptr); } -future<> storage_service::register_protocol_server(protocol_server& server) { +future<> storage_service::register_protocol_server(protocol_server& server, bool start_instantly) { _protocol_servers.push_back(&server); - co_return; + if (start_instantly) { + co_await server.start_server(); + } } } // namespace service diff --git a/service/storage_service.hh b/service/storage_service.hh index b6597a0400..d44265623b 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -329,7 +329,7 @@ public: // should only be called via JMX future is_gossip_running(); - future<> register_protocol_server(protocol_server& server); + future<> register_protocol_server(protocol_server& server, bool start_instantly = false); // All pointers are valid. const std::vector& protocol_servers() const { From 830a87e86205f07469277f5b6912b82643eedeb0 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 09:29:14 +0300 Subject: [PATCH 06/10] main: Start native transport transparently It's now possible to start protocol server when registered. It will also be stopped automatically on shutdown / aborted shutdown. Signed-off-by: Pavel Emelyanov --- main.cc | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/main.cc b/main.cc index b5e66f53b5..7e2b839745 100644 --- a/main.cc +++ b/main.cc @@ -1964,21 +1964,18 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl seastar::set_abort_on_ebadf(cfg->abort_on_ebadf()); api::set_server_done(ctx).get(); + // Create controllers before drain_on_shutdown() below, so that it destructs + // after drain stops them in stop_transport() + // Register controllers after drain_on_shutdown() below, so that even on start + // failure drain is called and stops controllers + cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group); + // Register at_exit last, so that storage_service::drain_on_shutdown will be called first auto do_drain = defer_verbose_shutdown("local storage", [&ss] { ss.local().drain_on_shutdown().get(); }); - cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group); - - ss.local().register_protocol_server(cql_server_ctl).get(); - - std::any stop_cql; - - if (cfg->start_native_transport()) { - start_cql(cql_server_ctl, stop_cql, "native transport"); - } - + ss.local().register_protocol_server(cql_server_ctl, cfg->start_native_transport()).get(); api::set_transport_controller(ctx, cql_server_ctl).get(); auto stop_transport_controller = defer_verbose_shutdown("transport controller API", [&ctx] { api::unset_transport_controller(ctx).get(); From d3e112179363bb16680158be5ea49834d3eff5df Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 09:29:33 +0300 Subject: [PATCH 07/10] main: Start thrift transparently It's now possible to start protocol server when registered. It will also be stopped automatically on shutdown / aborted shutdown. It also fixes a rare bug. If thrifst is not asked to be started on boot, its deferred shutdown action isn't created, so it it's later started via the API, it won't be stopped on shutdown. Signed-off-by: Pavel Emelyanov --- main.cc | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/main.cc b/main.cc index 7e2b839745..9ecf7ae810 100644 --- a/main.cc +++ b/main.cc @@ -1969,6 +1969,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl // Register controllers after drain_on_shutdown() below, so that even on start // failure drain is called and stops controllers cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group); + ::thrift_controller thrift_ctl(db, auth_service, qp, service_memory_limiter, ss, proxy, dbcfg.statement_scheduling_group); // Register at_exit last, so that storage_service::drain_on_shutdown will be called first auto do_drain = defer_verbose_shutdown("local storage", [&ss] { @@ -1981,19 +1982,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl api::unset_transport_controller(ctx).get(); }); - ::thrift_controller thrift_ctl(db, auth_service, qp, service_memory_limiter, ss, proxy, dbcfg.statement_scheduling_group); - - ss.local().register_protocol_server(thrift_ctl).get(); - - std::any stop_rpc; - if (cfg->start_rpc()) { - thrift_ctl.start_server().get(); - // FIXME -- this should be done via client hooks instead - stop_rpc = defer_verbose_shutdown("rpc server", [&thrift_ctl] { - thrift_ctl.stop_server().get(); - }); - } - + ss.local().register_protocol_server(thrift_ctl, cfg->start_rpc()).get(); api::set_rpc_controller(ctx, thrift_ctl).get(); auto stop_rpc_controller = defer_verbose_shutdown("rpc controller API", [&ctx] { api::unset_rpc_controller(ctx).get(); From 4204d7f4f9af15b6503a8c467042e9399328906a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 09:30:03 +0300 Subject: [PATCH 08/10] main: Start alternator transparently It's now possible to start protocol server when registered. It will also be stopped automatically on shutdown / aborted shutdown. Also move the controller variable lower to keep it all next to each other. Signed-off-by: Pavel Emelyanov --- main.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.cc b/main.cc index 9ecf7ae810..455e13d36e 100644 --- a/main.cc +++ b/main.cc @@ -1970,6 +1970,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl // failure drain is called and stops controllers cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group); ::thrift_controller thrift_ctl(db, auth_service, qp, service_memory_limiter, ss, proxy, dbcfg.statement_scheduling_group); + alternator::controller alternator_ctl(gossiper, proxy, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group); // Register at_exit last, so that storage_service::drain_on_shutdown will be called first auto do_drain = defer_verbose_shutdown("local storage", [&ss] { @@ -1988,12 +1989,10 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl api::unset_rpc_controller(ctx).get(); }); - alternator::controller alternator_ctl(gossiper, proxy, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group); sharded es; std::any stop_expiration_service; if (cfg->alternator_port() || cfg->alternator_https_port()) { - alternator_ctl.start_server().get(); // Start the expiration service on all shards. // Currently we only run it if Alternator is enabled, because // only Alternator uses it for its TTL feature. But in the @@ -2009,7 +2008,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl return es.invoke_on_all(&alternator::expiration_service::start); }).get(); } - ss.local().register_protocol_server(alternator_ctl).get(); + + ss.local().register_protocol_server(alternator_ctl, cfg->alternator_port() || cfg->alternator_https_port()).get(); redis::controller redis_ctl(proxy, auth_service, mm, *cfg, gossiper, dbcfg.statement_scheduling_group); if (cfg->redis_port() || cfg->redis_ssl_port()) { From d7c231ede9b6cda0feac9ee5722baeacc54df644 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 09:30:31 +0300 Subject: [PATCH 09/10] main: Start redis transparently It's now possible to start protocol server when registered. It will also be stopped automatically on shutdown / aborted shutdown. Signed-off-by: Pavel Emelyanov --- main.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/main.cc b/main.cc index 455e13d36e..d913abdaf7 100644 --- a/main.cc +++ b/main.cc @@ -1971,6 +1971,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group); ::thrift_controller thrift_ctl(db, auth_service, qp, service_memory_limiter, ss, proxy, dbcfg.statement_scheduling_group); alternator::controller alternator_ctl(gossiper, proxy, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group); + redis::controller redis_ctl(proxy, auth_service, mm, *cfg, gossiper, dbcfg.statement_scheduling_group); // Register at_exit last, so that storage_service::drain_on_shutdown will be called first auto do_drain = defer_verbose_shutdown("local storage", [&ss] { @@ -2011,11 +2012,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl ss.local().register_protocol_server(alternator_ctl, cfg->alternator_port() || cfg->alternator_https_port()).get(); - redis::controller redis_ctl(proxy, auth_service, mm, *cfg, gossiper, dbcfg.statement_scheduling_group); - if (cfg->redis_port() || cfg->redis_ssl_port()) { - redis_ctl.start_server().get(); - } - ss.local().register_protocol_server(redis_ctl).get(); + ss.local().register_protocol_server(redis_ctl, cfg->redis_port() || cfg->redis_ssl_port()).get(); supervisor::notify("serving"); From 9e654346925d868e67b3ecdbb711e2d7944c8102 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 3 Jun 2024 22:58:06 +0300 Subject: [PATCH 10/10] main: Start alternator expiration service earlier Prior to registering drain_on_shutdown and all the protorocl servers. To keep the natural sequence - start core - register drain-on-shutdown - start transport(s) Signed-off-by: Pavel Emelyanov --- main.cc | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/main.cc b/main.cc index d913abdaf7..3ba6c0c1b5 100644 --- a/main.cc +++ b/main.cc @@ -1959,6 +1959,26 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl api::unset_server_view_builder(ctx).get(); }); + sharded es; + std::any stop_expiration_service; + + if (cfg->alternator_port() || cfg->alternator_https_port()) { + // Start the expiration service on all shards. + // Currently we only run it if Alternator is enabled, because + // only Alternator uses it for its TTL feature. But in the + // future if we add a CQL interface to it, we may want to + // start this outside the Alternator if(). + supervisor::notify("starting the expiration service"); + es.start(seastar::sharded_parameter([] (const replica::database& db) { return db.as_data_dictionary(); }, std::ref(db)), + std::ref(proxy), std::ref(gossiper)).get(); + stop_expiration_service = defer_verbose_shutdown("expiration service", [&es] { + es.stop().get(); + }); + with_scheduling_group(maintenance_scheduling_group, [&es] { + return es.invoke_on_all(&alternator::expiration_service::start); + }).get(); + } + db.invoke_on_all(&replica::database::revert_initial_system_read_concurrency_boost).get(); notify_set.notify_all(configurable::system_state::started).get(); seastar::set_abort_on_ebadf(cfg->abort_on_ebadf()); @@ -1990,26 +2010,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl api::unset_rpc_controller(ctx).get(); }); - sharded es; - std::any stop_expiration_service; - - if (cfg->alternator_port() || cfg->alternator_https_port()) { - // Start the expiration service on all shards. - // Currently we only run it if Alternator is enabled, because - // only Alternator uses it for its TTL feature. But in the - // future if we add a CQL interface to it, we may want to - // start this outside the Alternator if(). - supervisor::notify("starting the expiration service"); - es.start(seastar::sharded_parameter([] (const replica::database& db) { return db.as_data_dictionary(); }, std::ref(db)), - std::ref(proxy), std::ref(gossiper)).get(); - stop_expiration_service = defer_verbose_shutdown("expiration service", [&es] { - es.stop().get(); - }); - with_scheduling_group(maintenance_scheduling_group, [&es] { - return es.invoke_on_all(&alternator::expiration_service::start); - }).get(); - } - ss.local().register_protocol_server(alternator_ctl, cfg->alternator_port() || cfg->alternator_https_port()).get(); ss.local().register_protocol_server(redis_ctl, cfg->redis_port() || cfg->redis_ssl_port()).get();