From b8e4e040967f81f6fcc7aab3f2db932a75ecdc5b Mon Sep 17 00:00:00 2001 From: Piotr Dulikowski Date: Tue, 6 Feb 2024 15:46:36 +0100 Subject: [PATCH] storage_service: disallow topology ops when upgrade is in progress Forbid starting new topology changes while upgrade to topology on raft is in progress. While this does not take into account any ongoing topology operations, it makes sure that at the end of the upgrade no node will try to perform any legacy topology operations. --- service/storage_service.cc | 26 ++++++++++++++++++++++++++ service/storage_service.hh | 6 ++++++ 2 files changed, 32 insertions(+) diff --git a/service/storage_service.cc b/service/storage_service.cc index abf18ebaab..fb33bea9fa 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -3028,6 +3028,7 @@ future<> storage_service::raft_decommission() { future<> storage_service::decommission() { return run_with_api_lock(sstring("decommission"), [] (storage_service& ss) { return seastar::async([&ss] { + ss.check_ability_to_perform_topology_operation("decommission"); std::exception_ptr leave_group0_ex; if (ss.raft_topology_change_enabled()) { ss.raft_decommission().get(); @@ -3381,6 +3382,7 @@ future<> storage_service::raft_removenode(locator::host_id host_id, std::list storage_service::removenode(locator::host_id host_id, std::list ignore_nodes_params) { return run_with_api_lock(sstring("removenode"), [host_id, ignore_nodes_params = std::move(ignore_nodes_params)] (storage_service& ss) mutable { return seastar::async([&ss, host_id, ignore_nodes_params = std::move(ignore_nodes_params)] () mutable { + ss.check_ability_to_perform_topology_operation("removenode"); if (ss.raft_topology_change_enabled()) { ss.raft_removenode(host_id, std::move(ignore_nodes_params)).get(); return; @@ -3514,6 +3516,8 @@ future<> storage_service::check_and_repair_cdc_streams() { return make_exception_future<>(std::runtime_error("CDC generation service not initialized yet")); } + check_ability_to_perform_topology_operation("checkAndRepairCdcStreams"); + if (raft_topology_change_enabled()) { return raft_check_and_repair_cdc_streams(); } @@ -4078,6 +4082,7 @@ future<> storage_service::raft_check_and_repair_cdc_streams() { future<> storage_service::rebuild(sstring source_dc) { return run_with_api_lock(sstring("rebuild"), [source_dc] (storage_service& ss) -> future<> { + ss.check_ability_to_perform_topology_operation("rebuild"); if (ss.raft_topology_change_enabled()) { co_await ss.raft_rebuild(source_dc); } else { @@ -4110,6 +4115,20 @@ future<> storage_service::rebuild(sstring source_dc) { }); } +void storage_service::check_ability_to_perform_topology_operation(std::string_view operation_name) const { + switch (_topology_change_kind_enabled) { + case topology_change_kind::unknown: + throw std::runtime_error(format("{} is not allowed at this time - the node is still starting", operation_name)); + case topology_change_kind::upgrading_to_raft: + throw std::runtime_error(format("{} is not allowed at this time - the node is still in the process" + " of upgrading to raft topology", operation_name)); + case topology_change_kind::legacy: + return; + case topology_change_kind::raft: + return; + } +} + int32_t storage_service::get_exception_count() { // FIXME // We return 0 for no exceptions, it should probably be @@ -5382,6 +5401,13 @@ future storage_service::join_node_request_handler(join join_node_request_result result; rtlogger.info("received request to join from host_id: {}", params.host_id); + // Sanity check. We should already be using raft topology changes because + // the node asked us via join_node_query about which node to use and + // we responded that they should use raft. We cannot go back from raft + // to legacy (unless we switch to recovery between handling join_node_query + // and join_node_request, which is extremely unlikely). + check_ability_to_perform_topology_operation("join"); + if (params.cluster_name != _db.local().get_config().cluster_name()) { result.result = join_node_request_result::rejected{ .reason = ::format("Cluster name check failed. This node cannot join the cluster " diff --git a/service/storage_service.hh b/service/storage_service.hh index 9dcdc62c1c..695276dd76 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -763,6 +763,12 @@ private: }; topology_change_kind _topology_change_kind_enabled = topology_change_kind::unknown; + // Throws an exception if the node is either starting and didn't determine which + // topology operations to use, or if it is in the process of upgrade to topology + // on raft. The name only serves for display purposes (i.e. it will be included + // in the exception, if one is thrown). + void check_ability_to_perform_topology_operation(std::string_view operation_name) const; + public: bool raft_topology_change_enabled() const { return _topology_change_kind_enabled == topology_change_kind::raft;