From b03fa87551173dd530a8a660d6e2891bfb44816b Mon Sep 17 00:00:00 2001 From: Kamil Braun Date: Fri, 3 Nov 2023 15:26:49 +0100 Subject: [PATCH] storage_service: make shadow round mandatory during bootstrap/replace It is unsafe to bootstrap or perform replace without performing the shadow round, which is used to obtain features from the existing cluster and verify that we support all enabled features. Before this patch, I could easily produce the following scenario: 1. bootstrap first node in the cluster 2. shut it down 3. start bootstrapping second node, pointing to the first as seed 4. the second node skips shadow round because it gets `rpc::closed_error` when trying to connect to first node. 5. the node then passes the feature check (!) and proceeds to the next step, where it waits for nodes to show up in gossiper 6. we now restart the first node, and the second node finishes bootstrap The shadow round must be mandatory during bootstrap/replace, which is what this patch does. On restart it can remain optional as it was until now. In fact it should be completely unnecessary during restart, but since we did it until now (as best-effort), we can keep doing it. --- gms/gossiper.cc | 6 +++--- gms/gossiper.hh | 3 ++- service/storage_service.cc | 6 +++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/gms/gossiper.cc b/gms/gossiper.cc index 6aa3a40e5a..18b4642dbf 100644 --- a/gms/gossiper.cc +++ b/gms/gossiper.cc @@ -2005,8 +2005,8 @@ future<> gossiper::advertise_to_nodes(generation_for_nodes advertise_to_nodes) { }); } -future<> gossiper::do_shadow_round(std::unordered_set nodes) { - return seastar::async([this, g = this->shared_from_this(), nodes = std::move(nodes)] () mutable { +future<> gossiper::do_shadow_round(std::unordered_set nodes, mandatory is_mandatory) { + return seastar::async([this, g = this->shared_from_this(), nodes = std::move(nodes), is_mandatory] () mutable { nodes.erase(get_broadcast_address()); gossip_get_endpoint_states_request request{{ gms::application_state::STATUS, @@ -2046,7 +2046,7 @@ future<> gossiper::do_shadow_round(std::unordered_set nodes) if (!nodes_talked.empty()) { break; } - if (nodes_down == nodes.size()) { + if (nodes_down == nodes.size() && !is_mandatory) { logger.warn("All nodes={} are down for get_endpoint_states verb. Skip ShadowRound.", nodes); break; } diff --git a/gms/gossiper.hh b/gms/gossiper.hh index f2e7a9dd37..2ce4d129a0 100644 --- a/gms/gossiper.hh +++ b/gms/gossiper.hh @@ -589,10 +589,11 @@ public: gms::advertise_myself advertise = gms::advertise_myself::yes); public: + using mandatory = bool_class; /** * Do a single 'shadow' round of gossip, where we do not modify any state */ - future<> do_shadow_round(std::unordered_set nodes); + future<> do_shadow_round(std::unordered_set nodes, mandatory is_mandatory); private: void build_seeds_list(); diff --git a/service/storage_service.cc b/service/storage_service.cc index 42cc479ba2..cb708ff229 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2846,7 +2846,7 @@ future<> storage_service::join_token_ring(sharded storage_service::check_for_endpoint_collision(std::unordered_set // make magic happen slogger.info("Performing gossip shadow round"); - co_await _gossiper.do_shadow_round(initial_contact_nodes); + co_await _gossiper.do_shadow_round(initial_contact_nodes, gms::gossiper::mandatory::yes); if (!_raft_topology_change_enabled) { auto local_features = _feature_service.supported_feature_set(); _gossiper.check_knows_remote_features(local_features, loaded_peer_features);