storage_service: make shadow round mandatory during bootstrap/replace

It is unsafe to bootstrap or perform replace without performing the
shadow round, which is used to obtain features from the existing cluster
and verify that we support all enabled features.

Before this patch, I could easily produce the following scenario:
1. bootstrap first node in the cluster
2. shut it down
3. start bootstrapping second node, pointing to the first as seed
4. the second node skips shadow round because it gets
   `rpc::closed_error` when trying to connect to first node.
5. the node then passes the feature check (!) and proceeds to the next
   step, where it waits for nodes to show up in gossiper
6. we now restart the first node, and the second node finishes bootstrap

The shadow round must be mandatory during bootstrap/replace, which is
what this patch does.

On restart it can remain optional as it was until now. In fact it should
be completely unnecessary during restart, but since we did it until now
(as best-effort), we can keep doing it.
This commit is contained in:
Kamil Braun
2023-11-03 15:26:49 +01:00
parent 7e9e84200c
commit b03fa87551
3 changed files with 8 additions and 7 deletions

View File

@@ -2005,8 +2005,8 @@ future<> gossiper::advertise_to_nodes(generation_for_nodes advertise_to_nodes) {
});
}
future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes) {
return seastar::async([this, g = this->shared_from_this(), nodes = std::move(nodes)] () mutable {
future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes, mandatory is_mandatory) {
return seastar::async([this, g = this->shared_from_this(), nodes = std::move(nodes), is_mandatory] () mutable {
nodes.erase(get_broadcast_address());
gossip_get_endpoint_states_request request{{
gms::application_state::STATUS,
@@ -2046,7 +2046,7 @@ future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes)
if (!nodes_talked.empty()) {
break;
}
if (nodes_down == nodes.size()) {
if (nodes_down == nodes.size() && !is_mandatory) {
logger.warn("All nodes={} are down for get_endpoint_states verb. Skip ShadowRound.", nodes);
break;
}

View File

@@ -589,10 +589,11 @@ public:
gms::advertise_myself advertise = gms::advertise_myself::yes);
public:
using mandatory = bool_class<class mandatory_tag>;
/**
* Do a single 'shadow' round of gossip, where we do not modify any state
*/
future<> do_shadow_round(std::unordered_set<gms::inet_address> nodes);
future<> do_shadow_round(std::unordered_set<gms::inet_address> nodes, mandatory is_mandatory);
private:
void build_seeds_list();

View File

@@ -2846,7 +2846,7 @@ future<> storage_service::join_token_ring(sharded<db::system_distributed_keyspac
} else {
auto local_features = _feature_service.supported_feature_set();
slogger.info("Performing gossip shadow round, initial_contact_nodes={}", initial_contact_nodes);
co_await _gossiper.do_shadow_round(initial_contact_nodes);
co_await _gossiper.do_shadow_round(initial_contact_nodes, gms::gossiper::mandatory::no);
if (!_raft_topology_change_enabled) {
_gossiper.check_knows_remote_features(local_features, loaded_peer_features);
}
@@ -4146,7 +4146,7 @@ future<> storage_service::check_for_endpoint_collision(std::unordered_set<gms::i
auto local_features = _feature_service.supported_feature_set();
do {
slogger.info("Performing gossip shadow round");
_gossiper.do_shadow_round(initial_contact_nodes).get();
_gossiper.do_shadow_round(initial_contact_nodes, gms::gossiper::mandatory::yes).get();
if (!_raft_topology_change_enabled) {
_gossiper.check_knows_remote_features(local_features, loaded_peer_features);
}
@@ -4225,7 +4225,7 @@ storage_service::prepare_replacement_info(std::unordered_set<gms::inet_address>
// make magic happen
slogger.info("Performing gossip shadow round");
co_await _gossiper.do_shadow_round(initial_contact_nodes);
co_await _gossiper.do_shadow_round(initial_contact_nodes, gms::gossiper::mandatory::yes);
if (!_raft_topology_change_enabled) {
auto local_features = _feature_service.supported_feature_set();
_gossiper.check_knows_remote_features(local_features, loaded_peer_features);