From db734cd74fc4c8cc60cef1e7c7d6832d7e4a378b Mon Sep 17 00:00:00 2001 From: Kamil Braun Date: Wed, 11 Jan 2023 18:44:41 +0100 Subject: [PATCH] service: storage_service: make leaving node a non-voter before removing it from group 0 in decommission/removenode removenode currently works roughly like this: 1. stream/repair data so it ends up on new replica sets (calculated without the node we want to remove) 2. remove the node from the token ring 3. remove the node from group 0 configuration. If the procedure fails before after step 2 but before step 3 finishes, we're in trouble: the cluster is left with an additional voting group 0 member, which reduces group 0's availability, and there is no way to remove this member because `removenode` no longer considers it to be part of the cluster (it consults the token ring to decide). Improve this failure scenario by including a new step at the beginning: make the node a non-voter in group 0 configuration. Then, even if we fail after removing the node from the token ring but before removing it from group 0, we'll only be left with a non-voter which doesn't reduce availability. We make a similar change for `decommission`: between `unbootstrap()` (which streams data) and `leave_ring()` (which removes our tokens from the ring), become a non-voter. The difference here is that we don't become a non-voter at the beginning, but only after streaming/repair. In `removenode` it's desirable to make the node a non-voter as soon as possible because it's already dead. In decommission it may be desirable for us to remain a voter if we fail during streaming because we're still alive and functional in that case. In a later commit we'll also make it possible to retry `removenode` to remove a node that is only a group 0 member and not a token ring member. --- service/raft/raft_group0.cc | 44 ++++++++++++++++++++++++++++++++ service/raft/raft_group0.hh | 16 ++++++++++++ service/storage_service.cc | 51 +++++++++++++++++++++++++++---------- 3 files changed, 98 insertions(+), 13 deletions(-) diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index ada234cc9e..71bf04ae9e 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -671,6 +671,30 @@ future<> raft_group0::finish_setup_after_join() { }); } +future<> raft_group0::become_nonvoter() { + if (!(co_await raft_upgrade_complete())) { + on_internal_error(group0_log, "called become_nonvoter before Raft upgrade finished"); + } + + auto my_id = load_my_id(); + group0_log.info("becoming a non-voter (my id = {})...", my_id); + + co_await make_raft_config_nonvoter(my_id); + group0_log.info("became a non-voter.", my_id); +} + +future<> raft_group0::make_nonvoter(raft::server_id node) { + if (!(co_await raft_upgrade_complete())) { + on_internal_error(group0_log, "called make_nonvoter before Raft upgrade finished"); + } + + group0_log.info("making server {} a non-voter...", node); + + co_await make_raft_config_nonvoter(node); + + group0_log.info("server {} is now a non-voter.", node); +} + future<> raft_group0::leave_group0() { if (!(co_await raft_upgrade_complete())) { on_internal_error(group0_log, "called leave_group0 before Raft upgrade finished"); @@ -755,6 +779,26 @@ future raft_group0::wait_for_raft() { co_return true; } +future<> raft_group0::make_raft_config_nonvoter(raft::server_id id) { + static constexpr auto max_retry_period = std::chrono::seconds{1}; + auto retry_period = std::chrono::milliseconds{10}; + + // FIXME: cancellability/timeout + while (true) { + try { + co_await _raft_gr.group0().modify_config({{{id, {}}, false}}, {}, &_abort_source); + co_return; + } catch (const raft::commit_status_unknown& e) { + group0_log.info("make_raft_config_nonvoter({}): modify_config returned \"{}\", retrying", id, e); + } + retry_period *= 2; + if (retry_period > max_retry_period) { + retry_period = max_retry_period; + } + co_await sleep_abortable(retry_period, _abort_source); + } +} + future<> raft_group0::remove_from_raft_config(raft::server_id id) { static constexpr auto max_retry_period = std::chrono::seconds{1}; auto retry_period = std::chrono::milliseconds{10}; diff --git a/service/raft/raft_group0.hh b/service/raft/raft_group0.hh index 10d4e98406..375a659967 100644 --- a/service/raft/raft_group0.hh +++ b/service/raft/raft_group0.hh @@ -158,6 +158,18 @@ public: // This is a prerequisite for performing group 0 configuration operations. future wait_for_raft(); + // Become a non-voter in group 0. + // + // Assumes we've finished the startup procedure (`setup_group0()` finished earlier). + // `wait_for_raft` must've also been called earlier and returned `true`. + future<> become_nonvoter(); + + // Make the given server, other than us, a non-voter in group 0. + // + // Assumes we've finished the startup procedure (`setup_group0()` finished earlier). + // `wait_for_raft` must've also been called earlier and returned `true`. + future<> make_nonvoter(raft::server_id); + // Remove ourselves from group 0. // // Assumes we've finished the startup procedure (`setup_group0()` finished earlier). @@ -260,6 +272,10 @@ private: // (we could then try restarting the server internally). future<> start_server_for_group0(raft::group_id group0_id); + // Make the given server a non-voter in Raft group 0 configuration. + // Retries on raft::commit_status_unknown. + future<> make_raft_config_nonvoter(raft::server_id); + // Remove the node from raft config, retries raft::commit_status_unknown. future<> remove_from_raft_config(raft::server_id id); diff --git a/service/storage_service.cc b/service/storage_service.cc index 93ba60aede..2171ea2163 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2003,6 +2003,9 @@ future<> storage_service::decommission() { throw std::runtime_error(msg); } + assert(ss._group0); + bool raft_available = ss._group0->wait_for_raft().get(); + // Step 2: Prepare to sync data std::unordered_set nodes_unknown_verb; std::unordered_set nodes_down; @@ -2038,13 +2041,27 @@ future<> storage_service::decommission() { heartbeat_updater.get(); }); - // Step 5: Start to sync data + // Step 4: Start to sync data slogger.info("DECOMMISSIONING: unbootstrap starts"); ss.unbootstrap().get(); - ss.leave_ring().get(); slogger.info("DECOMMISSIONING: unbootstrap done"); - // Step 6: Finish + // Step 5: Become a group 0 non-voter before leaving the token ring. + // + // Thanks to this, even if we fail after leaving the token ring but before leaving group 0, + // group 0's availability won't be reduced. + if (raft_available) { + slogger.info("decommission[{}]: becoming a group 0 non-voter", uuid); + ss._group0->become_nonvoter().get(); + slogger.info("decommission[{}]: became a group 0 non-voter", uuid); + } + + // Step 6: Leave the token ring + slogger.info("decommission[{}]: leaving token ring", uuid); + ss.leave_ring().get(); + slogger.info("decommission[{}]: left token ring", uuid); + + // Step 7: Finish req.cmd = node_ops_cmd::decommission_done; parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) { return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) { @@ -2070,8 +2087,6 @@ future<> storage_service::decommission() { throw; } - assert(ss._group0); - bool raft_available = ss._group0->wait_for_raft().get(); if (raft_available) { slogger.info("DECOMMISSIONING: leaving Raft group 0"); ss._group0->leave_group0().get(); @@ -2391,7 +2406,20 @@ future<> storage_service::removenode(locator::host_id host_id, std::listwait_for_raft().get(); + if (raft_available) { + slogger.info("removenode[{}]: making node {} a non-voter in group 0", uuid, raft_id); + ss._group0->make_nonvoter(raft_id).get(); + slogger.info("removenode[{}]: made node {} a non-voter in group 0", uuid, raft_id); + } + + // Step 3: Prepare to sync data std::unordered_set nodes_unknown_verb; std::unordered_set nodes_down; auto req = node_ops_cmd_request{node_ops_cmd::removenode_prepare, uuid, ignore_nodes, leaving_nodes, {}}; @@ -2418,7 +2446,7 @@ future<> storage_service::removenode(locator::host_id host_id, std::list(false); auto heartbeat_updater = ss.node_ops_cmd_heartbeat_updater(node_ops_cmd::removenode_heartbeat, uuid, nodes, heartbeat_updater_done); auto stop_heartbeat_updater = defer([&] { @@ -2426,7 +2454,7 @@ future<> storage_service::removenode(locator::host_id host_id, std::list storage_service::removenode(locator::host_id host_id, std::list tmp(tokens.begin(), tokens.end()); ss.excise(std::move(tmp), endpoint).get(); - // Step 6: Finish + // Step 7: Finish req.cmd = node_ops_cmd::removenode_done; parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) { return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) { @@ -2450,9 +2478,6 @@ future<> storage_service::removenode(locator::host_id host_id, std::listwait_for_raft().get(); if (raft_available) { slogger.info("removenode[{}]: removing node {} from group 0", uuid, raft_id); ss._group0->remove_from_group0(raft_id).get();