diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index ada234cc9e..71bf04ae9e 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -671,6 +671,30 @@ future<> raft_group0::finish_setup_after_join() { }); } +future<> raft_group0::become_nonvoter() { + if (!(co_await raft_upgrade_complete())) { + on_internal_error(group0_log, "called become_nonvoter before Raft upgrade finished"); + } + + auto my_id = load_my_id(); + group0_log.info("becoming a non-voter (my id = {})...", my_id); + + co_await make_raft_config_nonvoter(my_id); + group0_log.info("became a non-voter.", my_id); +} + +future<> raft_group0::make_nonvoter(raft::server_id node) { + if (!(co_await raft_upgrade_complete())) { + on_internal_error(group0_log, "called make_nonvoter before Raft upgrade finished"); + } + + group0_log.info("making server {} a non-voter...", node); + + co_await make_raft_config_nonvoter(node); + + group0_log.info("server {} is now a non-voter.", node); +} + future<> raft_group0::leave_group0() { if (!(co_await raft_upgrade_complete())) { on_internal_error(group0_log, "called leave_group0 before Raft upgrade finished"); @@ -755,6 +779,26 @@ future raft_group0::wait_for_raft() { co_return true; } +future<> raft_group0::make_raft_config_nonvoter(raft::server_id id) { + static constexpr auto max_retry_period = std::chrono::seconds{1}; + auto retry_period = std::chrono::milliseconds{10}; + + // FIXME: cancellability/timeout + while (true) { + try { + co_await _raft_gr.group0().modify_config({{{id, {}}, false}}, {}, &_abort_source); + co_return; + } catch (const raft::commit_status_unknown& e) { + group0_log.info("make_raft_config_nonvoter({}): modify_config returned \"{}\", retrying", id, e); + } + retry_period *= 2; + if (retry_period > max_retry_period) { + retry_period = max_retry_period; + } + co_await sleep_abortable(retry_period, _abort_source); + } +} + future<> raft_group0::remove_from_raft_config(raft::server_id id) { static constexpr auto max_retry_period = std::chrono::seconds{1}; auto retry_period = std::chrono::milliseconds{10}; diff --git a/service/raft/raft_group0.hh b/service/raft/raft_group0.hh index 10d4e98406..375a659967 100644 --- a/service/raft/raft_group0.hh +++ b/service/raft/raft_group0.hh @@ -158,6 +158,18 @@ public: // This is a prerequisite for performing group 0 configuration operations. future wait_for_raft(); + // Become a non-voter in group 0. + // + // Assumes we've finished the startup procedure (`setup_group0()` finished earlier). + // `wait_for_raft` must've also been called earlier and returned `true`. + future<> become_nonvoter(); + + // Make the given server, other than us, a non-voter in group 0. + // + // Assumes we've finished the startup procedure (`setup_group0()` finished earlier). + // `wait_for_raft` must've also been called earlier and returned `true`. + future<> make_nonvoter(raft::server_id); + // Remove ourselves from group 0. // // Assumes we've finished the startup procedure (`setup_group0()` finished earlier). @@ -260,6 +272,10 @@ private: // (we could then try restarting the server internally). future<> start_server_for_group0(raft::group_id group0_id); + // Make the given server a non-voter in Raft group 0 configuration. + // Retries on raft::commit_status_unknown. + future<> make_raft_config_nonvoter(raft::server_id); + // Remove the node from raft config, retries raft::commit_status_unknown. future<> remove_from_raft_config(raft::server_id id); diff --git a/service/storage_service.cc b/service/storage_service.cc index 93ba60aede..2171ea2163 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2003,6 +2003,9 @@ future<> storage_service::decommission() { throw std::runtime_error(msg); } + assert(ss._group0); + bool raft_available = ss._group0->wait_for_raft().get(); + // Step 2: Prepare to sync data std::unordered_set nodes_unknown_verb; std::unordered_set nodes_down; @@ -2038,13 +2041,27 @@ future<> storage_service::decommission() { heartbeat_updater.get(); }); - // Step 5: Start to sync data + // Step 4: Start to sync data slogger.info("DECOMMISSIONING: unbootstrap starts"); ss.unbootstrap().get(); - ss.leave_ring().get(); slogger.info("DECOMMISSIONING: unbootstrap done"); - // Step 6: Finish + // Step 5: Become a group 0 non-voter before leaving the token ring. + // + // Thanks to this, even if we fail after leaving the token ring but before leaving group 0, + // group 0's availability won't be reduced. + if (raft_available) { + slogger.info("decommission[{}]: becoming a group 0 non-voter", uuid); + ss._group0->become_nonvoter().get(); + slogger.info("decommission[{}]: became a group 0 non-voter", uuid); + } + + // Step 6: Leave the token ring + slogger.info("decommission[{}]: leaving token ring", uuid); + ss.leave_ring().get(); + slogger.info("decommission[{}]: left token ring", uuid); + + // Step 7: Finish req.cmd = node_ops_cmd::decommission_done; parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) { return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) { @@ -2070,8 +2087,6 @@ future<> storage_service::decommission() { throw; } - assert(ss._group0); - bool raft_available = ss._group0->wait_for_raft().get(); if (raft_available) { slogger.info("DECOMMISSIONING: leaving Raft group 0"); ss._group0->leave_group0().get(); @@ -2391,7 +2406,20 @@ future<> storage_service::removenode(locator::host_id host_id, std::listwait_for_raft().get(); + if (raft_available) { + slogger.info("removenode[{}]: making node {} a non-voter in group 0", uuid, raft_id); + ss._group0->make_nonvoter(raft_id).get(); + slogger.info("removenode[{}]: made node {} a non-voter in group 0", uuid, raft_id); + } + + // Step 3: Prepare to sync data std::unordered_set nodes_unknown_verb; std::unordered_set nodes_down; auto req = node_ops_cmd_request{node_ops_cmd::removenode_prepare, uuid, ignore_nodes, leaving_nodes, {}}; @@ -2418,7 +2446,7 @@ future<> storage_service::removenode(locator::host_id host_id, std::list(false); auto heartbeat_updater = ss.node_ops_cmd_heartbeat_updater(node_ops_cmd::removenode_heartbeat, uuid, nodes, heartbeat_updater_done); auto stop_heartbeat_updater = defer([&] { @@ -2426,7 +2454,7 @@ future<> storage_service::removenode(locator::host_id host_id, std::list storage_service::removenode(locator::host_id host_id, std::list tmp(tokens.begin(), tokens.end()); ss.excise(std::move(tmp), endpoint).get(); - // Step 6: Finish + // Step 7: Finish req.cmd = node_ops_cmd::removenode_done; parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) { return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) { @@ -2450,9 +2478,6 @@ future<> storage_service::removenode(locator::host_id host_id, std::listwait_for_raft().get(); if (raft_available) { slogger.info("removenode[{}]: removing node {} from group 0", uuid, raft_id); ss._group0->remove_from_group0(raft_id).get();