service: storage_service: make leaving node a non-voter before removing it from group 0 in decommission/removenode

removenode currently works roughly like this:
1. stream/repair data so it ends up on new replica sets (calculated
   without the node we want to remove)
2. remove the node from the token ring
3. remove the node from group 0 configuration.

If the procedure fails before after step 2 but before step 3 finishes,
we're in trouble: the cluster is left with an additional voting group 0
member, which reduces group 0's availability, and there is no way to
remove this member because `removenode` no longer considers it to be
part of the cluster (it consults the token ring to decide).

Improve this failure scenario by including a new step at the beginning:
make the node a non-voter in group 0 configuration. Then, even if we
fail after removing the node from the token ring but before removing it
from group 0, we'll only be left with a non-voter which doesn't reduce
availability.

We make a similar change for `decommission`: between `unbootstrap()` (which
streams data) and `leave_ring()` (which removes our tokens from the
ring), become a non-voter. The difference here is that we don't become a
non-voter at the beginning, but only after streaming/repair. In
`removenode` it's desirable to make the node a non-voter as soon as
possible because it's already dead. In decommission it may be desirable
for us to remain a voter if we fail during streaming because we're still
alive and functional in that case.

In a later commit we'll also make it possible to retry `removenode` to
remove a node that is only a group 0 member and not a token ring member.
This commit is contained in:
Kamil Braun
2023-01-11 18:44:41 +01:00
parent 1eee349a17
commit db734cd74f
3 changed files with 98 additions and 13 deletions

View File

@@ -671,6 +671,30 @@ future<> raft_group0::finish_setup_after_join() {
});
}
future<> raft_group0::become_nonvoter() {
if (!(co_await raft_upgrade_complete())) {
on_internal_error(group0_log, "called become_nonvoter before Raft upgrade finished");
}
auto my_id = load_my_id();
group0_log.info("becoming a non-voter (my id = {})...", my_id);
co_await make_raft_config_nonvoter(my_id);
group0_log.info("became a non-voter.", my_id);
}
future<> raft_group0::make_nonvoter(raft::server_id node) {
if (!(co_await raft_upgrade_complete())) {
on_internal_error(group0_log, "called make_nonvoter before Raft upgrade finished");
}
group0_log.info("making server {} a non-voter...", node);
co_await make_raft_config_nonvoter(node);
group0_log.info("server {} is now a non-voter.", node);
}
future<> raft_group0::leave_group0() {
if (!(co_await raft_upgrade_complete())) {
on_internal_error(group0_log, "called leave_group0 before Raft upgrade finished");
@@ -755,6 +779,26 @@ future<bool> raft_group0::wait_for_raft() {
co_return true;
}
future<> raft_group0::make_raft_config_nonvoter(raft::server_id id) {
static constexpr auto max_retry_period = std::chrono::seconds{1};
auto retry_period = std::chrono::milliseconds{10};
// FIXME: cancellability/timeout
while (true) {
try {
co_await _raft_gr.group0().modify_config({{{id, {}}, false}}, {}, &_abort_source);
co_return;
} catch (const raft::commit_status_unknown& e) {
group0_log.info("make_raft_config_nonvoter({}): modify_config returned \"{}\", retrying", id, e);
}
retry_period *= 2;
if (retry_period > max_retry_period) {
retry_period = max_retry_period;
}
co_await sleep_abortable(retry_period, _abort_source);
}
}
future<> raft_group0::remove_from_raft_config(raft::server_id id) {
static constexpr auto max_retry_period = std::chrono::seconds{1};
auto retry_period = std::chrono::milliseconds{10};

View File

@@ -158,6 +158,18 @@ public:
// This is a prerequisite for performing group 0 configuration operations.
future<bool> wait_for_raft();
// Become a non-voter in group 0.
//
// Assumes we've finished the startup procedure (`setup_group0()` finished earlier).
// `wait_for_raft` must've also been called earlier and returned `true`.
future<> become_nonvoter();
// Make the given server, other than us, a non-voter in group 0.
//
// Assumes we've finished the startup procedure (`setup_group0()` finished earlier).
// `wait_for_raft` must've also been called earlier and returned `true`.
future<> make_nonvoter(raft::server_id);
// Remove ourselves from group 0.
//
// Assumes we've finished the startup procedure (`setup_group0()` finished earlier).
@@ -260,6 +272,10 @@ private:
// (we could then try restarting the server internally).
future<> start_server_for_group0(raft::group_id group0_id);
// Make the given server a non-voter in Raft group 0 configuration.
// Retries on raft::commit_status_unknown.
future<> make_raft_config_nonvoter(raft::server_id);
// Remove the node from raft config, retries raft::commit_status_unknown.
future<> remove_from_raft_config(raft::server_id id);

View File

@@ -2003,6 +2003,9 @@ future<> storage_service::decommission() {
throw std::runtime_error(msg);
}
assert(ss._group0);
bool raft_available = ss._group0->wait_for_raft().get();
// Step 2: Prepare to sync data
std::unordered_set<gms::inet_address> nodes_unknown_verb;
std::unordered_set<gms::inet_address> nodes_down;
@@ -2038,13 +2041,27 @@ future<> storage_service::decommission() {
heartbeat_updater.get();
});
// Step 5: Start to sync data
// Step 4: Start to sync data
slogger.info("DECOMMISSIONING: unbootstrap starts");
ss.unbootstrap().get();
ss.leave_ring().get();
slogger.info("DECOMMISSIONING: unbootstrap done");
// Step 6: Finish
// Step 5: Become a group 0 non-voter before leaving the token ring.
//
// Thanks to this, even if we fail after leaving the token ring but before leaving group 0,
// group 0's availability won't be reduced.
if (raft_available) {
slogger.info("decommission[{}]: becoming a group 0 non-voter", uuid);
ss._group0->become_nonvoter().get();
slogger.info("decommission[{}]: became a group 0 non-voter", uuid);
}
// Step 6: Leave the token ring
slogger.info("decommission[{}]: leaving token ring", uuid);
ss.leave_ring().get();
slogger.info("decommission[{}]: left token ring", uuid);
// Step 7: Finish
req.cmd = node_ops_cmd::decommission_done;
parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) {
return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) {
@@ -2070,8 +2087,6 @@ future<> storage_service::decommission() {
throw;
}
assert(ss._group0);
bool raft_available = ss._group0->wait_for_raft().get();
if (raft_available) {
slogger.info("DECOMMISSIONING: leaving Raft group 0");
ss._group0->leave_group0().get();
@@ -2391,7 +2406,20 @@ future<> storage_service::removenode(locator::host_id host_id, std::list<locator
throw std::runtime_error(message);
}
// Step 2: Prepare to sync data
// Step 2: Make the node a group 0 non-voter before removing it from the token ring.
//
// Thanks to this, even if we fail after removing the node from the token ring
// but before removing it group 0, group 0's availability won't be reduced.
assert(ss._group0);
auto raft_id = raft::server_id{host_id.uuid()};
bool raft_available = ss._group0->wait_for_raft().get();
if (raft_available) {
slogger.info("removenode[{}]: making node {} a non-voter in group 0", uuid, raft_id);
ss._group0->make_nonvoter(raft_id).get();
slogger.info("removenode[{}]: made node {} a non-voter in group 0", uuid, raft_id);
}
// Step 3: Prepare to sync data
std::unordered_set<gms::inet_address> nodes_unknown_verb;
std::unordered_set<gms::inet_address> nodes_down;
auto req = node_ops_cmd_request{node_ops_cmd::removenode_prepare, uuid, ignore_nodes, leaving_nodes, {}};
@@ -2418,7 +2446,7 @@ future<> storage_service::removenode(locator::host_id host_id, std::list<locator
throw std::runtime_error(msg);
}
// Step 3: Start heartbeat updater
// Step 4: Start heartbeat updater
auto heartbeat_updater_done = make_lw_shared<bool>(false);
auto heartbeat_updater = ss.node_ops_cmd_heartbeat_updater(node_ops_cmd::removenode_heartbeat, uuid, nodes, heartbeat_updater_done);
auto stop_heartbeat_updater = defer([&] {
@@ -2426,7 +2454,7 @@ future<> storage_service::removenode(locator::host_id host_id, std::list<locator
heartbeat_updater.get();
});
// Step 4: Start to sync data
// Step 5: Start to sync data
req.cmd = node_ops_cmd::removenode_sync_data;
parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) {
return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) {
@@ -2436,12 +2464,12 @@ future<> storage_service::removenode(locator::host_id host_id, std::list<locator
}).get();
// Step 5: Announce the node has left
// Step 6: Announce the node has left
ss._gossiper.advertise_token_removed(endpoint, host_id).get();
std::unordered_set<token> tmp(tokens.begin(), tokens.end());
ss.excise(std::move(tmp), endpoint).get();
// Step 6: Finish
// Step 7: Finish
req.cmd = node_ops_cmd::removenode_done;
parallel_for_each(nodes, [&ss, &req, uuid] (const gms::inet_address& node) {
return ss._messaging.local().send_node_ops_cmd(netw::msg_addr(node), req).then([uuid, node] (node_ops_cmd_response resp) {
@@ -2450,9 +2478,6 @@ future<> storage_service::removenode(locator::host_id host_id, std::list<locator
});
}).get();
auto raft_id = raft::server_id{host_id.uuid()};
assert(ss._group0);
bool raft_available = ss._group0->wait_for_raft().get();
if (raft_available) {
slogger.info("removenode[{}]: removing node {} from group 0", uuid, raft_id);
ss._group0->remove_from_group0(raft_id).get();