mirror of
https://github.com/scylladb/scylladb.git
synced 2026-06-01 04:26:48 +00:00
storage_service: remove optimization in cleanup_group0_config_if_needed
The `topology_coordinator::cleanup_group0_config_if_needed` function first checks whether the number of group 0 members is larger than the number of non-left entries in the topology table, then attempts to remove nodes in left state from group 0 and prints a warning if no such nodes are found. There are some problems with this check: - Currently, a node is added to group 0 before it inserts its entry to the topology table. Such a node may cause the check to succeed but no nodes will be removed, which will cause the warning to be printed needlessly. - Cluster features on raft will reverse the situation and it will be possible for an entry in system.topology to exist without the corresponding node being a part of group 0. This, in turn, may cause the check not to pass when it should and nodes could be removed later than necessary. This commit gets rid of the optimization and the warning, and the topology coordinator will always compute the set of nodes that should be removed. Additionally, the set of nodes to remove is now computed differently: instead of iterating over left nodes and including only those that are in group 0, we now iterate over group 0 members and include those that are in `left` state. As the number of left nodes can potentially grow unbounded and the number of group 0 members is more likely to be bounded, this should give better performance in long-running clusters.
This commit is contained in:
@@ -714,27 +714,22 @@ class topology_coordinator {
|
||||
future<> cleanup_group0_config_if_needed() {
|
||||
auto& topo = _topo_sm._topology;
|
||||
auto rconf = _group0.group0_server().get_configuration();
|
||||
if (rconf.current.size() > topo.normal_nodes.size() + topo.new_nodes.size() + topo.transition_nodes.size()) {
|
||||
// Raft config is larger than the sum of all nodes not in 'left' state.
|
||||
if (!rconf.is_joint()) {
|
||||
// Find nodes that 'left' but still in the config and remove them
|
||||
size_t found = 0;
|
||||
co_await coroutine::parallel_for_each(topo.left_nodes | boost::adaptors::filtered(
|
||||
[&rconf] (raft::server_id id) { return rconf.contains(id); }),
|
||||
[&] (raft::server_id id) -> future<> {
|
||||
found++;
|
||||
auto to_remove = boost::copy_range<std::vector<raft::server_id>>(
|
||||
rconf.current
|
||||
| boost::adaptors::transformed([&] (const raft::config_member& m) { return m.addr.id; })
|
||||
| boost::adaptors::filtered([&] (const raft::server_id& id) { return topo.left_nodes.contains(id); }));
|
||||
if (!to_remove.empty()) {
|
||||
// Remove from group 0 nodes that left. They may failed to do so by themselves
|
||||
try {
|
||||
slogger.trace("raft topology: topology coordinator fiber removing {}"
|
||||
" from the raft since it is in `left` state", id);
|
||||
co_await _group0.group0_server().modify_config({}, {id}, &_as);
|
||||
" from raft since they are in `left` state", to_remove);
|
||||
co_await _group0.group0_server().modify_config({}, to_remove, &_as);
|
||||
} catch (const raft::commit_status_unknown&) {
|
||||
slogger.trace("raft topology: topology coordinator fiber got unknown status"
|
||||
" while removing {} from the raft", id);
|
||||
" while removing {} from raft", to_remove);
|
||||
}
|
||||
});
|
||||
if (!found) {
|
||||
slogger.warn("raft topology: raft config is larger then sum of all nodes"
|
||||
" in non left state but no nodes in left state were found");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user