From 9aebd6dd961b69864f01664818161c19a4d3528a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Tue, 16 Jan 2024 09:46:03 +0100 Subject: [PATCH] raft topology: send barrier_and_drain to a decommissioning node Before this patch, we didn't send the `barrier_and_drain` command to a decommissioning node that could still be coordinating requests. It could happen that a decommissioning node sent a request with an old topology version after normal nodes received the new fence version. Then, the request would fail on replicas with the stale topology exception. We fix this problem by modifying `exec_global_command`. From now on, it sends `barrier_and_drain` to a decommissioning node, which can also be in the `left_token_ring` state. --- service/topology_coordinator.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index 573d74fb11..c96f9a8331 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -385,9 +385,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { const std::unordered_set& exclude_nodes, drop_guard_and_retake drop_and_retake = drop_guard_and_retake::yes) { rtlogger.info("executing global topology command {}, excluded nodes: {}", cmd.cmd, exclude_nodes); - auto nodes = _topo_sm._topology.normal_nodes - | boost::adaptors::filtered([&exclude_nodes] (const std::pair& n) { - return !exclude_nodes.contains(n.first); + auto nodes = boost::range::join(_topo_sm._topology.normal_nodes, _topo_sm._topology.transition_nodes) + | boost::adaptors::filtered([&cmd, &exclude_nodes] (const std::pair& n) { + // We must send barrier_and_drain to the decommissioning node as it might be coordinating requests. + bool drain_decommissioning_node = cmd.cmd == raft_topology_cmd::command::barrier_and_drain + && (n.second.state == node_state::decommissioning || n.second.state == node_state::left_token_ring); + return !exclude_nodes.contains(n.first) && (n.second.state == node_state::normal || drain_decommissioning_node); }) | boost::adaptors::map_keys; if (drop_and_retake) {