From 8510568edacdf3fda2e31c1e084c063b9b11f736 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 26 Aug 2024 10:18:09 +0300 Subject: [PATCH] topology coordinator: do not add replacing node without a ring to topology When only inter dc encryption is enabled a non encrypted connection between two nodes is allowed only if both nodes are in the same dc. If a nodes that initiates the connection knows that dst is in the same dc and hence use non encrypted connection, but the dst not yet knows the topology of the src such connection will not be allowed since dst cannot guaranty that dst is in the same dc. Currently, when topology coordinator is used, a replacing node will appear in the coordinator's topology immediately after it is added to the group0. The coordinator will try to send raft message to the new node and (assuming only inter dc encryption is enabled and replacing node and the coordinator are in the same dc) it will try to open regular, non encrypted, connection to it. But the replacing node will not have the coordinator in it's topology yet (it needs to sync the raft state for that). so it will reject such connection. To solve the problem the patch does not add a replacing node that was just added to group0 to the topology. It will be added later, when tokens will be assigned to it. At this point a replacing node will already make sure that its topology state is up-to-date (since it will execute a raft barrier in join_node_response_params handler) and it knows coordinator's topology. This aligns replace behaviour with bootstrap since bootstrap also does not add a node without a ring to the topology. The patch effectively reverts b8ee8911ca7c1ad5d1aea9ba5124e5a84499fe9a Fixes: scylladb/scylladb#19025 (cherry picked from commit 17f4a151ce27114a66a83874705c223e0dba9ceb) --- service/storage_service.cc | 21 +++++++++---------- .../test_replace_with_encryption.py | 1 - 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 53c392d1f3..2b2abbdaec 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -559,22 +559,21 @@ future storage_service::sync_raft_t on_fatal_internal_error(rtlogger, ::format("Cannot map id of a node being replaced {} to its ip", replaced_id)); } assert(existing_ip); + if (rs.ring.has_value()) { const auto replaced_host_id = locator::host_id(replaced_id.uuid()); tmptr->update_topology(replaced_host_id, std::nullopt, locator::node::state::being_replaced); update_topology(host_id, ip, rs); tmptr->add_replacing_endpoint(replaced_host_id, host_id); - if (t.tstate != service::topology::transition_state::join_group0) { - co_await update_topology_change_info(tmptr, ::format("replacing {}/{} by {}/{}", replaced_id, *existing_ip, id, ip)); + co_await update_topology_change_info(tmptr, ::format("replacing {}/{} by {}/{}", replaced_id, *existing_ip, id, ip)); } else { - // Do not update pending ranges in join_group0 state yet. After adding replacing endpoint above the - // node will no longer be reported for reads and writes, but it needs to be marked as alive - // before it is reported as pending. Otherwise increased CL during bootstrap will not be satisfied - // (replaced node cannot be contacted and replacing is reported as dead). So instead mark the node - // as UP (the node started report itself as alive already by this point). If it is down it will be - // marked as such eventually. - if (existing_ip == ip && !_gossiper.is_alive(*existing_ip)) { - co_await _gossiper.real_mark_alive(*existing_ip); - } + // After adding replacing endpoint above the node will no longer be reported for reads and writes, + // but it needs to be marked as alive before it is reported as pending. Otherwise increased CL + // during bootstrap will not be satisfied (replaced node cannot be contacted and replacing is reported + // as dead). So instead mark the node as UP (the node started report itself as alive already by this + // point). If it is down it will be marked as such eventually. + if (existing_ip == ip && !_gossiper.is_alive(*existing_ip)) { + co_await _gossiper.real_mark_alive(*existing_ip); + } } } break; diff --git a/test/topology_custom/test_replace_with_encryption.py b/test/topology_custom/test_replace_with_encryption.py index bc64e48d42..ea9e8db01c 100644 --- a/test/topology_custom/test_replace_with_encryption.py +++ b/test/topology_custom/test_replace_with_encryption.py @@ -8,7 +8,6 @@ from test.pylib.manager_client import ManagerClient from test.pylib.scylla_cluster import ReplaceConfig @pytest.mark.asyncio -@pytest.mark.xfail(reason="issue #19025") async def test_replace_with_encryption(manager: ManagerClient): """Test that a node can be replaced if inter-dc encryption is enabled. The test creates 6 node cluster with two DCs and replaces one node in