From a2178b7c31e947e1b0e850fd05bac1a5af930bd6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 4 May 2025 17:56:20 +0300 Subject: [PATCH 1/3] storage_service: use id to check for local node IP may change and an old gossiper message with previous IP may be processed when it shouldn't. Fixes: #22777 --- service/storage_service.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 51bc2ab7f0..35b0e6c09d 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -457,7 +457,7 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet switch (rs.state) { case node_state::normal: { - if (is_me(ip)) { + if (is_me(id)) { co_return; } // In replace-with-same-ip scenario the replaced node IP will be the same From ecd14753c0029da90291db526fbb44934f7894c0 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 4 May 2025 17:56:54 +0300 Subject: [PATCH 2/3] storage_service: Do not remove gossiper entry on address change When gossiper indexed entries by ip an old entry had to be removed on an address change, but the index is id based, so even if ip was change the entry should stay. Gossiper simply updates an ip address there. --- service/storage_service.cc | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 35b0e6c09d..5ab1a635a5 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -490,8 +490,6 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet auto old_ip = it->second; sys_ks_futures.push_back(_sys_ks.local().remove_endpoint(old_ip)); - - co_await _gossiper.force_remove_endpoint(id, gms::null_permit_id); } } break; @@ -945,22 +943,13 @@ class storage_service::ip_address_updater: public gms::i_endpoint_state_change_s if (prev_ip == endpoint) { co_return; } - if (_address_map.find(id) != endpoint) { // Address map refused to update IP for the host_id, // this means prev_ip has higher generation than endpoint. - // We can immediately remove endpoint from gossiper - // since it represents an old IP (before an IP change) - // for the given host_id. This is not strictly - // necessary, but it reduces the noise circulated - // in gossiper messages and allows for clearer - // expectations of the gossiper state in tests. - - co_await _ss._gossiper.force_remove_endpoint(id, permit_id); + // Do not update address. co_return; } - // If the host_id <-> IP mapping has changed, we need to update system tables, token_metadat and erm. if (_ss.raft_topology_change_enabled()) { rslog.debug("ip_address_updater::on_endpoint_change({}), host_id {}, " From 7403de241cbe2715a48f869ef4b73b361b8799bd Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 4 May 2025 17:52:13 +0300 Subject: [PATCH 3/3] test: add reproducer for #22777 Add sleep before starting gossiper to increase a chance of getting old gossiper entry about yourself before updating local gossiper info with new IP address. --- service/storage_service.cc | 2 ++ test/cluster/test_change_ip.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/service/storage_service.cc b/service/storage_service.cc index 5ab1a635a5..3cd662b733 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -1680,6 +1680,8 @@ future<> storage_service::join_topology(sharded& proxy, slogger.info("Starting up server gossip"); + co_await utils::get_local_injector().inject("sleep_before_start_gossiping", std::chrono::milliseconds{500}); + co_await _gossiper.start_gossiping(new_generation, app_states); utils::get_local_injector().inject("stop_after_starting_gossiping", diff --git a/test/cluster/test_change_ip.py b/test/cluster/test_change_ip.py index aa600ad069..aa927b346c 100644 --- a/test/cluster/test_change_ip.py +++ b/test/cluster/test_change_ip.py @@ -117,6 +117,9 @@ async def test_change_two(manager, random_tables, build_mode): # IP-s before they are send back to servers[1] and servers[2], # and the mentioned above code is not exercised by this test. await manager.api.enable_injection(servers[0].ip_addr, 'ip-change-raft-sync-delay', one_shot=False) + # sleep_before_start_gossiping injections are needed to reproduce #22777 + await manager.server_update_config(servers[1].server_id, "error_injections_at_startup", ['sleep_before_start_gossiping']) + await manager.server_update_config(servers[2].server_id, "error_injections_at_startup", ['sleep_before_start_gossiping']) await manager.server_start(servers[1].server_id) servers[1] = ServerInfo(servers[1].server_id, s1_new_ip, s1_new_ip, servers[1].datacenter, servers[1].rack) if build_mode != 'release':