Merge 'Correctly skip updating node's own ip address due to oudated gossiper data ' from Gleb Natapov

Used host id to check if the update is for the node itself. Using IP is unreliable since if a node is restarted with different IP a gossiper message with previous IP can be misinterpreted as belonging to a different node.

Fixes: #22777

Backport to 2025.1 since this fixes a crash. Older version do not have the code.

Closes scylladb/scylladb#24000

* https://github.com/scylladb/scylladb:
  test: add reproducer for #22777
  storage_service: Do not remove gossiper entry on address change
  storage_service: use id to check for local node
This commit is contained in:
Patryk Jędrzejczak
2025-05-09 11:28:20 +02:00
2 changed files with 7 additions and 13 deletions

View File

@@ -457,7 +457,7 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
switch (rs.state) {
case node_state::normal: {
if (is_me(ip)) {
if (is_me(id)) {
co_return;
}
// In replace-with-same-ip scenario the replaced node IP will be the same
@@ -490,8 +490,6 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
auto old_ip = it->second;
sys_ks_futures.push_back(_sys_ks.local().remove_endpoint(old_ip));
co_await _gossiper.force_remove_endpoint(id, gms::null_permit_id);
}
}
break;
@@ -945,22 +943,13 @@ class storage_service::ip_address_updater: public gms::i_endpoint_state_change_s
if (prev_ip == endpoint) {
co_return;
}
if (_address_map.find(id) != endpoint) {
// Address map refused to update IP for the host_id,
// this means prev_ip has higher generation than endpoint.
// We can immediately remove endpoint from gossiper
// since it represents an old IP (before an IP change)
// for the given host_id. This is not strictly
// necessary, but it reduces the noise circulated
// in gossiper messages and allows for clearer
// expectations of the gossiper state in tests.
co_await _ss._gossiper.force_remove_endpoint(id, permit_id);
// Do not update address.
co_return;
}
// If the host_id <-> IP mapping has changed, we need to update system tables, token_metadat and erm.
if (_ss.raft_topology_change_enabled()) {
rslog.debug("ip_address_updater::on_endpoint_change({}), host_id {}, "
@@ -1691,6 +1680,8 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,
slogger.info("Starting up server gossip");
co_await utils::get_local_injector().inject("sleep_before_start_gossiping", std::chrono::milliseconds{500});
co_await _gossiper.start_gossiping(new_generation, app_states);
utils::get_local_injector().inject("stop_after_starting_gossiping",

View File

@@ -117,6 +117,9 @@ async def test_change_two(manager, random_tables, build_mode):
# IP-s before they are send back to servers[1] and servers[2],
# and the mentioned above code is not exercised by this test.
await manager.api.enable_injection(servers[0].ip_addr, 'ip-change-raft-sync-delay', one_shot=False)
# sleep_before_start_gossiping injections are needed to reproduce #22777
await manager.server_update_config(servers[1].server_id, "error_injections_at_startup", ['sleep_before_start_gossiping'])
await manager.server_update_config(servers[2].server_id, "error_injections_at_startup", ['sleep_before_start_gossiping'])
await manager.server_start(servers[1].server_id)
servers[1] = ServerInfo(servers[1].server_id, s1_new_ip, s1_new_ip, servers[1].datacenter, servers[1].rack)
if build_mode != 'release':