mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-25 09:11:10 +00:00
Merge 'Correctly skip updating node's own ip address due to oudated gossiper data ' from Gleb Natapov
Used host id to check if the update is for the node itself. Using IP is unreliable since if a node is restarted with different IP a gossiper message with previous IP can be misinterpreted as belonging to a different node. Fixes: #22777 Backport to 2025.1 since this fixes a crash. Older version do not have the code. Closes scylladb/scylladb#24000 * https://github.com/scylladb/scylladb: test: add reproducer for #22777 storage_service: Do not remove gossiper entry on address change storage_service: use id to check for local node
This commit is contained in:
@@ -457,7 +457,7 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
|
||||
|
||||
switch (rs.state) {
|
||||
case node_state::normal: {
|
||||
if (is_me(ip)) {
|
||||
if (is_me(id)) {
|
||||
co_return;
|
||||
}
|
||||
// In replace-with-same-ip scenario the replaced node IP will be the same
|
||||
@@ -490,8 +490,6 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
|
||||
|
||||
auto old_ip = it->second;
|
||||
sys_ks_futures.push_back(_sys_ks.local().remove_endpoint(old_ip));
|
||||
|
||||
co_await _gossiper.force_remove_endpoint(id, gms::null_permit_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -945,22 +943,13 @@ class storage_service::ip_address_updater: public gms::i_endpoint_state_change_s
|
||||
if (prev_ip == endpoint) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
if (_address_map.find(id) != endpoint) {
|
||||
// Address map refused to update IP for the host_id,
|
||||
// this means prev_ip has higher generation than endpoint.
|
||||
// We can immediately remove endpoint from gossiper
|
||||
// since it represents an old IP (before an IP change)
|
||||
// for the given host_id. This is not strictly
|
||||
// necessary, but it reduces the noise circulated
|
||||
// in gossiper messages and allows for clearer
|
||||
// expectations of the gossiper state in tests.
|
||||
|
||||
co_await _ss._gossiper.force_remove_endpoint(id, permit_id);
|
||||
// Do not update address.
|
||||
co_return;
|
||||
}
|
||||
|
||||
|
||||
// If the host_id <-> IP mapping has changed, we need to update system tables, token_metadat and erm.
|
||||
if (_ss.raft_topology_change_enabled()) {
|
||||
rslog.debug("ip_address_updater::on_endpoint_change({}), host_id {}, "
|
||||
@@ -1691,6 +1680,8 @@ future<> storage_service::join_topology(sharded<service::storage_proxy>& proxy,
|
||||
|
||||
slogger.info("Starting up server gossip");
|
||||
|
||||
co_await utils::get_local_injector().inject("sleep_before_start_gossiping", std::chrono::milliseconds{500});
|
||||
|
||||
co_await _gossiper.start_gossiping(new_generation, app_states);
|
||||
|
||||
utils::get_local_injector().inject("stop_after_starting_gossiping",
|
||||
|
||||
@@ -117,6 +117,9 @@ async def test_change_two(manager, random_tables, build_mode):
|
||||
# IP-s before they are send back to servers[1] and servers[2],
|
||||
# and the mentioned above code is not exercised by this test.
|
||||
await manager.api.enable_injection(servers[0].ip_addr, 'ip-change-raft-sync-delay', one_shot=False)
|
||||
# sleep_before_start_gossiping injections are needed to reproduce #22777
|
||||
await manager.server_update_config(servers[1].server_id, "error_injections_at_startup", ['sleep_before_start_gossiping'])
|
||||
await manager.server_update_config(servers[2].server_id, "error_injections_at_startup", ['sleep_before_start_gossiping'])
|
||||
await manager.server_start(servers[1].server_id)
|
||||
servers[1] = ServerInfo(servers[1].server_id, s1_new_ip, s1_new_ip, servers[1].datacenter, servers[1].rack)
|
||||
if build_mode != 'release':
|
||||
|
||||
Reference in New Issue
Block a user