From e157e8577eb444f4d7ade38bcbb2a493cc4e842d Mon Sep 17 00:00:00 2001 From: Sergey Zolotukhin Date: Thu, 28 Aug 2025 14:30:30 +0200 Subject: [PATCH] gossiper: check for a race condition in `do_apply_state_locally` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In do_apply_state_locally, a race condition can occur if a task is suspended at a preemption point while the node entry is not locked. During this time, the host may be removed from _endpoint_state_map. When the task resumes, this can lead to inserting an entry with an empty host ID into the map, causing various errors, including a node crash. This change 1. adds a check after locking the map entry: if a gossip ACK update does not contain a host ID, we verify that an entry with that host ID still exists in the gossiper’s _endpoint_state_map. 2. Removes xfail from the test_gossiper_race test since the issue is now fixed. 3. Adds exception handling in `do_shadow_round` to skip responses from nodes that sent an empty host ID. This re-applies the commit 13392a40d4d7dc60cd07b9de5cfe4d5b1651fe5d that was reverted in 46aa59fe4995820ae14ec4818bc73b704cfbd663, after fixing the issues that caused the CI to fail. Fixes: scylladb/scylladb#25702 Fixes: scylladb/scylladb#25621 Ref: scylladb/scylla-enterprise#5613 (cherry picked from commit f08df7c9d782eb1f70a464a873fdfa93ba3b8544) --- gms/gossiper.cc | 13 ++++++++++++- test/cluster/test_gossiper_race.py | 1 - 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/gms/gossiper.cc b/gms/gossiper.cc index f7a58b6b0d..002362a7c9 100644 --- a/gms/gossiper.cc +++ b/gms/gossiper.cc @@ -621,6 +621,13 @@ future<> gossiper::do_apply_state_locally(locator::host_id node, endpoint_state // If there is a generation tie, attempt to break it by heartbeat version. auto permit = co_await lock_endpoint(node, null_permit_id); auto es = get_endpoint_state_ptr(node); + + // If remote state update does not contain a host id, check whether the endpoint still + // exists in the `_endpoint_state_map` since after a preemption point it could have been deleted. + if (!remote_state.get_host_id() && !es) { + throw std::runtime_error(format("Entry for host id {} does not exist in the endpoint state map.", node)); + } + if (es) { endpoint_state local_state = *es; auto local_generation = local_state.get_heart_beat_state().get_generation(); @@ -2170,7 +2177,11 @@ future<> gossiper::do_shadow_round(std::unordered_set nodes, }); for (auto& response : responses) { - co_await apply_state_locally_in_shadow_round(std::move(response.endpoint_state_map)); + try { + co_await apply_state_locally_in_shadow_round(std::move(response.endpoint_state_map)); + } catch (const std::exception& exception) { + logger.warn("Error while applying node state {}", exception.what()); + } } if (!nodes_talked.empty()) { break; diff --git a/test/cluster/test_gossiper_race.py b/test/cluster/test_gossiper_race.py index 10d0259075..d42bc39937 100644 --- a/test/cluster/test_gossiper_race.py +++ b/test/cluster/test_gossiper_race.py @@ -15,7 +15,6 @@ from test.pylib.manager_client import ManagerClient @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') -@pytest.mark.xfail(reason="https://github.com/scylladb/scylladb/issues/25621") async def test_gossiper_race_on_decommission(manager: ManagerClient): """ Test for gossiper race scenario (https://github.com/scylladb/scylladb/issues/25621):