mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-23 18:10:39 +00:00
In do_apply_state_locally, a race condition can occur if a task is suspended at a preemption point while the node entry is not locked. During this time, the host may be removed from _endpoint_state_map. When the task resumes, this can lead to inserting an entry with an empty host ID into the map, causing various errors, including a node crash. This change 1. adds a check after locking the map entry: if a gossip ACK update does not contain a host ID, we verify that an entry with that host ID still exists in the gossiper’s _endpoint_state_map. 2. Removes xfail from the test_gossiper_race test since the issue is now fixed. 3. Adds exception handling in `do_shadow_round` to skip responses from nodes that sent an empty host ID. This re-applies the commit13392a40d4that was reverted in46aa59fe49, after fixing the issues that caused the CI to fail. Fixes: scylladb/scylladb#25702 Fixes: scylladb/scylladb#25621 Ref: scylladb/scylla-enterprise#5613
98 lines
3.5 KiB
Python
98 lines
3.5 KiB
Python
#
|
|
# Copyright (C) 2025-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
#
|
|
|
|
|
|
from aiohttp import ServerDisconnectedError
|
|
import pytest
|
|
|
|
from test.cluster.conftest import skip_mode
|
|
from test.cluster.util import get_coordinator_host
|
|
from test.pylib.manager_client import ManagerClient
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@skip_mode('release', 'error injections are not supported in release mode')
|
|
async def test_gossiper_race_on_decommission(manager: ManagerClient):
|
|
"""
|
|
Test for gossiper race scenario (https://github.com/scylladb/scylladb/issues/25621):
|
|
- Create a cluster with multiple nodes
|
|
- Decommission one node while injecting delays in gossip processing
|
|
- Check for the race condition where get_host_id() is called on a removed endpoint
|
|
"""
|
|
cmdline = [
|
|
'--logger-log-level=gossip=debug',
|
|
'--logger-log-level=raft_topology=debug'
|
|
]
|
|
|
|
# Create cluster with more nodes to increase gossip traffic
|
|
servers = await manager.servers_add(3, cmdline=cmdline)
|
|
|
|
coordinator = await get_coordinator_host(manager=manager)
|
|
coordinator_log = await manager.server_open_log(server_id=coordinator.server_id)
|
|
coordinator_log_mark = await coordinator_log.mark()
|
|
|
|
decom_node = next(s for s in servers if s.server_id != coordinator.server_id)
|
|
|
|
# enable the delay_gossiper_apply injection
|
|
await manager.api.enable_injection(
|
|
node_ip=coordinator.ip_addr,
|
|
injection="delay_gossiper_apply",
|
|
one_shot=False,
|
|
parameters={"delay_node": decom_node.ip_addr},
|
|
)
|
|
|
|
# wait for the "delay_gossiper_apply" error injection to take effect
|
|
# - wait for multiple occurrences to be batched, so that there is a higher chance of one of them
|
|
# failing down in the `gossiper::do_on_change_notifications()`
|
|
for _ in range(5):
|
|
log_mark = await coordinator_log.mark()
|
|
await coordinator_log.wait_for(
|
|
"delay_gossiper_apply: suspend for node",
|
|
from_mark=log_mark,
|
|
)
|
|
|
|
coordinator_log_mark = await coordinator_log.mark()
|
|
|
|
# start the decommission task
|
|
await manager.decommission_node(decom_node.server_id)
|
|
|
|
# wait for the node to finish the removal
|
|
await coordinator_log.wait_for(
|
|
"Finished to force remove node",
|
|
from_mark=coordinator_log_mark,
|
|
)
|
|
|
|
coordinator_log_mark = await coordinator_log.mark()
|
|
|
|
try:
|
|
# unblock the delay_gossiper_apply injection
|
|
await manager.api.message_injection(
|
|
node_ip=coordinator.ip_addr,
|
|
injection="delay_gossiper_apply",
|
|
)
|
|
except ServerDisconnectedError:
|
|
# the server might get disconnected in the failure case because of abort
|
|
# - we detect that later (with more informatiove error handling), so we ignore this here
|
|
pass
|
|
|
|
# wait for the "delay_gossiper_apply" error injection to be unblocked
|
|
await coordinator_log.wait_for(
|
|
"delay_gossiper_apply: resume for node",
|
|
from_mark=coordinator_log_mark,
|
|
)
|
|
|
|
# test that the coordinator node didn't hit the case where it would try to add a state with empty host id
|
|
empty_host_found = await coordinator_log.grep(
|
|
"gossip - attempting to add a state with empty host id",
|
|
from_mark=coordinator_log_mark,
|
|
)
|
|
|
|
assert not empty_host_found, "Empty host ID has been found in gossiper::replicate()"
|
|
|
|
# secondary test - ensure the coordinator node is still running
|
|
running_servers = await manager.running_servers()
|
|
assert coordinator.server_id in [s.server_id for s in running_servers]
|