mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-02 14:15:46 +00:00
test/gossiper: add reproducible test for race condition during node decommission
This change introduces a targeted test that simulates the gossiper race condition observed during node decommissioning. The test delays gossip state application and host ID lookup to reliably reproduce the scenario where `gossiper::get_host_id()` is called on a removed endpoint, potentially triggering an abort in `apply_new_states`. There is a specific error injection added to widen the race window, in order to increase the likelihood of hitting the race condition. The error injection is designed to delay the application of gossip state updates, for the specific node that is being decommissioned. This should then result in the server abort in the gossiper. Refs: scylladb/scylladb#25621 Fixes: scylladb/scylladb#25721 Backport: The test is primarily for an issue found in 2025.1, so it needs to be backported to all the 2025.x branches. Closes scylladb/scylladb#25685
This commit is contained in:
committed by
Piotr Dulikowski
parent
2e757d6de4
commit
5dac4b38fb
98
test/cluster/test_gossiper_race.py
Normal file
98
test/cluster/test_gossiper_race.py
Normal file
@@ -0,0 +1,98 @@
|
||||
#
|
||||
# Copyright (C) 2025-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
||||
#
|
||||
|
||||
|
||||
from aiohttp import ServerDisconnectedError
|
||||
import pytest
|
||||
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.cluster.util import get_coordinator_host
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@skip_mode('release', 'error injections are not supported in release mode')
|
||||
@pytest.mark.xfail(reason="https://github.com/scylladb/scylladb/issues/25621")
|
||||
async def test_gossiper_race_on_decommission(manager: ManagerClient):
|
||||
"""
|
||||
Test for gossiper race scenario (https://github.com/scylladb/scylladb/issues/25621):
|
||||
- Create a cluster with multiple nodes
|
||||
- Decommission one node while injecting delays in gossip processing
|
||||
- Check for the race condition where get_host_id() is called on a removed endpoint
|
||||
"""
|
||||
cmdline = [
|
||||
'--logger-log-level=gossip=debug',
|
||||
'--logger-log-level=raft_topology=debug'
|
||||
]
|
||||
|
||||
# Create cluster with more nodes to increase gossip traffic
|
||||
servers = await manager.servers_add(3, cmdline=cmdline)
|
||||
|
||||
coordinator = await get_coordinator_host(manager=manager)
|
||||
coordinator_log = await manager.server_open_log(server_id=coordinator.server_id)
|
||||
coordinator_log_mark = await coordinator_log.mark()
|
||||
|
||||
decom_node = next(s for s in servers if s.server_id != coordinator.server_id)
|
||||
|
||||
# enable the delay_gossiper_apply injection
|
||||
await manager.api.enable_injection(
|
||||
node_ip=coordinator.ip_addr,
|
||||
injection="delay_gossiper_apply",
|
||||
one_shot=False,
|
||||
parameters={"delay_node": decom_node.ip_addr},
|
||||
)
|
||||
|
||||
# wait for the "delay_gossiper_apply" error injection to take effect
|
||||
# - wait for multiple occurrences to be batched, so that there is a higher chance of one of them
|
||||
# failing down in the `gossiper::do_on_change_notifications()`
|
||||
for _ in range(5):
|
||||
log_mark = await coordinator_log.mark()
|
||||
await coordinator_log.wait_for(
|
||||
"delay_gossiper_apply: suspend for node",
|
||||
from_mark=log_mark,
|
||||
)
|
||||
|
||||
coordinator_log_mark = await coordinator_log.mark()
|
||||
|
||||
# start the decommission task
|
||||
await manager.decommission_node(decom_node.server_id)
|
||||
|
||||
# wait for the node to finish the removal
|
||||
await coordinator_log.wait_for(
|
||||
"Finished to force remove node",
|
||||
from_mark=coordinator_log_mark,
|
||||
)
|
||||
|
||||
coordinator_log_mark = await coordinator_log.mark()
|
||||
|
||||
try:
|
||||
# unblock the delay_gossiper_apply injection
|
||||
await manager.api.message_injection(
|
||||
node_ip=coordinator.ip_addr,
|
||||
injection="delay_gossiper_apply",
|
||||
)
|
||||
except ServerDisconnectedError:
|
||||
# the server might get disconnected in the failure case because of abort
|
||||
# - we detect that later (with more informatiove error handling), so we ignore this here
|
||||
pass
|
||||
|
||||
# wait for the "delay_gossiper_apply" error injection to be unblocked
|
||||
await coordinator_log.wait_for(
|
||||
"delay_gossiper_apply: resume for node",
|
||||
from_mark=coordinator_log_mark,
|
||||
)
|
||||
|
||||
# test that the coordinator node didn't abort
|
||||
empty_host_found = await coordinator_log.grep(
|
||||
"gossip - adding a state with empty host id",
|
||||
from_mark=coordinator_log_mark,
|
||||
)
|
||||
|
||||
assert not empty_host_found, "Empty host ID has been found in gossiper::replicate()"
|
||||
|
||||
# secondary test - ensure the coordinator node is still running
|
||||
running_servers = await manager.running_servers()
|
||||
assert coordinator.server_id in [s.server_id for s in running_servers]
|
||||
Reference in New Issue
Block a user