Raft topology goes over all nodes in a 'left' state and triggers 'remove node' notification in case id/ip mapping is available (meaning the node left recently), but the problem is that, since the mapping is not removed immediately, when multiple nodes are removed in succession a notification for the same node can be sent several times. Fix that by sending notification only if the node still exists in the peers table. It will be removed by the first notification and following notification will not be sent. Closes scylladb/scylladb#27743
56 lines
2.4 KiB
Python
56 lines
2.4 KiB
Python
#
|
|
# Copyright (C) 2025-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
#
|
|
import logging
|
|
import pytest
|
|
import asyncio
|
|
|
|
from test.pylib.manager_client import ManagerClient
|
|
from test.cluster.util import check_token_ring_and_group0_consistency
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_left_node_notification(manager: ManagerClient) -> None:
|
|
"""
|
|
Create a 3-node multi-DC cluster with 2 nodes in dc1 and 1 node in dc2.
|
|
Then decommission both dc1 nodes, ensuring the topology remains consistent
|
|
and the remaining node belongs to dc2 and there is only two 'left the cluster'
|
|
notifications were issued
|
|
"""
|
|
# Bootstrap 2 nodes in dc1
|
|
logger.info("Bootstrapping dc1 nodes")
|
|
dc1_node_a = await manager.server_add(property_file={"dc": "dc1", "rack": "r1"})
|
|
dc1_node_b = await manager.server_add(property_file={"dc": "dc1", "rack": "r2"})
|
|
|
|
# Bootstrap 1 node in dc2 with storage_service debug logging
|
|
logger.info("Bootstrapping dc2 node with storage_service=debug")
|
|
dc2_node = await manager.server_add(cmdline=["--logger-log-level", "storage_service=debug"],
|
|
property_file={"dc": "dc2", "rack": "r1"})
|
|
|
|
# Ensure ring and group0 are consistent before operations
|
|
await check_token_ring_and_group0_consistency(manager)
|
|
|
|
# Decommission both dc1 nodes
|
|
logger.info(f"Decommissioning dc1 node {dc1_node_b}")
|
|
await manager.decommission_node(dc1_node_b.server_id)
|
|
await check_token_ring_and_group0_consistency(manager)
|
|
|
|
logger.info(f"Decommissioning dc1 node {dc1_node_a}")
|
|
await manager.decommission_node(dc1_node_a.server_id)
|
|
await check_token_ring_and_group0_consistency(manager)
|
|
|
|
# Verify only dc2 node remains running
|
|
running = await manager.running_servers()
|
|
assert len(running) == 1, f"Expected 1 running server, found {len(running)}: {running}"
|
|
assert running[0].datacenter == "dc2", f"Remaining node should be in dc2, got {running[0].datacenter}"
|
|
logger.info("Successfully decommissioned both dc1 nodes; dc2 node remains running")
|
|
|
|
# Check the remaining node's log contains exactly two 'Notify node … has left the cluster'
|
|
log = await manager.server_open_log(dc2_node.server_id)
|
|
left_msgs = await log.grep(r"Notify node .* has left the cluster")
|
|
assert len(left_msgs) == 2, f"Expected exactly 2 'left the cluster' notifications, got {len(left_msgs)}"
|