Files
scylladb/test/cluster/test_left_node_notification.py
Piotr Smaron 2e12b83366 test/cluster: adjust audit in tests involving decommissioning its ks
When table audit is enabled, Scylla creates the "audit" ks with
NetworkTopologyStrategy and RF=3. During node decommission, streaming can fail
for the audit ks with "zero replica after the removal" when all nodes from a DC
are removed, and so we have to ALTER audit ks to either zero the number of its
replicas, to allow for a clear decommission, or have them in the 2nd DC.

BTW. https://github.com/scylladb/scylladb/issues/27395 is the same change, but
in dtests repository.
2026-02-18 15:14:55 +01:00

66 lines
3.1 KiB
Python

#
# Copyright (C) 2025-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import logging
import pytest
import asyncio
from test.pylib.manager_client import ManagerClient
from test.cluster.util import check_token_ring_and_group0_consistency
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
async def test_left_node_notification(manager: ManagerClient) -> None:
"""
Create a 3-node multi-DC cluster with 2 nodes in dc1 and 1 node in dc2.
Then decommission both dc1 nodes, ensuring the topology remains consistent,
and the remaining node belongs to dc2, and only two 'left the cluster'
notifications were issued.
"""
# Bootstrap 2 nodes in dc1
logger.info("Bootstrapping dc1 nodes")
dc1_node_a = await manager.server_add(property_file={"dc": "dc1", "rack": "r1"})
dc1_node_b = await manager.server_add(property_file={"dc": "dc1", "rack": "r2"})
# Bootstrap 1 node in dc2 with storage_service debug logging
logger.info("Bootstrapping dc2 node with storage_service=debug")
dc2_node = await manager.server_add(cmdline=["--logger-log-level", "storage_service=debug"],
property_file={"dc": "dc2", "rack": "r1"})
# When table audit is enabled, Scylla creates the "audit" keyspace with
# NetworkTopologyStrategy and RF=3 in dc1 only. To avoid decommission failures due to
# "zero replica after the removal" or "can not find new node in local dc" errors when
# removing dc1 nodes, we alter the audit keyspace to have replicas only in dc2.
# Only alter if the audit keyspace exists (it might not exist if audit is disabled).
cql = manager.get_cql()
result = await cql.run_async("SELECT * FROM system_schema.keyspaces WHERE keyspace_name = 'audit'")
if result:
await cql.run_async("ALTER KEYSPACE audit WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'dc2': 1}")
# Ensure ring and group0 are consistent before operations
await check_token_ring_and_group0_consistency(manager)
# Decommission both dc1 nodes
logger.info(f"Decommissioning dc1 node {dc1_node_b}")
await manager.decommission_node(dc1_node_b.server_id)
await check_token_ring_and_group0_consistency(manager)
logger.info(f"Decommissioning dc1 node {dc1_node_a}")
await manager.decommission_node(dc1_node_a.server_id)
await check_token_ring_and_group0_consistency(manager)
# Verify only dc2 node remains running
running = await manager.running_servers()
assert len(running) == 1, f"Expected 1 running server, found {len(running)}: {running}"
assert running[0].datacenter == "dc2", f"Remaining node should be in dc2, got {running[0].datacenter}"
logger.info("Successfully decommissioned both dc1 nodes; dc2 node remains running")
# Check the remaining node's log contains exactly two 'Notify node … has left the cluster'
log = await manager.server_open_log(dc2_node.server_id)
left_msgs = await log.grep(r"Notify node .* has left the cluster")
assert len(left_msgs) == 2, f"Expected exactly 2 'left the cluster' notifications, got {len(left_msgs)}"