scylladb/test/cluster/test_left_node_notification.py

#
# Copyright (C) 2025-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import logging
import pytest
import asyncio

from test.pylib.manager_client import ManagerClient
from test.cluster.util import check_token_ring_and_group0_consistency

logger = logging.getLogger(__name__)


@pytest.mark.asyncio
async def test_left_node_notification(manager: ManagerClient) -> None:
    """
    Create a 3-node multi-DC cluster with 2 nodes in dc1 and 1 node in dc2.
    Then decommission both dc1 nodes, ensuring the topology remains consistent,
    and the remaining node belongs to dc2, and only two 'left the cluster'
    notifications were issued.
    """
    # Bootstrap 2 nodes in dc1
    logger.info("Bootstrapping dc1 nodes")
    dc1_node_a = await manager.server_add(property_file={"dc": "dc1", "rack": "r1"})
    dc1_node_b = await manager.server_add(property_file={"dc": "dc1", "rack": "r2"})

    # Bootstrap 1 node in dc2 with storage_service debug logging
    logger.info("Bootstrapping dc2 node with storage_service=debug")
    dc2_node = await manager.server_add(cmdline=["--logger-log-level", "storage_service=debug"],
                                        property_file={"dc": "dc2", "rack": "r1"})

    # When table audit is enabled, Scylla creates the "audit" keyspace with
    # NetworkTopologyStrategy and RF=3 in dc1 only. To avoid decommission failures due to
    # "zero replica after the removal" or "can not find new node in local dc" errors when
    # removing dc1 nodes, we alter the audit keyspace to have replicas only in dc2.
    # Only alter if the audit keyspace exists (it might not exist if audit is disabled).
    cql = manager.get_cql()
    result = await cql.run_async("SELECT * FROM system_schema.keyspaces WHERE keyspace_name = 'audit'")
    if result:
        await cql.run_async("ALTER KEYSPACE audit WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'dc2': 1}")

    # Ensure ring and group0 are consistent before operations
    await check_token_ring_and_group0_consistency(manager)

    # Decommission both dc1 nodes
    logger.info(f"Decommissioning dc1 node {dc1_node_b}")
    await manager.decommission_node(dc1_node_b.server_id)
    await check_token_ring_and_group0_consistency(manager)

    logger.info(f"Decommissioning dc1 node {dc1_node_a}")
    await manager.decommission_node(dc1_node_a.server_id)
    await check_token_ring_and_group0_consistency(manager)

    # Verify only dc2 node remains running
    running = await manager.running_servers()
    assert len(running) == 1, f"Expected 1 running server, found {len(running)}: {running}"
    assert running[0].datacenter == "dc2", f"Remaining node should be in dc2, got {running[0].datacenter}"
    logger.info("Successfully decommissioned both dc1 nodes; dc2 node remains running")

    # Check the remaining node's log contains exactly two 'Notify node … has left the cluster'
    log = await manager.server_open_log(dc2_node.server_id)
    left_msgs = await log.grep(r"Notify node .* has left the cluster")
    assert len(left_msgs) == 2, f"Expected exactly 2 'left the cluster' notifications, got {len(left_msgs)}"