When table audit is enabled, Scylla creates the "audit" ks with NetworkTopologyStrategy and RF=3. During node decommission, streaming can fail for the audit ks with "zero replica after the removal" when all nodes from a DC are removed, and so we have to ALTER audit ks to either zero the number of its replicas, to allow for a clear decommission, or have them in the 2nd DC. BTW. https://github.com/scylladb/scylladb/issues/27395 is the same change, but in dtests repository.
66 lines
3.1 KiB
Python
66 lines
3.1 KiB
Python
#
|
|
# Copyright (C) 2025-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
#
|
|
import logging
|
|
import pytest
|
|
import asyncio
|
|
|
|
from test.pylib.manager_client import ManagerClient
|
|
from test.cluster.util import check_token_ring_and_group0_consistency
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_left_node_notification(manager: ManagerClient) -> None:
|
|
"""
|
|
Create a 3-node multi-DC cluster with 2 nodes in dc1 and 1 node in dc2.
|
|
Then decommission both dc1 nodes, ensuring the topology remains consistent,
|
|
and the remaining node belongs to dc2, and only two 'left the cluster'
|
|
notifications were issued.
|
|
"""
|
|
# Bootstrap 2 nodes in dc1
|
|
logger.info("Bootstrapping dc1 nodes")
|
|
dc1_node_a = await manager.server_add(property_file={"dc": "dc1", "rack": "r1"})
|
|
dc1_node_b = await manager.server_add(property_file={"dc": "dc1", "rack": "r2"})
|
|
|
|
# Bootstrap 1 node in dc2 with storage_service debug logging
|
|
logger.info("Bootstrapping dc2 node with storage_service=debug")
|
|
dc2_node = await manager.server_add(cmdline=["--logger-log-level", "storage_service=debug"],
|
|
property_file={"dc": "dc2", "rack": "r1"})
|
|
|
|
# When table audit is enabled, Scylla creates the "audit" keyspace with
|
|
# NetworkTopologyStrategy and RF=3 in dc1 only. To avoid decommission failures due to
|
|
# "zero replica after the removal" or "can not find new node in local dc" errors when
|
|
# removing dc1 nodes, we alter the audit keyspace to have replicas only in dc2.
|
|
# Only alter if the audit keyspace exists (it might not exist if audit is disabled).
|
|
cql = manager.get_cql()
|
|
result = await cql.run_async("SELECT * FROM system_schema.keyspaces WHERE keyspace_name = 'audit'")
|
|
if result:
|
|
await cql.run_async("ALTER KEYSPACE audit WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'dc2': 1}")
|
|
|
|
# Ensure ring and group0 are consistent before operations
|
|
await check_token_ring_and_group0_consistency(manager)
|
|
|
|
# Decommission both dc1 nodes
|
|
logger.info(f"Decommissioning dc1 node {dc1_node_b}")
|
|
await manager.decommission_node(dc1_node_b.server_id)
|
|
await check_token_ring_and_group0_consistency(manager)
|
|
|
|
logger.info(f"Decommissioning dc1 node {dc1_node_a}")
|
|
await manager.decommission_node(dc1_node_a.server_id)
|
|
await check_token_ring_and_group0_consistency(manager)
|
|
|
|
# Verify only dc2 node remains running
|
|
running = await manager.running_servers()
|
|
assert len(running) == 1, f"Expected 1 running server, found {len(running)}: {running}"
|
|
assert running[0].datacenter == "dc2", f"Remaining node should be in dc2, got {running[0].datacenter}"
|
|
logger.info("Successfully decommissioned both dc1 nodes; dc2 node remains running")
|
|
|
|
# Check the remaining node's log contains exactly two 'Notify node … has left the cluster'
|
|
log = await manager.server_open_log(dc2_node.server_id)
|
|
left_msgs = await log.grep(r"Notify node .* has left the cluster")
|
|
assert len(left_msgs) == 2, f"Expected exactly 2 'left the cluster' notifications, got {len(left_msgs)}"
|