Files
scylladb/test/cluster/test_zero_token_nodes_multidc.py
Patryk Jędrzejczak 1f0f694c9e test: test_zero_token_nodes_multidc: properly handle reads with CL=LOCAL_ONE
The test is currently flaky. It incorrectly assumes that a read with
CL=LOCAL_ONE will see the data inserted by a preceding write with
CL=LOCAL_ONE in the same datacenter with RF=2.

The same issue has already been fixed for CL=ONE in
21edec1ace. The difference is that
for CL=LOCAL_ONE, only dc1 is problematic, as dc2 has RF=1.

We fix the issue for CL=LOCAL_ONE by skipping the check for dc1.

Fixes #28253

The fix addresses CI flakiness and only changes the test, so it
should be backported.

Closes scylladb/scylladb#28274
2026-01-21 15:17:42 +01:00

118 lines
5.7 KiB
Python

#
# Copyright (C) 2024-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
import logging
import pytest
from cassandra import ConsistencyLevel
from cassandra.policies import WhiteListRoundRobinPolicy
from cassandra.query import SimpleStatement
from test.pylib.manager_client import ManagerClient
from test.pylib.util import unique_name
from test.cluster.conftest import cluster_con
from test.cluster.util import create_new_test_keyspace
@pytest.mark.asyncio
@pytest.mark.parametrize('zero_token_nodes', [1, 2])
@pytest.mark.parametrize('rf_rack_valid_keyspaces', [False, True])
async def test_zero_token_nodes_multidc_basic(manager: ManagerClient, zero_token_nodes: int, rf_rack_valid_keyspaces: bool):
"""
Test the basic functionality of a DC with zero-token nodes:
- adding zero-token nodes to a new DC succeeds
- with tablets, ensuring enough replicas for tables depends on the number of token-owners in a DC, not all nodes
- client requests in the presence of zero-token nodes succeed (also when zero-token nodes coordinate)
"""
normal_cfg = {'endpoint_snitch': 'GossipingPropertyFileSnitch', 'rf_rack_valid_keyspaces': rf_rack_valid_keyspaces}
zero_token_cfg = normal_cfg | {'join_ring': False}
logging.info('Creating dc1 with 2 token-owning nodes')
servers = await manager.servers_add(2, config=normal_cfg, auto_rack_dc='dc1')
normal_nodes_in_dc2 = 2 - zero_token_nodes
logging.info(f'Creating dc2 with {normal_nodes_in_dc2} token-owning and {zero_token_nodes} zero-token nodes')
servers += await manager.servers_add(zero_token_nodes, config=zero_token_cfg, property_file={'dc': 'dc2', 'rack': 'rack1'})
if zero_token_nodes == 1:
servers.append(await manager.server_add(config=normal_cfg, property_file={'dc': 'dc2', 'rack': 'rack2'}))
logging.info('Creating connections to dc1 and dc2')
dc1_cql = cluster_con([servers[0].ip_addr],
load_balancing_policy=WhiteListRoundRobinPolicy([servers[0].ip_addr])).connect()
dc2_cql = cluster_con([servers[2].ip_addr],
load_balancing_policy=WhiteListRoundRobinPolicy([servers[2].ip_addr])).connect()
ks_names = list[str]()
logging.info('Trying to create tables for different replication factors')
# With `rf_rack_valid_keyspaces` set to true, we cannot create a keyspace with RF > #racks.
# Because of that, the test will fail not at the stage when a TABLE is created, but when
# the KEYSPACE is. We want to avoid that and hence this statement.
#
# rf_rack_valid_keyspaces == False: We'll attempt to create tables in keyspaces with too few normal token owners.
# rf_rack_valid_keyspaces == True: We'll only create RF-rack-valid keyspaces and so all of the created tables
# will have enough normal token owners.
rf_range = (normal_nodes_in_dc2 + 1) if rf_rack_valid_keyspaces else 3
for rf in range(rf_range):
failed = False
try:
ks_name = await create_new_test_keyspace(dc2_cql, f"""WITH replication =
{{'class': 'NetworkTopologyStrategy', 'replication_factor': 2, 'dc2': {rf}}}
AND tablets = {{ 'enabled': true }}""")
ks_names.append(ks_name)
await dc2_cql.run_async(
f'CREATE TABLE {ks_names[rf]}.tbl (cl int, zero_token boolean, v int, PRIMARY KEY (cl, zero_token))')
except Exception:
failed = True
assert failed == (rf > normal_nodes_in_dc2)
logging.info('Sending requests with different consistency levels')
for rf in range(normal_nodes_in_dc2 + 1):
# FIXME: we may add LOCAL_QUORUM to the list below once scylladb/scylladb#20028 is fixed.
cls = [
ConsistencyLevel.ONE,
ConsistencyLevel.TWO,
ConsistencyLevel.QUORUM,
ConsistencyLevel.EACH_QUORUM,
ConsistencyLevel.ALL
]
if rf == 1:
cls.append(ConsistencyLevel.LOCAL_ONE)
for cl in cls:
logging.info('Testing with rf=%s, consistency_level=%s', rf, cl)
insert_queries = [
SimpleStatement(
f'INSERT INTO {ks_names[rf]}.tbl (cl, zero_token, v) VALUES ({cl}, {zero_token_coordinator}, {cl})',
consistency_level=cl
) for zero_token_coordinator in [False, True]
]
await dc1_cql.run_async(insert_queries[0])
await dc2_cql.run_async(insert_queries[1])
if cl == ConsistencyLevel.EACH_QUORUM:
continue # EACH_QUORUM is supported only for writes
select_queries = [
SimpleStatement(
f'SELECT * FROM {ks_names[rf]}.tbl WHERE cl = {cl} AND zero_token = {zero_token_coordinator}',
consistency_level=cl
) for zero_token_coordinator in [False, True]
]
dc1_result_set = await dc1_cql.run_async(select_queries[0])
dc2_result_set = await dc2_cql.run_async(select_queries[1])
# With CL=ONE we don't have a guarantee that the replicas written to and read from have a non-empty
# intersection. Hence, reads could miss the written rows.
if cl == ConsistencyLevel.ONE:
continue
# The comment above is also true with CL=LOCAL_ONE, but only for dc1 that has RF=2.
if cl != ConsistencyLevel.LOCAL_ONE:
assert len(dc1_result_set) == 1
assert list(dc1_result_set[0]) == [cl, False, cl]
assert len(dc2_result_set) == 1
assert list(dc2_result_set[0]) == [cl, True, cl]