Files
scylladb/test/cluster/test_shutdown_hang.py
Łukasz Paszkowski d18eb9479f cql/statement: Create keyspace_metadata with correct initial_tablets count
In `ks_prop_defs::as_ks_metadata(...)` a default initial tablets count
is set to 0, when tablets are enabled and the replication strategy
is NetworkReplicationStrategy.

This effectively sets _uses_tablets = false in abstract_replication_strategy
for the remaining strategies when no `tablets = {...}` options are specified.
As a consequence, it is possible to create vnode-based keyspaces even
when tablets are enforced with `tablets_mode_for_new_keyspaces`.

The patch sets a default initial tablets count to zero regardless of
the chosen replication strategy. Then each of the replication strategy
validates the options and raises a configuration exception when tablets
are not supported.

All tests are altered in the following way:
+ whenever it was correct, SimpleStrategy was replaced with NetworkTopologyStrategy
+ otherwise, tablets were explicitly disabled with ` AND tablets = {'enabled': false}`

Fixes https://github.com/scylladb/scylladb/issues/25340

Closes scylladb/scylladb#25342
2026-04-20 17:57:38 +03:00

75 lines
3.1 KiB
Python

#
# Copyright (C) 2023-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
#
import asyncio
import logging
import time
import pytest
from cassandra.query import SimpleStatement # type: ignore
from cassandra.cluster import ConsistencyLevel # type: ignore
from cassandra.protocol import WriteTimeout # type: ignore
from test.pylib.manager_client import ManagerClient
from test.cluster.util import wait_for_token_ring_and_group0_consistency, new_test_keyspace, reconnect_driver
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_hints_manager_shutdown_hang(manager: ManagerClient) -> None:
"""Reproducer for #8079"""
s1 = await manager.server_add(config={
'error_injections_at_startup': ['decrease_hints_flush_period'],
}, property_file={"dc": "dc1", "rack": "rack1"})
s2 = await manager.server_add(property_file={"dc": "dc1", "rack": "rack2"})
await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30)
cql = manager.get_cql()
logger.info("Create keyspace and table")
async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}") as ks:
await cql.run_async(f"create table {ks}.t (pk int primary key)")
logger.info(f"Stop {s2}")
await manager.server_stop(s2.server_id)
logger.info("Write data with small timeout")
# We're using a small timeout for the insert so it's not unexpected that it would fail on slow
# CI machines. To avoid flakiness we disable the test in debug mode (as well as release since
# it requires an error injection - so it will run only in dev mode) and we retry the write 10 times.
passed = False
for _ in range(10):
try:
await cql.run_async(SimpleStatement(f"insert into {ks}.t (pk) values (0) using timeout 500ms",
consistency_level=ConsistencyLevel.ONE))
except WriteTimeout:
logger.info("write timeout, retrying")
else:
passed = True
break
if not passed:
pytest.fail("Write timed out on each attempt")
# The write succeeded but a background task was left to finish the write to the other node
# (which is dead but the first node didn't mark it as dead yet).
# The background task will timeout shortly because of 'using timeout' in the statement.
# This will cause a hint to get created.
# The hints manager starts sending the hint soon after (hint flushing happens every
# ~1 second with the error injection).
logger.info("Sleep")
await asyncio.sleep(2)
logger.info(f"Stop {s1} gracefully")
await manager.server_stop_gracefully(s1.server_id)
# For dropping the keyspace
await asyncio.gather(*[manager.server_start(s.server_id) for s in [s1, s2]])
await reconnect_driver(manager)