Files
scylladb/test/cluster/test_ip_mappings.py
Łukasz Paszkowski d18eb9479f cql/statement: Create keyspace_metadata with correct initial_tablets count
In `ks_prop_defs::as_ks_metadata(...)` a default initial tablets count
is set to 0, when tablets are enabled and the replication strategy
is NetworkReplicationStrategy.

This effectively sets _uses_tablets = false in abstract_replication_strategy
for the remaining strategies when no `tablets = {...}` options are specified.
As a consequence, it is possible to create vnode-based keyspaces even
when tablets are enforced with `tablets_mode_for_new_keyspaces`.

The patch sets a default initial tablets count to zero regardless of
the chosen replication strategy. Then each of the replication strategy
validates the options and raises a configuration exception when tablets
are not supported.

All tests are altered in the following way:
+ whenever it was correct, SimpleStrategy was replaced with NetworkTopologyStrategy
+ otherwise, tablets were explicitly disabled with ` AND tablets = {'enabled': false}`

Fixes https://github.com/scylladb/scylladb/issues/25340

Closes scylladb/scylladb#25342
2026-04-20 17:57:38 +03:00

157 lines
8.3 KiB
Python

# Copyright (C) 2024-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
#
import asyncio
from test.pylib.manager_client import ManagerClient
import pytest
import logging
from uuid import UUID
from test.pylib.rest_client import inject_error_one_shot, read_barrier
from test.pylib.scylla_cluster import ReplaceConfig
from test.pylib.util import gather_safely
from test.cluster.util import disable_schema_agreement_wait, new_test_keyspace, reconnect_driver
from cassandra.cluster import ConsistencyLevel, SimpleStatement
logger = logging.getLogger(__name__)
@pytest.mark.asyncio
async def test_broken_bootstrap(manager: ManagerClient):
server_a = await manager.server_add()
server_b = await manager.server_add(start=False)
async with new_test_keyspace(manager, "WITH REPLICATION = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks:
table = f"{ks}.test"
await manager.cql.run_async(f"CREATE TABLE {table} (a int PRIMARY KEY, b int)")
for i in range(100):
await manager.cql.run_async(f"INSERT INTO {table} (a, b) VALUES ({i}, {i})")
await inject_error_one_shot(manager.api, server_a.ip_addr, "crash-before-bootstrapping-node-added")
try:
# Timeout fast since we do not expect the operation to complete
# because the coordinator is dead by now due to the error injection
# above
await manager.server_start(server_b.server_id, timeout=5)
pytest.fail("Expected server_add to fail")
except Exception:
pass
await gather_safely(*(manager.server_stop(srv.server_id) for srv in [server_a, server_b]))
await manager.server_start(server_a.server_id)
await manager.driver_connect()
for i in range(100):
await manager.cql.run_async(f"INSERT INTO {table} (a, b) VALUES ({i}, {i})")
response = await manager.cql.run_async(f"SELECT * FROM {table} WHERE a = {i}")
assert response[0].b == i
@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
@pytest.mark.parametrize('reuse_ip', [False, True])
async def test_full_shutdown_during_replace(manager: ManagerClient, reuse_ip: bool):
"""
Test that shutting down all live nodes during replace doesn't cause the IP address of the replacing node to be
missing on restart. After restarts, the test performs requests with the replacing node as a pending replica, which
would crash the request coordinators if the IP address was missing.
Regression test for scylla-enterprise#5686. Note that in the reported issue `nodetool status` performed during
stuck replace caused a crash, but the problem turned out to be more general.
"""
logger.info('Adding servers')
leader = await manager.server_add(property_file={'dc': 'dc1', 'rack': 'rack0'})
follower, dead_server = await manager.servers_add(2, auto_rack_dc='dc1')
live_servers = [leader, follower]
logger.info('Waiting for driver')
cql, [host] = await manager.get_ready_cql([follower])
with disable_schema_agreement_wait(cql): # This makes the test a bit faster.
async with new_test_keyspace(manager, """WITH REPLICATION = {'class': 'NetworkTopologyStrategy',
'replication_factor': 3} AND tablets = {'enabled': false}""", host) as ks:
table = f'{ks}.test'
await cql.run_async(f'CREATE TABLE {table} (a int PRIMARY KEY, b int)', host=host)
logger.info(f'Stopping {dead_server}')
await manager.server_stop_gracefully(dead_server.server_id)
await manager.api.enable_injection(
leader.ip_addr, 'topology_coordinator/write_both_read_old/before_global_token_metadata_barrier', True)
replace_cfg = ReplaceConfig(replaced_id=dead_server.server_id, reuse_ip_addr=reuse_ip, use_host_id=False)
logger.info(f'Adding the server that will replace {dead_server}')
replacing_server = await manager.server_add(
replace_cfg, property_file=dead_server.property_file(), start=False)
logger.info(f'Trying to replace {dead_server} with {replacing_server}')
replacing_task = asyncio.create_task(manager.server_start(replacing_server.server_id))
logger.info('Waiting for the topology coordinator to pause in write_both_read_old')
leader_log = await manager.server_open_log(leader.server_id)
await leader_log.wait_for(
'topology_coordinator/write_both_read_old/before_global_token_metadata_barrier: waiting for message')
replacing_host_id = await manager.get_host_id(replacing_server.server_id)
logger.info(f'Stopping {live_servers + [replacing_server]}')
await gather_safely(*(manager.server_stop(srv.server_id) for srv in live_servers + [replacing_server]))
replacing_task.cancel()
for srv in live_servers:
await manager.server_update_config(srv.server_id, 'error_injections_at_startup',
['topology_coordinator/write_both_read_old/before_global_token_metadata_barrier'])
logger.info(f'Starting {live_servers}')
await gather_safely(*(manager.server_start(srv.server_id) for srv in live_servers))
# This is needed to ensure that the replacing node will be a pending replica for the requests below.
logger.info(f'Waiting for {live_servers} to apply the latest topology state')
await gather_safely(*[read_barrier(manager.api, srv.ip_addr) for srv in live_servers])
await reconnect_driver(manager)
logger.info('Waiting for driver')
cql, hosts = await manager.get_ready_cql(live_servers)
logger.info(f'Sending requests to {table} with {replacing_server} as a pending replica')
for i in range(100):
await cql.run_async(SimpleStatement(f'INSERT INTO {table} (a, b) VALUES ({i}, {i})',
consistency_level=ConsistencyLevel.ONE))
response = await manager.cql.run_async(f"SELECT * FROM {table} WHERE a = {i}")
assert response[0].b == i
# Verify that `/storage_service/tokens/{endpoint}` returns no tokens for IP of the node being replaced iff
# reuse_ip is True. The goal here is to check that the API endpoint resolves the provided IP address to the
# host ID of the replacing node when reuse_ip is True. There is nothing special about this endpoint; we
# can use any endpoint that calls `gossiper::get_host_id(inet_address endpoint)`. When reuse_ip is False, we
# expect non-empty tokens as a sanity check.
logger.info(f'Checking tokens of {dead_server.ip_addr}')
for srv in live_servers:
tokens = await manager.api.get_tokens(srv.ip_addr, dead_server.ip_addr)
assert (len(tokens) == 0) == reuse_ip
for srv in live_servers:
await manager.api.message_injection(
srv.ip_addr, 'topology_coordinator/write_both_read_old/before_global_token_metadata_barrier')
logs = [await manager.server_open_log(srv.server_id) for srv in live_servers]
logger.info(f'Waiting for {replacing_server} to be removed from gossip after replace rollback')
await gather_safely(*[
log.wait_for(f'gossip - Finished to force remove node {replacing_host_id}')
for log in logs
])
logger.info(f'Retrying replace of {dead_server}')
new_server = await manager.server_add(replace_cfg, property_file=dead_server.property_file())
live_servers.append(new_server)
logger.info(f'Checking peers on {live_servers}')
host_ids = [await manager.get_host_id(srv.server_id) for srv in live_servers]
for srv, host in zip(live_servers, hosts):
result = await cql.run_async('SELECT peer, host_id FROM system.peers', host=host)
peers = {(row.peer, row.host_id) for row in result}
expected = {(other.ip_addr, UUID(id)) for other, id in zip(live_servers, host_ids) if other != srv}
assert peers == expected