mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-24 10:30:38 +00:00
In the test test_tablet_mv_replica_pairing_during_replace we stop 2 out of 4 servers while using RF=2. Even though in the test we use exactly 4 tablets (1 for each replica of a base table and view), intially, the tablets may not be split evenly between all nodes. Because of this, even when we chose a server that hosts the view and a different server that hosts the base table, we sometimes stoped all replicas of the base or the view table because the node with the base table replica may also be a view replica. After some time, the tablets should be distributed across all nodes. When that happens, there will be no common nodes with a base and view replica, so the test scenario will continue as planned. In this patch, we add this waiting period after creating the base and view, and continue the test only when all 4 tablets are on distinct nodes. Fixes https://github.com/scylladb/scylladb/issues/23982 Fixes https://github.com/scylladb/scylladb/issues/23997 Closes scylladb/scylladb#24111
109 lines
5.3 KiB
Python
109 lines
5.3 KiB
Python
#
|
|
# Copyright (C) 2024-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
#
|
|
|
|
from cassandra import ConsistencyLevel
|
|
from cassandra.query import SimpleStatement
|
|
|
|
from test.pylib.manager_client import ManagerClient
|
|
from test.pylib.scylla_cluster import ReplaceConfig
|
|
from test.pylib.internal_types import HostID
|
|
|
|
import pytest
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
|
|
from test.cluster.conftest import skip_mode
|
|
from test.cluster.util import get_topology_coordinator, find_server_by_host_id
|
|
from test.cluster.mv.tablets.test_mv_tablets import get_tablet_replicas
|
|
from test.cluster.util import new_test_keyspace, wait_for
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@skip_mode('release', 'error injections are not supported in release mode')
|
|
async def test_tablet_mv_replica_pairing_during_replace(manager: ManagerClient):
|
|
"""
|
|
Verifies that view replica pairing is stable in the case of node replace.
|
|
After replace, the node is in left state, but still present in the replica set.
|
|
If view pairing code would use get_natural_endpoints(), which excludes left nodes,
|
|
the pairing would be shifted during replace.
|
|
"""
|
|
|
|
servers = await manager.servers_add(4, property_file=[
|
|
{"dc": "dc1", "rack": "r1"},
|
|
{"dc": "dc1", "rack": "r1"},
|
|
{"dc": "dc1", "rack": "r2"},
|
|
{"dc": "dc1", "rack": "r2"}
|
|
])
|
|
cql = manager.get_cql()
|
|
async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1}") as ks:
|
|
await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)")
|
|
await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.tv AS SELECT * FROM {ks}.test WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk) WITH SYNCHRONOUS_UPDATES = TRUE")
|
|
|
|
async def replicas_balanced():
|
|
base_replicas = [replica[0] for replica in await get_tablet_replicas(manager, servers[0], ks, "test", 0)]
|
|
view_replicas = [replica[0] for replica in await get_tablet_replicas(manager, servers[0], ks, "tv", 0)]
|
|
return len(set(base_replicas) & set(view_replicas)) == 0 or None
|
|
# There's 4 nodes and 4 tablets, so even if the initial placement is not balanced,
|
|
# each node should get 1 replica after some time.
|
|
wait_for(replicas_balanced, time.time() + 60)
|
|
|
|
# Disable migrations concurrent with replace since we don't handle nodes going down during migration yet.
|
|
# See https://github.com/scylladb/scylladb/issues/16527
|
|
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
|
|
|
|
base_replicas = await get_tablet_replicas(manager, servers[0], ks, "test", 0)
|
|
logger.info(f'{ks}.test replicas: {base_replicas}')
|
|
view_replicas = await get_tablet_replicas(manager, servers[0], ks, "tv", 0)
|
|
logger.info(f'{ks}.tv replicas: {view_replicas}')
|
|
server_to_replace = await find_server_by_host_id(manager, servers, HostID(str(view_replicas[0][0])))
|
|
server_to_down = await find_server_by_host_id(manager, servers, HostID(str(base_replicas[0][0])))
|
|
|
|
logger.info('Downing a node to be replaced')
|
|
await manager.server_stop(server_to_replace.server_id)
|
|
|
|
logger.info('Blocking tablet rebuild')
|
|
coord = await get_topology_coordinator(manager)
|
|
coord_serv = await find_server_by_host_id(manager, servers, coord)
|
|
await manager.api.enable_injection(coord_serv.ip_addr, "tablet_transition_updates", one_shot=True)
|
|
coord_log = await manager.server_open_log(coord_serv.server_id)
|
|
coord_mark = await coord_log.mark()
|
|
|
|
logger.info('Replacing the node')
|
|
replace_cfg = ReplaceConfig(replaced_id = server_to_replace.server_id, reuse_ip_addr = False, use_host_id = True)
|
|
replace_task = asyncio.create_task(manager.server_add(replace_cfg, property_file={
|
|
"dc": server_to_replace.datacenter,
|
|
"rack": server_to_replace.rack
|
|
}))
|
|
|
|
await coord_log.wait_for('tablet_transition_updates: waiting', from_mark=coord_mark)
|
|
|
|
if server_to_down.server_id != server_to_replace.server_id:
|
|
await manager.server_stop(server_to_down.server_id)
|
|
|
|
# The update is supposed to go to the second replica only, since the other one is downed.
|
|
# If pairing would shift, the update to the view would be lost because the first replica
|
|
# is the one which is in the left state.
|
|
logger.info('Updating base table')
|
|
await cql.run_async(SimpleStatement(f"INSERT INTO {ks}.test (pk, c) VALUES (3, 4)", consistency_level=ConsistencyLevel.ONE))
|
|
logger.info('Querying the view')
|
|
assert [(4,3)] == list(await cql.run_async(SimpleStatement(f"SELECT * FROM {ks}.tv WHERE c=4", consistency_level=ConsistencyLevel.ONE)))
|
|
|
|
if server_to_down.server_id != server_to_replace.server_id:
|
|
await manager.server_start(server_to_down.server_id)
|
|
|
|
logger.info('Unblocking tablet rebuild')
|
|
if coord_serv.server_id != server_to_down.server_id:
|
|
await manager.api.message_injection(coord_serv.ip_addr, "tablet_transition_updates")
|
|
|
|
logger.info('Waiting for replace')
|
|
await replace_task
|
|
|
|
logger.info('Querying')
|
|
assert [(4,3)] == list(await cql.run_async(f"SELECT * FROM {ks}.tv WHERE c=4"))
|