Files
scylladb/test/topology/util.py
Konstantin Osipov e7c9ca560b test: issue a read barrier before checking ring consistency
Raft replication doesn't guarantee that all replicas see
identical Raft state at all times, it only guarantees the
same order of events on all replicas.

When comparing raft state with gossip state on a node, first
issue a read barrier to ensure the node has the latest raft state.

To issue a read barrier it is sufficient to alter a non-existing
state: in order to validate the DDL the node needs to sync with the
leader and fetch its latest group0 state.

Fixes #13518 (flaky topology test).

Closes #13756
2023-05-04 12:22:07 +02:00

85 lines
3.9 KiB
Python

#
# Copyright (C) 2022-present ScyllaDB
#
# SPDX-License-Identifier: AGPL-3.0-or-later
#
"""
Test consistency of schema changes with topology changes.
"""
import logging
import pytest
import time
from cassandra.protocol import InvalidRequest, ConfigurationException
from test.pylib.internal_types import ServerInfo
from test.pylib.manager_client import ManagerClient
from test.pylib.util import wait_for, wait_for_cql_and_get_hosts
logger = logging.getLogger(__name__)
async def get_token_ring_host_ids(manager: ManagerClient, srv: ServerInfo) -> set[str]:
"""Get the host IDs of normal token owners known by `srv`."""
token_endpoint_map = await manager.api.client.get_json("/storage_service/tokens_endpoint", srv.ip_addr)
normal_endpoints = {e["value"] for e in token_endpoint_map}
logger.info(f"Normal endpoints' IPs by {srv}: {normal_endpoints}")
host_id_map = await manager.api.client.get_json('/storage_service/host_id', srv.ip_addr)
all_host_ids = {e["value"] for e in host_id_map}
logger.info(f"All host IDs by {srv}: {all_host_ids}")
normal_host_ids = {e["value"] for e in host_id_map if e["key"] in normal_endpoints}
logger.info(f"Normal endpoints' host IDs by {srv}: {normal_host_ids}")
return normal_host_ids
async def get_current_group0_config(manager: ManagerClient, srv: ServerInfo) -> set[tuple[str, bool]]:
"""Get the current Raft group 0 configuration known by `srv`.
The first element of each tuple is the Raft ID of the node (which is equal to the Host ID),
the second element indicates whether the node is a voter.
"""
assert manager.cql
host = (await wait_for_cql_and_get_hosts(manager.cql, [srv], time.time() + 60))[0]
# Issue a read barrer on that host.
with pytest.raises(InvalidRequest, match="nosuch"):
_ = await manager.cql.run_async("alter table nosuchkeyspace.nosuchtable with comment=''", host = host)
group0_id = (await manager.cql.run_async(
"select value from system.scylla_local where key = 'raft_group0_id'",
host=host))[0].value
config = await manager.cql.run_async(
f"select server_id, can_vote from system.raft_state where group_id = {group0_id} and disposition = 'CURRENT'",
host=host)
result = {(str(m.server_id), bool(m.can_vote)) for m in config}
logger.info(f"Group 0 members by {srv}: {result}")
return result
async def check_token_ring_and_group0_consistency(manager: ManagerClient) -> None:
"""Ensure that the normal token owners and group 0 members match
according to each currently running server.
"""
servers = await manager.running_servers()
for srv in servers:
group0_members = await get_current_group0_config(manager, srv)
group0_ids = {m[0] for m in group0_members}
token_ring_ids = await get_token_ring_host_ids(manager, srv)
assert token_ring_ids == group0_ids
async def wait_for_token_ring_and_group0_consistency(manager: ManagerClient, deadline: float) -> None:
"""Weaker version of the above check; the token ring is not immediately updated after
bootstrap/replace/decommission - the normal tokens of the new node propagate through gossip.
Take this into account and wait for the equality condition to hold, with a timeout.
"""
servers = await manager.running_servers()
for srv in servers:
group0_members = await get_current_group0_config(manager, srv)
group0_ids = {m[0] for m in group0_members}
async def token_ring_matches():
token_ring_ids = await get_token_ring_host_ids(manager, srv)
diff = token_ring_ids ^ group0_ids
if diff:
logger.warning(f"Group 0 members and token ring members don't yet match" \
f" according to {srv}, symmetric difference: {diff}")
return None
return True
await wait_for(token_ring_matches, deadline, period=.5)