scylladb/test/cluster/test_fencing.py

#
# Copyright (C) 2023-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
#
from test.pylib.manager_client import ManagerClient
from test.pylib.random_tables import RandomTables, Column, IntType, CounterType
from test.pylib.util import unique_name, wait_for_cql_and_get_hosts, wait_for
from cassandra import WriteFailure, ConsistencyLevel
from test.pylib.internal_types import ServerInfo
from test.pylib.rest_client import ScyllaMetrics
from cassandra.pool import Host # type: ignore # pylint: disable=no-name-in-module
from cassandra.query import SimpleStatement
from test.cluster.conftest import skip_mode
import pytest
import logging
import time


logger = logging.getLogger(__name__)


def host_by_server(hosts: list[Host], srv: ServerInfo):
    for h in hosts:
        if h.address == srv.ip_addr:
            return h
    raise ValueError(f"can't find host for server {srv}")


async def set_version(manager: ManagerClient, host: Host, new_version: int):
    await manager.cql.run_async("update system.topology set version=%s where key = 'topology'",
                                parameters=[new_version],
                                host=host)


async def set_fence_version(manager: ManagerClient, host: Host, new_version: int):
    await manager.cql.run_async("update system.topology set fence_version=%s where key = 'topology'",
                                parameters=[new_version],
                                host=host)


async def get_version(manager: ManagerClient, host: Host):
    rows = await manager.cql.run_async(
        "select version from system.topology where key = 'topology'",
        host=host)
    return rows[0].version


def send_errors_metric(metrics: ScyllaMetrics):
    return metrics.get('scylla_hints_manager_send_errors')


def sent_total_metric(metrics: ScyllaMetrics):
    return metrics.get('scylla_hints_manager_sent_total')


def all_hints_metrics(metrics: ScyllaMetrics) -> list[str]:
    return metrics.lines_by_prefix('scylla_hints_manager_')


@pytest.mark.asyncio
@pytest.mark.parametrize("tablets_enabled", [True, False])
async def test_fence_writes(request, manager: ManagerClient, tablets_enabled: bool):
    cfg = {'tablets_mode_for_new_keyspaces' : 'enabled' if tablets_enabled else 'disabled'}

    logger.info("Bootstrapping first two nodes")
    servers = await manager.servers_add(2, config=cfg, property_file=[
        {"dc": "dc1", "rack": "r1"},
        {"dc": "dc1", "rack": "r2"}
    ])

    # The third node is started as the last one, so we can be sure that is has
    # the latest topology version
    logger.info("Bootstrapping the last node")
    servers += [await manager.server_add(config=cfg, property_file={"dc": "dc1", "rack": "r3"})]

    # Disable load balancer as it might bump topology version, undoing the decrement below.
    # This should be done before adding the last two servers,
    # otherwise it can break the version == fence_version condition
    # which the test relies on.
    await manager.api.disable_tablet_balancing(servers[2].ip_addr)

    logger.info(f'Creating new tables')
    random_tables = RandomTables(request.node.name, manager, unique_name(), 3)
    table1 = await random_tables.add_table(name='t1', pks=1, columns=[
        Column("pk", IntType),
        Column('int_c', IntType)
    ])
    if not tablets_enabled:  # issue #18180
        table2 = await random_tables.add_table(name='t2', pks=1, columns=[
            Column("pk", IntType),
            Column('counter_c', CounterType)
        ])
    cql = manager.get_cql()
    await cql.run_async(f"USE {random_tables.keyspace}")

    logger.info(f'Waiting for cql and hosts')
    host2 = (await wait_for_cql_and_get_hosts(cql, [servers[2]], time.time() + 60))[0]

    version = await get_version(manager, host2)
    logger.info(f"version on host2 {version}")

    await set_version(manager, host2, version - 1)
    logger.info(f"decremented version on host2")
    await set_fence_version(manager, host2, version - 1)
    logger.info(f"decremented fence version on host2")

    await manager.server_restart(servers[2].server_id, wait_others=2)
    logger.info(f"host2 restarted")

    host2 = (await wait_for_cql_and_get_hosts(cql, [servers[2]], time.time() + 60))[0]

    logger.info(f"trying to write through host2 to regular column [{host2}]")
    with pytest.raises(WriteFailure, match="stale topology exception"):
        await cql.run_async("insert into t1(pk, int_c) values (1, 1)", host=host2)

    if not tablets_enabled:  # issue #18180
        logger.info(f"trying to write through host2 to counter column [{host2}]")
        with pytest.raises(WriteFailure, match="stale topology exception"):
            await cql.run_async("update t2 set counter_c=counter_c+1 where pk=1", host=host2)

    random_tables.drop_all()


@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
async def test_fence_hints(request, manager: ManagerClient):
    logger.info("Bootstrapping cluster with three nodes")
    s0 = await manager.server_add(
        config={'error_injections_at_startup': ['decrease_hints_flush_period']},
        cmdline=['--logger-log-level', 'hints_manager=trace'],
        property_file={"dc": "dc1", "rack": "r1"})

    # Disable load balancer as it might bump topology version, potentially creating a race condition
    # with read modify write below.
    # This should be done before adding the last two servers,
    # otherwise it can break the version == fence_version condition
    # which the test relies on.
    await manager.api.disable_tablet_balancing(s0.ip_addr)

    [s1, s2] = await manager.servers_add(2, property_file=[
        {"dc": "dc1", "rack": "r2"},
        {"dc": "dc1", "rack": "r3"}
    ])

    logger.info(f'Creating test table')
    random_tables = RandomTables(request.node.name, manager, unique_name(), 3)
    table1 = await random_tables.add_table(name='t1', pks=1, columns=[
        Column("pk", IntType),
        Column('int_c', IntType)
    ])
    cql = manager.get_cql()
    await cql.run_async(f"USE {random_tables.keyspace}")

    logger.info(f'Waiting for cql and hosts')
    hosts = await wait_for_cql_and_get_hosts(cql, [s0, s2], time.time() + 60)

    host2 = host_by_server(hosts, s2)
    new_version = (await get_version(manager, host2)) + 1
    logger.info(f"Set version and fence_version to {new_version} on node {host2}")
    await set_version(manager, host2, new_version)
    await set_fence_version(manager, host2, new_version)

    select_all_stmt = SimpleStatement("select * from t1", consistency_level=ConsistencyLevel.ONE)
    rows = await cql.run_async(select_all_stmt, host=host2)
    assert len(list(rows)) == 0

    logger.info(f"Stopping node {host2}")
    await manager.server_stop_gracefully(s2.server_id)

    host0 = host_by_server(hosts, s0)
    logger.info(f"Writing through {host0} to regular column")
    await cql.run_async("insert into t1(pk, int_c) values (1, 1)", host=host0)

    logger.info(f"Starting last node {host2}")
    await manager.server_start(s2.server_id)

    logger.info(f"Waiting for failed hints on {host0}")
    async def at_least_one_hint_failed():
        metrics_data = await manager.metrics.query(s0.ip_addr)
        if sent_total_metric(metrics_data) > 0:
            pytest.fail(f"Unexpected successful hints; metrics on {s0}: {all_hints_metrics(metrics_data)}")
        if send_errors_metric(metrics_data) >= 1:
            return True
        logger.info(f"Metrics on {s0}: {all_hints_metrics(metrics_data)}")
    await wait_for(at_least_one_hint_failed, time.time() + 60)

    host2 = (await wait_for_cql_and_get_hosts(cql, [s2], time.time() + 60))[0]

    # Check there is no new data on host2.
    rows = await cql.run_async(select_all_stmt, host=host2)
    assert len(list(rows)) == 0

    logger.info("Updating version on first node")
    await set_version(manager, host0, new_version)
    await set_fence_version(manager, host0, new_version)
    await manager.api.client.post("/storage_service/raft_topology/reload", s0.ip_addr)

    logger.info(f"Waiting for sent hints on {host0}")
    async def exactly_one_hint_sent():
        metrics_data = await manager.metrics.query(s0.ip_addr)
        if sent_total_metric(metrics_data) > 1:
            pytest.fail(f"Unexpected more than 1 successful hints; metrics on {s0}: {all_hints_metrics(metrics_data)}")
        if sent_total_metric(metrics_data) == 1:
            return True
        logger.info(f"Metrics on {s0}: {all_hints_metrics(metrics_data)}")
    await wait_for(exactly_one_hint_sent, time.time() + 60)

    # Check the hint is delivered, and we see the new data on host2
    rows = await cql.run_async(select_all_stmt, host=host2)
    assert len(list(rows)) == 1

    random_tables.drop_all()