scylladb/test/cluster/test_strong_consistency.py

#
# Copyright (C) 2025-present ScyllaDB
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
#

from typing import Tuple

from test.pylib.manager_client import ManagerClient
from test.pylib.util import gather_safely, wait_for, Host
from test.cluster.util import new_test_keyspace, new_test_table, reconnect_driver
from test.pylib.internal_types import HostID, ServerInfo
from cassandra import InvalidRequest, ReadTimeout, WriteTimeout
from cassandra.cluster import ConsistencyLevel
from cassandra.policies import FallthroughRetryPolicy
from cassandra.protocol import InvalidRequest
from cassandra.query import SimpleStatement, BoundStatement
from test.pylib.tablets import get_all_tablet_replicas, get_tablet_replicas

import asyncio
import pytest
import logging
import time
import uuid
import random
import asyncio


logger = logging.getLogger(__name__)


DEFAULT_CONFIG = {'experimental_features': ['strongly-consistent-tables']}
DEFAULT_CMDLINE = [
        '--logger-log-level', 'sc_groups_manager=debug',
        '--logger-log-level', 'sc_coordinator=debug'
    ]


async def wait_for_leader(manager: ManagerClient, s: ServerInfo, group_id: str):
    async def get_leader_host_id():
        result = await manager.api.get_raft_leader(s.ip_addr, group_id)
        return None if uuid.UUID(result).int == 0 else result
    return await wait_for(get_leader_host_id, time.time() + 60)

async def collect_all_raft_state(cql, host):
    state = {}
    rows = await cql.run_async(f"SELECT DISTINCT shard, group_id, vote_term, commit_idx FROM system.raft_groups", host=host)
    logger.info(f"Collected raft state from host {host}:")
    for row in rows:
        assert str(row.group_id) not in state, f"Duplicate raft state for group {row.group_id} shard {row.shard}"
        state[str(row.group_id)] = (row.shard, row.vote_term, row.commit_idx)
    return state


# Verify that:
# - All raft groups present before an event are still present after the event.
# - For each raft group, the shard did not change.
# - For each raft group, vote_term did not decrease (if present).
# - For each raft group, commit_idx did not decrease (if present).
def assert_raft_state_continuity(state_before: dict, state_after: dict, context: str):
    for group_id, (shard_before, vote_term_before, commit_idx_before) in state_before.items():
        assert group_id in state_after, (
            f"Raft state for group {group_id} shard {shard_before} missing after {context}."
        )
        shard_after, vote_term_after, commit_idx_after = state_after[group_id]

        assert shard_before == shard_after, (
            f"Raft state for group {group_id} changed shard from {shard_before} to {shard_after} after {context}."
        )

        if vote_term_before is not None:
            assert vote_term_after is not None and vote_term_after >= vote_term_before, (
                f"vote_term decreased for group {group_id} shard {shard_before} after {context}: "
                f"{vote_term_before} -> {vote_term_after}."
            )

        if commit_idx_before is not None:
            assert commit_idx_after is not None and commit_idx_after >= commit_idx_before, (
                f"commit_idx decreased for group {group_id} shard {shard_before} after {context}: "
                f"{commit_idx_before} -> {commit_idx_after}."
            )

# Verify that reads and writes to raft tables for strongly consistent tablets
# are always routed to the expected shards, according to the tablet's
# assignment.
async def assert_no_cross_shard_routing(manager: ManagerClient, server: ServerInfo):
    log = await manager.server_open_log(server.server_id)

    # Check partitioner logs
    partitioner_logs = await log.grep(r"fixed_shard.*get_token: shard=(\d+), token=(\d+)")
    computed_tokens = {}
    for _, match in partitioner_logs:
        shard = int(match.group(1))
        token = int(match.group(2))
        computed_tokens[token] = shard

    # Check sharder logs
    sharder_logs = await log.grep(r"fixed_shard.*shard_of\((\d+)\) = (\d+)")
    for _, match in sharder_logs:
        token = int(match.group(1))
        shard = int(match.group(2))
        if token not in computed_tokens:
            # This can happen when we read the raft tables as a client, through CQL.
            # In that case, we may try to route a token that does not correspond
            # to any existing raft group.
            continue
        shard_from_partitioner = computed_tokens[token]
        assert shard == shard_from_partitioner, (
            f"Sharder routed token {token} to shard {shard}, "
            f"but partitioner computed shard {shard_from_partitioner}."
        )

async def get_table_raft_group_id(manager: ManagerClient, ks: str, table: str):
    table_id = await manager.get_table_id(ks, table)
    rows = await manager.get_cql().run_async(f"SELECT raft_group_id FROM system.tablets where table_id = {table_id}")
    return str(rows[0].raft_group_id)

@pytest.mark.asyncio
async def test_basic_write_read(manager: ManagerClient):

    logger.info("Bootstrapping cluster")
    servers = await manager.servers_add(3, config=DEFAULT_CONFIG, cmdline=DEFAULT_CMDLINE, auto_rack_dc='my_dc')
    (cql, hosts) = await manager.get_ready_cql(servers)

    logger.info("Load host_id-s for servers")
    host_ids = await gather_safely(*[manager.get_host_id(s.server_id) for s in servers])

    def host_by_host_id(host_id):
        for hid, host in zip(host_ids, hosts):
            if hid == host_id:
                return host
        raise RuntimeError(f"Can't find host for host_id {host_id}")

    logger.info("Creating a strongly-consistent keyspace")
    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        logger.info("Creating a table")
        await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);")

        logger.info("Select raft group id for the tablet")
        group_id = await get_table_raft_group_id(manager, ks, 'test')

        logger.info(f"Get current leader for the group {group_id}")
        try:
            leader_host_id = await wait_for_leader(manager, servers[0], group_id)
        except:
            # We need to wait for leader on a replica, and first server might not be one
            leader_host_id = await wait_for_leader(manager, servers[1], group_id)
        leader_host = host_by_host_id(leader_host_id)

        tablet_replicas = await get_tablet_replicas(manager, servers[0], ks, "test", 0)
        assert len(tablet_replicas) == 2
        replica_host_ids = [replica[0] for replica in tablet_replicas]

        logger.info(f"Get the non-leader replica for the group {group_id}")
        non_leader_replica_host_id = [host_id for host_id in replica_host_ids if str(host_id) != str(leader_host_id)][0]
        non_leader_replica_host = host_by_host_id(non_leader_replica_host_id)

        logger.info(f"Get the non-replica for the group {group_id}")
        non_replica_host_id = [host_id for host_id in host_ids if str(host_id) not in replica_host_ids][0]
        non_replica_host = host_by_host_id(non_replica_host_id)

        async def collect_metrics():
            leader_metrics_query = await manager.metrics.query(leader_host.address)
            leader_metrics = {
                'scylla_strong_consistency_coordinator_write_latency_count': leader_metrics_query.get('scylla_strong_consistency_coordinator_write_latency_count') or 0,
                'scylla_strong_consistency_coordinator_read_latency_count':  leader_metrics_query.get('scylla_strong_consistency_coordinator_read_latency_count') or 0,
            }
            non_leader_metrics_query = await manager.metrics.query(non_leader_replica_host.address)
            non_leader_metrics = {
                'scylla_strong_consistency_coordinator_write_node_bounces': non_leader_metrics_query.get('scylla_strong_consistency_coordinator_write_node_bounces') or 0,
                'scylla_strong_consistency_coordinator_read_latency_count': non_leader_metrics_query.get('scylla_strong_consistency_coordinator_read_latency_count') or 0,
            }
            non_replica_metrics_query = await manager.metrics.query(non_replica_host.address)
            non_replica_metrics = {
                'scylla_strong_consistency_coordinator_write_node_bounces': non_replica_metrics_query.get('scylla_strong_consistency_coordinator_write_node_bounces') or 0,
                'scylla_strong_consistency_coordinator_read_node_bounces':  non_replica_metrics_query.get('scylla_strong_consistency_coordinator_read_node_bounces') or 0,
            }

            return {
                'leader': leader_metrics,
                'replica': non_leader_metrics,
                'non_replica': non_replica_metrics,
            }
        metrics_before = await collect_metrics()

        logger.info(f"Run INSERT statement on leader {leader_host}")
        await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES (10, 20)", host=leader_host)

        logger.info(f"Run SELECT statement on leader {leader_host}")
        rows = await cql.run_async(f"SELECT * FROM {ks}.test WHERE pk = 10;", host=leader_host)
        assert len(rows) == 1
        row = rows[0]
        assert row.pk == 10
        assert row.c == 20

        logger.info(f"Run INSERT statement on non-leader replica {non_leader_replica_host}")
        await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES (10, 30)", host=non_leader_replica_host)

        logger.info(f"Run SELECT statement on non-leader replica {non_leader_replica_host}")
        rows = await cql.run_async(f"SELECT * FROM {ks}.test WHERE pk = 10;", host=non_leader_replica_host)
        assert len(rows) == 1
        row = rows[0]
        assert row.pk == 10
        assert row.c == 30

        logger.info(f"Run INSERT statement on non-replica {non_replica_host}")
        await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES (10, 40)", host=non_replica_host)

        logger.info(f"Run SELECT statement on non-replica {non_replica_host}")
        rows = await cql.run_async(f"SELECT * FROM {ks}.test WHERE pk = 10;", host=non_replica_host)
        assert len(rows) == 1
        row = rows[0]
        assert row.pk == 10
        assert row.c == 40

        # Test with prepared statements as well
        insert_stmt = cql.prepare(f"INSERT INTO {ks}.test (pk, c) VALUES (?, ?)")
        bound_insert_stmt = BoundStatement(insert_stmt)
        select_stmt = cql.prepare(f"SELECT * FROM {ks}.test WHERE pk = ?")
        bound_select_stmt = BoundStatement(select_stmt, consistency_level=ConsistencyLevel.ONE)
        bound_select_stmt.bind([10])

        logger.info(f"Run prepared INSERT statement on leader {leader_host}")
        bound_insert_stmt.bind([10, 50])
        await cql.run_async(bound_insert_stmt, host=leader_host)

        logger.info(f"Run prepared SELECT statement on leader {leader_host}")
        rows = await cql.run_async(bound_select_stmt, host=leader_host)
        assert len(rows) == 1
        row = rows[0]
        assert row.pk == 10
        assert row.c == 50

        logger.info(f"Run prepared INSERT statement on non-leader replica {non_leader_replica_host}")
        bound_insert_stmt.bind([10, 60])
        await cql.run_async(bound_insert_stmt, host=non_leader_replica_host)

        logger.info(f"Run prepared SELECT statement on non-leader replica {non_leader_replica_host}")
        rows = await cql.run_async(bound_select_stmt, host=non_leader_replica_host)
        assert len(rows) == 1
        row = rows[0]
        assert row.pk == 10
        assert row.c == 60

        logger.info(f"Run prepared INSERT statement on non-replica {non_replica_host}")
        bound_insert_stmt.bind([10, 70])
        await cql.run_async(bound_insert_stmt, host=non_replica_host)

        logger.info(f"Run prepared SELECT statement on non-replica {non_replica_host}")
        rows = await cql.run_async(bound_select_stmt, host=non_replica_host)
        assert len(rows) == 1
        row = rows[0]
        assert row.pk == 10
        assert row.c == 70

        metrics_after = await collect_metrics()
        # Validate that all matrics were increased after read/write operations
        for host in metrics_after.keys():
            for metric_name in metrics_after[host].keys():
                assert metrics_after[host][metric_name] > metrics_before[host][metric_name]

        # Check that we can restart a server with an active tablets raft group
        await manager.server_restart(servers[2].server_id)

    # To check that the servers can be stopped gracefully. By default the test runner just kills them.
    await gather_safely(*[manager.server_stop_gracefully(s.server_id) for s in servers])

@pytest.mark.asyncio
async def test_multi_shard_write_read(manager: ManagerClient):
    """
    Verify that strongly consistent tables work correctly on non-shard-0.

    We create a strongly consistent table with 4 tablets and RF=3
    and a cluster with 3 nodes and 4 shards per node.
    Assuming a uniform distribution of tablets to shards,
    each tablet should be assigned to a different shard.
    We have 3 "shard zeros", so at least one tablet should be
    replicated on non-shard-0 nodes only, including its leader.

    We then perform multiple writes to all nodes, some of which
    should go to non-shard-0 leaders, and verify that all writes
    succeed and that we can read back all data correctly.
    """
    logger.info("Bootstrapping cluster with 4 shards per node")
    cmdline = DEFAULT_CMDLINE + ['--smp=4']
    servers = await manager.servers_add(3, config=DEFAULT_CONFIG, cmdline=cmdline, auto_rack_dc='my_dc')
    (cql, hosts) = await manager.get_ready_cql(servers)

    logger.info("Creating a strongly-consistent keyspace with 4 tablets")
    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 4} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, c int") as table:
            for j in range(50):
                await cql.run_async(f"INSERT INTO {table} (pk, c) VALUES ({j}, {j})")

            # Read all data
            for j in range(50):
                rows = await cql.run_async(f"SELECT * FROM {table} WHERE pk = {j};")
                assert len(rows) == 1, f"Expected 1 row for pk={j}, got {len(rows)}"
                row = rows[0]
                assert row.c == j, f"Data integrity check failed for pk={j}: expected c={j}, got c={row.c}"

    await gather_safely(*[manager.server_stop_gracefully(s.server_id) for s in servers])

@pytest.mark.asyncio
async def test_sc_multishard_metadata_reads(manager: ManagerClient):
    """
    Verify that multi-shard reads of raft metadata for strongly-consistent tables work correctly.
    """
    cmdline = DEFAULT_CMDLINE + [
        '--smp=4',
        '--logger-log-level', 'fixed_shard=trace',
    ]
    server = await manager.server_add(config=DEFAULT_CONFIG, cmdline=cmdline)
    (cql, hosts) = await manager.get_ready_cql([server])

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 8} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, c int") as table:
            table_name = table.split('.')[-1]
            table_id = await manager.get_table_id(ks, table_name)

            # Write some rows to trigger raft table updates
            for pk in range(10):
                await cql.run_async(f"INSERT INTO {table} (pk, c) VALUES ({pk}, {pk})")

            # Verify that tablets are allocated also on non-0 shards
            tablets = await get_all_tablet_replicas(manager, server, ks, table_name)
            logger.info(f"Tablet distribution: {tablets}")
            assert set(shard for tablet in tablets for _, shard in tablet.replicas) != set([0]), "Strongly consistent tables shoud be allocated also on non-0 shards"

            # Verify we have non-empty state
            state = await collect_all_raft_state(cql, hosts[0])
            has_nonempty_state = any(
                commit_idx is not None and commit_idx > 0
                for _, commit_idx, _ in state.values()
            )
            assert has_nonempty_state, "Expected at least one group to have commit_idx > 0 before crash"

            # Prepare the list of (shard, group_id) pairs for all tablets of the table.
            raft_partition_keys = []
            for row in await cql.run_async(f"SELECT raft_group_id, replicas FROM system.tablets where table_id = {table_id}"):
                assert len(row.replicas) == 1, "Expected RF=1 for the test table"
                (_, shard) = row.replicas[0]
                raft_partition_keys.append((shard, row.raft_group_id))
            assert len(raft_partition_keys) == 8, f"Expected 8 tablets, got {len(raft_partition_keys)}"

            # Collect raft metadata for all tablets using single-shard queries
            raft_data = {}
            for (shard, group_id) in raft_partition_keys:
                raft_data[(shard, group_id)] = set(await cql.run_async(f"SELECT * FROM system.raft_groups WHERE shard = {shard} AND group_id = {group_id}"))
                # Verify that we can also obtain the same data without knowing the shard using ALLOW FILTERING
                filtered_data = set(await cql.run_async(f"SELECT * FROM system.raft_groups WHERE group_id = {group_id} ALLOW FILTERING"))
                assert raft_data[(shard, group_id)] == filtered_data, f"Data mismatch for group {group_id} when read by shard vs ALLOW FILTERING: {raft_data[(shard, group_id)]} vs {filtered_data}"

            # Now read raft metadata using multi-shard queries and verify correctness
            for k in range(2, 9):
                sample_keys = random.sample(raft_partition_keys, k)
                shards = tuple(shard for (shard, _) in sample_keys)
                group_ids = ", ".join(f"{group_id}" for group_id in set(group_id for (_, group_id) in sample_keys))
                logger.info(f"Testing multi-shard read with {k} keys: {sample_keys}")
                # We can't specify (shard, group_id) partition key pairs in the IN clause because multi-column restriction are only allowed for clustering keys.
                # Instead, we read using a product of all involved shards and group_ids. This should return the same rows because every group_id
                # is only assigned to one shard.
                rows = await cql.run_async(f"SELECT * FROM system.raft_groups WHERE shard IN {shards} AND group_id IN ({group_ids})")
                for key in sample_keys:
                    for row_data in raft_data[key]:
                        assert row_data in rows, f"Missing data for raft group {key} in multi-shard read: {row_data}"
                for row in rows:
                    assert row in raft_data[(row.shard, row.group_id)], f"Unexpected data for raft group {(row.shard, row.group_id)} in multi-shard read: {row}"

            # Read ranges of partitions using TOKEN()
            for (boundary_shard, boundary_group_id) in raft_partition_keys:
                ith_token = await cql.run_async(f"SELECT TOKEN(shard, group_id) as t FROM system.raft_groups WHERE shard = {boundary_shard} AND group_id = {boundary_group_id} LIMIT 1")
                token_value = ith_token[0].t
                range_below = await cql.run_async(f"SELECT * FROM system.raft_groups WHERE TOKEN(shard, group_id) <= {token_value}")
                range_above = await cql.run_async(f"SELECT * FROM system.raft_groups WHERE TOKEN(shard, group_id) > {token_value}")
                assert set(range_below) & set(range_above) == set(), f"Overlapping data in range reads up to and above token {token_value}"
                for (shard, group_id) in raft_partition_keys:
                    for row_data in raft_data[(shard, group_id)]:
                        if shard < boundary_shard:
                            assert row_data in range_below, f"Missing data for raft group {(shard, group_id)} in range read up to token {token_value}: {row_data}"
                        elif shard > boundary_shard:
                            assert row_data in range_above, f"Missing data for raft group {(shard, group_id)} in range read above token {token_value}: {row_data}"
                        else:
                            if group_id == boundary_group_id:
                                assert row_data in range_below, f"Missing data for raft group {(shard, group_id)} in range read up to token {token_value}: {row_data}"
                            else:
                                assert row_data in range_below or row_data in range_above, f"Missing data for raft group {(shard, group_id)} in range read up to token {token_value}: {row_data}"

    await manager.server_stop_gracefully(server.server_id)

@pytest.mark.asyncio
async def test_sc_persistence_restart_with_smp_increase(manager: ManagerClient):
    """
    Verify that the metadata for strongly-consistent tables
    is preserved after increasing shard count (--smp).

    The raft tables for strongly consistent tables use a custom partitioner
    and sharder which should not change the shard assignments when the
    number of shards changes. This test verifies that after increasing the SMP
    on a single-node cluster, we don't start reading/writing raft metadata
    from/to incorrect shards.
    """
    cmdline = DEFAULT_CMDLINE + [
        '--smp=2',
        '--logger-log-level', 'fixed_shard=trace',
    ]
    server = await manager.server_add(config=DEFAULT_CONFIG, cmdline=cmdline)
    (cql, hosts) = await manager.get_ready_cql([server])

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, c int") as table:

            # Write some rows to trigger raft table updates
            for pk in range(10):
                await cql.run_async(f"INSERT INTO {table} (pk, c) VALUES ({pk}, {pk})")

            state_before = await collect_all_raft_state(cql, hosts[0])

            await manager.server_update_cmdline(server.server_id, ['--smp=4'])
            await manager.server_restart(server.server_id)
            await reconnect_driver(manager)
            cql = manager.get_cql()

            # We can't read the internal raft state directly, so we perform extra writes
            # which should cause raft table updates based on the loaded state after restart.
            for pk in range(10, 20):
                await cql.run_async(f"INSERT INTO {table} (pk, c) VALUES ({pk}, {pk})")

            state_after = await collect_all_raft_state(cql, hosts[0])
            assert_raft_state_continuity(state_before, state_after, "SMP increase")

            await assert_no_cross_shard_routing(manager, server)

            for pk in range(20):
                rows = await cql.run_async(f"SELECT * FROM {table} WHERE pk = {pk};")
                assert len(rows) == 1, f"Expected 1 row for pk={pk}, got {len(rows)}"
                row = rows[0]
                assert row.c == pk, f"Incorrect row read for pk={pk}: expected c={pk}, got c={row.c}"

    await manager.server_stop_gracefully(server.server_id)


@pytest.mark.asyncio
async def test_sc_persistence_with_compaction(manager: ManagerClient):
    """
    Verify that compaction of system.raft_groups works correctly.

    Regular (non-reshard) compaction does not use the custom partitioner/sharder
    directly, bit it does compact SSTables written with them, so in this test we
    verify that after compaction the raft metadata is still readable and correct.
    """
    cmdline = DEFAULT_CMDLINE + ['--logger-log-level', 'fixed_shard=trace']
    server = await manager.server_add(config=DEFAULT_CONFIG, cmdline=cmdline)
    (cql, hosts) = await manager.get_ready_cql([server])

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 4} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, c int") as table:

            # Create multiple SSTables by doing writes with flushes in between
            for batch in range(3):
                for pk in range(batch * 10, (batch + 1) * 10):
                    await cql.run_async(f"INSERT INTO {table} (pk, c) VALUES ({pk}, {pk})")
                await manager.api.keyspace_flush(server.ip_addr, "system", "raft_groups")

            state_before_compaction = await collect_all_raft_state(cql, hosts[0])

            await manager.api.keyspace_compaction(server.ip_addr, "system", "raft_groups")

            state_after_compaction = await collect_all_raft_state(cql, hosts[0])
            assert_raft_state_continuity(state_before_compaction, state_after_compaction, "compaction")

            # Restart to verify compacted SSTables are correctly readable by raft server
            await manager.server_restart(server.server_id)
            await reconnect_driver(manager)
            cql = manager.get_cql()

            # We can't read the internal raft state directly, so we perform extra writes
            # which should cause raft table updates based on the loaded state after restart.
            for pk in range(30, 40):
                await cql.run_async(f"INSERT INTO {table} (pk, c) VALUES ({pk}, {pk})")

            state_after_restart = await collect_all_raft_state(cql, hosts[0])
            assert_raft_state_continuity(state_after_compaction, state_after_restart, "compaction + restart")

            await assert_no_cross_shard_routing(manager, server)

    await manager.server_stop_gracefully(server.server_id)


@pytest.mark.asyncio
async def test_sc_persistence_after_crash(manager: ManagerClient):
    """
    Verify that metadata for strongly-consistent tables is recovered
    after a non-graceful stop (crash simulation).
    """
    cmdline = DEFAULT_CMDLINE + ['--logger-log-level', 'fixed_shard=trace']
    server = await manager.server_add(config=DEFAULT_CONFIG, cmdline=cmdline)
    (cql, hosts) = await manager.get_ready_cql([server])

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 4} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, c int") as table:
            # Write some rows to trigger raft table updates
            for pk in range(20):
                await cql.run_async(f"INSERT INTO {table} (pk, c) VALUES ({pk}, {pk})")

            # Collect raft state and log entry counts before crash
            state_before_crash = await collect_all_raft_state(cql, hosts[0])

            await manager.server_stop(server.server_id)

            await manager.server_start(server.server_id)
            await reconnect_driver(manager)
            cql = manager.get_cql()

            # We can't read the internal raft state directly, so we perform extra writes
            # which should cause raft table updates based on the loaded state after restart.
            for pk in range(20, 30):
                await cql.run_async(f"INSERT INTO {table} (pk, c) VALUES ({pk}, {pk})")

            state_after_crash = await collect_all_raft_state(cql, hosts[0])
            assert_raft_state_continuity(state_before_crash, state_after_crash, "crash recovery")

            await assert_no_cross_shard_routing(manager, server)

    await manager.server_stop_gracefully(server.server_id)

@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_no_schema_when_apply_write(manager: ManagerClient):
    servers = await manager.servers_add(2, config=DEFAULT_CONFIG, cmdline=DEFAULT_CMDLINE, auto_rack_dc='my_dc')
    # We don't want `servers[2]` to be a Raft leader (for both group0 and strong consistency groups),
    # because we want `servers[2]` to receive Raft commands from others.
    config = DEFAULT_CONFIG | {'error_injections_at_startup': ['avoid_being_raft_leader']}
    servers += [await manager.server_add(config=config, cmdline=DEFAULT_CMDLINE, property_file={'dc':'my_dc', 'rack': 'rack3'})]
    (cql, hosts) = await manager.get_ready_cql(servers)
    host_ids = await gather_safely(*[manager.get_host_id(s.server_id) for s in servers])

    def host_by_host_id(host_id):
        for hid, host in zip(host_ids, hosts):
            if hid == host_id:
                return host
        raise RuntimeError(f"Can't find host for host_id {host_id}")

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);")

        # Drop incoming append entries from group0 (schema changes) on `servers[2]` after the table is created,
        # so strong consistency raft groups are created on the node but it won't receive next alter table mutations.
        group0_id = (await cql.run_async("SELECT value FROM system.scylla_local WHERE key = 'raft_group0_id'"))[0].value
        await manager.api.enable_injection(servers[2].ip_addr, "raft_drop_incoming_append_entries_for_specified_group", one_shot=False, parameters={'value': group0_id})
        await cql.run_async(f"ALTER TABLE {ks}.test ADD new_col int;", host=hosts[0])

        group_id = await get_table_raft_group_id(manager, ks, 'test')
        leader_host_id = await wait_for_leader(manager, servers[0], group_id)
        assert leader_host_id != host_ids[2]
        leader_host = host_by_host_id(leader_host_id)

        s2_log = await manager.server_open_log(servers[2].server_id)
        s2_mark = await s2_log.mark()

        await manager.api.enable_injection(servers[2].ip_addr, "disable_raft_drop_append_entries_for_specified_group", one_shot=True)
        await cql.run_async(f"INSERT INTO {ks}.test (pk, c, new_col) VALUES (10, 20, 30)", host=leader_host)

        await s2_log.wait_for(f"Column definitions for {ks}.test changed", timeout=60, from_mark=s2_mark)
        rows = await cql.run_async(f"SELECT * FROM {ks}.test WHERE pk = 10;", host=hosts[2])
        assert len(rows) == 1
        row = rows[0]
        assert row.pk == 10
        assert row.c == 20
        assert row.new_col == 30

@pytest.mark.asyncio
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_old_schema_when_apply_write(manager: ManagerClient):
    servers = await manager.servers_add(2, config=DEFAULT_CONFIG, cmdline=DEFAULT_CMDLINE, auto_rack_dc='my_dc')
    # We don't want `servers[2]` to be a Raft leader (for both group0 and strong consistency groups),
    # because we want `servers[2]` to receive Raft commands from others.
    config = DEFAULT_CONFIG | {'error_injections_at_startup': ['avoid_being_raft_leader']}
    servers += [await manager.server_add(config=config, cmdline=DEFAULT_CMDLINE, property_file={'dc':'my_dc', 'rack': 'rack3'})]
    (cql, hosts) = await manager.get_ready_cql(servers)
    host_ids = await gather_safely(*[manager.get_host_id(s.server_id) for s in servers])

    def host_by_host_id(host_id):
        for hid, host in zip(host_ids, hosts):
            if hid == host_id:
                return host
        raise RuntimeError(f"Can't find host for host_id {host_id}")

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);")

        group_id = await get_table_raft_group_id(manager, ks, 'test')
        leader_host_id = await wait_for_leader(manager, servers[0], group_id)
        assert leader_host_id != host_ids[2]
        leader_host = host_by_host_id(leader_host_id)

        table_schema_version = (await cql.run_async(f"SELECT version FROM system_schema.scylla_tables WHERE keyspace_name = '{ks}' AND table_name = 'test'"))[0].version

        await manager.api.enable_injection(servers[2].ip_addr, "strong_consistency_state_machine_wait_before_apply", one_shot=False)
        await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES (10, 20)", host=leader_host)

        await cql.run_async(f"ALTER TABLE {ks}.test ADD new_col int;", host=leader_host)
        # Following injection simulates that old schema version was already removed from the memory
        await manager.api.enable_injection(servers[2].ip_addr, "schema_registry_ignore_version", one_shot=False, parameters={'value': table_schema_version})
        await manager.api.message_injection(servers[2].ip_addr, "strong_consistency_state_machine_wait_before_apply")
        await manager.api.disable_injection(servers[2].ip_addr, "strong_consistency_state_machine_wait_before_apply")

        rows = await cql.run_async(f"SELECT * FROM {ks}.test WHERE pk = 10;", host=hosts[2])
        assert len(rows) == 1
        row = rows[0]
        assert row.pk == 10
        assert row.c == 20
        assert row.new_col is None

@pytest.mark.asyncio
async def test_reject_user_provided_timestamps(manager: ManagerClient):
    """
    A simple validation test that makes sure that we don't accept
    user-provided timestamps in queries to strongly consistent tables.
    """

    server = await manager.server_add(config=DEFAULT_CONFIG, cmdline=DEFAULT_CMDLINE)
    cql, _ = await manager.get_ready_cql([server])

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, v int") as table:
            error_msg = "Strongly consistent queries don't support user-provided timestamps"
            with pytest.raises(InvalidRequest, match=error_msg):
                await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (0, 13) USING TIMESTAMP 23")
            with pytest.raises(InvalidRequest, match=error_msg):
                await cql.run_async(f"UPDATE {table} USING TIMESTAMP 23 SET v = 13 WHERE pk = 0")
            with pytest.raises(InvalidRequest, match=error_msg):
                await cql.run_async(f"DELETE FROM {table} USING TIMESTAMP 23 WHERE pk = 0")
            # FIXME(SCYLLADB-977):
            # Add test cases for batches with timestamps. Remember to
            # handle both whole-batch timestamps, e.g.
            #   BEGIN BATCH USING TIMESTAMP ts
            #     ...
            #   APPLY BATCH
            # as well as timestamps for individual items, e.g.
            #   BEGIN BATCH
            #     INSERT INTO ... USING TIMESTAMP st;
            #     ...
            #   APPLY BATCH

async def test_forward_cql_prepared_with_bound_values(manager: ManagerClient):
    """
    When we prepare an statement not on the leader, we should
    still be able to forward it to the leader with bound values.
    In this test we prepare a statement that should be forwarded to
    the leader (so an INSERT statement) and then execute it.
    It should correctly insert the value.
    Then, we execute it again and check that we used the cache
    on the leader side and that it updated the value correctly again.
    """
    cmdline = [
        '--logger-log-level', 'cql_server=trace',
        '--experimental-features', 'strongly-consistent-tables',
    ]
    servers = await manager.servers_add(2, cmdline=cmdline, auto_rack_dc='dc1')
    (cql, hosts) = await manager.get_ready_cql(servers)

    ks_opts = ("WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND consistency = 'global'")
    async with new_test_keyspace(manager, ks_opts) as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, value int") as table:
            table_name = table.split('.')[-1]
            group_id = await get_table_raft_group_id(manager, ks, table_name)
            leader_host_id = await wait_for_leader(manager, servers[0], group_id)
            non_leader_host = [host for host in hosts if str(host.host_id) != str(leader_host_id)][0]

            # Prepare statement
            stmt = cql.prepare(f"INSERT INTO {table} (pk, value) VALUES (0, ?)")

            # Execute prepared INSERT once to verify results and populate the cache on the leader
            await cql.run_async(stmt, [0], host=non_leader_host)
            res = await cql.run_async(f"SELECT value FROM {table} WHERE pk = 0")
            assert len(res) == 1 and res[0].value == 0

            # Execute prepared INSERT again to verify that we use the cache on the leader
            traced_execute = cql.execute(stmt, (0,), host=non_leader_host, trace=True)
            res = await cql.run_async(f"SELECT value FROM {table} WHERE pk = 0")
            assert len(res) == 1 and res[0].value == 0
            trace = traced_execute.get_query_trace()
            for event in trace.events:
                logger.info(f"Trace event: {event.description}")
                assert "Prepared statement not found on target" not in event.description

@pytest.mark.asyncio
async def test_forward_cql_cache_invalidation(manager: ManagerClient):
    """
    Test that cql forwarding works after invalidation of prepared statement cache on schema changes.
    """
    cmdline = [
        '--logger-log-level', 'cql_server=trace',
        '--experimental-features', 'strongly-consistent-tables',
    ]
    servers = await manager.servers_add(2, cmdline=cmdline, auto_rack_dc='dc1')
    (cql, hosts) = await manager.get_ready_cql(servers)

    ks_opts = ("WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1} AND consistency = 'global'")
    async with new_test_keyspace(manager, ks_opts) as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, value int") as table:
            table_name = table.split('.')[-1]
            group_id = await get_table_raft_group_id(manager, ks, table_name)
            leader_host_id = await wait_for_leader(manager, servers[0], group_id)
            non_leader_host = [host for host in hosts if str(host.host_id) != str(leader_host_id)][0]

            insert_stmt = cql.prepare(f"INSERT INTO {table} (pk, value) VALUES (?, ?)")
            select_stmt = cql.prepare(f"SELECT pk, value FROM {table} WHERE pk = ?")

            await cql.run_async(insert_stmt, [1, 100])
            rows = await cql.run_async(select_stmt, [1])
            assert len(rows) == 1
            assert rows[0].value == 100

            metrics_before = await manager.metrics.query(non_leader_host.address)
            prepared_not_found_before = metrics_before.get('scylla_transport_requests_forwarded_prepared_not_found') or 0

            # Altering schema invalidates prepared statement cache
            await cql.run_async(f"ALTER TABLE {table} ADD extra_column text")
            await cql.run_async(insert_stmt, [2, 200], host=non_leader_host)
            rows = await cql.run_async(select_stmt, [2], host=non_leader_host)
            assert len(rows) == 1
            assert rows[0].value == 200

            metrics_after = await manager.metrics.query(non_leader_host.address)
            prepared_not_found_after = metrics_after.get('scylla_transport_requests_forwarded_prepared_not_found') or 0
            assert prepared_not_found_after > prepared_not_found_before

@pytest.mark.skip_mode('release', "error injections aren't enabled in release mode")
async def test_forward_cql_exception_passthrough(manager: ManagerClient):
    """
    Verify that coordinator exception returned on the target replica is correctly returned to the client.
    """
    cmdline = [
        '--logger-log-level', 'cql_server=trace',
        '--experimental-features', 'strongly-consistent-tables',
    ]
    servers = await manager.servers_add(3, cmdline=cmdline, auto_rack_dc='dc1')
    (cql, hosts) = await manager.get_ready_cql(servers)
    host_ids = await gather_safely(*[manager.get_host_id(s.server_id) for s in servers])

    ks_opts = ("WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1} AND consistency = 'global'")
    async with new_test_keyspace(manager, ks_opts) as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, value int") as table:
            table_name = table.split('.')[-1]
            group_id = await get_table_raft_group_id(manager, ks, table_name)

            logger.info(f"Get current leader for the group {group_id}")
            try:
                leader_host_id = await wait_for_leader(manager, servers[0], group_id)
            except:
                # We need to wait for leader on a replica, and first server might not be one
                leader_host_id = await wait_for_leader(manager, servers[1], group_id)
            leader_host = [host for host in hosts if str(host.host_id) == str(leader_host_id)][0]

            tablet_replicas = await get_tablet_replicas(manager, servers[0], ks, table_name, 0)
            assert len(tablet_replicas) == 2
            replica_host_ids = [replica[0] for replica in tablet_replicas]

            logger.info(f"Get the non-leader replica for the group {group_id}")
            non_leader_replica_host_id = [host_id for host_id in replica_host_ids if str(host_id) != str(leader_host_id)][0]
            non_leader_replica_host = [host for host in hosts if str(host.host_id) == str(non_leader_replica_host_id)][0]

            logger.info(f"Get the non-replica for the group {group_id}")
            non_replica_host_id = [host_id for host_id in host_ids if str(host_id) not in replica_host_ids][0]
            non_replica_host = [host for host in hosts if str(host.host_id) == str(non_replica_host_id)][0]


            logger.info(f"Verify that timeout on the target node is returned to the client and fail metric is incremented")
            await manager.api.enable_injection(leader_host.address, "sc_modification_statement_timeout", one_shot=False)
            metrics = await manager.metrics.query(non_leader_replica_host.address)
            errors_before = metrics.get('scylla_transport_requests_forwarded_failed') or 0

            with pytest.raises(WriteTimeout):
                await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, value) VALUES (1, 1)", retry_policy=FallthroughRetryPolicy()), host=non_leader_replica_host)

            metrics = await manager.metrics.query(non_leader_replica_host.address)
            errors_after = metrics.get('scylla_transport_requests_forwarded_failed') or 0
            assert errors_after > errors_before
            await manager.api.disable_injection(leader_host.address, "sc_modification_statement_timeout")

            # Now test that we get correct exception if the cross-node forwarding RPC times out.
            logger.info("Verify that timeout of the forwarding RPC is returned as the correct exception to the client")
            await manager.api.enable_injection(leader_host.address, "wait_before_handling_forwarded_request", one_shot=False)
            with pytest.raises(WriteTimeout):
                await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, value) VALUES (1, 1) USING TIMEOUT 500ms", retry_policy=FallthroughRetryPolicy()), host=non_leader_replica_host)

            await manager.api.enable_injection(non_leader_replica_host.address, "wait_before_handling_forwarded_request", one_shot=False)
            with pytest.raises(ReadTimeout):
                await cql.run_async(SimpleStatement(f"SELECT * FROM {table} WHERE pk = 1 USING TIMEOUT 500ms", retry_policy=FallthroughRetryPolicy()), host=non_replica_host)

            await manager.api.message_injection(leader_host.address, "wait_before_handling_forwarded_request")
            await manager.api.message_injection(non_leader_replica_host.address, "wait_before_handling_forwarded_request")


@pytest.mark.asyncio
@pytest.mark.skip_mode("release", "error injections aren't enabled in release mode")
async def test_drop_table_during_insert(manager: ManagerClient):
    """Regression test for SCYLLADB-1450: node crashes when DROP TABLE races with
    an in-flight DML on a strongly-consistent table.

    An error injection pauses INSERT inside create_operation_ctx (after obtaining
    the ERM but before acquire_server).  While it is paused the table is dropped,
    which erases the raft group.  Resuming the INSERT used to hit on_internal_error
    in acquire_server and abort the node."""

    config = {"experimental_features": ["strongly-consistent-tables"]}
    cmdline = ["--logger-log-level", "sc_groups_manager=debug"]
    server = await manager.server_add(config=config, cmdline=cmdline)
    (cql, hosts) = await manager.get_ready_cql([server])

    async with new_test_keyspace(
        manager,
        "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}"
        " AND tablets = {'initial': 1} AND consistency = 'global'",
    ) as ks:
        table = f"{ks}.tbl"
        await cql.run_async(f"CREATE TABLE {table} (pk int PRIMARY KEY, v int)")

        log = await manager.server_open_log(server.server_id)
        mark = await log.mark()

        await manager.api.enable_injection(server.ip_addr, "sc_coordinator_wait_before_acquire_server", one_shot=False)

        # Fire INSERT in the background – it will pause at the injection point.
        insert_fut = cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (0, 0)")

        # Wait until the injection is actually hit.
        await log.wait_for("sc_coordinator_wait_before_acquire_server: waiting for message", from_mark=mark, timeout=30)

        # Drop the table while INSERT is paused.
        await cql.run_async(f"DROP TABLE {table}")

        # Wait for the raft group to be destroyed.
        await log.wait_for("schedule_raft_group_deletion.*raft server.*is destroyed", from_mark=mark, timeout=30)

        # Resume the INSERT – with the bug present, the node will abort here.
        await manager.api.message_injection(server.ip_addr, "sc_coordinator_wait_before_acquire_server")

        # The INSERT should fail gracefully, not crash the node.
        try:
            await insert_fut
        except Exception as e:
            logger.info(f"INSERT failed with expected error: {e}")

        # Verify the node is still alive.
        await manager.api.get_host_id(server.ip_addr)


@pytest.mark.asyncio
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
async def test_timed_out_queries(manager: ManagerClient):
    """
    A simple test verifying that we don't get stuck for an indefinite amount
    of time while reading from or writing to a strongly consistent table.
    As soon as the deadline for a query ends, the operation should be canceled\
    and a time-out exception should be returned.

    This test focuses on a Raft operation being the potential reason for
    getting stuck. It should be aborted when we reach the deadline.
    """

    s1 = await manager.server_add(config=DEFAULT_CONFIG, cmdline=DEFAULT_CMDLINE)
    cql, _ = await manager.get_ready_cql([s1])

    log = await manager.server_open_log(s1.server_id)

    async def try_query_with_timeout(exception_type, stmt: str, error_injection_name: str, timeout_sec: float):
        await manager.api.enable_injection(s1.ip_addr, error_injection_name, one_shot=True)

        mark = await log.mark()
        request_timeout_ms = 100

        stmt_fut = cql.run_async(f"{stmt} USING TIMEOUT {request_timeout_ms}ms")
        await log.wait_for(error_injection_name, from_mark=mark)

        sleep_length = timeout_sec + (request_timeout_ms / 1000)
        await asyncio.sleep(sleep_length)

        await manager.api.message_injection(s1.ip_addr, error_injection_name)

        try:
            await stmt_fut
            return False
        except Exception as e:
            if isinstance(e, exception_type):
                assert f"Query timed out for {table}" in str(e)
                return True
            pytest.fail(f"Unexpected exception: {e}")

    async def try_query(exception_type, stmt: str, error_injection_name: str):
        # We cannot predict if the relevant timer will be triggered in time.
        # Even in not-really-extreme situations, it can take more time
        # than we expect. To avoid flakiness, we're going to attempt to
        # observe a timeout with an ever increasing margin of error.
        #
        # Most of the time, this will succeed during the first try,
        # so it shouldn't have a relevant impact on the length of the test.
        timeout_sec = 0.1
        while True:
            result = await try_query_with_timeout(exception_type, stmt, error_injection_name, timeout_sec)
            if result:
                break
            if timeout_sec > 60:
                pytest.fail("Reached the maximum number of attempts")
            timeout_sec *= 2

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, v int") as table:
            await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (0, 13)")


            async def try_read(error_injection_name: str):
                await try_query(ReadTimeout, f"SELECT * FROM {table} WHERE pk = 0", error_injection_name)

            async def try_write(error_injection_name: str):
                await try_query(WriteTimeout, f"INSERT INTO {table} (pk, v) VALUES (7, 23)", error_injection_name)

            # Case 1: Reads.
            read_error_injecitons = [
                "sc_coordinator_wait_before_acquire_server",
                "sc_coordinator_wait_before_query_read_barrier"
            ]
            for error_injection_name in read_error_injecitons:
                await try_read(error_injection_name)

            # Sanity check: Nothing broke and we can still read from the table.
            res = await cql.run_async(f"SELECT * FROM {table} WHERE pk = 0")
            assert res[0].v == 13

            # Case 2: Writes.
            write_error_injections = [
                "sc_coordinator_wait_before_acquire_server",
                "sc_coordinator_wait_before_begin_mutate",
                "sc_coordinator_wait_before_add_entry"
            ]
            for error_injection_name in write_error_injections:
                await try_write(error_injection_name)

            # Sanity check: Nothing broke and we can still write to the table.
            await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (17, 7)")

    # Case 3: Waiting for the leader during a write.

    # To trigger the timeout we want, we need to make sure that
    # groups_manager::begin_mutate will want to return need_wait_for_leader,
    # and that it will never succeed. This will do the job.
    await manager.api.enable_injection(s1.ip_addr, "sc_leader_info_updater_wait_before_setting_leader_info", one_shot=True)

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, v int") as table:
            with pytest.raises(WriteTimeout, match=f"Query timed out for {table}"):
                await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (11, 13) USING TIMEOUT 100ms")


@pytest.mark.asyncio
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
async def test_queries_while_dropping_table(manager: ManagerClient):
    """
    A simple test that verifies that ongoing reads and writes are not interrupted
    by a tablet migration. We simulate the process by dropping a table.

    We verify that:
    - New reads and writes are rejected with a proper error.
    - Ongoing reads and writes are not interrupted by a tablet migration
      (which boils down to successful executions of Raft methods).
    - Ongoing reads can still eventually observe that the table has
      been dropped.
    - Ongoing writes should also finish successfully if they've reached
      the strongly consistent coordinator.
    """

    s1 = await manager.server_add(config=DEFAULT_CONFIG, cmdline=DEFAULT_CMDLINE)
    cql, _ = await manager.get_ready_cql([s1])

    log = await manager.server_open_log(s1.server_id)
    mark = await log.mark()

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        table_name = "my_table"
        table = f"{ks}.{table_name}"

        await cql.run_async(f"CREATE TABLE {table} (pk int PRIMARY KEY, v int)")
        await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (0, 13)")

        await gather_safely(*[
            manager.api.enable_injection(s1.ip_addr, "sc_coordinator_wait_before_query_read_barrier", one_shot=True),
            manager.api.enable_injection(s1.ip_addr, "sc_coordinator_wait_before_add_entry", one_shot=True)
        ])

        read_fut = cql.run_async(f"SELECT * FROM {table} WHERE pk = 0")
        write_fut = cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (7, 17)")

        await asyncio.gather(*[
            asyncio.create_task(log.wait_for("sc_coordinator_wait_before_query_read_barrier", from_mark=mark)),
            asyncio.create_task(log.wait_for("sc_coordinator_wait_before_add_entry", from_mark=mark))
        ])

        mark = await log.mark()

        await cql.run_async(f"DROP TABLE {table}")

        # Sanity check: The table was really dropped and we can no longer read or write to it.
        with pytest.raises(InvalidRequest, match="unconfigured table"):
            await cql.run_async(f"SELECT * FROM {table} WHERE pk = 0")
        with pytest.raises(InvalidRequest, match="unconfigured table"):
            await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (13, 11)")

        # Wait for the Raft group to be removed. This should happen almost immediately
        # after dropping the table, but let's avoid any potential cause of flakiness.
        await log.wait_for(r"schedule_raft_group_deletion\(\): group id \S+: starting", from_mark=mark)

        await asyncio.gather(*[
            asyncio.create_task(manager.api.message_injection(s1.ip_addr, "sc_coordinator_wait_before_query_read_barrier")),
            asyncio.create_task(manager.api.message_injection(s1.ip_addr, "sc_coordinator_wait_before_add_entry"))
        ])

        # Make sure that the read noticed that the table had been dropped.
        with pytest.raises(Exception, match="Can't find a column family"):
            await read_fut
        # The groups manager waits for all ongoing queries to finish before
        # it aborts the Raft server. Thanks to this, the write will succeed.
        # No matter if we get a `no_such_column_family` exception when later
        # applying the mutation in the state machine or not, there will be
        # no exception thrown, so the write will still look like a success.
        # And that's the desired behavior.
        await write_fut


@pytest.mark.asyncio
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
async def test_queries_when_shutting_down(manager: ManagerClient):
    """
    A simple test verifying that pending reads and writes are canceled
    when the node starts shutting down.

    The potential cause of the hanging we're testing here comes from being
    stuck at a Raft operation when reading from strongly consistent table.
    """

    smp = 2
    config = DEFAULT_CONFIG | {"request_timeout_on_shutdown_in_seconds": 1}
    cmdline = DEFAULT_CMDLINE + [
        f"--smp={smp}",
        "--logger-log-level", "cql_server=debug",
        "--logger-log-level", "sc_coordinator=trace"
    ]

    servers = await manager.servers_add(3, config=config, cmdline=cmdline, auto_rack_dc="dc1")
    cql, hosts = await manager.get_ready_cql(servers)

    async def pick_leader_info(host_id: HostID) -> Tuple[Host, ServerInfo]:
        for host, server in zip(hosts, servers):
            srv_host_id = await manager.get_host_id(server.server_id)
            if srv_host_id == host_id:
                return (host, server)
        raise RuntimeError(f"Can't find host for host_id {host_id}")

    async def get_leader(keyspace: str, table: str) -> Tuple[Host, ServerInfo]:
        logger.info("Select raft group id for the tablet")
        group_id = await get_table_raft_group_id(manager, keyspace, table)

        logger.info(f"Get current leader for the group {group_id}")
        leader_host_id = await wait_for_leader(manager, servers[0], group_id)

        logger.info(f"Leader of group {group_id} is {leader_host_id}")

        leader_host, leader_info = await pick_leader_info(leader_host_id)
        logger.info(f"Further information on leader of group {group_id}: server_id={leader_info.server_id}, ip={leader_info.ip_addr}")

        return (leader_host, leader_info)

    prevent_read_injection = "sc_coordinator_wait_before_query_read_barrier"
    prevent_write_injection = "sc_coordinator_wait_before_add_entry"

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, v int") as table:
            _, table_name = table.split(".")
            leader_host, leader_info = await get_leader(ks, table_name)

            log = await manager.server_open_log(leader_info.server_id)
            mark = await log.mark()

            await asyncio.gather(*[
                asyncio.create_task(manager.api.enable_injection(leader_info.ip_addr, prevent_read_injection, one_shot=True)),
                asyncio.create_task(manager.api.enable_injection(leader_info.ip_addr, prevent_write_injection, one_shot=True))
            ])

            read_fut = cql.run_async(f"SELECT * FROM {table} WHERE pk = 0", host=leader_host)
            write_fut = cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (0, 13)", host=leader_host)

            await asyncio.gather(*[
                asyncio.create_task(log.wait_for(prevent_read_injection, from_mark=mark)),
                asyncio.create_task(log.wait_for(prevent_write_injection, from_mark=mark))
            ])

            mark = await log.mark()
            stop_fut = asyncio.create_task(manager.server_stop_gracefully(leader_info.server_id))

            delta = 0.1
            timeout = 60

            # We don't know which shard coordiantes the request,
            # so we need to wait for all of them
            while delta < timeout:
                matches = await log.grep(r"generic_server::shutdown completed", from_mark=mark)
                if len(matches) >= smp:
                    break
                logger.debug(f"Grepped matches={matches}")
                await asyncio.sleep(delta)
                delta *= 2
            if delta >= timeout:
                pytest.fail("Exceeded timeout")

            mark = await log.mark()

            await asyncio.gather(*[
                asyncio.create_task(manager.api.message_injection(leader_info.ip_addr, prevent_read_injection)),
                asyncio.create_task(manager.api.message_injection(leader_info.ip_addr, prevent_write_injection))
            ])

            await asyncio.gather(*[
                asyncio.create_task(log.wait_for(rf"mutate\(\): request timed out with error .*, table {table}", from_mark=mark)),
                asyncio.create_task(log.wait_for(rf"query\(\): request timed out with error .*, table {table}", from_mark=mark))
            ])

            # We cannot predict what the result of the query is going to be.
            # Technically, we could ensure either outcome, but it requires
            # playing with more error injections. We don't really care about
            # the result (the test just wants to make sure we stop the node
            # fast enough), so let's just log it and call it a day.
            try:
                await read_fut
                logger.debug("The read ended successfully")
            except Exception as e:
                logger.debug(f"The read ended in an exception: {e}")
                pass
            try:
                await write_fut
                logger.debug("The write ended successfully")
            except Exception as e:
                logger.debug(f"The write ended in an exception: {e}")
                pass

            await stop_fut


@pytest.mark.asyncio
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
async def test_abort_forwarded_write_upon_shutdown(manager: ManagerClient):
    """
    Test verifying that forwarded writes to strongly consistent tables are
    aborted upon the shutdown of the target replica.
    """

    smp = 2
    config = DEFAULT_CONFIG | {"request_timeout_on_shutdown_in_seconds": 1}
    cmdline = DEFAULT_CMDLINE + [
        f"--smp={smp}",
        "--logger-log-level", "cql_server=debug",
        "--logger-log-level", "sc_coordinator=trace"
    ]

    servers = await manager.servers_add(3, config=config, cmdline=cmdline, auto_rack_dc="dc1")
    cql, hosts = await manager.get_ready_cql(servers)

    async def pick_leader_info(host_id: HostID) -> Tuple[Host, ServerInfo]:
        for host, server in zip(hosts, servers):
            srv_host_id = await manager.get_host_id(server.server_id)
            if srv_host_id == host_id:
                return (host, server)
        raise RuntimeError(f"Can't find host for host_id {host_id}")

    async def get_leader(keyspace: str, table: str) -> Tuple[Host, ServerInfo]:
        logger.info("Select raft group id for the tablet")
        group_id = await get_table_raft_group_id(manager, keyspace, table)

        logger.info(f"Get current leader for the group {group_id}")
        leader_host_id = await wait_for_leader(manager, servers[0], group_id)

        logger.info(f"Leader of group {group_id} is {leader_host_id}")

        leader_host, leader_info = await pick_leader_info(leader_host_id)
        logger.info(f"Further information on leader of group {group_id}: server_id={leader_info.server_id}, ip={leader_info.ip_addr}")

        return (leader_host, leader_info)

    prevent_write_injection = "sc_coordinator_wait_before_add_entry"

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        async with new_test_table(manager, ks, "pk int PRIMARY KEY, v int") as table:
            _, table_name = table.split(".")
            leader_host, leader_info = await get_leader(ks, table_name)

            nonleader_host, nonleader_info = next((host, server) for host, server in zip(hosts, servers) if host != leader_host)
            logger.debug(f"Targetting host={nonleader_host}: server_id={nonleader_info.server_id}, ip={nonleader_info.ip_addr}")

            log = await manager.server_open_log(leader_info.server_id)
            mark = await log.mark()

            await manager.api.enable_injection(leader_info.ip_addr, prevent_write_injection, one_shot=True)

            # Send the mutation to a NON-leader.
            write_fut = cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (0, 13)", host=nonleader_host)

            await log.wait_for(prevent_write_injection, from_mark=mark)
            mark = await log.mark()

            stop_fut = asyncio.create_task(manager.server_stop_gracefully(leader_info.server_id))

            delta = 0.1
            timeout = 60

            # We don't know which shard coordiantes the request,
            # so we need to wait for all of them
            while delta < timeout:
                matches = await log.grep(r"generic_server::shutdown completed", from_mark=mark)
                if len(matches) >= smp:
                    break
                logger.debug(f"Grepped matches={matches}")
                await asyncio.sleep(delta)
                delta *= 2
            if delta >= timeout:
                pytest.fail("Exceeded timeout")

            mark = await log.mark()

            await manager.api.message_injection(leader_info.ip_addr, prevent_write_injection)
            await log.wait_for(rf"mutate\(\): request timed out with error .*, table {table}")

            # It doesn't matter what the outcome is, but let's await the future
            # (as we should) and log the result.
            try:
                await write_fut
                logger.debug("The write ended successfully")
            except Exception as e:
                logger.debug(f"The write ended in an exception: {e}")
                pass

            await stop_fut


@pytest.mark.skip_bug(reason="SCYLLADB-1056")
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
async def test_abort_state_machine_apply_after_dropping_table(manager: ManagerClient):
    """
    This test verifies that ongoing executions of state_machine::apply are
    aborted when their corresponding Raft group is being removed. We test
    that by dropping the table, but it should also correspond to other cases
    like tablet migration.

    For a similar scenario during a node shutdown, see test_abort_state_machine_apply_during_shutdown.
    """

    cmdline = DEFAULT_CMDLINE + ["--logger-log-level", "sc_state_machine=debug:raft=debug"]
    leader_server = await manager.server_add(config=DEFAULT_CONFIG, cmdline=cmdline,
                                             property_file={"dc": "dc1", "rack": "rack1"})

    # We want to prevent target_server from becoming the leader of either group0
    # or the strongly consistent Raft group so it might not have the latest
    # schema version and is forced to perform a read barrier.
    config = DEFAULT_CONFIG | {"error_injections_at_startup": ["avoid_being_raft_leader"]}
    target_server = await manager.server_add(config=config, cmdline=cmdline, property_file={"dc": "dc1", "rack": "rack2"})

    cql, [leader_host, _] = await manager.get_ready_cql([leader_server, target_server])

    leader_host_id, target_host_id = await gather_safely(*[
        manager.get_host_id(leader_server.server_id),
        manager.get_host_id(target_server.server_id)
    ])

    wait_before_apply_injection = "strong_consistency_state_machine_wait_before_apply"

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        table_name = "my_table"
        table = f"{ks}.{table_name}"

        await cql.run_async(f"CREATE TABLE {table} (pk int PRIMARY KEY, v int)")

        group_id = await get_table_raft_group_id(manager, ks, table_name)
        leader_host_id = await wait_for_leader(manager, leader_server, group_id)
        assert leader_host_id != target_host_id

        await gather_safely(*[
            manager.api.enable_injection(target_server.ip_addr, wait_before_apply_injection, one_shot=True),
            manager.api.enable_injection(target_server.ip_addr, "sc_state_machine_return_empty_schema", one_shot=True)
        ])

        log = await manager.server_open_log(target_server.server_id)
        mark = await log.mark()

        # We won't wait for the follower to apply the state, so we can await this right away.
        await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (0, 13)", host=leader_host)

        await log.wait_for(wait_before_apply_injection, from_mark=mark)
        mark = await log.mark()

        await cql.run_async(f"DROP TABLE {table}")
        # Wait until the Raft group has started being removed.
        await log.wait_for(rf"schedule_raft_group_deletion\(\): starting aborting raft server for group id {group_id}", from_mark=mark)
        mark = await log.mark()

        # At this point, the Raft server should already be getting aborted,
        # so we can resume state_machine::apply.
        await manager.api.message_injection(target_server.ip_addr, wait_before_apply_injection)
        # Verify that state_machine::apply was really aborted.
        await log.wait_for(rf"apply\(\): execution for tablet \S+, group_id={group_id} aborted", from_mark=mark)


@pytest.mark.asyncio
@pytest.mark.skip_bug(reason="SCYLLADB-1056")
@pytest.mark.skip_mode(mode="release", reason="error injections are not supported in release mode")
async def test_abort_state_machine_apply_during_shutdown(manager: ManagerClient):
    """
    This test verifies that ongoing executions of state_machine::apply are
    aborted when a node is shutting down.

    For a similar scenario after dropping a table, see test_abort_state_machine_apply_after_dropping_table.
    """

    cmdline = DEFAULT_CMDLINE + ["--logger-log-level", "sc_state_machine=debug:raft=debug"]
    leader_server = await manager.server_add(config=DEFAULT_CONFIG, cmdline=cmdline,
                                             property_file={"dc": "dc1", "rack": "rack1"})

    # We want to prevent target_server from becoming the leader of either group0
    # or the strongly consistent Raft group so it might not have the latest
    # schema version and is forced to perform a read barrier.
    config = DEFAULT_CONFIG | {"error_injections_at_startup": ["avoid_being_raft_leader"]}
    target_server = await manager.server_add(config=config, cmdline=cmdline, property_file={"dc": "dc1", "rack": "rack2"})

    cql, [leader_host, target_host] = await manager.get_ready_cql([leader_server, target_server])

    leader_host_id, target_host_id = await gather_safely(*[
        manager.get_host_id(leader_server.server_id),
        manager.get_host_id(target_server.server_id)
    ])

    wait_before_apply_injection = "strong_consistency_state_machine_wait_before_apply"

    async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1} AND consistency = 'global'") as ks:
        table_name = "my_table"
        table = f"{ks}.{table_name}"

        await cql.run_async(f"CREATE TABLE {table} (pk int PRIMARY KEY, v int)")

        group_id = await get_table_raft_group_id(manager, ks, table_name)
        leader_host_id = await wait_for_leader(manager, leader_server, group_id)
        assert leader_host_id != target_host_id

        await gather_safely(*[
            manager.api.enable_injection(target_server.ip_addr, wait_before_apply_injection, one_shot=True),
            manager.api.enable_injection(target_server.ip_addr, "sc_state_machine_return_empty_schema", one_shot=True)
        ])

        log = await manager.server_open_log(target_server.server_id)
        mark = await log.mark()

        # We won't wait for the follower to apply the state, so we can await this right away.
        await cql.run_async(f"INSERT INTO {table} (pk, v) VALUES (0, 13)", host=leader_host)

        await log.wait_for(wait_before_apply_injection, from_mark=mark)
        mark = await log.mark()

        stop_task = asyncio.create_task(manager.server_stop_gracefully(target_server.server_id))
        # Wait until the Raft group has started being removed.
        await log.wait_for(rf"schedule_raft_group_deletion\(\): starting aborting raft server for group id {group_id}", from_mark=mark)
        mark = await log.mark()

        # At this point, the Raft server should already be getting aborted,
        # so we can resume state_machine::apply.
        await manager.api.message_injection(target_server.ip_addr, wait_before_apply_injection)
        # Verify that state_machine::apply was really aborted.
        await log.wait_for(rf"apply\(\): execution for tablet \S+, group_id={group_id} aborted", from_mark=mark)

        # The test framework should verify that we haven't observed any errors
        # during the stopping procedure.
        await stop_task

        await manager.server_start(target_server.server_id)
        cql, [target_host] = await manager.get_ready_cql([target_server])

        rows = await cql.run_async(f"SELECT * FROM {table} WHERE pk = 0", host=target_host)
        assert len(rows) == 1
        assert rows[0].v == 13