# Copyright (C) 2025-present ScyllaDB # # SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 from cassandra.policies import WhiteListRoundRobinPolicy from test.cluster.conftest import skip_mode from test.pylib.manager_client import ManagerClient from test.pylib.random_tables import RandomTables, Column, IntType from test.pylib.rest_client import inject_error_one_shot from test.pylib.util import wait_for_cql_and_get_hosts import asyncio from datetime import datetime, timedelta import pytest import logging import time logger = logging.getLogger(__name__) @skip_mode('release', 'error injections are not supported in release mode') @pytest.mark.asyncio @pytest.mark.parametrize("query_type,should_wait_for_timeout,shutdown_nodes", [ ("SELECT", True, False), ("SELECT", True, True), ("SELECT_COUNT", False, False), ("SELECT_WHERE", True, False), ("SELECT_COUNT_WHERE", True, False), ]) async def test_long_query_timeout_erm(request, manager: ManagerClient, query_type, should_wait_for_timeout, shutdown_nodes): """ Test verifies that a query with long timeout doesn't block ERM on failure. The test is parametrized ("SELECT * ..." and "SELECT COUNT(*) ...") to verify both regular query execution and aggregate query that goes through mapreduce service. After an initial preparation, the test kills a node during the execution of a query with long timeout. The expectation is that: 1) Due to scylladb#21831 fix, ERM will not be blocked for the timeout period. 2) For non-mapreduce queries, due to scylladb#3699 fix, the query will not finish before the timeout passes. 3) For mapreduce query, the query will fail promptly with "Operation failed for ..." error, because waiting with a hope for driver-side speculative retry (scylladb#3699 fix) is not needed for an aggregate query. Please note that "SELECT COUNT(*) FROM {} WHERE key = 0" is not a mapreduce query. One of the test scenarios sets shutdown_nodes=True, to verify that we are able to quickly shutdown nodes, even if query timeout is extremely long. """ if query_type == "SELECT": query = "SELECT * FROM {}" elif query_type == "SELECT_COUNT": query = "SELECT COUNT(*) FROM {}" elif query_type == "SELECT_WHERE": query = "SELECT * FROM {} WHERE key = 0" elif query_type == "SELECT_COUNT_WHERE": query = "SELECT COUNT(*) FROM {} WHERE key = 0" else: assert False # Invalid query type logger.info("Start four nodes cluster") # FIXME: Adjust this test to run with `rf_rack_valid_keyspaces` set to `True`. servers = await manager.servers_add(4, config={"rf_rack_valid_keyspaces": False}) selected_server = servers[0] logger.info(f"Creating a client with selected_server: {selected_server}") cql = await manager.get_cql_exclusive(selected_server) logger.info("Create a table") random_tables = RandomTables(request.node.name, manager, "ks", replication_factor=3, enable_tablets=True) table = await random_tables.add_table(pks=1, columns=[ Column(name="key", ctype=IntType), Column(name="value", ctype=IntType) ]) if "WHERE" in query: # For non-ranged queries, Scylladb-side speculative retries are disabled to enforce # conditions of scylladb#3699 fix. logger.info("Disabling speculative retries") cql.execute(f"ALTER TABLE {table} WITH speculative_retry = 'NONE'") logger.info(f"Created {table}, write some rows") await asyncio.gather(*[cql.run_async(f"INSERT INTO {table} (key, value) VALUES ({i}, {i})") for i in range(256)]) logger.info("Enable injected errors that stop reads") injected_handlers = {} for server in servers: injected_handlers[server.ip_addr] = await inject_error_one_shot( manager.api, server.ip_addr, 'storage_proxy::handle_read', parameters={'cf_name': table.name}) query_timeout = 60 if shutdown_nodes: query_timeout *= 100 # Will end test by shutting down nodes instead of finishing the query before_query = datetime.now() final_query = query.format(table) logger.info("Starting to execute query={final_query} with query_timeout={query_timeout}") query_future = cql.run_async(f"{final_query} USING TIMEOUT {query_timeout}s", timeout=2*query_timeout) logger.info("Confirm reads are waiting on the injected error") server_to_kill = None for server in servers: if server != selected_server: server_log = await manager.server_open_log(server.server_id) try: await server_log.wait_for("storage_proxy::handle_read injection hit", timeout=5) server_to_kill = server except TimeoutError: # Some nodes might not receive any reads logger.info(f"Node not handling any reads: {server.ip_addr}") logger.info(f"Kill a node: {server_to_kill.ip_addr}") await manager.server_stop(server_to_kill.server_id) logger.info("Unblock reads") for server in servers: if server != server_to_kill: await injected_handlers[server.ip_addr].message() logger.info("Remove the killed node, add a new one - ERM should not be blocked") await manager.server_not_sees_other_server(selected_server.ip_addr, server_to_kill.ip_addr) await manager.remove_node(selected_server.server_id, server_to_kill.server_id) expected_timeout_time = before_query + timedelta(seconds=query_timeout) if datetime.now() > expected_timeout_time: logger.warning("Too much time passed and query already might be timed-out") if shutdown_nodes: for server in servers: await manager.server_stop_gracefully(server.server_id) else: if should_wait_for_timeout: with pytest.raises(Exception, match="Operation time"): await query_future assert datetime.now() > expected_timeout_time else: with pytest.raises(Exception, match="Operation failed for"): await query_future