From 7b60752e47292fa244548a7709fcf937d85d18cf Mon Sep 17 00:00:00 2001 From: Marcin Maliszkiewicz Date: Wed, 13 Mar 2024 08:39:03 +0100 Subject: [PATCH] test: fix cql connection problem in test_auth_raft_command_split This is a speculative fix as the problem is observed only on CI. When run_async is called right after driver_connect and get_cql it fails with ConnectionException('Host has been marked down or removed'). If the approach proves to be succesfull we can start to deprecate base get_cql in favor of get_ready_cql. It's better to have robust testing helper libraries than try to take care of it in every test case separately. Fixes #17713 Closes scylladb/scylladb#17772 --- test/auth_cluster/test_auth_raft_command_split.py | 10 ++++------ test/pylib/manager_client.py | 13 +++++++++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/test/auth_cluster/test_auth_raft_command_split.py b/test/auth_cluster/test_auth_raft_command_split.py index 25ceaa38e4..bc96ad7a5d 100644 --- a/test/auth_cluster/test_auth_raft_command_split.py +++ b/test/auth_cluster/test_auth_raft_command_split.py @@ -5,7 +5,6 @@ # import asyncio -import time from test.pylib.manager_client import ManagerClient import pytest from test.pylib.rest_client import inject_error, inject_error_one_shot @@ -18,10 +17,7 @@ Tests case when bigger auth operation is split into multiple raft commands. @pytest.mark.asyncio async def test_auth_raft_command_split(manager: ManagerClient) -> None: servers = await manager.servers_add(3) - - cql = manager.get_cql() - hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - await manager.servers_see_each_other(servers) + cql, hosts = await manager.get_ready_cql(servers) initial_perms = await cql.run_async("SELECT * FROM system_auth_v2.role_permissions") @@ -37,11 +33,13 @@ async def test_auth_raft_command_split(manager: ManagerClient) -> None: # this will trigger cascade of deletes which should be packed # into raft commands in a way that none exceeds max_command_size await manager.driver_connect(server=servers[0]) - cql = manager.get_cql() + cql, _ = await manager.get_ready_cql([servers[0]]) async with inject_error(manager.api, servers[0].ip_addr, 'auth_announce_mutations_command_max_size'): await cql.run_async(f"DROP ROLE IF EXISTS {shared_role}", execution_profile='whitelist') + cql, hosts = await manager.get_ready_cql(servers) + # auth reads are eventually consistent so we need to sync all nodes await asyncio.gather(*(read_barrier(cql, host) for host in hosts)) diff --git a/test/pylib/manager_client.py b/test/pylib/manager_client.py index dfd1a9be61..9401999c34 100644 --- a/test/pylib/manager_client.py +++ b/test/pylib/manager_client.py @@ -15,8 +15,7 @@ from time import time import logging from test.pylib.log_browsing import ScyllaLogFile from test.pylib.rest_client import UnixRESTClient, ScyllaRESTAPIClient, ScyllaMetricsClient -from test.pylib.util import wait_for -from test.pylib.util import wait_for_cql_and_get_hosts +from test.pylib.util import wait_for, wait_for_cql_and_get_hosts, Host from test.pylib.internal_types import ServerNum, IPAddress, HostID, ServerInfo from test.pylib.scylla_cluster import ReplaceConfig, ScyllaServer from cassandra.cluster import Session as CassandraSession # type: ignore # pylint: disable=no-name-in-module @@ -81,6 +80,16 @@ class ManagerClient(): assert self.cql return self.cql + # More robust version of get_cql, when topology changes + # or cql statement is executed immediately after driver_connect + # it may fail unless we perform additional readiness checks + async def get_ready_cql(self, servers: List[ServerInfo]) -> tuple[CassandraSession, list[Host]]: + """Precondition: driver is connected""" + cql = self.get_cql() + await self.servers_see_each_other(servers) + hosts = await wait_for_cql_and_get_hosts(cql, servers, time() + 60) + return cql, hosts + # Make driver update endpoints from remote connection def _driver_update(self) -> None: if self.ccluster is not None: