test: fix cql connection problem in test_auth_raft_command_split

This is a speculative fix as the problem is observed only on CI. When run_async is called right after driver_connect and get_cql it fails with ConnectionException('Host has been marked down or removed'). If the approach proves to be succesfull we can start to deprecate base get_cql in favor of get_ready_cql. It's better to have robust testing helper libraries than try to take care of it in every test case separately. Fixes #17713 Closes scylladb/scylladb#17772
2026-05-01 05:35:48 +00:00 · 2024-03-13 08:39:03 +01:00
parent 4d83a8c12c
commit 7b60752e47
2 changed files with 15 additions and 8 deletions
--- a/test/auth_cluster/test_auth_raft_command_split.py
+++ b/test/auth_cluster/test_auth_raft_command_split.py
@@ -5,7 +5,6 @@
 #

 import asyncio
-import time
 from test.pylib.manager_client import ManagerClient
 import pytest
 from test.pylib.rest_client import inject_error, inject_error_one_shot
@@ -18,10 +17,7 @@ Tests case when bigger auth operation is split into multiple raft commands.
@pytest.mark.asyncio
 async def test_auth_raft_command_split(manager: ManagerClient) -> None:
    servers = await manager.servers_add(3)
-
-    cql = manager.get_cql()
-    hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
-    await manager.servers_see_each_other(servers)
+    cql, hosts = await manager.get_ready_cql(servers)

    initial_perms = await cql.run_async("SELECT * FROM system_auth_v2.role_permissions")

@@ -37,11 +33,13 @@ async def test_auth_raft_command_split(manager: ManagerClient) -> None:
    # this will trigger cascade of deletes which should be packed
    # into raft commands in a way that none exceeds max_command_size
    await manager.driver_connect(server=servers[0])
-    cql = manager.get_cql()
+    cql, _ = await manager.get_ready_cql([servers[0]])
    async with inject_error(manager.api, servers[0].ip_addr,
                            'auth_announce_mutations_command_max_size'):
        await cql.run_async(f"DROP ROLE IF EXISTS {shared_role}", execution_profile='whitelist')

+    cql, hosts = await manager.get_ready_cql(servers)
+
    # auth reads are eventually consistent so we need to sync all nodes
    await asyncio.gather(*(read_barrier(cql, host) for host in hosts))

--- a/test/pylib/manager_client.py
+++ b/test/pylib/manager_client.py
@@ -15,8 +15,7 @@ from time import time
 import logging
 from test.pylib.log_browsing import ScyllaLogFile
 from test.pylib.rest_client import UnixRESTClient, ScyllaRESTAPIClient, ScyllaMetricsClient
-from test.pylib.util import wait_for
-from test.pylib.util import wait_for_cql_and_get_hosts
+from test.pylib.util import wait_for, wait_for_cql_and_get_hosts, Host
 from test.pylib.internal_types import ServerNum, IPAddress, HostID, ServerInfo
 from test.pylib.scylla_cluster import ReplaceConfig, ScyllaServer
 from cassandra.cluster import Session as CassandraSession  # type: ignore # pylint: disable=no-name-in-module
@@ -81,6 +80,16 @@ class ManagerClient():
        assert self.cql
        return self.cql

+    # More robust version of get_cql, when topology changes
+    # or cql statement is executed immediately after driver_connect
+    # it may fail unless we perform additional readiness checks
+    async def get_ready_cql(self, servers: List[ServerInfo]) -> tuple[CassandraSession, list[Host]]:
+        """Precondition: driver is connected"""
+        cql = self.get_cql()
+        await self.servers_see_each_other(servers)
+        hosts = await wait_for_cql_and_get_hosts(cql, servers, time() + 60)
+        return cql, hosts
+
    # Make driver update endpoints from remote connection
    def _driver_update(self) -> None:
        if self.ccluster is not None: