Merge 'test: test_topology_ops: fix flakiness and reenable bg writes' from Patryk Jędrzejczak

We decrease the server's request timeouts in topology tests so that they are lower than the driver's timeout. Before, the driver could time out its request before the server handled it successfully. This problem caused scylladb/scylladb#15924. Since scylladb/scylladb#15924 is the last issue mentioned in scylladb/scylladb#15962, this PR also reenables background writes in `test_topology_ops` with tablets disabled. The test doesn't pass with tablets and background writes because of scylladb/scylladb#17025. We will reenable background writes with tablets after fixing that issue. Fixes scylladb/scylladb#15924 Fixes scylladb/scylladb#15962 Closes scylladb/scylladb#17585 * github.com:scylladb/scylladb: test: test_topology_ops: reenable background writes without tablets test: test_topology_ops: run with and without tablets test: topology: decrease the server's request timeouts
2026-05-13 11:22:01 +00:00 · 2024-03-04 20:57:24 +01:00
parent f1d9248df9 e7d4e080e9
commit 0a7854ea4d
3 changed files with 36 additions and 25 deletions
--- a/test.py
+++ b/test.py
@@ -429,6 +429,7 @@ class PythonTestSuite(TestSuite):
                             create_cfg.config_from_test

            server = ScyllaServer(
+                mode=self.mode,
                exe=self.scylla_exe,
                vardir=os.path.join(self.options.tmpdir, self.mode),
                logger=create_cfg.logger,
--- a/test/pylib/scylla_cluster.py
+++ b/test/pylib/scylla_cluster.py
@@ -53,7 +53,16 @@ class ReplaceConfig(NamedTuple):
    wait_replaced_dead: bool = True


-def make_scylla_conf(workdir: pathlib.Path, host_addr: str, seed_addrs: List[str], cluster_name: str) -> dict[str, object]:
+def make_scylla_conf(mode: str, workdir: pathlib.Path, host_addr: str, seed_addrs: List[str], cluster_name: str) -> dict[str, object]:
+    # We significantly increase default timeouts to allow running tests on a very slow
+    # setup (but without network losses). These timeouts can impact the running time of
+    # topology tests. For example, the barrier_and_drain topology command waits until
+    # background writes' handlers time out. We don't want to slow down tests for no
+    # reason, so we increase the timeouts according to each mode's needs. The client
+    # should avoid timing out its requests before the server times out - for this reason
+    # we increase the CQL driver's client-side timeout in conftest.py.
+    request_timeout_in_ms = 180000 if mode in {'debug', 'sanitize'} else 30000
+
    return {
        'cluster_name': cluster_name,
        'workdir': str(workdir.resolve()),
@@ -86,19 +95,13 @@ def make_scylla_conf(workdir: pathlib.Path, host_addr: str, seed_addrs: List[str
        'flush_schema_tables_after_modification': False,
        'auto_snapshot': False,

-        # Significantly increase default timeouts to allow running tests
-        # on a very slow setup (but without network losses). Note that these
-        # are server-side timeouts: The client should also avoid timing out
-        # its own requests - for this reason we increase the CQL driver's
-        # client-side timeout in conftest.py.
-
-        'range_request_timeout_in_ms': 300000,
-        'read_request_timeout_in_ms': 300000,
-        'counter_write_request_timeout_in_ms': 300000,
-        'cas_contention_timeout_in_ms': 300000,
-        'truncate_request_timeout_in_ms': 300000,
-        'write_request_timeout_in_ms': 300000,
-        'request_timeout_in_ms': 300000,
+        'range_request_timeout_in_ms': request_timeout_in_ms,
+        'read_request_timeout_in_ms': request_timeout_in_ms,
+        'counter_write_request_timeout_in_ms': request_timeout_in_ms,
+        'cas_contention_timeout_in_ms': request_timeout_in_ms,
+        'truncate_request_timeout_in_ms': request_timeout_in_ms,
+        'write_request_timeout_in_ms': request_timeout_in_ms,
+        'request_timeout_in_ms': request_timeout_in_ms,
        'user_defined_function_time_limit_ms': 1000,

        'strict_allow_filtering': True,
@@ -209,7 +212,7 @@ class ScyllaServer:
    host_id: HostID                             # Host id (UUID)
    newid = itertools.count(start=1).__next__   # Sequential unique id

-    def __init__(self, exe: str, vardir: str,
+    def __init__(self, mode: str, exe: str, vardir: str,
                 logger: Union[logging.Logger, logging.LoggerAdapter],
                 cluster_name: str, ip_addr: str, seeds: List[str],
                 cmdline_options: List[str],
@@ -236,6 +239,7 @@ class ScyllaServer:
        self.property_filename = self.workdir / "conf/cassandra-rackdc.properties"
        # Sum of basic server configuration and the user-provided config options.
        self.config = make_scylla_conf(
+                mode = mode,
                workdir = self.workdir,
                host_addr = self.ip_addr,
                seed_addrs = self.seeds,
--- a/test/topology_experimental_raft/test_topology_ops.py
+++ b/test/topology_experimental_raft/test_topology_ops.py
@@ -21,23 +21,28 @@ logger = logging.getLogger(__name__)


@pytest.mark.asyncio
-async def test_topology_ops(request, manager: ManagerClient):
+@pytest.mark.parametrize("tablets_enabled", ["true", "false"])
+async def test_topology_ops(request, manager: ManagerClient, tablets_enabled: bool):
    """Test basic topology operations using the topology coordinator."""
+    cfg = {'experimental_features': ['consistent-topology-changes']}
+    if tablets_enabled:
+        cfg['experimental_features'].append('tablets')
+
    logger.info("Bootstrapping first node")
-    servers = [await manager.server_add()]
+    servers = [await manager.server_add(config=cfg)]

    logger.info(f"Restarting node {servers[0]}")
    await manager.server_stop_gracefully(servers[0].server_id)
    await manager.server_start(servers[0].server_id)

-    await wait_for_cql_and_get_hosts(manager.cql, await manager.running_servers(), time.time() + 60)
-    cql = await reconnect_driver(manager)
-    # FIXME: disabled as a workaround for #15935, #15924
-    # We need to re-enable once these issues are fixed.
-    #finish_writes = await start_writes(cql)
-
    logger.info("Bootstrapping other nodes")
-    servers += await manager.servers_add(3)
+    servers += await manager.servers_add(3, config=cfg)
+
+    await wait_for_cql_and_get_hosts(manager.cql, servers, time.time() + 60)
+    cql = await reconnect_driver(manager)
+    # FIXME: we disable background writes with tablets enabled because the test fails due to #17025.
+    # We need to re-enable them once this issue is fixed (by removing `if` here and above `await finish_writes()`).
+    finish_writes = await start_writes(cql) if not tablets_enabled else None

    logger.info(f"Decommissioning node {servers[0]}")
    await manager.decommission_node(servers[0].server_id)
@@ -67,7 +72,8 @@ async def test_topology_ops(request, manager: ManagerClient):
    servers = servers[1:]

    logger.info("Checking results of the background writes")
-    #await finish_writes()
+    if not tablets_enabled:
+        await finish_writes()

    for server in servers:
        await check_node_log_for_failed_mutations(manager, server)