From abfa4d0272e68d1f0fd8a5592fc2c071c05ba5f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 24 Mar 2026 21:09:19 +0200
Subject: [PATCH] =?UTF-8?q?Merge=20'test:=20cluster:=20Deflake=20test=5Fst?=
 =?UTF-8?q?artup=5Fwith=5Fkeyspaces=5Fviolating=5Frf=5Frack=5Fvalid=5Fkeys?=
 =?UTF-8?q?paces'=20from=20Dawid=20M=C4=99drek?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test was flaky. The scenario looked like this:

1. Stop server 1.
2. Set its rf_rack_valid_keyspaces configuration option to true.
3. Create an RF-rack-invalid keyspace.
4. Start server 1 and expect a failure during start-up.

It was wrong. We cannot predict when the Raft mutation corresponding to
the newly created keyspace will arrive at the node or when it will be
processed. If the check of the RF-rack-valid keyspaces we perform at
start-up was done before that, it won't include the keyspace. This will
lead to a test failure.

Unfortunately, it's not feasible to perform a read barrier during
start-up. What's more, although it would help the test, it wouldn't be
useful otherwise. Because of that, we simply fix the test, at least for
now.

The new scenario looks like this:

1. Disable the rf_rack_valid_keyspaces configuration option on server 1.
2. Start the server.
3. Create an RF-rack-invalid keyspace.
4. Perform a read barrier on server 1. This will ensure that it has
   observed all Raft mutations, and we won't run into the same problem.
5. Stop the node.
6. Set its rf_rack_valid_keyspaces configuration option to true.
7. Try to start the node and observe a failure.

This will make the test perform consistently.

---

I ran the test (in dev mode, on my local machine) three times before
these changes, and three times with them. I include the time results
below.

Before:
```
real    0m47.570s
user    0m41.631s
sys     0m8.634s

real    0m50.495s
user    0m42.499s
sys     0m8.607s

real    0m50.375s
user    0m41.832s
sys     0m8.789s
```

After:
```
real    0m50.509s
user    0m43.535s
sys     0m9.715s

real    0m50.857s
user    0m44.185s
sys     0m9.811s

real    0m50.873s
user    0m44.289s
sys     0m9.737s
```

Fixes SCYLLADB-1137

Backport: The test is present on all supported branches, and so we
          should backport these changes to them.

Closes scylladb/scylladb#29218

* github.com:scylladb/scylladb:
  test: cluster: Deflake test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces
  test: cluster: Mark test with @pytest.mark.asyncio in test_multidc.py

(cherry picked from commit d52fbf7adafc2fb6cfb13acbe729ac4557a0ecc7)

Closes scylladb/scylladb#29247
---
 test/cluster/test_multidc.py | 40 +++++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/test/cluster/test_multidc.py b/test/cluster/test_multidc.py
index 5b57c345cd..c361cbce2c 100644
--- a/test/cluster/test_multidc.py
+++ b/test/cluster/test_multidc.py
@@ -20,6 +20,7 @@ from cassandra.query import SimpleStatement
 from test.pylib.async_cql import _wrap_future
 from test.pylib.manager_client import ManagerClient
 from test.pylib.random_tables import RandomTables, TextType, Column
+from test.pylib.rest_client import read_barrier
 from test.pylib.util import unique_name
 from test.cluster.conftest import cluster_con
 
@@ -403,6 +404,7 @@ async def test_arbiter_dc_rf_rack_valid_keyspaces(manager: ManagerClient):
         for task in [*valid_keyspaces, *invalid_keyspaces]:
             _ = tg.create_task(task)
 
+@pytest.mark.asyncio
 async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager: ManagerClient):
     """
     This test verifies that starting a Scylla node fails when there's an RF-rack-invalid keyspace.
@@ -464,22 +466,50 @@ async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager:
         for rfs, tablets in valid_keyspaces:
             _ = tg.create_task(create_keyspace(rfs, tablets))
 
-    await manager.server_stop_gracefully(s1.server_id)
-    await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
-
+    # Precondition: s1 has rf_rack_valid_keyspaces set to false.
+    # Postcondition: s1 still has rf_rack_valid_keyspaces set to false.
     async def try_fail(rfs: List[int], dc: str, rf: int, rack_count: int):
+        running_servers = await manager.running_servers()
+        should_start = s1.server_id not in [server.server_id for server in running_servers]
+        if should_start:
+            await manager.server_start(s1.server_id)
+
         ks = await create_keyspace(rfs, True)
+        # We need to wait for the new schema to propagate.
+        # Otherwise, it's not clear when the mutation
+        # corresponding to the created keyspace will
+        # arrive at server 1.
+        # It could happen only after the node performs
+        # the check upon start-up, effectively leading
+        # to a successful start-up, which we don't want.
+        # For more context, see issue: SCYLLADB-1137.
+        await read_barrier(manager.api, s1.ip_addr)
+
+        await manager.server_stop_gracefully(s1.server_id)
+        await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
+
         err = f"The keyspace '{ks}' is required to be RF-rack-valid. " \
               f"That condition is violated for DC '{dc}': RF={rf} vs. rack count={rack_count}."
-        _ = await manager.server_start(s1.server_id, expected_error=err)
+        await manager.server_start(s1.server_id, expected_error=err)
         await cql.run_async(f"DROP KEYSPACE {ks}")
 
+        await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "false")
+
     # Test RF-rack-invalid keyspaces.
     await try_fail([2, 0], "dc1", 2, 3)
     await try_fail([3, 2], "dc2", 2, 1)
     await try_fail([4, 1], "dc1", 4, 3)
 
-    _ = await manager.server_start(s1.server_id)
+    # We need to perform a read barrier on the node to make
+    # sure that it processes the last DROP KEYSPACE.
+    # Otherwise, the node could think the RF-rack-invalid
+    # keyspace still exists.
+    await manager.server_start(s1.server_id)
+    await read_barrier(manager.api, s1.ip_addr)
+    await manager.server_stop_gracefully(s1.server_id)
+
+    await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true")
+    await manager.server_start(s1.server_id)
 
 @pytest.mark.asyncio
 async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces_but_not_enforced(manager: ManagerClient):