From abfa4d0272e68d1f0fd8a5592fc2c071c05ba5f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Tue, 24 Mar 2026 21:09:19 +0200 Subject: [PATCH] =?UTF-8?q?Merge=20'test:=20cluster:=20Deflake=20test=5Fst?= =?UTF-8?q?artup=5Fwith=5Fkeyspaces=5Fviolating=5Frf=5Frack=5Fvalid=5Fkeys?= =?UTF-8?q?paces'=20from=20Dawid=20M=C4=99drek?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test was flaky. The scenario looked like this: 1. Stop server 1. 2. Set its rf_rack_valid_keyspaces configuration option to true. 3. Create an RF-rack-invalid keyspace. 4. Start server 1 and expect a failure during start-up. It was wrong. We cannot predict when the Raft mutation corresponding to the newly created keyspace will arrive at the node or when it will be processed. If the check of the RF-rack-valid keyspaces we perform at start-up was done before that, it won't include the keyspace. This will lead to a test failure. Unfortunately, it's not feasible to perform a read barrier during start-up. What's more, although it would help the test, it wouldn't be useful otherwise. Because of that, we simply fix the test, at least for now. The new scenario looks like this: 1. Disable the rf_rack_valid_keyspaces configuration option on server 1. 2. Start the server. 3. Create an RF-rack-invalid keyspace. 4. Perform a read barrier on server 1. This will ensure that it has observed all Raft mutations, and we won't run into the same problem. 5. Stop the node. 6. Set its rf_rack_valid_keyspaces configuration option to true. 7. Try to start the node and observe a failure. This will make the test perform consistently. --- I ran the test (in dev mode, on my local machine) three times before these changes, and three times with them. I include the time results below. Before: ``` real 0m47.570s user 0m41.631s sys 0m8.634s real 0m50.495s user 0m42.499s sys 0m8.607s real 0m50.375s user 0m41.832s sys 0m8.789s ``` After: ``` real 0m50.509s user 0m43.535s sys 0m9.715s real 0m50.857s user 0m44.185s sys 0m9.811s real 0m50.873s user 0m44.289s sys 0m9.737s ``` Fixes SCYLLADB-1137 Backport: The test is present on all supported branches, and so we should backport these changes to them. Closes scylladb/scylladb#29218 * github.com:scylladb/scylladb: test: cluster: Deflake test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces test: cluster: Mark test with @pytest.mark.asyncio in test_multidc.py (cherry picked from commit d52fbf7adafc2fb6cfb13acbe729ac4557a0ecc7) Closes scylladb/scylladb#29247 --- test/cluster/test_multidc.py | 40 +++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/test/cluster/test_multidc.py b/test/cluster/test_multidc.py index 5b57c345cd..c361cbce2c 100644 --- a/test/cluster/test_multidc.py +++ b/test/cluster/test_multidc.py @@ -20,6 +20,7 @@ from cassandra.query import SimpleStatement from test.pylib.async_cql import _wrap_future from test.pylib.manager_client import ManagerClient from test.pylib.random_tables import RandomTables, TextType, Column +from test.pylib.rest_client import read_barrier from test.pylib.util import unique_name from test.cluster.conftest import cluster_con @@ -403,6 +404,7 @@ async def test_arbiter_dc_rf_rack_valid_keyspaces(manager: ManagerClient): for task in [*valid_keyspaces, *invalid_keyspaces]: _ = tg.create_task(task) +@pytest.mark.asyncio async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager: ManagerClient): """ This test verifies that starting a Scylla node fails when there's an RF-rack-invalid keyspace. @@ -464,22 +466,50 @@ async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces(manager: for rfs, tablets in valid_keyspaces: _ = tg.create_task(create_keyspace(rfs, tablets)) - await manager.server_stop_gracefully(s1.server_id) - await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true") - + # Precondition: s1 has rf_rack_valid_keyspaces set to false. + # Postcondition: s1 still has rf_rack_valid_keyspaces set to false. async def try_fail(rfs: List[int], dc: str, rf: int, rack_count: int): + running_servers = await manager.running_servers() + should_start = s1.server_id not in [server.server_id for server in running_servers] + if should_start: + await manager.server_start(s1.server_id) + ks = await create_keyspace(rfs, True) + # We need to wait for the new schema to propagate. + # Otherwise, it's not clear when the mutation + # corresponding to the created keyspace will + # arrive at server 1. + # It could happen only after the node performs + # the check upon start-up, effectively leading + # to a successful start-up, which we don't want. + # For more context, see issue: SCYLLADB-1137. + await read_barrier(manager.api, s1.ip_addr) + + await manager.server_stop_gracefully(s1.server_id) + await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true") + err = f"The keyspace '{ks}' is required to be RF-rack-valid. " \ f"That condition is violated for DC '{dc}': RF={rf} vs. rack count={rack_count}." - _ = await manager.server_start(s1.server_id, expected_error=err) + await manager.server_start(s1.server_id, expected_error=err) await cql.run_async(f"DROP KEYSPACE {ks}") + await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "false") + # Test RF-rack-invalid keyspaces. await try_fail([2, 0], "dc1", 2, 3) await try_fail([3, 2], "dc2", 2, 1) await try_fail([4, 1], "dc1", 4, 3) - _ = await manager.server_start(s1.server_id) + # We need to perform a read barrier on the node to make + # sure that it processes the last DROP KEYSPACE. + # Otherwise, the node could think the RF-rack-invalid + # keyspace still exists. + await manager.server_start(s1.server_id) + await read_barrier(manager.api, s1.ip_addr) + await manager.server_stop_gracefully(s1.server_id) + + await manager.server_update_config(s1.server_id, "rf_rack_valid_keyspaces", "true") + await manager.server_start(s1.server_id) @pytest.mark.asyncio async def test_startup_with_keyspaces_violating_rf_rack_valid_keyspaces_but_not_enforced(manager: ManagerClient):