diff --git a/test/topology_experimental_raft/test_tablets_removenode.py b/test/topology_experimental_raft/test_tablets_removenode.py index 6da03d6d9f..ab42042e0a 100644 --- a/test/topology_experimental_raft/test_tablets_removenode.py +++ b/test/topology_experimental_raft/test_tablets_removenode.py @@ -30,20 +30,24 @@ async def run_async_cl_all(cql, query: str): @pytest.mark.asyncio async def test_replace(manager: ManagerClient): logger.info("Bootstrapping cluster") - cmdline = ['--logger-log-level', 'storage_service=trace'] + cmdline = [ + '--logger-log-level', 'storage_service=trace', + '--logger-log-level', 'raft_topology=trace', + ] - # 4 nodes so that we can find new tablet replica for the RF=3 table on removenode - servers = await manager.servers_add(4, cmdline=cmdline) + servers = await manager.servers_add(3, cmdline=cmdline) cql = manager.get_cql() await create_keyspace(cql, "test", 32, rf=1) await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + # We want RF=2 table to validate that quorum reads work after replacing node finishes + # bootstrap which indicates that bootstrap waits for rebuilt. + # Otherwise, some reads would fail to find a quorum. await create_keyspace(cql, "test2", 32, rf=2) await cql.run_async("CREATE TABLE test2.test (pk int PRIMARY KEY, c int);") - # RF=3 await create_keyspace(cql, "test3", 32, rf=3) await cql.run_async("CREATE TABLE test3.test (pk int PRIMARY KEY, c int);") @@ -54,23 +58,19 @@ async def test_replace(manager: ManagerClient): await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test2.test (pk, c) VALUES ({k}, {k});") for k in keys]) await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test3.test (pk, c) VALUES ({k}, {k});") for k in keys]) + async def check_ks(ks): + logger.info(f"Checking {ks}") + query = SimpleStatement(f"SELECT * FROM {ks}.test;", consistency_level=ConsistencyLevel.QUORUM) + rows = await cql.run_async(query) + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk + async def check(): - # RF=1 table "test" will experience data loss so don't check it. - # We include it to check that the system doesn't crash. - - logger.info("Checking table test2") - query = SimpleStatement("SELECT * FROM test2.test;", consistency_level=ConsistencyLevel.ONE) - rows = await cql.run_async(query) - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk - - logger.info("Checking table test3") - query = SimpleStatement("SELECT * FROM test3.test;", consistency_level=ConsistencyLevel.ONE) - rows = await cql.run_async(query) - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk + # RF=1 keyspace will experience data loss so don't check it. + # We include it in the test only to check that the system doesn't crash. + await check_ks("test2") + await check_ks("test3") await check() @@ -81,11 +81,21 @@ async def test_replace(manager: ManagerClient): logger.info('Replacing a node') await manager.server_stop(servers[0].server_id) replace_cfg = ReplaceConfig(replaced_id = servers[0].server_id, reuse_ip_addr = False, use_host_id = True) - await manager.server_add(replace_cfg) + servers.append(await manager.server_add(replace_cfg)) servers = servers[1:] await check() + # Verify that QUORUM reads from RF=3 table work when replacing finished and we down a single node. + # This validates that replace waits for tablet rebuilt before finishing bootstrap, otherwise some reads + # would fail to find a quorum. + logger.info('Downing a node') + await manager.server_stop_gracefully(servers[0].server_id) + await manager.server_not_sees_other_server(servers[1].ip_addr, servers[0].ip_addr) + await manager.server_not_sees_other_server(servers[2].ip_addr, servers[0].ip_addr) + + await check_ks("test3") + @pytest.mark.asyncio async def test_removenode(manager: ManagerClient):