From 78d7c953b01e242479d91c3b4dfbc0ec378ae1ea Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Mon, 29 Jul 2024 01:24:54 +0300 Subject: [PATCH] test: increase timeouts for /localnodes test In commit bac7c3331377ad7bb85524d295e438230d61001e we introduced a new test for the Alternator "/localnodes" request, checking that a node that is still joining does not get returned. The tests used what I thought were "very high" timeouts - we had a timeout of 10 seconds for starting a single node, and injected a 20 second sleep to leave us 10 seconds after the first sleep. But the test failed in one extremely slow run (a debug build on aarch64), where starting just a single node took more than 15 seconds! So in this patch I increase the timeouts significantly: We increase the wait for the node to 60 seconds, and the sleeping injection to 120 seconds. These should definitely be enough for anyone (famous last words...). The test doesn't actually wait for these timeouts, so the ridiculously high timeouts shouldn't affect the normal runtime of this test. Signed-off-by: Nadav Har'El (cherry picked from commit ca8b91f641ac9a506a7f18604417507c06a10787) Closes scylladb/scylladb#19940 --- service/storage_service.cc | 6 +++--- test/topology_experimental_raft/test_alternator.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index 82fb5f9b2b..84c20a1cdd 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -1782,7 +1782,7 @@ future<> storage_service::join_token_ring(sharded& bootstrap_tok // Step 3: Prepare to sync data ctl.prepare(node_ops_cmd::bootstrap_prepare).get(); - utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20)).get(); + utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120)).get(); // Step 5: Sync data for bootstrap _repair.local().bootstrap_with_repair(get_token_metadata_ptr(), bootstrap_tokens).get(); @@ -5501,7 +5501,7 @@ future storage_service::raft_topology_cmd_handler(raft if (!_topology_state_machine._topology.normal_nodes.empty()) { // stream only if there is a node in normal state co_await retrier(_bootstrap_result, coroutine::lambda([&] () -> future<> { if (is_repair_based_node_ops_enabled(streaming::stream_reason::bootstrap)) { - co_await utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20)); + co_await utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120)); co_await _repair.local().bootstrap_with_repair(get_token_metadata_ptr(), rs.ring.value().tokens); } else { diff --git a/test/topology_experimental_raft/test_alternator.py b/test/topology_experimental_raft/test_alternator.py index 2a01736ac3..c20bff6d1e 100644 --- a/test/topology_experimental_raft/test_alternator.py +++ b/test/topology_experimental_raft/test_alternator.py @@ -214,7 +214,7 @@ async def test_localnodes_broadcast_rpc_address(manager: ManagerClient): # bit of time to bootstrap after coming up, and only then will it # appear on /localnodes (see #19694). url = f"http://{server.ip_addr}:{config['alternator_port']}/localnodes" - timeout = time.time() + 10 + timeout = time.time() + 60 while True: assert time.time() < timeout response = requests.get(url, verify=False) @@ -246,7 +246,7 @@ async def test_localnodes_drained_node(manager: ManagerClient): return None # try again else: return False - assert await wait_for(check_localnodes_two, time.time() + 10) + assert await wait_for(check_localnodes_two, time.time() + 60) # Now "nodetool" drain on the second node, leaving the second node # in DRAINED state. await manager.api.client.post("/storage_service/drain", host=servers[1].ip_addr) @@ -262,7 +262,7 @@ async def test_localnodes_drained_node(manager: ManagerClient): return True else: return False - assert await wait_for(check_localnodes_one, time.time() + 10) + assert await wait_for(check_localnodes_one, time.time() + 60) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -276,7 +276,7 @@ async def test_localnodes_joining_nodes(manager: ManagerClient): # We need to start the second node in the background, because server_add() # will wait for the bootstrap to complete - which we don't want to do. server = await manager.server_add(config=alternator_config) - task = asyncio.create_task(manager.server_add(config=alternator_config | {'error_injections_at_startup': ['delay_bootstrap_20s']})) + task = asyncio.create_task(manager.server_add(config=alternator_config | {'error_injections_at_startup': ['delay_bootstrap_120s']})) # Sleep until the first node knows of the second one as a "live node" # (we check this with the REST API's /gossiper/endpoint/live. async def check_two_live_nodes(): @@ -287,7 +287,7 @@ async def test_localnodes_joining_nodes(manager: ManagerClient): return True else: return False - assert await wait_for(check_two_live_nodes, time.time() + 10) + assert await wait_for(check_two_live_nodes, time.time() + 60) # At this point the second node is live, but hasn't finished bootstrapping # (we delayed that with the injection). So the "/localnodes" should still