test: increase timeouts for /localnodes test

In commit bac7c33313 we introduced a new test for the Alternator "/localnodes" request, checking that a node that is still joining does not get returned. The tests used what I thought were "very high" timeouts - we had a timeout of 10 seconds for starting a single node, and injected a 20 second sleep to leave us 10 seconds after the first sleep. But the test failed in one extremely slow run (a debug build on aarch64), where starting just a single node took more than 15 seconds! So in this patch I increase the timeouts significantly: We increase the wait for the node to 60 seconds, and the sleeping injection to 120 seconds. These should definitely be enough for anyone (famous last words...). The test doesn't actually wait for these timeouts, so the ridiculously high timeouts shouldn't affect the normal runtime of this test. Signed-off-by: Nadav Har'El <nyh@scylladb.com> (cherry picked from commit ca8b91f641) Closes scylladb/scylladb#19940
2026-05-12 19:02:12 +00:00 · 2024-07-29 01:24:54 +03:00
parent 753fc87efa
commit 78d7c953b0
2 changed files with 8 additions and 8 deletions
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -1782,7 +1782,7 @@ future<> storage_service::join_token_ring(sharded<db::system_distributed_keyspac

    set_mode(mode::JOINING);

-    co_await utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20));
+    co_await utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120));

    if (raft_server) { // Raft is enabled. Check if we need to bootstrap ourself using raft
        rtlogger.info("topology changes are using raft");
@@ -3822,7 +3822,7 @@ void storage_service::run_bootstrap_ops(std::unordered_set<token>& bootstrap_tok
        // Step 3: Prepare to sync data
        ctl.prepare(node_ops_cmd::bootstrap_prepare).get();

-        utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20)).get();
+        utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120)).get();

        // Step 5: Sync data for bootstrap
        _repair.local().bootstrap_with_repair(get_token_metadata_ptr(), bootstrap_tokens).get();
@@ -5501,7 +5501,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
                            if (!_topology_state_machine._topology.normal_nodes.empty()) { // stream only if there is a node in normal state
                                co_await retrier(_bootstrap_result, coroutine::lambda([&] () -> future<> {
                                    if (is_repair_based_node_ops_enabled(streaming::stream_reason::bootstrap)) {
-                                        co_await utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20));
+                                        co_await utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120));

                                        co_await _repair.local().bootstrap_with_repair(get_token_metadata_ptr(), rs.ring.value().tokens);
                                    } else {
--- a/test/topology_experimental_raft/test_alternator.py
+++ b/test/topology_experimental_raft/test_alternator.py
@@ -214,7 +214,7 @@ async def test_localnodes_broadcast_rpc_address(manager: ManagerClient):
        # bit of time to bootstrap after coming up, and only then will it
        # appear on /localnodes (see #19694).
        url = f"http://{server.ip_addr}:{config['alternator_port']}/localnodes"
-        timeout = time.time() + 10
+        timeout = time.time() + 60
        while True:
            assert time.time() < timeout
            response = requests.get(url, verify=False)
@@ -246,7 +246,7 @@ async def test_localnodes_drained_node(manager: ManagerClient):
            return None # try again
        else:
            return False
-    assert await wait_for(check_localnodes_two, time.time() + 10)
+    assert await wait_for(check_localnodes_two, time.time() + 60)
    # Now "nodetool" drain on the second node, leaving the second node
    # in DRAINED state.
    await manager.api.client.post("/storage_service/drain", host=servers[1].ip_addr)
@@ -262,7 +262,7 @@ async def test_localnodes_drained_node(manager: ManagerClient):
            return True
        else:
            return False
-    assert await wait_for(check_localnodes_one, time.time() + 10)
+    assert await wait_for(check_localnodes_one, time.time() + 60)

@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
@@ -276,7 +276,7 @@ async def test_localnodes_joining_nodes(manager: ManagerClient):
    # We need to start the second node in the background, because server_add()
    # will wait for the bootstrap to complete - which we don't want to do.
    server = await manager.server_add(config=alternator_config)
-    task = asyncio.create_task(manager.server_add(config=alternator_config | {'error_injections_at_startup': ['delay_bootstrap_20s']}))
+    task = asyncio.create_task(manager.server_add(config=alternator_config | {'error_injections_at_startup': ['delay_bootstrap_120s']}))
    # Sleep until the first node knows of the second one as a "live node"
    # (we check this with the REST API's /gossiper/endpoint/live.
    async def check_two_live_nodes():
@@ -287,7 +287,7 @@ async def test_localnodes_joining_nodes(manager: ManagerClient):
            return True
        else:
            return False
-    assert await wait_for(check_two_live_nodes, time.time() + 10)
+    assert await wait_for(check_two_live_nodes, time.time() + 60)

    # At this point the second node is live, but hasn't finished bootstrapping
    # (we delayed that with the injection). So the "/localnodes" should still