test: increase timeouts for /localnodes test

In commit bac7c33313 we introduced a new
test for the Alternator "/localnodes" request, checking that a node
that is still joining does not get returned. The tests used what I
thought were "very high" timeouts - we had a timeout of 10 seconds
for starting a single node, and injected a 20 second sleep to leave
us 10 seconds after the first sleep.

But the test failed in one extremely slow run (a debug build on
aarch64), where starting just a single node took more than 15 seconds!

So in this patch I increase the timeouts significantly: We increase
the wait for the node to 60 seconds, and the sleeping injection to
120 seconds. These should definitely be enough for anyone (famous
last words...).

The test doesn't actually wait for these timeouts, so the ridiculously
high timeouts shouldn't affect the normal runtime of this test.

Signed-off-by: Nadav Har'El <nyh@scylladb.com>
(cherry picked from commit ca8b91f641)

Closes scylladb/scylladb#19940
This commit is contained in:
Nadav Har'El
2024-07-29 01:24:54 +03:00
committed by Botond Dénes
parent 753fc87efa
commit 78d7c953b0
2 changed files with 8 additions and 8 deletions

View File

@@ -1782,7 +1782,7 @@ future<> storage_service::join_token_ring(sharded<db::system_distributed_keyspac
set_mode(mode::JOINING);
co_await utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20));
co_await utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120));
if (raft_server) { // Raft is enabled. Check if we need to bootstrap ourself using raft
rtlogger.info("topology changes are using raft");
@@ -3822,7 +3822,7 @@ void storage_service::run_bootstrap_ops(std::unordered_set<token>& bootstrap_tok
// Step 3: Prepare to sync data
ctl.prepare(node_ops_cmd::bootstrap_prepare).get();
utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20)).get();
utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120)).get();
// Step 5: Sync data for bootstrap
_repair.local().bootstrap_with_repair(get_token_metadata_ptr(), bootstrap_tokens).get();
@@ -5501,7 +5501,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
if (!_topology_state_machine._topology.normal_nodes.empty()) { // stream only if there is a node in normal state
co_await retrier(_bootstrap_result, coroutine::lambda([&] () -> future<> {
if (is_repair_based_node_ops_enabled(streaming::stream_reason::bootstrap)) {
co_await utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20));
co_await utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120));
co_await _repair.local().bootstrap_with_repair(get_token_metadata_ptr(), rs.ring.value().tokens);
} else {

View File

@@ -214,7 +214,7 @@ async def test_localnodes_broadcast_rpc_address(manager: ManagerClient):
# bit of time to bootstrap after coming up, and only then will it
# appear on /localnodes (see #19694).
url = f"http://{server.ip_addr}:{config['alternator_port']}/localnodes"
timeout = time.time() + 10
timeout = time.time() + 60
while True:
assert time.time() < timeout
response = requests.get(url, verify=False)
@@ -246,7 +246,7 @@ async def test_localnodes_drained_node(manager: ManagerClient):
return None # try again
else:
return False
assert await wait_for(check_localnodes_two, time.time() + 10)
assert await wait_for(check_localnodes_two, time.time() + 60)
# Now "nodetool" drain on the second node, leaving the second node
# in DRAINED state.
await manager.api.client.post("/storage_service/drain", host=servers[1].ip_addr)
@@ -262,7 +262,7 @@ async def test_localnodes_drained_node(manager: ManagerClient):
return True
else:
return False
assert await wait_for(check_localnodes_one, time.time() + 10)
assert await wait_for(check_localnodes_one, time.time() + 60)
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
@@ -276,7 +276,7 @@ async def test_localnodes_joining_nodes(manager: ManagerClient):
# We need to start the second node in the background, because server_add()
# will wait for the bootstrap to complete - which we don't want to do.
server = await manager.server_add(config=alternator_config)
task = asyncio.create_task(manager.server_add(config=alternator_config | {'error_injections_at_startup': ['delay_bootstrap_20s']}))
task = asyncio.create_task(manager.server_add(config=alternator_config | {'error_injections_at_startup': ['delay_bootstrap_120s']}))
# Sleep until the first node knows of the second one as a "live node"
# (we check this with the REST API's /gossiper/endpoint/live.
async def check_two_live_nodes():
@@ -287,7 +287,7 @@ async def test_localnodes_joining_nodes(manager: ManagerClient):
return True
else:
return False
assert await wait_for(check_two_live_nodes, time.time() + 10)
assert await wait_for(check_two_live_nodes, time.time() + 60)
# At this point the second node is live, but hasn't finished bootstrapping
# (we delayed that with the injection). So the "/localnodes" should still