mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-12 19:02:12 +00:00
test: increase timeouts for /localnodes test
In commitbac7c33313we introduced a new test for the Alternator "/localnodes" request, checking that a node that is still joining does not get returned. The tests used what I thought were "very high" timeouts - we had a timeout of 10 seconds for starting a single node, and injected a 20 second sleep to leave us 10 seconds after the first sleep. But the test failed in one extremely slow run (a debug build on aarch64), where starting just a single node took more than 15 seconds! So in this patch I increase the timeouts significantly: We increase the wait for the node to 60 seconds, and the sleeping injection to 120 seconds. These should definitely be enough for anyone (famous last words...). The test doesn't actually wait for these timeouts, so the ridiculously high timeouts shouldn't affect the normal runtime of this test. Signed-off-by: Nadav Har'El <nyh@scylladb.com> (cherry picked from commitca8b91f641) Closes scylladb/scylladb#19940
This commit is contained in:
committed by
Botond Dénes
parent
753fc87efa
commit
78d7c953b0
@@ -1782,7 +1782,7 @@ future<> storage_service::join_token_ring(sharded<db::system_distributed_keyspac
|
||||
|
||||
set_mode(mode::JOINING);
|
||||
|
||||
co_await utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20));
|
||||
co_await utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120));
|
||||
|
||||
if (raft_server) { // Raft is enabled. Check if we need to bootstrap ourself using raft
|
||||
rtlogger.info("topology changes are using raft");
|
||||
@@ -3822,7 +3822,7 @@ void storage_service::run_bootstrap_ops(std::unordered_set<token>& bootstrap_tok
|
||||
// Step 3: Prepare to sync data
|
||||
ctl.prepare(node_ops_cmd::bootstrap_prepare).get();
|
||||
|
||||
utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20)).get();
|
||||
utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120)).get();
|
||||
|
||||
// Step 5: Sync data for bootstrap
|
||||
_repair.local().bootstrap_with_repair(get_token_metadata_ptr(), bootstrap_tokens).get();
|
||||
@@ -5501,7 +5501,7 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
if (!_topology_state_machine._topology.normal_nodes.empty()) { // stream only if there is a node in normal state
|
||||
co_await retrier(_bootstrap_result, coroutine::lambda([&] () -> future<> {
|
||||
if (is_repair_based_node_ops_enabled(streaming::stream_reason::bootstrap)) {
|
||||
co_await utils::get_local_injector().inject("delay_bootstrap_20s", std::chrono::seconds(20));
|
||||
co_await utils::get_local_injector().inject("delay_bootstrap_120s", std::chrono::seconds(120));
|
||||
|
||||
co_await _repair.local().bootstrap_with_repair(get_token_metadata_ptr(), rs.ring.value().tokens);
|
||||
} else {
|
||||
|
||||
@@ -214,7 +214,7 @@ async def test_localnodes_broadcast_rpc_address(manager: ManagerClient):
|
||||
# bit of time to bootstrap after coming up, and only then will it
|
||||
# appear on /localnodes (see #19694).
|
||||
url = f"http://{server.ip_addr}:{config['alternator_port']}/localnodes"
|
||||
timeout = time.time() + 10
|
||||
timeout = time.time() + 60
|
||||
while True:
|
||||
assert time.time() < timeout
|
||||
response = requests.get(url, verify=False)
|
||||
@@ -246,7 +246,7 @@ async def test_localnodes_drained_node(manager: ManagerClient):
|
||||
return None # try again
|
||||
else:
|
||||
return False
|
||||
assert await wait_for(check_localnodes_two, time.time() + 10)
|
||||
assert await wait_for(check_localnodes_two, time.time() + 60)
|
||||
# Now "nodetool" drain on the second node, leaving the second node
|
||||
# in DRAINED state.
|
||||
await manager.api.client.post("/storage_service/drain", host=servers[1].ip_addr)
|
||||
@@ -262,7 +262,7 @@ async def test_localnodes_drained_node(manager: ManagerClient):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
assert await wait_for(check_localnodes_one, time.time() + 10)
|
||||
assert await wait_for(check_localnodes_one, time.time() + 60)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@skip_mode('release', 'error injections are not supported in release mode')
|
||||
@@ -276,7 +276,7 @@ async def test_localnodes_joining_nodes(manager: ManagerClient):
|
||||
# We need to start the second node in the background, because server_add()
|
||||
# will wait for the bootstrap to complete - which we don't want to do.
|
||||
server = await manager.server_add(config=alternator_config)
|
||||
task = asyncio.create_task(manager.server_add(config=alternator_config | {'error_injections_at_startup': ['delay_bootstrap_20s']}))
|
||||
task = asyncio.create_task(manager.server_add(config=alternator_config | {'error_injections_at_startup': ['delay_bootstrap_120s']}))
|
||||
# Sleep until the first node knows of the second one as a "live node"
|
||||
# (we check this with the REST API's /gossiper/endpoint/live.
|
||||
async def check_two_live_nodes():
|
||||
@@ -287,7 +287,7 @@ async def test_localnodes_joining_nodes(manager: ManagerClient):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
assert await wait_for(check_two_live_nodes, time.time() + 10)
|
||||
assert await wait_for(check_two_live_nodes, time.time() + 60)
|
||||
|
||||
# At this point the second node is live, but hasn't finished bootstrapping
|
||||
# (we delayed that with the injection). So the "/localnodes" should still
|
||||
|
||||
Reference in New Issue
Block a user