From 8addbed0dcfb372d3c02e9cd358c81531e72c668 Mon Sep 17 00:00:00 2001 From: Petr Gusev Date: Wed, 20 May 2026 17:21:56 +0200 Subject: [PATCH] test_unfinished_writes_during_shutdown: add timeout and deadlock detection for shutdown_task Add a 15s timeout around the shutdown_task await. If the timeout fires, the deadlock is reproduced (shutdown hung because stale_versions_in_use blocks on a write handler holding a stale token_metadata version). When the timeout fires, explicitly kill the node via server_stop() so that the manager's _after_test handler does not wait 120s for the stuck stop_gracefully request. Then fail the test with a clear message. (cherry picked from commit fa01f74ae6945e9cc9f2fe049310536e56d61263) --- .../test_unfinished_writes_during_shutdown.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/test/cluster/test_unfinished_writes_during_shutdown.py b/test/cluster/test_unfinished_writes_during_shutdown.py index 7eba67aa90..448f6511e5 100644 --- a/test/cluster/test_unfinished_writes_during_shutdown.py +++ b/test/cluster/test_unfinished_writes_during_shutdown.py @@ -102,7 +102,19 @@ async def test_unfinished_writes_during_shutdown(request: pytest.FixtureRequest, await manager.api.message_injection(server_to_pause.ip_addr, 'storage_proxy_write_response_pause') logger.info("Waiting for the shutdown to complete") - await shutdown_task + try: + await asyncio.wait_for(shutdown_task, timeout=15) + except asyncio.TimeoutError: + # Deadlock reproduced — shutdown hung because stale_versions_in_use + # blocks on the write handler holding a stale token_metadata version. + # We must explicitly kill the node here: the manager's _after_test + # handler waits up to 120s for all outstanding tasks (including + # the stuck stop_gracefully request) before teardown proceeds. + # Killing the process lets stop_gracefully's cmd.wait() return, + # which unblocks _after_test. + logger.info("Shutdown did not complete within the timeout, killing the node") + await manager.server_stop(target_server.server_id) + pytest.fail("Shutdown did not complete within 15s — deadlock reproduced") logger.info("Cancelling addnode task") add_last_node_task.cancel()