From f337ecbafa30ac2d3ee7217e9bd37efdcd08683c Mon Sep 17 00:00:00 2001 From: Evgeniy Naydanov Date: Tue, 24 Dec 2024 14:24:09 +0000 Subject: [PATCH] test.py: topology_random_failures: handle more node's hangs during 30s sleep The node is hanging and the coordinator just rollback a topology state. It's different from `stop_after_sending_join_node_request` and `stop_after_bootstrapping_initial_raft_configuration` because in these cases the coordinator just not able to start the topology change at all and a message in the coordinator's log is different. Error injections handled: - `stop_after_updating_cdc_generation` - `stop_before_streaming` And, actually, it can be any cluster event which lasts more than 30s. --- .../error_injections.py | 8 ++++++++ .../test_random_failures.py | 17 +++++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/test/topology_random_failures/error_injections.py b/test/topology_random_failures/error_injections.py index 55fafab3da..817261bda1 100644 --- a/test/topology_random_failures/error_injections.py +++ b/test/topology_random_failures/error_injections.py @@ -27,3 +27,11 @@ ERROR_INJECTIONS = ( "stop_after_streaming", "stop_after_bootstrapping_initial_raft_configuration", ) + +# Error injections which can cause a node's hang due to some timeouts. +ERROR_INJECTIONS_NODE_MAY_HANG = ( + "stop_after_sending_join_node_request", + "stop_after_updating_cdc_generation", + "stop_before_streaming", + "stop_after_bootstrapping_initial_raft_configuration", +) diff --git a/test/topology_random_failures/test_random_failures.py b/test/topology_random_failures/test_random_failures.py index c74c1fc507..1d7a8ada2e 100644 --- a/test/topology_random_failures/test_random_failures.py +++ b/test/topology_random_failures/test_random_failures.py @@ -23,7 +23,7 @@ from test.topology.util import wait_for_token_ring_and_group0_consistency, get_c from test.topology.conftest import skip_mode from test.pylib.internal_types import ServerUpState from test.topology_random_failures.cluster_events import CLUSTER_EVENTS, TOPOLOGY_TIMEOUT -from test.topology_random_failures.error_injections import ERROR_INJECTIONS +from test.topology_random_failures.error_injections import ERROR_INJECTIONS, ERROR_INJECTIONS_NODE_MAY_HANG if TYPE_CHECKING: from test.pylib.random_tables import RandomTables @@ -150,14 +150,19 @@ async def test_random_failures(manager: ManagerClient, server_log = await manager.server_open_log(server_id=s_info.server_id) - if cluster_event_duration + 1 >= WAIT_FOR_IP_TIMEOUT and error_injection in ( # give one more second for a tolerance - "stop_after_sending_join_node_request", - "stop_after_bootstrapping_initial_raft_configuration", - ): + if cluster_event_duration + 1 >= WAIT_FOR_IP_TIMEOUT and error_injection in ERROR_INJECTIONS_NODE_MAY_HANG: LOGGER.info("Expecting the added node can hang and we'll have a message in the coordinator's log. See #18638.") coordinator = await get_coordinator_host(manager=manager) coordinator_log = await manager.server_open_log(server_id=coordinator.server_id) - if matches := await coordinator_log.grep(r"The node may hang\. It's safe to shut it down manually now\."): + coordinator_log_pattern = r"The node may hang\. It's safe to shut it down manually now\." + if matches := await server_log.grep(r"init - Setting local host id to (?P[0-9a-f-]+)"): + line, match = matches[-1] + LOGGER.info("Found following message in the coordinator's log:\n\t%s", line) + coordinator_log_pattern += ( + rf"|updating topology state: rollback {match.group('hostid')} after bootstrapping failure, moving" + rf" transition state to left token ring and setting cleanup flag" + ) + if matches := await coordinator_log.grep(coordinator_log_pattern): LOGGER.info("Found following message in the coordinator's log:\n\t%s", matches[-1][0]) await manager.server_stop(server_id=s_info.server_id)