From f337ecbafa30ac2d3ee7217e9bd37efdcd08683c Mon Sep 17 00:00:00 2001
From: Evgeniy Naydanov <evgeniy.naydanov@scylladb.com>
Date: Tue, 24 Dec 2024 14:24:09 +0000
Subject: [PATCH] test.py: topology_random_failures: handle more node's hangs
 during 30s sleep

The node is hanging and the coordinator just rollback a topology state.  It's different from
`stop_after_sending_join_node_request` and `stop_after_bootstrapping_initial_raft_configuration`
because in these cases the coordinator just not able to start the topology change at all and
a message in the coordinator's log is different.

Error injections handled:
  - `stop_after_updating_cdc_generation`
  - `stop_before_streaming`

And, actually, it can be any cluster event which lasts more than 30s.
---
 .../error_injections.py                         |  8 ++++++++
 .../test_random_failures.py                     | 17 +++++++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/test/topology_random_failures/error_injections.py b/test/topology_random_failures/error_injections.py
index 55fafab3da..817261bda1 100644
--- a/test/topology_random_failures/error_injections.py
+++ b/test/topology_random_failures/error_injections.py
@@ -27,3 +27,11 @@ ERROR_INJECTIONS = (
     "stop_after_streaming",
     "stop_after_bootstrapping_initial_raft_configuration",
 )
+
+# Error injections which can cause a node's hang due to some timeouts.
+ERROR_INJECTIONS_NODE_MAY_HANG = (
+    "stop_after_sending_join_node_request",
+    "stop_after_updating_cdc_generation",
+    "stop_before_streaming",
+    "stop_after_bootstrapping_initial_raft_configuration",
+)
diff --git a/test/topology_random_failures/test_random_failures.py b/test/topology_random_failures/test_random_failures.py
index c74c1fc507..1d7a8ada2e 100644
--- a/test/topology_random_failures/test_random_failures.py
+++ b/test/topology_random_failures/test_random_failures.py
@@ -23,7 +23,7 @@ from test.topology.util import wait_for_token_ring_and_group0_consistency, get_c
 from test.topology.conftest import skip_mode
 from test.pylib.internal_types import ServerUpState
 from test.topology_random_failures.cluster_events import CLUSTER_EVENTS, TOPOLOGY_TIMEOUT
-from test.topology_random_failures.error_injections import ERROR_INJECTIONS
+from test.topology_random_failures.error_injections import ERROR_INJECTIONS, ERROR_INJECTIONS_NODE_MAY_HANG
 
 if TYPE_CHECKING:
     from test.pylib.random_tables import RandomTables
@@ -150,14 +150,19 @@ async def test_random_failures(manager: ManagerClient,
 
     server_log = await manager.server_open_log(server_id=s_info.server_id)
 
-    if cluster_event_duration + 1 >= WAIT_FOR_IP_TIMEOUT and error_injection in (  # give one more second for a tolerance
-        "stop_after_sending_join_node_request",
-        "stop_after_bootstrapping_initial_raft_configuration",
-    ):
+    if cluster_event_duration + 1 >= WAIT_FOR_IP_TIMEOUT and error_injection in ERROR_INJECTIONS_NODE_MAY_HANG:
         LOGGER.info("Expecting the added node can hang and we'll have a message in the coordinator's log.  See #18638.")
         coordinator = await get_coordinator_host(manager=manager)
         coordinator_log = await manager.server_open_log(server_id=coordinator.server_id)
-        if matches := await coordinator_log.grep(r"The node may hang\. It's safe to shut it down manually now\."):
+        coordinator_log_pattern = r"The node may hang\. It's safe to shut it down manually now\."
+        if matches := await server_log.grep(r"init - Setting local host id to (?P<hostid>[0-9a-f-]+)"):
+            line, match = matches[-1]
+            LOGGER.info("Found following message in the coordinator's log:\n\t%s", line)
+            coordinator_log_pattern += (
+                rf"|updating topology state: rollback {match.group('hostid')} after bootstrapping failure, moving"
+                rf" transition state to left token ring and setting cleanup flag"
+            )
+        if matches := await coordinator_log.grep(coordinator_log_pattern):
             LOGGER.info("Found following message in the coordinator's log:\n\t%s", matches[-1][0])
             await manager.server_stop(server_id=s_info.server_id)