test.py: topology_random_failures: handle more node's hangs during 30s sleep

The node is hanging and the coordinator just rollback a topology state.  It's different from
`stop_after_sending_join_node_request` and `stop_after_bootstrapping_initial_raft_configuration`
because in these cases the coordinator just not able to start the topology change at all and
a message in the coordinator's log is different.

Error injections handled:
  - `stop_after_updating_cdc_generation`
  - `stop_before_streaming`

And, actually, it can be any cluster event which lasts more than 30s.
This commit is contained in:
Evgeniy Naydanov
2024-12-24 14:24:09 +00:00
parent a19ad3c655
commit f337ecbafa
2 changed files with 19 additions and 6 deletions

View File

@@ -27,3 +27,11 @@ ERROR_INJECTIONS = (
"stop_after_streaming",
"stop_after_bootstrapping_initial_raft_configuration",
)
# Error injections which can cause a node's hang due to some timeouts.
ERROR_INJECTIONS_NODE_MAY_HANG = (
"stop_after_sending_join_node_request",
"stop_after_updating_cdc_generation",
"stop_before_streaming",
"stop_after_bootstrapping_initial_raft_configuration",
)

View File

@@ -23,7 +23,7 @@ from test.topology.util import wait_for_token_ring_and_group0_consistency, get_c
from test.topology.conftest import skip_mode
from test.pylib.internal_types import ServerUpState
from test.topology_random_failures.cluster_events import CLUSTER_EVENTS, TOPOLOGY_TIMEOUT
from test.topology_random_failures.error_injections import ERROR_INJECTIONS
from test.topology_random_failures.error_injections import ERROR_INJECTIONS, ERROR_INJECTIONS_NODE_MAY_HANG
if TYPE_CHECKING:
from test.pylib.random_tables import RandomTables
@@ -150,14 +150,19 @@ async def test_random_failures(manager: ManagerClient,
server_log = await manager.server_open_log(server_id=s_info.server_id)
if cluster_event_duration + 1 >= WAIT_FOR_IP_TIMEOUT and error_injection in ( # give one more second for a tolerance
"stop_after_sending_join_node_request",
"stop_after_bootstrapping_initial_raft_configuration",
):
if cluster_event_duration + 1 >= WAIT_FOR_IP_TIMEOUT and error_injection in ERROR_INJECTIONS_NODE_MAY_HANG:
LOGGER.info("Expecting the added node can hang and we'll have a message in the coordinator's log. See #18638.")
coordinator = await get_coordinator_host(manager=manager)
coordinator_log = await manager.server_open_log(server_id=coordinator.server_id)
if matches := await coordinator_log.grep(r"The node may hang\. It's safe to shut it down manually now\."):
coordinator_log_pattern = r"The node may hang\. It's safe to shut it down manually now\."
if matches := await server_log.grep(r"init - Setting local host id to (?P<hostid>[0-9a-f-]+)"):
line, match = matches[-1]
LOGGER.info("Found following message in the coordinator's log:\n\t%s", line)
coordinator_log_pattern += (
rf"|updating topology state: rollback {match.group('hostid')} after bootstrapping failure, moving"
rf" transition state to left token ring and setting cleanup flag"
)
if matches := await coordinator_log.grep(coordinator_log_pattern):
LOGGER.info("Found following message in the coordinator's log:\n\t%s", matches[-1][0])
await manager.server_stop(server_id=s_info.server_id)