test.py: topology_random_failures: handle more node's hangs during 30s sleep
The node is hanging and the coordinator just rollback a topology state. It's different from `stop_after_sending_join_node_request` and `stop_after_bootstrapping_initial_raft_configuration` because in these cases the coordinator just not able to start the topology change at all and a message in the coordinator's log is different. Error injections handled: - `stop_after_updating_cdc_generation` - `stop_before_streaming` And, actually, it can be any cluster event which lasts more than 30s.
This commit is contained in:
@@ -27,3 +27,11 @@ ERROR_INJECTIONS = (
|
||||
"stop_after_streaming",
|
||||
"stop_after_bootstrapping_initial_raft_configuration",
|
||||
)
|
||||
|
||||
# Error injections which can cause a node's hang due to some timeouts.
|
||||
ERROR_INJECTIONS_NODE_MAY_HANG = (
|
||||
"stop_after_sending_join_node_request",
|
||||
"stop_after_updating_cdc_generation",
|
||||
"stop_before_streaming",
|
||||
"stop_after_bootstrapping_initial_raft_configuration",
|
||||
)
|
||||
|
||||
@@ -23,7 +23,7 @@ from test.topology.util import wait_for_token_ring_and_group0_consistency, get_c
|
||||
from test.topology.conftest import skip_mode
|
||||
from test.pylib.internal_types import ServerUpState
|
||||
from test.topology_random_failures.cluster_events import CLUSTER_EVENTS, TOPOLOGY_TIMEOUT
|
||||
from test.topology_random_failures.error_injections import ERROR_INJECTIONS
|
||||
from test.topology_random_failures.error_injections import ERROR_INJECTIONS, ERROR_INJECTIONS_NODE_MAY_HANG
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from test.pylib.random_tables import RandomTables
|
||||
@@ -150,14 +150,19 @@ async def test_random_failures(manager: ManagerClient,
|
||||
|
||||
server_log = await manager.server_open_log(server_id=s_info.server_id)
|
||||
|
||||
if cluster_event_duration + 1 >= WAIT_FOR_IP_TIMEOUT and error_injection in ( # give one more second for a tolerance
|
||||
"stop_after_sending_join_node_request",
|
||||
"stop_after_bootstrapping_initial_raft_configuration",
|
||||
):
|
||||
if cluster_event_duration + 1 >= WAIT_FOR_IP_TIMEOUT and error_injection in ERROR_INJECTIONS_NODE_MAY_HANG:
|
||||
LOGGER.info("Expecting the added node can hang and we'll have a message in the coordinator's log. See #18638.")
|
||||
coordinator = await get_coordinator_host(manager=manager)
|
||||
coordinator_log = await manager.server_open_log(server_id=coordinator.server_id)
|
||||
if matches := await coordinator_log.grep(r"The node may hang\. It's safe to shut it down manually now\."):
|
||||
coordinator_log_pattern = r"The node may hang\. It's safe to shut it down manually now\."
|
||||
if matches := await server_log.grep(r"init - Setting local host id to (?P<hostid>[0-9a-f-]+)"):
|
||||
line, match = matches[-1]
|
||||
LOGGER.info("Found following message in the coordinator's log:\n\t%s", line)
|
||||
coordinator_log_pattern += (
|
||||
rf"|updating topology state: rollback {match.group('hostid')} after bootstrapping failure, moving"
|
||||
rf" transition state to left token ring and setting cleanup flag"
|
||||
)
|
||||
if matches := await coordinator_log.grep(coordinator_log_pattern):
|
||||
LOGGER.info("Found following message in the coordinator's log:\n\t%s", matches[-1][0])
|
||||
await manager.server_stop(server_id=s_info.server_id)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user