From a2c23793abfa755ef9f1aac29aa5da13cfab5ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Fri, 27 Mar 2026 15:52:05 +0100 Subject: [PATCH] raft_group0: join_group0: fix join hang when node joins group 0 before post_server_start A joining node hung forever if the topology coordinator added it to the group 0 configuration before the node reached `post_server_start`. In that case, `server->get_configuration().contains(my_id)` returned true and the node broke out of the join loop early, skipping `post_server_start`. `_join_node_group0_started` was therefore never set, so the node's `join_node_response` RPC handler blocked indefinitely. Meanwhile the topology coordinator's `respond_to_joining_node` call (which has no timeout) hung forever waiting for the reply that never came. Fix by only taking the early-break path when not starting as a follower (i.e. when the node is the discovery leader or is restarting). A joining node must always reach `post_server_start`. We also provide a regression test. It takes 6s in dev mode. Fixes SCYLLADB-959 Closes scylladb/scylladb#29266 (cherry picked from commit b9f82f6f23a30c2405d26a039fabb79ac087d11c) Closes scylladb/scylladb#29291 Closes scylladb/scylladb#29308 --- service/raft/raft_group0.cc | 6 +- .../test_bootstrap_with_quick_group0_join.py | 71 +++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 test/cluster/test_bootstrap_with_quick_group0_join.py diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index d6fbc19177..37ecc37e7c 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -555,6 +555,7 @@ future<> raft_group0::join_group0(std::vector seeds, shared_p group0_id = g0_info.group0_id; raft::server_address my_addr{my_id, {}}; + bool starting_server_as_follower = false; if (server == nullptr) { // This is the first time discovery is run. Create and start a Raft server for group 0 on this node. raft::configuration initial_configuration; @@ -582,6 +583,7 @@ future<> raft_group0::join_group0(std::vector seeds, shared_p // trigger an empty snapshot transfer. nontrivial_snapshot = true; } else { + starting_server_as_follower = true; co_await handshaker->pre_server_start(g0_info); } @@ -610,7 +612,9 @@ future<> raft_group0::join_group0(std::vector seeds, shared_p } SCYLLA_ASSERT(server); - if (server->get_configuration().contains(my_id)) { + co_await utils::get_local_injector().inject("join_group0_pause_before_config_check", + utils::wait_for_message(std::chrono::minutes{5})); + if (!starting_server_as_follower && server->get_configuration().contains(my_id)) { // True if we started a new group or completed a configuration change initiated earlier. group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id, server->get_configuration().can_vote(my_id)? "voter" : "non-voter"); diff --git a/test/cluster/test_bootstrap_with_quick_group0_join.py b/test/cluster/test_bootstrap_with_quick_group0_join.py new file mode 100644 index 0000000000..a074025f95 --- /dev/null +++ b/test/cluster/test_bootstrap_with_quick_group0_join.py @@ -0,0 +1,71 @@ +# +# Copyright (C) 2026-present ScyllaDB +# +# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 +# +import logging +import asyncio +import time + +import pytest + +from test.cluster.conftest import skip_mode +from test.cluster.util import get_current_group0_config +from test.pylib.manager_client import ManagerClient +from test.pylib.rest_client import read_barrier +from test.pylib.util import wait_for + + +logger = logging.getLogger(__name__) + + +@pytest.mark.asyncio +@skip_mode('release', 'error injections are not supported in release mode') +async def test_bootstrap_with_quick_group0_join(manager: ManagerClient): + """Regression test for https://scylladb.atlassian.net/browse/SCYLLADB-959. + + The bug was that when the bootstrapping node joined group0 before reaching + post_server_start, it skipped post_server_start and thus hung forever. + + The test simulates the scenario by starting the second node with the + join_group0_pause_before_config_check injection. Without the fix, the + startup times out. + """ + logger.info("Adding first server") + s1 = await manager.server_add() + + logger.info("Adding second server with join_group0_pause_before_config_check enabled") + s2 = await manager.server_add(start=False, config={ + 'error_injections_at_startup': ['join_group0_pause_before_config_check'] + }) + + logger.info(f"Starting {s2}") + start_task = asyncio.create_task(manager.server_start(s2.server_id)) + + s2_log = await manager.server_open_log(s2.server_id) + + await s2_log.wait_for("join_group0_pause_before_config_check: waiting for message", timeout=60) + + s1_host_id = await manager.get_host_id(s1.server_id) + s2_host_id = await manager.get_host_id(s2.server_id) + + async def s2_in_group0_config_on_s1(): + config = await get_current_group0_config(manager, s1) + ids = {m[0] for m in config} + assert s1_host_id in ids # sanity check + return True if s2_host_id in ids else None + + # Note: we would like to wait for s2 to see itself in the group0 config, but we can't execute + # get_current_group0_config for s2, as s2 doesn't handle CQL requests at this point. As a workaround, we wait for s1 + # to see s2 and then perform a read barrier on s2. + logger.info(f"Waiting for {s1} to see {s2} in the group0 config") + await wait_for(s2_in_group0_config_on_s1, deadline=time.time() + 60, period=0.1) + + logger.info(f"Performing read barrier on {s2} to make sure it sees itself in the group0 config") + await read_barrier(manager.api, s2.ip_addr) + + logger.info(f"Unblocking {s2}") + await manager.api.message_injection(s2.ip_addr, 'join_group0_pause_before_config_check') + + logger.info(f"Waiting for {s2} to complete bootstrap") + await asyncio.wait_for(start_task, timeout=60)