raft_group0: join_group0: fix join hang when node joins group 0 before post_server_start

A joining node hung forever if the topology coordinator added it to the group 0 configuration before the node reached `post_server_start`. In that case, `server->get_configuration().contains(my_id)` returned true and the node broke out of the join loop early, skipping `post_server_start`. `_join_node_group0_started` was therefore never set, so the node's `join_node_response` RPC handler blocked indefinitely. Meanwhile the topology coordinator's `respond_to_joining_node` call (which has no timeout) hung forever waiting for the reply that never came. Fix by only taking the early-break path when not starting as a follower (i.e. when the node is the discovery leader or is restarting). A joining node must always reach `post_server_start`. We also provide a regression test. It takes 6s in dev mode. Fixes SCYLLADB-959 Closes scylladb/scylladb#29266 (cherry picked from commit b9f82f6f23) Closes scylladb/scylladb#29291 Closes scylladb/scylladb#29308
2026-05-01 21:55:50 +00:00 · 2026-03-27 15:52:05 +01:00
parent 177996a385
commit a2c23793ab
2 changed files with 76 additions and 1 deletions
--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -555,6 +555,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
        group0_id = g0_info.group0_id;
        raft::server_address my_addr{my_id, {}};

+        bool starting_server_as_follower = false;
        if (server == nullptr) {
            // This is the first time discovery is run. Create and start a Raft server for group 0 on this node.
            raft::configuration initial_configuration;
@@ -582,6 +583,7 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
                // trigger an empty snapshot transfer.
                nontrivial_snapshot = true;
            } else {
+                starting_server_as_follower = true;
                co_await handshaker->pre_server_start(g0_info);
            }

@@ -610,7 +612,9 @@ future<> raft_group0::join_group0(std::vector<gms::inet_address> seeds, shared_p
        }

        SCYLLA_ASSERT(server);
-        if (server->get_configuration().contains(my_id)) {
+        co_await utils::get_local_injector().inject("join_group0_pause_before_config_check",
+                utils::wait_for_message(std::chrono::minutes{5}));
+        if (!starting_server_as_follower && server->get_configuration().contains(my_id)) {
            // True if we started a new group or completed a configuration change initiated earlier.
            group0_log.info("server {} already in group 0 (id {}) as {}", my_id, group0_id,
                    server->get_configuration().can_vote(my_id)? "voter" : "non-voter");
--- a/test/cluster/test_bootstrap_with_quick_group0_join.py
+++ b/test/cluster/test_bootstrap_with_quick_group0_join.py
@@ -0,0 +1,71 @@
+#
+# Copyright (C) 2026-present ScyllaDB
+#
+# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+#
+import logging
+import asyncio
+import time
+
+import pytest
+
+from test.cluster.conftest import skip_mode
+from test.cluster.util import get_current_group0_config
+from test.pylib.manager_client import ManagerClient
+from test.pylib.rest_client import read_barrier
+from test.pylib.util import wait_for
+
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.mark.asyncio
+@skip_mode('release', 'error injections are not supported in release mode')
+async def test_bootstrap_with_quick_group0_join(manager: ManagerClient):
+    """Regression test for https://scylladb.atlassian.net/browse/SCYLLADB-959.
+
+    The bug was that when the bootstrapping node joined group0 before reaching
+    post_server_start, it skipped post_server_start and thus hung forever.
+
+    The test simulates the scenario by starting the second node with the
+    join_group0_pause_before_config_check injection. Without the fix, the
+    startup times out.
+    """
+    logger.info("Adding first server")
+    s1 = await manager.server_add()
+
+    logger.info("Adding second server with join_group0_pause_before_config_check enabled")
+    s2 = await manager.server_add(start=False, config={
+        'error_injections_at_startup': ['join_group0_pause_before_config_check']
+    })
+
+    logger.info(f"Starting {s2}")
+    start_task = asyncio.create_task(manager.server_start(s2.server_id))
+
+    s2_log = await manager.server_open_log(s2.server_id)
+
+    await s2_log.wait_for("join_group0_pause_before_config_check: waiting for message", timeout=60)
+
+    s1_host_id = await manager.get_host_id(s1.server_id)
+    s2_host_id = await manager.get_host_id(s2.server_id)
+
+    async def s2_in_group0_config_on_s1():
+        config = await get_current_group0_config(manager, s1)
+        ids = {m[0] for m in config}
+        assert s1_host_id in ids  # sanity check
+        return True if s2_host_id in ids else None
+
+    # Note: we would like to wait for s2 to see itself in the group0 config, but we can't execute
+    # get_current_group0_config for s2, as s2 doesn't handle CQL requests at this point. As a workaround, we wait for s1
+    # to see s2 and then perform a read barrier on s2.
+    logger.info(f"Waiting for {s1} to see {s2} in the group0 config")
+    await wait_for(s2_in_group0_config_on_s1, deadline=time.time() + 60, period=0.1)
+
+    logger.info(f"Performing read barrier on {s2} to make sure it sees itself in the group0 config")
+    await read_barrier(manager.api, s2.ip_addr)
+
+    logger.info(f"Unblocking {s2}")
+    await manager.api.message_injection(s2.ip_addr, 'join_group0_pause_before_config_check')
+
+    logger.info(f"Waiting for {s2} to complete bootstrap")
+    await asyncio.wait_for(start_task, timeout=60)