From 61794064678fda5fd8eb390405fc00a86b60bf3a Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Thu, 30 Apr 2026 00:41:04 +0300 Subject: [PATCH] raft/group0: fix destroy assertion on startup failure If start_server_for_group0() successfully registers a server in _raft_gr._servers but a subsequent step (e.g. enable_in_memory_state_machine()) throws, the server is never destroyed because abort_and_drain()/destroy() check std::get_if(&_group0) which was only set after the entire with_scheduling_group block completed. Move _group0.emplace() inside the lambda, immediately after start_server_for_group() succeeds, so that cleanup paths can always find and destroy the registered server. This fixes the assertion: "raft_group_registry - stop(): server for group ... is not destroyed" which manifests during shutdown after an upgrade where topology_state_load() fails due to netw::unknown_address. Backport: Yes, to 2026.1, 2026.2, as it causes a crash on upgrades Refs: SCYLLADB-1217 Refs: CUSTOMER-340 Refs: CUSTOMER-335 Fixes: SCYLLADB-1801 Signed-off-by: Yaniv Kaul AI-assisted: Yes, Opencode/Opus 4.6 Closes scylladb/scylladb#29702 --- service/raft/group0_state_machine.cc | 2 + service/raft/raft_group0.cc | 8 ++- ...ailure_after_group0_server_registration.py | 61 +++++++++++++++++++ 3 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 test/cluster/test_failure_after_group0_server_registration.py diff --git a/service/raft/group0_state_machine.cc b/service/raft/group0_state_machine.cc index f494bf73cd..5febdc4a20 100644 --- a/service/raft/group0_state_machine.cc +++ b/service/raft/group0_state_machine.cc @@ -434,6 +434,8 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) { } future<> group0_state_machine::enable_in_memory_state_machine() { + co_await utils::get_local_injector().inject("group0_state_machine_enable_in_memory_fail", + [] { return std::make_exception_ptr(std::runtime_error("injected failure in enable_in_memory_state_machine")); }); auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source); if (!_in_memory_state_machine_enabled) { _in_memory_state_machine_enabled = true; diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index 1623e14b08..657ccced6f 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -452,14 +452,16 @@ future<> raft_group0::start_server_for_group0(raft::group_id group0_id, service: auto srv_for_group0 = create_server_for_group0(group0_id, my_id, ss, qp, mm); auto& persistence = srv_for_group0.persistence; auto& server = *srv_for_group0.server; - co_await with_scheduling_group(_sg, [this, &srv_for_group0] (this auto self) -> future<> { + co_await with_scheduling_group(_sg, [this, &srv_for_group0, group0_id] (this auto self) -> future<> { auto& state_machine = dynamic_cast(srv_for_group0.state_machine); co_await _raft_gr.start_server_for_group(std::move(srv_for_group0)); + // Set _group0 immediately after the server is registered in _raft_gr._servers. + // This ensures abort_and_drain()/destroy() can find and clean up the server + // even if enable_in_memory_state_machine() or later steps throw. + _group0.emplace(group0_id); co_await state_machine.enable_in_memory_state_machine(); }); - _group0.emplace(group0_id); - // Fix for scylladb/scylladb#16683: // If the snapshot index is 0, trigger creation of a new snapshot // so bootstrapping nodes will receive a snapshot transfer. diff --git a/test/cluster/test_failure_after_group0_server_registration.py b/test/cluster/test_failure_after_group0_server_registration.py new file mode 100644 index 0000000000..76eaa84252 --- /dev/null +++ b/test/cluster/test_failure_after_group0_server_registration.py @@ -0,0 +1,61 @@ +# +# Copyright (C) 2026-present ScyllaDB +# +# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 +# + +import logging + +import pytest +from test.pylib.manager_client import ManagerClient + +logger = logging.getLogger(__name__) + + +@pytest.mark.asyncio +@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode') +async def test_failure_after_group0_server_registration(manager: ManagerClient) -> None: + """Test that a node shuts down cleanly when group0 startup fails after server registration. + + Reproducer for: CUSTOMER-340, CUSTOMER-335, SCYLLADB-1217 + + On restart, setup_group0_if_exist() calls start_server_for_group0() which + registers the raft server in raft_group_registry._servers, then calls + enable_in_memory_state_machine(). If enable_in_memory_state_machine() throws + (e.g., because reload_state() -> auth_cache().load_all() fails due to topology + being in a transitional state), the exception propagates and stack unwinding + calls raft_group_registry::stop(). + + Previously, _group0 was set AFTER the with_scheduling_group lambda returned, + so a throw inside the lambda left _group0 as monostate. abort_and_drain() and + destroy() would be no-ops, leaving the server orphaned in _servers. + raft_group_registry::stop() would then hit on_internal_error + ("server for group ... is not destroyed") and abort. + + The fix moves _group0.emplace() inside the lambda, immediately after + start_server_for_group(), so destroy() can always find and clean up the server. + + This test: + 1. Starts a node normally (group0 established) + 2. Stops the node + 3. Restarts with an injection that fails enable_in_memory_state_machine() + 4. Verifies the node fails startup cleanly (no abort) + """ + # Start a node normally so group0 is established + srv = await manager.server_add() + logger.info("Server %s started successfully with group0", srv.server_id) + + logger.info("Stopping server %s", srv.server_id) + await manager.server_stop_gracefully(srv.server_id) + + logger.info("Restarting server %s with injection to fail enable_in_memory_state_machine", srv.server_id) + await manager.server_update_config(srv.server_id, + key='error_injections_at_startup', + value=['group0_state_machine_enable_in_memory_fail']) + await manager.server_start(srv.server_id, + expected_error="injected failure in enable_in_memory_state_machine") + + # If we get here without the test framework detecting a crash/abort, + # the node shut down cleanly. The fix ensures abort_and_drain()/destroy() + # can find and clean up the raft server even when startup fails. + logger.info("Server failed startup and shut down cleanly (no abort)")