mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-12 19:02:12 +00:00
raft/group0: fix destroy assertion on startup failure
If start_server_for_group0() successfully registers a server in _raft_gr._servers but a subsequent step (e.g. enable_in_memory_state_machine()) throws, the server is never destroyed because abort_and_drain()/destroy() check std::get_if<raft::group_id>(&_group0) which was only set after the entire with_scheduling_group block completed. Move _group0.emplace<raft::group_id>() inside the lambda, immediately after start_server_for_group() succeeds, so that cleanup paths can always find and destroy the registered server. This fixes the assertion: "raft_group_registry - stop(): server for group ... is not destroyed" which manifests during shutdown after an upgrade where topology_state_load() fails due to netw::unknown_address. Backport: Yes, to 2026.1, 2026.2, as it causes a crash on upgrades Refs: SCYLLADB-1217 Refs: CUSTOMER-340 Refs: CUSTOMER-335 Fixes: SCYLLADB-1801 Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com> AI-assisted: Yes, Opencode/Opus 4.6 Closes scylladb/scylladb#29702
This commit is contained in:
committed by
Patryk Jędrzejczak
parent
d33bb6ea00
commit
6179406467
@@ -434,6 +434,8 @@ future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
|
||||
}
|
||||
|
||||
future<> group0_state_machine::enable_in_memory_state_machine() {
|
||||
co_await utils::get_local_injector().inject("group0_state_machine_enable_in_memory_fail",
|
||||
[] { return std::make_exception_ptr(std::runtime_error("injected failure in enable_in_memory_state_machine")); });
|
||||
auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
|
||||
if (!_in_memory_state_machine_enabled) {
|
||||
_in_memory_state_machine_enabled = true;
|
||||
|
||||
@@ -452,14 +452,16 @@ future<> raft_group0::start_server_for_group0(raft::group_id group0_id, service:
|
||||
auto srv_for_group0 = create_server_for_group0(group0_id, my_id, ss, qp, mm);
|
||||
auto& persistence = srv_for_group0.persistence;
|
||||
auto& server = *srv_for_group0.server;
|
||||
co_await with_scheduling_group(_sg, [this, &srv_for_group0] (this auto self) -> future<> {
|
||||
co_await with_scheduling_group(_sg, [this, &srv_for_group0, group0_id] (this auto self) -> future<> {
|
||||
auto& state_machine = dynamic_cast<group0_state_machine&>(srv_for_group0.state_machine);
|
||||
co_await _raft_gr.start_server_for_group(std::move(srv_for_group0));
|
||||
// Set _group0 immediately after the server is registered in _raft_gr._servers.
|
||||
// This ensures abort_and_drain()/destroy() can find and clean up the server
|
||||
// even if enable_in_memory_state_machine() or later steps throw.
|
||||
_group0.emplace<raft::group_id>(group0_id);
|
||||
co_await state_machine.enable_in_memory_state_machine();
|
||||
});
|
||||
|
||||
_group0.emplace<raft::group_id>(group0_id);
|
||||
|
||||
// Fix for scylladb/scylladb#16683:
|
||||
// If the snapshot index is 0, trigger creation of a new snapshot
|
||||
// so bootstrapping nodes will receive a snapshot transfer.
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
#
|
||||
# Copyright (C) 2026-present ScyllaDB
|
||||
#
|
||||
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
|
||||
#
|
||||
|
||||
import logging
|
||||
|
||||
import pytest
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
|
||||
async def test_failure_after_group0_server_registration(manager: ManagerClient) -> None:
|
||||
"""Test that a node shuts down cleanly when group0 startup fails after server registration.
|
||||
|
||||
Reproducer for: CUSTOMER-340, CUSTOMER-335, SCYLLADB-1217
|
||||
|
||||
On restart, setup_group0_if_exist() calls start_server_for_group0() which
|
||||
registers the raft server in raft_group_registry._servers, then calls
|
||||
enable_in_memory_state_machine(). If enable_in_memory_state_machine() throws
|
||||
(e.g., because reload_state() -> auth_cache().load_all() fails due to topology
|
||||
being in a transitional state), the exception propagates and stack unwinding
|
||||
calls raft_group_registry::stop().
|
||||
|
||||
Previously, _group0 was set AFTER the with_scheduling_group lambda returned,
|
||||
so a throw inside the lambda left _group0 as monostate. abort_and_drain() and
|
||||
destroy() would be no-ops, leaving the server orphaned in _servers.
|
||||
raft_group_registry::stop() would then hit on_internal_error
|
||||
("server for group ... is not destroyed") and abort.
|
||||
|
||||
The fix moves _group0.emplace() inside the lambda, immediately after
|
||||
start_server_for_group(), so destroy() can always find and clean up the server.
|
||||
|
||||
This test:
|
||||
1. Starts a node normally (group0 established)
|
||||
2. Stops the node
|
||||
3. Restarts with an injection that fails enable_in_memory_state_machine()
|
||||
4. Verifies the node fails startup cleanly (no abort)
|
||||
"""
|
||||
# Start a node normally so group0 is established
|
||||
srv = await manager.server_add()
|
||||
logger.info("Server %s started successfully with group0", srv.server_id)
|
||||
|
||||
logger.info("Stopping server %s", srv.server_id)
|
||||
await manager.server_stop_gracefully(srv.server_id)
|
||||
|
||||
logger.info("Restarting server %s with injection to fail enable_in_memory_state_machine", srv.server_id)
|
||||
await manager.server_update_config(srv.server_id,
|
||||
key='error_injections_at_startup',
|
||||
value=['group0_state_machine_enable_in_memory_fail'])
|
||||
await manager.server_start(srv.server_id,
|
||||
expected_error="injected failure in enable_in_memory_state_machine")
|
||||
|
||||
# If we get here without the test framework detecting a crash/abort,
|
||||
# the node shut down cleanly. The fix ensures abort_and_drain()/destroy()
|
||||
# can find and clean up the raft server even when startup fails.
|
||||
logger.info("Server failed startup and shut down cleanly (no abort)")
|
||||
Reference in New Issue
Block a user