join_node_request_handler: add raft_timeout to make_nonvoters and add_entry

We also add a specific test_quorum_lost_during_node_join. It
exercises the case when the quorum is lost after start_operation
but before these methods are called.
This commit is contained in:
Petr Gusev
2024-02-21 20:41:20 +04:00
parent 0ad852e323
commit 099c756ba1
2 changed files with 56 additions and 2 deletions

View File

@@ -6006,10 +6006,17 @@ future<join_node_request_result> storage_service::join_node_request_handler(join
topology_change change{{std::move(mutation)}};
group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
format("raft topology: placing join request for {}", params.host_id));
co_await utils::get_local_injector().inject("join-node-before-add-entry", [] (auto& handler) -> future<> {
rtlogger.info("join-node-before-add-entry injection hit");
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
rtlogger.info("join-node-before-add-entry injection done");
});
try {
// Make replaced node and ignored nodes non voters earlier for better HA
co_await _group0->make_nonvoters(ignored_nodes_from_join_params(params), _group0_as);
co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_group0_as);
co_await _group0->make_nonvoters(ignored_nodes_from_join_params(params), _group0_as, raft_timeout{});
co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_group0_as, raft_timeout{});
break;
} catch (group0_concurrent_modification&) {
rtlogger.info("join_node_request: concurrent operation is detected, retrying.");

View File

@@ -9,6 +9,7 @@ import pytest
import asyncio
from test.pylib.manager_client import ManagerClient
from test.topology.conftest import skip_mode
from test.pylib.rest_client import inject_error_one_shot
logger = logging.getLogger(__name__)
@@ -68,3 +69,49 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
timeout=60)
logger.info("done")
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
@skip_mode('debug', 'aarch64/debug is unpredictably slow', platform_key='aarch64')
async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_timeout: int) -> None:
config = {
'error_injections_at_startup': [
{
'name': 'group0-raft-op-timeout-in-ms',
'value': raft_op_timeout
},
{
'name': 'raft-group-registry-fd-threshold-in-ms',
'value': '500'
}
]
}
logger.info("starting a first node (the leader)")
servers = [await manager.server_add(config=config)]
logger.info("starting a second node (the follower)")
servers += [await manager.server_add()]
logger.info(f"injecting join-node-before-add-entry into the leader node {servers[0]}")
injection_handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, 'join-node-before-add-entry')
logger.info("starting a third node")
third_node_future = asyncio.create_task(manager.server_add(
seeds=[servers[0].ip_addr],
expected_error="raft operation [add_entry] timed out, there is no raft quorum",
timeout=60))
logger.info(f"waiting for the leader node {servers[0]} to start handling the join request")
log_file = await manager.server_open_log(servers[0].server_id)
await log_file.wait_for("join-node-before-add-entry injection hit",
timeout=60)
logger.info("stopping the second node")
await manager.server_stop_gracefully(servers[1].server_id)
logger.info("release join-node-before-add-entry injection")
await injection_handler.message()
logger.info("waiting for third node joining process to fail")
await third_node_future