mirror of
https://github.com/scylladb/scylladb.git
synced 2026-06-05 22:43:15 +00:00
join_node_request_handler: add raft_timeout to make_nonvoters and add_entry
We also add a specific test_quorum_lost_during_node_join. It exercises the case when the quorum is lost after start_operation but before these methods are called.
This commit is contained in:
@@ -6006,10 +6006,17 @@ future<join_node_request_result> storage_service::join_node_request_handler(join
|
||||
topology_change change{{std::move(mutation)}};
|
||||
group0_command g0_cmd = _group0->client().prepare_command(std::move(change), guard,
|
||||
format("raft topology: placing join request for {}", params.host_id));
|
||||
|
||||
co_await utils::get_local_injector().inject("join-node-before-add-entry", [] (auto& handler) -> future<> {
|
||||
rtlogger.info("join-node-before-add-entry injection hit");
|
||||
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
|
||||
rtlogger.info("join-node-before-add-entry injection done");
|
||||
});
|
||||
|
||||
try {
|
||||
// Make replaced node and ignored nodes non voters earlier for better HA
|
||||
co_await _group0->make_nonvoters(ignored_nodes_from_join_params(params), _group0_as);
|
||||
co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_group0_as);
|
||||
co_await _group0->make_nonvoters(ignored_nodes_from_join_params(params), _group0_as, raft_timeout{});
|
||||
co_await _group0->client().add_entry(std::move(g0_cmd), std::move(guard), &_group0_as, raft_timeout{});
|
||||
break;
|
||||
} catch (group0_concurrent_modification&) {
|
||||
rtlogger.info("join_node_request: concurrent operation is detected, retrying.");
|
||||
|
||||
@@ -9,6 +9,7 @@ import pytest
|
||||
import asyncio
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.topology.conftest import skip_mode
|
||||
from test.pylib.rest_client import inject_error_one_shot
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -68,3 +69,49 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
|
||||
timeout=60)
|
||||
|
||||
logger.info("done")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@skip_mode('release', 'error injections are not supported in release mode')
|
||||
@skip_mode('debug', 'aarch64/debug is unpredictably slow', platform_key='aarch64')
|
||||
async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_timeout: int) -> None:
|
||||
config = {
|
||||
'error_injections_at_startup': [
|
||||
{
|
||||
'name': 'group0-raft-op-timeout-in-ms',
|
||||
'value': raft_op_timeout
|
||||
},
|
||||
{
|
||||
'name': 'raft-group-registry-fd-threshold-in-ms',
|
||||
'value': '500'
|
||||
}
|
||||
]
|
||||
}
|
||||
logger.info("starting a first node (the leader)")
|
||||
servers = [await manager.server_add(config=config)]
|
||||
|
||||
logger.info("starting a second node (the follower)")
|
||||
servers += [await manager.server_add()]
|
||||
|
||||
logger.info(f"injecting join-node-before-add-entry into the leader node {servers[0]}")
|
||||
injection_handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, 'join-node-before-add-entry')
|
||||
|
||||
logger.info("starting a third node")
|
||||
third_node_future = asyncio.create_task(manager.server_add(
|
||||
seeds=[servers[0].ip_addr],
|
||||
expected_error="raft operation [add_entry] timed out, there is no raft quorum",
|
||||
timeout=60))
|
||||
|
||||
logger.info(f"waiting for the leader node {servers[0]} to start handling the join request")
|
||||
log_file = await manager.server_open_log(servers[0].server_id)
|
||||
await log_file.wait_for("join-node-before-add-entry injection hit",
|
||||
timeout=60)
|
||||
|
||||
logger.info("stopping the second node")
|
||||
await manager.server_stop_gracefully(servers[1].server_id)
|
||||
|
||||
logger.info("release join-node-before-add-entry injection")
|
||||
await injection_handler.message()
|
||||
|
||||
logger.info("waiting for third node joining process to fail")
|
||||
await third_node_future
|
||||
|
||||
Reference in New Issue
Block a user