From 2ea97d8c19dbad31991a93c93b940a773ab5f560 Mon Sep 17 00:00:00 2001 From: Sergey Zolotukhin Date: Thu, 9 Jan 2025 17:51:38 +0100 Subject: [PATCH] raft: Handle non-critical config update errors in when changing status to voter. When a node is bootstrapped and joins a cluster as a non-voter, errors can occur while committing a new Raft record, for instance, if the Raft leader changes during this time. These errors are not critical and should not cause a node crash, as the action can be retried. Fixes scylladb/scylladb#20814 (cherry picked from commit 8c48f7ad62af06e2129cdf56abe058952e81e504) --- service/raft/raft_group0.cc | 10 +++++++++- test/topology_custom/test_error_becoming_voter.py | 1 - 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index 703d6aff86..4c2be29541 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -825,7 +825,15 @@ future<> raft_group0::finish_setup_after_join(service::storage_service& ss, cql3 // Just bootstrapped and joined as non-voter. Become a voter. auto pause_shutdown = _shutdown_gate.hold(); raft::server_address my_addr{my_id, {}}; - co_await _raft_gr.group0().modify_config({{my_addr, true}}, {}, &_abort_source); + co_await run_op_with_retry(_abort_source, [this, my_addr]() -> future { + try { + co_await _raft_gr.group0().modify_config({{my_addr, true}}, {}, &_abort_source); + } catch (const raft::commit_status_unknown& e) { + group0_log.info("finish_setup_after_join({}): modify_config returned \"{}\", retrying", my_addr, e); + co_return operation_result::failure; + } + co_return operation_result::success; + }, "finish_setup_after_join->modify_config", {}); group0_log.info("finish_setup_after_join: became a group 0 voter."); // No need to run `upgrade_to_group0()` since we must have bootstrapped with Raft diff --git a/test/topology_custom/test_error_becoming_voter.py b/test/topology_custom/test_error_becoming_voter.py index 745ba68de8..62783a341a 100644 --- a/test/topology_custom/test_error_becoming_voter.py +++ b/test/topology_custom/test_error_becoming_voter.py @@ -17,7 +17,6 @@ from test.pylib.util import wait_for_cql_and_get_hosts logger = logging.getLogger(__name__) -@pytest.mark.xfail(reason="issue #20814") @pytest.mark.asyncio async def test_error_while_becoming_voter(request: pytest.FixtureRequest, manager: ManagerClient) -> None: """