raft: ensure topology coordinator retains votership

The limited voters feature did not account for the existing topology
coordinator (Raft leader) when selecting voters to be removed.
As a result, the limited voters calculator could inadvertently remove
the votership of the current topology coordinator, triggering
an unnecessary Raft leader re-election.

This change ensures that the existing topology coordinator's votership
status is preserved unless absolutely necessary. When choosing between
otherwise equivalent voters, the node other than the topology coordinator
is prioritized for removal. This helps maintain stability in the cluster
by avoiding unnecessary leader re-elections.

Additionally, only the alive leader node is considered relevant for this
logic. A dead existing leader (topology coordinator) is excluded from
consideration, as it is already in the process of losing leadership.

Fixes: scylladb/scylladb#23588
Fixes: scylladb/scylladb#23786
This commit is contained in:
Emil Maskovsky
2025-04-22 18:28:50 +02:00
parent 2ae59e8a87
commit 24dfd2034b
4 changed files with 224 additions and 14 deletions

View File

@@ -6,6 +6,7 @@
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#include <boost/test/data/test_case.hpp>
#include <boost/test/unit_test.hpp>
#include "service/raft/group0_voter_calculator.hh"
@@ -939,4 +940,106 @@ BOOST_AUTO_TEST_CASE(existing_voters_are_kept_across_racks) {
}
BOOST_DATA_TEST_CASE(leader_is_retained_as_voter, boost::unit_test::data::make({0, 1, 2}), leader_node_idx) {
// Arrange: Set the voters limit and create the voter calculator.
constexpr size_t max_voters = 3;
const service::group0_voter_calculator voter_calc{max_voters};
// Act: Add a third DC to a 2 DC cluster.
const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(),
raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()};
const service::group0_voter_calculator::nodes_list_t nodes = {
// The initial nodes (just 2 DCs)
{ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 0}},
{ids[1], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 1}},
{ids[2], {.datacenter = "dc-2", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 2}},
{ids[3], {.datacenter = "dc-2", .rack = "rack", .is_voter = false, .is_alive = true}},
// The new nodes (3rd DC)
// - this will lead to 1 voter being removed from the DC-1
{ids[4], {.datacenter = "dc-3", .rack = "rack", .is_voter = false, .is_alive = true}},
{ids[5], {.datacenter = "dc-3", .rack = "rack", .is_voter = false, .is_alive = true}},
};
const auto& voters = voter_calc.distribute_voters(nodes);
// Assert: The current leader is retained as a voter, even after adding
// the third datacenter and redistributing voters.
BOOST_CHECK_EQUAL(voters.size(), max_voters);
BOOST_CHECK(voters.contains(ids[leader_node_idx]));
}
BOOST_DATA_TEST_CASE(leader_is_retained_as_voter_in_racks, boost::unit_test::data::make({0, 1, 3}), leader_node_idx) {
// Arrange: Set the voters limit and create the voter calculator.
constexpr size_t max_voters = 3;
const service::group0_voter_calculator voter_calc{max_voters};
// Act: Add a third DC to a 2 DC cluster.
const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id(),
raft::server_id::create_random_id(), raft::server_id::create_random_id(), raft::server_id::create_random_id()};
const service::group0_voter_calculator::nodes_list_t nodes = {
// The initial nodes (just 2 DCs)
{ids[0], {.datacenter = "dc-1", .rack = "rack-1", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 0}},
{ids[1], {.datacenter = "dc-1", .rack = "rack-2", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 1}},
{ids[2], {.datacenter = "dc-2", .rack = "rack-3", .is_voter = false, .is_alive = true}},
{ids[3], {.datacenter = "dc-2", .rack = "rack-4", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 3}},
// The new nodes (3rd DC)
// - this will lead to 1 voter being removed from the DC-1
{ids[4], {.datacenter = "dc-3", .rack = "rack-5", .is_voter = false, .is_alive = true}},
{ids[5], {.datacenter = "dc-3", .rack = "rack-6", .is_voter = false, .is_alive = true}},
};
const auto& voters = voter_calc.distribute_voters(nodes);
// Assert: The current leader is retained as a voter, even after adding
// the third datacenter and redistributing voters.
BOOST_CHECK_EQUAL(voters.size(), max_voters);
BOOST_CHECK(voters.contains(ids[leader_node_idx]));
}
BOOST_DATA_TEST_CASE(leader_is_retained_as_voter_in_two_dc_asymmetric_setup, boost::unit_test::data::make({0, 1}), leader_node_idx) {
// Arrange: Set the voters limit and create the voter calculator.
constexpr size_t max_voters = 3;
const service::group0_voter_calculator voter_calc{max_voters};
// Act: Remove nodes from a 2 DC cluster so that each DC has 1 node left.
// - this will lead to 1 voter being removed from one DC
// - the leader should be retained as a voter
const std::array ids = {raft::server_id::create_random_id(), raft::server_id::create_random_id()};
const service::group0_voter_calculator::nodes_list_t nodes = {
// The result nodes (just one per DC)
{ids[0], {.datacenter = "dc-1", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 0}},
{ids[1], {.datacenter = "dc-2", .rack = "rack", .is_voter = true, .is_alive = true, .is_leader = leader_node_idx == 1}},
// no other nodes - all removed
};
const auto& voters = voter_calc.distribute_voters(nodes);
// Assert: The current leader is retained as a voter.
// TODO (scylladb/scylladb#23266): This should be equal to 1, after we enforce the odd number of voters
BOOST_CHECK_GE(voters.size(), 1);
BOOST_CHECK(voters.contains(ids[leader_node_idx]));
}
BOOST_AUTO_TEST_SUITE_END()

View File

@@ -15,7 +15,7 @@ from test.pylib.internal_types import ServerInfo
from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import read_barrier
from test.cluster.conftest import cluster_con, skip_mode
from test.cluster.util import get_current_group0_config
from test.cluster.util import get_coordinator_host_ids, get_current_group0_config
GROUP0_VOTERS_LIMIT = 5
@@ -176,3 +176,60 @@ async def test_raft_limited_voters_upgrade(manager: ManagerClient):
num_voters = await get_number_of_voters(manager, servers[0])
assert num_voters == GROUP0_VOTERS_LIMIT, f"The number of voters should be limited to {GROUP0_VOTERS_LIMIT} (but is {num_voters})"
@pytest.mark.asyncio
async def test_raft_limited_voters_retain_coordinator(manager: ManagerClient):
"""
Test that the topology coordinator is retained as a voter when possible.
This test ensures that if a voter needs to be removed and the topology coordinator
is among the candidates for removal, the coordinator is prioritized to remain a voter.
Arrange:
- Create 2 DCs with 3 nodes each.
- Retrieve the current topology coordinator.
Act:
- Add a third DC with 1 node.
- This will require removing one voter from either DC1 or DC2.
Assert:
- Verify that the topology coordinator is retained as a voter.
"""
# Arrange: Create a 3-node cluster with the limited voters feature disabled
dc_setup = [
{
'property_file': {'dc': 'dc1', 'rack': 'rack1'},
'num_nodes': 3,
},
{
'property_file': {'dc': 'dc2', 'rack': 'rack2'},
'num_nodes': 3,
},
]
dc_servers = []
for dc in dc_setup:
logging.info(
f"Creating {dc['property_file']['dc']} with {dc['num_nodes']} nodes")
dc_servers.append(await manager.servers_add(dc['num_nodes'],
property_file=dc['property_file']))
assert len(dc_servers) == len(dc_setup)
coordinator_ids = await get_coordinator_host_ids(manager)
assert coordinator_ids, "At least 1 coordinator id should be found"
coordinator_id = coordinator_ids[0]
# Act: Add a new DC with one node
logging.info('Adding a new DC with one node')
await manager.servers_add(1, property_file={'dc': 'dc3', 'rack': 'rack3'})
# Assert: Verify that the topology coordinator is still the voter
group0_members = await get_current_group0_config(manager, dc_servers[0][0])
assert any(m[0] == coordinator_id and m[1] for m in group0_members), \
f"The coordinator {coordinator_id} should be a voter (but is not)"