mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-12 19:02:12 +00:00
Merge '[Backport 2025.4] raft topology: disable schema pulls in the Raft-based recovery procedure' from Scylladb[bot]
Schema pulls should always be disabled when group 0 is used. However, `migration_manager::disable_schema_pulls()` is never called during a restart with `recovery_leader` set in the Raft-based recovery procedure, which causes schema pulls to be re-enabled on all live nodes (excluding the nodes replacing the dead nodes). Moreover, schema pulls remain enabled on each node until the node is restarted, which could be a very long time. We fix this issue and add a regression test in this PR. Fixes #26569 This is an important bug fix, so it should be backported to all branches with the Raft-based recovery procedure (2025.2 and newer branches). - (cherry picked from commitec3a35303d) - (cherry picked from commitda8748e2b1) - (cherry picked from commit71de01cd41) Parent PR: #26572 Closes scylladb/scylladb#26599 * github.com:scylladb/scylladb: test: test_raft_recovery_entry_loss: fix the typo in the test case name test: verify that schema pulls are disabled in the Raft-based recovery procedure raft topology: disable schema pulls in the Raft-based recovery procedure
This commit is contained in:
@@ -745,7 +745,10 @@ future<> raft_group0::setup_group0_if_exist(db::system_keyspace& sys_ks, service
|
||||
} else {
|
||||
// We'll disable them once we complete the upgrade procedure.
|
||||
}
|
||||
} else if (!qp.db().get_config().recovery_leader.is_set()) {
|
||||
} else if (qp.db().get_config().recovery_leader.is_set()) {
|
||||
group0_log.info("Disabling migration_manager schema pulls in the Raft-based recovery procedure");
|
||||
co_await mm.disable_schema_pulls();
|
||||
} else {
|
||||
// Scylla has bootstrapped earlier but group 0 ID is not present and we are not recovering from majority loss
|
||||
// using the Raft-based procedure. This means we're upgrading.
|
||||
// Upgrade will start through a feature listener created after we enter NORMAL state.
|
||||
|
||||
@@ -19,7 +19,7 @@ from test.cluster.test_group0_schema_versioning import get_group0_schema_version
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raft_recovery_entry_lose(manager: ManagerClient):
|
||||
async def test_raft_recovery_entry_loss(manager: ManagerClient):
|
||||
"""
|
||||
Test that the Raft-based recovery procedure works correctly if some committed group 0 entry has been permanently
|
||||
lost (it has been committed only by dead nodes).
|
||||
@@ -39,6 +39,9 @@ async def test_raft_recovery_entry_lose(manager: ManagerClient):
|
||||
5. Check that node 1 has moved its group 0 state to v2.
|
||||
6. Remove nodes 3-5 from topology using the standard removenode procedure.
|
||||
7. Add a new node (a sanity check verifying that the cluster is functioning properly).
|
||||
|
||||
Additionally, verify that no schema pulls take place during the recovery procedure at the end of the test. This is
|
||||
a regression test for https://github.com/scylladb/scylladb/issues/26569.
|
||||
"""
|
||||
logging.info('Adding initial servers')
|
||||
servers = await manager.servers_add(5)
|
||||
@@ -158,10 +161,17 @@ async def test_raft_recovery_entry_lose(manager: ManagerClient):
|
||||
|
||||
logging.info('Adding a new server')
|
||||
new_server = await manager.server_add()
|
||||
live_servers.append(new_server)
|
||||
|
||||
hosts = await wait_for_cql_and_get_hosts(cql, live_servers + [new_server], time.time() + 60)
|
||||
hosts = await wait_for_cql_and_get_hosts(cql, live_servers, time.time() + 60)
|
||||
|
||||
logging.info(f'Performing consistency checks after adding {new_server}')
|
||||
await wait_for_cdc_generations_publishing(cql, hosts, time.time() + 60)
|
||||
await check_token_ring_and_group0_consistency(manager)
|
||||
await check_system_topology_and_cdc_generations_v3_consistency(manager, hosts, ignored_hosts=dead_hosts)
|
||||
|
||||
logging.info(f'Checking that there were no schema pulls on {live_servers}')
|
||||
log_files = await asyncio.gather(*[manager.server_open_log(srv.server_id) for srv in live_servers])
|
||||
for log_file in log_files:
|
||||
matches = await log_file.grep('Requesting schema pull') + await log_file.grep('Pulling schema')
|
||||
assert not matches
|
||||
|
||||
Reference in New Issue
Block a user