From 95d4206585c3812bca22ebc80ede75f1058f34f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Wed, 15 Oct 2025 16:37:08 +0200 Subject: [PATCH 1/3] raft topology: disable schema pulls in the Raft-based recovery procedure Schema pulls should always be disabled when group 0 is used. However, `migration_manager::disable_schema_pulls()` is never called during a restart with `recovery_leader` set in the Raft-based recovery procedure, which causes schema pulls to be re-enabled on all live nodes (excluding the nodes replacing the dead nodes). Moreover, schema pulls remain enabled on each node until the node is restarted, which could be a very long time. The old gossip-based recovery procedure doesn't have this problem because we disable schema pulls after completing the upgrade-to-group0 procedure, which is a part of the old recovery procedure. Fixes #26569 (cherry picked from commit ec3a35303d7916cf87dfed0998bf18c44d43c4a1) --- service/raft/raft_group0.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index d11d5326f2..df8662126a 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -745,7 +745,10 @@ future<> raft_group0::setup_group0_if_exist(db::system_keyspace& sys_ks, service } else { // We'll disable them once we complete the upgrade procedure. } - } else if (!qp.db().get_config().recovery_leader.is_set()) { + } else if (qp.db().get_config().recovery_leader.is_set()) { + group0_log.info("Disabling migration_manager schema pulls in the Raft-based recovery procedure"); + co_await mm.disable_schema_pulls(); + } else { // Scylla has bootstrapped earlier but group 0 ID is not present and we are not recovering from majority loss // using the Raft-based procedure. This means we're upgrading. // Upgrade will start through a feature listener created after we enter NORMAL state. From cd0bb11eef367fde1c07589d55c36968c5ae33ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Wed, 15 Oct 2025 16:43:11 +0200 Subject: [PATCH 2/3] test: verify that schema pulls are disabled in the Raft-based recovery procedure We do this at the end of `test_raft_recovery_entry_loss`. It's not worth to add a separate regression test, as tests of the recovery procedure are complicated and have a long running time. Also, we choose `test_raft_recovery_entry_loss` out of all tests of the recovery procedure because it does some schema changes. (cherry picked from commit da8748e2b1bad5a7a1d645e80a5ddb8e05587b42) --- test/cluster/test_raft_recovery_entry_loss.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/cluster/test_raft_recovery_entry_loss.py b/test/cluster/test_raft_recovery_entry_loss.py index e5e1d6dde4..ad6057a1d6 100644 --- a/test/cluster/test_raft_recovery_entry_loss.py +++ b/test/cluster/test_raft_recovery_entry_loss.py @@ -39,6 +39,9 @@ async def test_raft_recovery_entry_lose(manager: ManagerClient): 5. Check that node 1 has moved its group 0 state to v2. 6. Remove nodes 3-5 from topology using the standard removenode procedure. 7. Add a new node (a sanity check verifying that the cluster is functioning properly). + + Additionally, verify that no schema pulls take place during the recovery procedure at the end of the test. This is + a regression test for https://github.com/scylladb/scylladb/issues/26569. """ logging.info('Adding initial servers') servers = await manager.servers_add(5) @@ -158,10 +161,17 @@ async def test_raft_recovery_entry_lose(manager: ManagerClient): logging.info('Adding a new server') new_server = await manager.server_add() + live_servers.append(new_server) - hosts = await wait_for_cql_and_get_hosts(cql, live_servers + [new_server], time.time() + 60) + hosts = await wait_for_cql_and_get_hosts(cql, live_servers, time.time() + 60) logging.info(f'Performing consistency checks after adding {new_server}') await wait_for_cdc_generations_publishing(cql, hosts, time.time() + 60) await check_token_ring_and_group0_consistency(manager) await check_system_topology_and_cdc_generations_v3_consistency(manager, hosts, ignored_hosts=dead_hosts) + + logging.info(f'Checking that there were no schema pulls on {live_servers}') + log_files = await asyncio.gather(*[manager.server_open_log(srv.server_id) for srv in live_servers]) + for log_file in log_files: + matches = await log_file.grep('Requesting schema pull') + await log_file.grep('Pulling schema') + assert not matches From 323a7b8c557221fa5c0ce0bf29c44dcae03b0037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Wed, 15 Oct 2025 16:49:40 +0200 Subject: [PATCH 3/3] test: test_raft_recovery_entry_loss: fix the typo in the test case name (cherry picked from commit 71de01cd41d9260e613d2c16a0bc4b7c4b964945) --- test/cluster/test_raft_recovery_entry_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cluster/test_raft_recovery_entry_loss.py b/test/cluster/test_raft_recovery_entry_loss.py index ad6057a1d6..3bace25af1 100644 --- a/test/cluster/test_raft_recovery_entry_loss.py +++ b/test/cluster/test_raft_recovery_entry_loss.py @@ -19,7 +19,7 @@ from test.cluster.test_group0_schema_versioning import get_group0_schema_version @pytest.mark.asyncio -async def test_raft_recovery_entry_lose(manager: ManagerClient): +async def test_raft_recovery_entry_loss(manager: ManagerClient): """ Test that the Raft-based recovery procedure works correctly if some committed group 0 entry has been permanently lost (it has been committed only by dead nodes).