From 95d4206585c3812bca22ebc80ede75f1058f34f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?=
 <patryk.jedrzejczak@scylladb.com>
Date: Wed, 15 Oct 2025 16:37:08 +0200
Subject: [PATCH 1/3] raft topology: disable schema pulls in the Raft-based
 recovery procedure

Schema pulls should always be disabled when group 0 is used. However,
`migration_manager::disable_schema_pulls()` is never called during
a restart with `recovery_leader` set in the Raft-based recovery
procedure, which causes schema pulls to be re-enabled on all live nodes
(excluding the nodes replacing the dead nodes). Moreover, schema pulls
remain enabled on each node until the node is restarted, which could
be a very long time.

The old gossip-based recovery procedure doesn't have this problem
because we disable schema pulls after completing the upgrade-to-group0
procedure, which is a part of the old recovery procedure.

Fixes #26569

(cherry picked from commit ec3a35303d7916cf87dfed0998bf18c44d43c4a1)
---
 service/raft/raft_group0.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc
index d11d5326f2..df8662126a 100644
--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -745,7 +745,10 @@ future<> raft_group0::setup_group0_if_exist(db::system_keyspace& sys_ks, service
         } else {
             // We'll disable them once we complete the upgrade procedure.
         }
-    } else if (!qp.db().get_config().recovery_leader.is_set()) {
+    } else if (qp.db().get_config().recovery_leader.is_set()) {
+        group0_log.info("Disabling migration_manager schema pulls in the Raft-based recovery procedure");
+        co_await mm.disable_schema_pulls();
+    } else {
         // Scylla has bootstrapped earlier but group 0 ID is not present and we are not recovering from majority loss
         // using the Raft-based procedure. This means we're upgrading.
         // Upgrade will start through a feature listener created after we enter NORMAL state.

From cd0bb11eef367fde1c07589d55c36968c5ae33ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?=
 <patryk.jedrzejczak@scylladb.com>
Date: Wed, 15 Oct 2025 16:43:11 +0200
Subject: [PATCH 2/3] test: verify that schema pulls are disabled in the
 Raft-based recovery procedure

We do this at the end of `test_raft_recovery_entry_loss`. It's not worth
to add a separate regression test, as tests of the recovery procedure
are complicated and have a long running time. Also, we choose
`test_raft_recovery_entry_loss` out of all tests of the recovery
procedure because it does some schema changes.

(cherry picked from commit da8748e2b1bad5a7a1d645e80a5ddb8e05587b42)
---
 test/cluster/test_raft_recovery_entry_loss.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/cluster/test_raft_recovery_entry_loss.py b/test/cluster/test_raft_recovery_entry_loss.py
index e5e1d6dde4..ad6057a1d6 100644
--- a/test/cluster/test_raft_recovery_entry_loss.py
+++ b/test/cluster/test_raft_recovery_entry_loss.py
@@ -39,6 +39,9 @@ async def test_raft_recovery_entry_lose(manager: ManagerClient):
     5. Check that node 1 has moved its group 0 state to v2.
     6. Remove nodes 3-5 from topology using the standard removenode procedure.
     7. Add a new node (a sanity check verifying that the cluster is functioning properly).
+
+    Additionally, verify that no schema pulls take place during the recovery procedure at the end of the test. This is
+    a regression test for https://github.com/scylladb/scylladb/issues/26569.
     """
     logging.info('Adding initial servers')
     servers = await manager.servers_add(5)
@@ -158,10 +161,17 @@ async def test_raft_recovery_entry_lose(manager: ManagerClient):
 
     logging.info('Adding a new server')
     new_server = await manager.server_add()
+    live_servers.append(new_server)
 
-    hosts = await wait_for_cql_and_get_hosts(cql, live_servers + [new_server], time.time() + 60)
+    hosts = await wait_for_cql_and_get_hosts(cql, live_servers, time.time() + 60)
 
     logging.info(f'Performing consistency checks after adding {new_server}')
     await wait_for_cdc_generations_publishing(cql, hosts, time.time() + 60)
     await check_token_ring_and_group0_consistency(manager)
     await check_system_topology_and_cdc_generations_v3_consistency(manager, hosts, ignored_hosts=dead_hosts)
+
+    logging.info(f'Checking that there were no schema pulls on {live_servers}')
+    log_files = await asyncio.gather(*[manager.server_open_log(srv.server_id) for srv in live_servers])
+    for log_file in log_files:
+        matches = await log_file.grep('Requesting schema pull') + await log_file.grep('Pulling schema')
+        assert not matches

From 323a7b8c557221fa5c0ce0bf29c44dcae03b0037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?=
 <patryk.jedrzejczak@scylladb.com>
Date: Wed, 15 Oct 2025 16:49:40 +0200
Subject: [PATCH 3/3] test: test_raft_recovery_entry_loss: fix the typo in the
 test case name

(cherry picked from commit 71de01cd41d9260e613d2c16a0bc4b7c4b964945)
---
 test/cluster/test_raft_recovery_entry_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cluster/test_raft_recovery_entry_loss.py b/test/cluster/test_raft_recovery_entry_loss.py
index ad6057a1d6..3bace25af1 100644
--- a/test/cluster/test_raft_recovery_entry_loss.py
+++ b/test/cluster/test_raft_recovery_entry_loss.py
@@ -19,7 +19,7 @@ from test.cluster.test_group0_schema_versioning import get_group0_schema_version
 
 
 @pytest.mark.asyncio
-async def test_raft_recovery_entry_lose(manager: ManagerClient):
+async def test_raft_recovery_entry_loss(manager: ManagerClient):
     """
     Test that the Raft-based recovery procedure works correctly if some committed group 0 entry has been permanently
     lost (it has been committed only by dead nodes).