From 6f1bba8faf7c8078924d07ed8b408102c08d849d Mon Sep 17 00:00:00 2001
From: Aleksandra Martyniuk <aleksandra.martyniuk@scylladb.com>
Date: Tue, 17 Feb 2026 17:14:25 +0100
Subject: [PATCH 1/5] service: maybe_start_tablet_migration returns
 std::optional<group0_guard>

maybe_start_tablet_migration takes an ownership of group0_guard and
does not give it back, even if no work was done.

In the following patches, we will proceed with different operations,
if there are no migrations to be started. Thus, the guard would be needed.

Return group0_guard from  maybe_start_tablet_migration is no work
was done.
---
 service/topology_coordinator.cc | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc
index 459236aca9..610137e9a5 100644
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -2583,8 +2583,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
             }
 
             // If there is no other work, evaluate load and start tablet migration if there is imbalance.
-            if (co_await maybe_start_tablet_migration(std::move(guard))) {
+            if (auto guard_opt = co_await maybe_start_tablet_migration(std::move(guard)); !guard_opt) {
                 co_return true;
+            } else {
+                guard = std::move(*guard_opt);
             }
             co_return false;
         }
@@ -3790,11 +3792,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
     // Returns the guard if no work done. Otherwise, performs a table migration and consumes the guard.
     future<std::optional<group0_guard>> maybe_migrate_system_tables(group0_guard guard);
 
-    // Returns true if the state machine was transitioned into tablet migration path.
-    future<bool> maybe_start_tablet_migration(group0_guard);
+    // Returns the guard if no work done. Otherwise, transitions the state machine into tablet migration path.
+    future<std::optional<group0_guard>> maybe_start_tablet_migration(group0_guard);
 
-    // Returns true if the state machine was transitioned into tablet resize finalization path.
-    future<bool> maybe_start_tablet_resize_finalization(group0_guard, const table_resize_plan& plan);
+    // Returns the guard if no work done. Otherwise, transitions the state machine into tablet resize finalization path.
+    future<std::optional<group0_guard>> maybe_start_tablet_resize_finalization(group0_guard, const table_resize_plan& plan);
 
     future<> refresh_tablet_load_stats();
     future<> start_tablet_load_stats_refresher();
@@ -3900,14 +3902,14 @@ future<std::optional<group0_guard>> topology_coordinator::maybe_migrate_system_t
     co_return std::move(guard);
 }
 
-future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard guard) {
+future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_migration(group0_guard guard) {
     rtlogger.debug("Evaluating tablet balance");
 
     auto tm = get_token_metadata_ptr();
     auto plan = co_await _tablet_allocator.balance_tablets(tm, &_topo_sm._topology, &_sys_ks, {}, get_dead_nodes());
     if (plan.empty()) {
         rtlogger.debug("Tablet load balancer did not make any plan");
-        co_return false;
+        co_return std::move(guard);
     }
 
     utils::chunked_vector<canonical_mutation> updates;
@@ -3927,15 +3929,15 @@ future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard gua
             .build());
 
     co_await update_topology_state(std::move(guard), std::move(updates), "Starting tablet migration");
-    co_return true;
+    co_return std::nullopt;
 }
 
-future<bool> topology_coordinator::maybe_start_tablet_resize_finalization(group0_guard guard, const table_resize_plan& plan) {
+future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_resize_finalization(group0_guard guard, const table_resize_plan& plan) {
     if (plan.finalize_resize.empty()) {
-        co_return false;
+        co_return std::move(guard);
     }
     if (utils::get_local_injector().enter("tablet_split_finalization_postpone")) {
-        co_return false;
+        co_return std::move(guard);
     }
 
     auto resize_finalization_transition_state = [this] {
@@ -3951,7 +3953,7 @@ future<bool> topology_coordinator::maybe_start_tablet_resize_finalization(group0
             .build());
 
     co_await update_topology_state(std::move(guard), std::move(updates), "Started tablet resize finalization");
-    co_return true;
+    co_return std::nullopt;
 }
 
 future<> topology_coordinator::refresh_tablet_load_stats() {

From 7951f922708d4597ed1c272afe1a8c5e9394d0f6 Mon Sep 17 00:00:00 2001
From: Aleksandra Martyniuk <aleksandra.martyniuk@scylladb.com>
Date: Tue, 17 Feb 2026 17:28:20 +0100
Subject: [PATCH 2/5] service: retry failed tablet rebuilds

RF change of tablet keyspace starts tablet rebuilds. Even if any
of the rebuilds is rolled back (because pending replica was excluded),
rf change request finishes successfully. In this case we end up with
the state of the replicas that isn't compatible with the expected
keyspace replication.

After this change, if topology_coordinator has nothing to do, it
proceeds to check if the state of replicas reflects the keyspace
replication. If there are any mismatches, the tablet rebuilds are
scheduled. All required rebuilds of a single keyspace are scheduled
together without respecting the node's load (just as it happens
in case of keyspace rf change).
---
 service/topology_coordinator.cc | 69 +++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc
index 610137e9a5..1bd691d1fc 100644
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -2588,6 +2588,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
             } else {
                 guard = std::move(*guard_opt);
             }
+
+            if (co_await maybe_retry_failed_rf_change_tablet_rebuilds(std::move(guard))) {
+                co_return true;
+            }
             co_return false;
         }
 
@@ -3798,6 +3802,9 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
     // Returns the guard if no work done. Otherwise, transitions the state machine into tablet resize finalization path.
     future<std::optional<group0_guard>> maybe_start_tablet_resize_finalization(group0_guard, const table_resize_plan& plan);
 
+    // Returns true if the state machine was transitioned into tablet migration path.
+    future<bool> maybe_retry_failed_rf_change_tablet_rebuilds(group0_guard guard);
+
     future<> refresh_tablet_load_stats();
     future<> start_tablet_load_stats_refresher();
 
@@ -3956,6 +3963,68 @@ future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_res
     co_return std::nullopt;
 }
 
+future<bool> topology_coordinator::maybe_retry_failed_rf_change_tablet_rebuilds(group0_guard guard) {
+    rtlogger.debug("Retrying failed rebuilds");
+
+    auto tmptr = get_token_metadata_ptr();
+    utils::chunked_vector<canonical_mutation> updates;
+    for (auto& ks_name : _db.get_tablets_keyspaces()) {
+        auto& ks = _db.find_keyspace(ks_name);
+        auto& strategy = ks.get_replication_strategy();
+        auto tables_with_mvs = ks.metadata()->tables();
+        auto views = ks.metadata()->views();
+        tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
+        for (const auto& table_or_mv : tables_with_mvs) {
+            if (!tmptr->tablets().is_base_table(table_or_mv->id())) {
+                continue;
+            }
+
+            auto& tablet_map = tmptr->tablets().get_tablet_map(table_or_mv->id());
+            auto new_tablet_map = co_await strategy.maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, co_await tablet_map.clone_gently());
+
+            replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
+            co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
+                auto& replicas = tablet_map.get_tablet_info(tablet_id).replicas;
+                auto it = std::find_if(tablet_info.replicas.begin(), tablet_info.replicas.end(), [&](const auto& replica) {
+                    return std::find(replicas.begin(), replicas.end(), replica) == replicas.end();
+                });
+                if (it == tablet_info.replicas.end()) {
+                    co_return;
+                }
+                auto new_replicas = replicas;
+                new_replicas.push_back(*it);
+                auto last_token = new_tablet_map.get_last_token(tablet_id);
+                updates.emplace_back(co_await make_canonical_mutation_gently(
+                        replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
+                                .set_new_replicas(last_token, new_replicas)
+                                .set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
+                                .set_transition(last_token, locator::choose_rebuild_transition_kind(_feature_service))
+                                .build()
+                ));
+            });
+        }
+
+        if (!updates.empty()) {
+            break;
+        }
+    }
+
+    if (updates.empty()) {
+        rtlogger.debug("No failed RF change rebuilds to retry");
+        co_return false;
+    }
+
+    updates.emplace_back(
+        topology_mutation_builder(guard.write_timestamp())
+            .set_transition_state(topology::transition_state::tablet_migration)
+            .set_version(_topo_sm._topology.version + 1)
+            .build());
+
+    sstring reason = "Retry failed tablet rebuilds";
+    co_await update_topology_state(std::move(guard), std::move(updates), reason);
+    co_return true;
+}
+
 future<> topology_coordinator::refresh_tablet_load_stats() {
     auto tm = get_token_metadata_ptr();
 

From 200dc084c539ee143ee5546b194988fd24e17717 Mon Sep 17 00:00:00 2001
From: Aleksandra Martyniuk <aleksandra.martyniuk@scylladb.com>
Date: Wed, 18 Feb 2026 13:57:03 +0100
Subject: [PATCH 3/5] service: fail ALTER KEYSPACE if replicas do not satisfy
 the replication

RF change of tablet keyspace starts tablet rebuilds. Even if any of
the rebuilds is rolled back (because pending replica was excluded),
rf change request finishes successfully. Yet, we are left with not
enough replicas. Then, a next new rf change request handler would
generate a rebuild of two replicas of the same tablet. Such a transition
would not be applied, as we don't allow many pending replicas.
An exception would be thrown and the request would be retried infinitely,
blocking the topology coordinator.

Throw and fail rf change request if there is not enough replicas.
The request should be retried later, after the issue is fixed
by the mechanism introduced in previous changes.
---
 docs/cql/ddl.rst                | 30 ++++++++++++++++++++++++++++++
 service/topology_coordinator.cc | 11 +++++++++--
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/docs/cql/ddl.rst b/docs/cql/ddl.rst
index 4a36eedf0d..7ef3025208 100644
--- a/docs/cql/ddl.rst
+++ b/docs/cql/ddl.rst
@@ -437,6 +437,36 @@ To migrate a keyspace from a numeric replication factor to a rack-list replicati
   ALTER KEYSPACE Excelsior
    WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true };
 
+.. _fix-rf-change-tablet-rebuilds:
+
+Fixing invalid replica state with RF change
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a tablet rebuild fails during an RF change, the state of replicas will be invalid, even though the RF change is marked as successful. The missing replicas will be eventually added in the background. However, until then, the following RF changes will fail.
+
+To fix the state of replicas in the foreground, retry the previous ALTER KEYSPACE statement, i.e. update the replication factor to the same value it currently has.
+
+For example, if the following statement fails due to invalid replica state:
+
+.. code-block:: cql
+
+  ALTER KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true };
+
+Check the current replication factor with DESCRIBE KEYSPACE:
+
+.. code-block:: cql
+
+  DESCRIBE KEYSPACE Excelsior;
+  CREATE KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 2} AND tablets = { 'enabled': true };
+
+Ensure that reaching the valid replicas state is possible (e.g. there is enough non-excluded racks) and alter keyspace with the current replication factor:
+
+.. code-block:: cql
+
+  ALTER KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 2} AND tablets = { 'enabled': true };
+
+This should fix the state of replicas and allow future RF changes to succeed.
+
 .. _drop-keyspace-statement:
 
 DROP KEYSPACE
diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc
index 1bd691d1fc..74f4c7c282 100644
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -1070,6 +1070,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                                 replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
                                 co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
                                     auto last_token = new_tablet_map.get_last_token(tablet_id);
+                                    auto old_tablet_info = old_tablets.get_tablet_info(last_token);
+                                    auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas);
+                                    auto new_replicas = locator::substract_sets(tablet_info.replicas, old_tablet_info.replicas);
+                                    if (abandoning_replicas.size() + new_replicas.size() > 1) {
+                                        throw std::runtime_error(fmt::format("Invalid state of a tablet {} of a table {}.{}. Expected replication factor: {}, but the tablet has replicas only on {}. "
+                                            "Try again later or use the \"Fixing invalid replica state with RF change\" procedure to fix the problem.", tablet_id, ks_name, table_or_mv->cf_name(),
+                                            ks.get_replication_strategy().get_replication_factor(*tmptr), old_tablet_info.replicas));
+                                    }
+
                                     updates.emplace_back(co_await make_canonical_mutation_gently(
                                             replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
                                                     .set_new_replicas(last_token, tablet_info.replicas)
@@ -1079,8 +1088,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
                                     ));
 
                                     // Calculate abandoning replica and abort view building tasks on them
-                                    auto old_tablet_info = old_tablets.get_tablet_info(last_token);
-                                    auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas);
                                     if (!abandoning_replicas.empty()) {
                                         if (abandoning_replicas.size() != 1) {
                                             on_internal_error(rtlogger, fmt::format("Keyspace RF abandons {} replicas for table {} and tablet id {}", abandoning_replicas.size(), table_or_mv->id(), tablet_id));

From 9ec54a820749ff3a5587e2d0fe4cb87d81e8f292 Mon Sep 17 00:00:00 2001
From: Aleksandra Martyniuk <aleksandra.martyniuk@scylladb.com>
Date: Wed, 18 Feb 2026 15:34:34 +0100
Subject: [PATCH 4/5] test: add a test to ensure that failed rebuilds are
 retried

---
 test/cluster/test_tablets.py | 50 ++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/test/cluster/test_tablets.py b/test/cluster/test_tablets.py
index a68e6d106b..355c9610a5 100644
--- a/test/cluster/test_tablets.py
+++ b/test/cluster/test_tablets.py
@@ -658,6 +658,56 @@ async def test_numeric_rf_to_rack_list_conversion_abort(request: pytest.FixtureR
     repl = await get_replication_options("ks1")
     assert repl['dc1'] == '1'
 
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_failed_tablet_rebuild_is_retried(request: pytest.FixtureRequest, manager: ManagerClient) -> None:
+    async def alter_keyspace(new_rf):
+        await cql.run_async(f"alter keyspace ks1 with replication = {{'class': 'NetworkTopologyStrategy', {new_rf}}};")
+
+    injection = "rebuild_repair_stage_fail"
+    config = {"tablets_mode_for_new_keyspaces": "enabled", "error_injections_at_startup": [injection]}
+    cmdline = [
+        '--logger-log-level', 'load_balancer=debug',
+        '--smp=2',
+    ]
+
+    servers = [await manager.server_add(config=config, cmdline=cmdline, property_file={'dc': 'dc1', 'rack': 'rack1a'}),
+                await manager.server_add(config=config, cmdline=cmdline, property_file={'dc': 'dc1', 'rack': 'rack1b'}),
+                await manager.server_add(config=config, cmdline=cmdline, property_file={'dc': 'dc1', 'rack': 'rack1c'})]
+
+    cql = manager.get_cql()
+
+    await cql.run_async(f"create keyspace ks1 with replication = {{'class': 'NetworkTopologyStrategy', 'dc1': ['rack1a']}} and tablets = {{'initial': 4}};")
+    await cql.run_async("create table ks1.t (pk int primary key);")
+    await asyncio.gather(*[cql.run_async(f"INSERT INTO ks1.t (pk) VALUES ({pk});") for pk in range(16)])
+
+    coord = await get_topology_coordinator(manager)
+    coord_serv = await find_server_by_host_id(manager, servers, coord)
+    log = await manager.server_open_log(coord_serv.server_id)
+    mark = await log.mark()
+
+    await alter_keyspace("'dc1': ['rack1a', 'rack1b']")
+
+    await log.wait_for('updating topology state: Retry failed tablet rebuilds', from_mark=mark)
+
+    failed = False
+    try:
+        await alter_keyspace("'dc1': ['rack1a', 'rack1b', 'rack1c']")
+    except Exception:
+        failed = True
+    assert failed
+
+    [await manager.api.disable_injection(s.ip_addr, injection) for s in servers]
+
+    log1 = await manager.server_open_log(coord_serv.server_id)
+    mark1 = await log1.mark()
+
+    await log1.wait_for('No failed RF change rebuilds to retry', from_mark=mark1)
+
+    await manager.api.quiesce_topology(coord_serv.ip_addr)
+
+    await alter_keyspace("'dc1': ['rack1a', 'rack1b', 'rack1c']")
+
 # Reproducer for https://github.com/scylladb/scylladb/issues/18110
 # Check that an existing cached read, will be cleaned up when the tablet it reads
 # from is migrated away.

From 166b293d06d33f1d3a3e954470ffb80ebe49591c Mon Sep 17 00:00:00 2001
From: Aleksandra Martyniuk <aleksandra.martyniuk@scylladb.com>
Date: Tue, 17 Mar 2026 17:21:46 +0100
Subject: [PATCH 5/5] test: add test_failed_tablet_rebuild_is_retried_on_alter

Test if alter keyspace statement with the current rf values will
fix the state of replicas.
---
 service/topology_coordinator.cc |  5 ++++
 test/cluster/test_tablets.py    | 43 +++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc
index 74f4c7c282..233150b77a 100644
--- a/service/topology_coordinator.cc
+++ b/service/topology_coordinator.cc
@@ -3973,6 +3973,11 @@ future<std::optional<group0_guard>> topology_coordinator::maybe_start_tablet_res
 future<bool> topology_coordinator::maybe_retry_failed_rf_change_tablet_rebuilds(group0_guard guard) {
     rtlogger.debug("Retrying failed rebuilds");
 
+    if (utils::get_local_injector().enter("maybe_retry_failed_rf_change_tablet_rebuilds_skip")) {
+        rtlogger.debug("Skipping retrying failed rebuilds due to error injection");
+        co_return false;
+    }
+
     auto tmptr = get_token_metadata_ptr();
     utils::chunked_vector<canonical_mutation> updates;
     for (auto& ks_name : _db.get_tablets_keyspaces()) {
diff --git a/test/cluster/test_tablets.py b/test/cluster/test_tablets.py
index 355c9610a5..caf9db69bc 100644
--- a/test/cluster/test_tablets.py
+++ b/test/cluster/test_tablets.py
@@ -708,6 +708,49 @@ async def test_failed_tablet_rebuild_is_retried(request: pytest.FixtureRequest,
 
     await alter_keyspace("'dc1': ['rack1a', 'rack1b', 'rack1c']")
 
+@pytest.mark.asyncio
+@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
+async def test_failed_tablet_rebuild_is_retried_on_alter(manager: ManagerClient) -> None:
+    async def alter_keyspace(new_rf):
+        await cql.run_async(f"alter keyspace ks1 with replication = {{'class': 'NetworkTopologyStrategy', {new_rf}}};")
+
+    fail_injection = "rebuild_repair_stage_fail"
+    skip_fix_injection = "maybe_retry_failed_rf_change_tablet_rebuilds_skip"
+    config = {
+        "tablets_mode_for_new_keyspaces": "enabled",
+        "error_injections_at_startup": [skip_fix_injection, fail_injection],
+    }
+    cmdline = [
+        '--logger-log-level', 'load_balancer=debug',
+        '--smp=2',
+    ]
+
+    servers = [await manager.server_add(config=config, cmdline=cmdline, property_file={'dc': 'dc1', 'rack': 'rack1a'}),
+               await manager.server_add(config=config, cmdline=cmdline, property_file={'dc': 'dc1', 'rack': 'rack1b'}),
+               await manager.server_add(config=config, cmdline=cmdline, property_file={'dc': 'dc1', 'rack': 'rack1c'})]
+
+    cql = manager.get_cql()
+
+    await cql.run_async(f"create keyspace ks1 with replication = {{'class': 'NetworkTopologyStrategy', 'dc1': ['rack1a']}} and tablets = {{'initial': 4}};")
+    await cql.run_async("create table ks1.t (pk int primary key);")
+    await asyncio.gather(*[cql.run_async(f"INSERT INTO ks1.t (pk) VALUES ({pk});") for pk in range(16)])
+
+    await alter_keyspace("'dc1': ['rack1a', 'rack1b']")
+
+    tablet_replicas = await get_all_tablet_replicas(manager, servers[0], "ks1", "t")
+    assert len(tablet_replicas) == 4
+    for r in tablet_replicas:
+        assert len(r.replicas) == 1
+
+    [await manager.api.disable_injection(s.ip_addr, fail_injection) for s in servers]
+
+    await alter_keyspace("'dc1': ['rack1a', 'rack1b']")
+
+    tablet_replicas = await get_all_tablet_replicas(manager, servers[0], "ks1", "t")
+    assert len(tablet_replicas) == 4
+    for r in tablet_replicas:
+        assert len(r.replicas) == 2
+
 # Reproducer for https://github.com/scylladb/scylladb/issues/18110
 # Check that an existing cached read, will be cleaned up when the tablet it reads
 # from is migrated away.