From 200dc084c539ee143ee5546b194988fd24e17717 Mon Sep 17 00:00:00 2001 From: Aleksandra Martyniuk Date: Wed, 18 Feb 2026 13:57:03 +0100 Subject: [PATCH] service: fail ALTER KEYSPACE if replicas do not satisfy the replication RF change of tablet keyspace starts tablet rebuilds. Even if any of the rebuilds is rolled back (because pending replica was excluded), rf change request finishes successfully. Yet, we are left with not enough replicas. Then, a next new rf change request handler would generate a rebuild of two replicas of the same tablet. Such a transition would not be applied, as we don't allow many pending replicas. An exception would be thrown and the request would be retried infinitely, blocking the topology coordinator. Throw and fail rf change request if there is not enough replicas. The request should be retried later, after the issue is fixed by the mechanism introduced in previous changes. --- docs/cql/ddl.rst | 30 ++++++++++++++++++++++++++++++ service/topology_coordinator.cc | 11 +++++++++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/docs/cql/ddl.rst b/docs/cql/ddl.rst index 4a36eedf0d..7ef3025208 100644 --- a/docs/cql/ddl.rst +++ b/docs/cql/ddl.rst @@ -437,6 +437,36 @@ To migrate a keyspace from a numeric replication factor to a rack-list replicati ALTER KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : ['RAC1', 'RAC2', 'RAC3'], 'dc2' : ['RAC4']} AND tablets = { 'enabled': true }; +.. _fix-rf-change-tablet-rebuilds: + +Fixing invalid replica state with RF change +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a tablet rebuild fails during an RF change, the state of replicas will be invalid, even though the RF change is marked as successful. The missing replicas will be eventually added in the background. However, until then, the following RF changes will fail. + +To fix the state of replicas in the foreground, retry the previous ALTER KEYSPACE statement, i.e. update the replication factor to the same value it currently has. + +For example, if the following statement fails due to invalid replica state: + +.. code-block:: cql + + ALTER KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 1} AND tablets = { 'enabled': true }; + +Check the current replication factor with DESCRIBE KEYSPACE: + +.. code-block:: cql + + DESCRIBE KEYSPACE Excelsior; + CREATE KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 2} AND tablets = { 'enabled': true }; + +Ensure that reaching the valid replicas state is possible (e.g. there is enough non-excluded racks) and alter keyspace with the current replication factor: + +.. code-block:: cql + + ALTER KEYSPACE Excelsior WITH replication = { 'class' : 'NetworkTopologyStrategy', 'dc1' : 3, 'dc2' : 2} AND tablets = { 'enabled': true }; + +This should fix the state of replicas and allow future RF changes to succeed. + .. _drop-keyspace-statement: DROP KEYSPACE diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index 1bd691d1fc..74f4c7c282 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -1070,6 +1070,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id()); co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> { auto last_token = new_tablet_map.get_last_token(tablet_id); + auto old_tablet_info = old_tablets.get_tablet_info(last_token); + auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas); + auto new_replicas = locator::substract_sets(tablet_info.replicas, old_tablet_info.replicas); + if (abandoning_replicas.size() + new_replicas.size() > 1) { + throw std::runtime_error(fmt::format("Invalid state of a tablet {} of a table {}.{}. Expected replication factor: {}, but the tablet has replicas only on {}. " + "Try again later or use the \"Fixing invalid replica state with RF change\" procedure to fix the problem.", tablet_id, ks_name, table_or_mv->cf_name(), + ks.get_replication_strategy().get_replication_factor(*tmptr), old_tablet_info.replicas)); + } + updates.emplace_back(co_await make_canonical_mutation_gently( replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id()) .set_new_replicas(last_token, tablet_info.replicas) @@ -1079,8 +1088,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber )); // Calculate abandoning replica and abort view building tasks on them - auto old_tablet_info = old_tablets.get_tablet_info(last_token); - auto abandoning_replicas = locator::substract_sets(old_tablet_info.replicas, tablet_info.replicas); if (!abandoning_replicas.empty()) { if (abandoning_replicas.size() != 1) { on_internal_error(rtlogger, fmt::format("Keyspace RF abandons {} replicas for table {} and tablet id {}", abandoning_replicas.size(), table_or_mv->id(), tablet_id));