repair: postpone repair until topology is not busy

Currently, repair_service::repair_tablets starts repair if there
is no ongoing tablet operations. The check does not consider global
topology operations, like tablet resize finalization. This may cause
a data race and unexpected behavior.

Start repair when topology is not busy.
This commit is contained in:
Aleksandra Martyniuk
2025-05-16 14:44:29 +02:00
parent 0ade15df33
commit df152d9824

View File

@@ -2292,11 +2292,15 @@ future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_nam
while (true) {
_repair_module->check_in_shutdown();
erm = t->get_effective_replication_map();
auto local_version = erm->get_token_metadata().get_version();
const locator::tablet_map& tmap = erm->get_token_metadata_ptr()->tablets().get_tablet_map(tid);
if (!tmap.has_transitions()) {
if (!tmap.has_transitions() && co_await container().invoke_on(0, [local_version] (repair_service& rs) {
// We need to ensure that there is no ongoing global request.
return local_version == rs._tsm.local()._topology.version && !rs._tsm.local()._topology.is_busy();
})) {
break;
}
rlogger.info("repair[{}] Table {}.{} has tablet transitions, waiting for topology to quiesce", rid.uuid(), keyspace_name, table_name);
rlogger.info("repair[{}] Topology is busy, waiting for it to quiesce", rid.uuid());
erm = nullptr;
co_await container().invoke_on(0, [] (repair_service& rs) {
return rs._tsm.local().await_not_busy();