From df152d9824811f25742ecfda68bf3a93c956be09 Mon Sep 17 00:00:00 2001 From: Aleksandra Martyniuk Date: Fri, 16 May 2025 14:44:29 +0200 Subject: [PATCH] repair: postpone repair until topology is not busy Currently, repair_service::repair_tablets starts repair if there is no ongoing tablet operations. The check does not consider global topology operations, like tablet resize finalization. This may cause a data race and unexpected behavior. Start repair when topology is not busy. --- repair/repair.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/repair/repair.cc b/repair/repair.cc index b0e1e91c3b..d57746ac64 100644 --- a/repair/repair.cc +++ b/repair/repair.cc @@ -2292,11 +2292,15 @@ future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_nam while (true) { _repair_module->check_in_shutdown(); erm = t->get_effective_replication_map(); + auto local_version = erm->get_token_metadata().get_version(); const locator::tablet_map& tmap = erm->get_token_metadata_ptr()->tablets().get_tablet_map(tid); - if (!tmap.has_transitions()) { + if (!tmap.has_transitions() && co_await container().invoke_on(0, [local_version] (repair_service& rs) { + // We need to ensure that there is no ongoing global request. + return local_version == rs._tsm.local()._topology.version && !rs._tsm.local()._topology.is_busy(); + })) { break; } - rlogger.info("repair[{}] Table {}.{} has tablet transitions, waiting for topology to quiesce", rid.uuid(), keyspace_name, table_name); + rlogger.info("repair[{}] Topology is busy, waiting for it to quiesce", rid.uuid()); erm = nullptr; co_await container().invoke_on(0, [] (repair_service& rs) { return rs._tsm.local().await_not_busy();