Revert "repair: plug in waiting for hints to be sent before repair"

This reverts commit 49f4a2f968. The idea to wait for hints to be replayed before repair is not always a good one. For example, someone might want to repair a small token range or just one table - but hinted handoff cannot selectively replay hints like this. The fact that we are waiting for hints before repair caused a small number of regressions (#8612, #8831). This commit removes the logic in repair which caused it to wait for hints. Additionally, the `storage_proxy.hh` include, which was introduced in the commit being reverted is also removed and smaller header files are included instead (gossiper.hh and fb_utilities.hh).
2026-05-30 03:30:49 +00:00 · 2021-05-27 16:09:25 +02:00
parent e3c32c897a
commit ecf854affc
1 changed files with 2 additions and 25 deletions
--- a/repair/repair.cc
+++ b/repair/repair.cc
@@ -26,7 +26,8 @@
 #include "dht/sharder.hh"
 #include "streaming/stream_reason.hh"
 #include "gms/inet_address.hh"
-#include "service/storage_proxy.hh"
+#include "utils/fb_utilities.hh"
+#include "gms/gossiper.hh"
 #include "service/priority_manager.hh"
 #include "message/messaging_service.hh"
 #include "sstables/sstables.hh"
@@ -1015,21 +1016,6 @@ static future<> repair_ranges(lw_shared_ptr<repair_info> ri) {
    });
 }

-static future<> try_wait_for_hints_to_be_replayed(repair_uniq_id id, std::vector<gms::inet_address> source_nodes, std::vector<gms::inet_address> target_nodes) {
-    auto get_elapsed_seconds = [start_time = lowres_clock::now()] {
-        return std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
-    };
-    auto& sp = service::get_local_storage_proxy();
-    rlogger.info("repair id {}: started replaying hints before repair, source nodes: {}, target nodes: {}", id, source_nodes, target_nodes);
-    try {
-        co_await sp.wait_for_hints_to_be_replayed(id.uuid, std::move(source_nodes), std::move(target_nodes));
-        rlogger.info("repair id {}: finished replaying hints (took {}s), continuing with repair", id, get_elapsed_seconds());
-    } catch (...) {
-        rlogger.warn("repair id {}: failed to replay hints before repair (took {}s): {}, the repair will continue", id, get_elapsed_seconds(), std::current_exception());
-    }
-    co_return;
-}
-
 // repair_start() can run on any cpu; It runs on cpu0 the function
 // do_repair_start(). The benefit of always running that function on the same
 // CPU is that it allows us to keep some state (like a list of ongoing
@@ -1131,15 +1117,6 @@ int repair_service::do_repair_start(sstring keyspace, std::unordered_map<sstring
    (void)repair_tracker().run(id, [this, &db, id, keyspace = std::move(keyspace),
            cfs = std::move(cfs), ranges = std::move(ranges), options = std::move(options), ignore_nodes = std::move(ignore_nodes)] () mutable {
        auto participants = get_hosts_participating_in_repair(db.local(), keyspace, ranges, options.data_centers, options.hosts, ignore_nodes).get();
-
-        if (db.local().get_config().wait_for_hint_replay_before_repair()) {
-            auto waiting_nodes = db.local().get_token_metadata().get_all_endpoints();
-            std::erase_if(waiting_nodes, [&] (const auto& addr) {
-                return ignore_nodes.contains(addr);
-            });
-            try_wait_for_hints_to_be_replayed(id, std::move(waiting_nodes), participants).get();
-        }
-
        std::vector<future<>> repair_results;
        repair_results.reserve(smp::count);
        auto table_ids = get_table_ids(db.local(), keyspace, cfs);