Revert "repair: plug in waiting for hints to be sent before repair"

This reverts commit 49f4a2f968.

The idea to wait for hints to be replayed before repair is not always a
good one. For example, someone might want to repair a small token range
or just one table - but hinted handoff cannot selectively replay hints
like this.

The fact that we are waiting for hints before repair caused a small
number of regressions (#8612, #8831).

This commit removes the logic in repair which caused it to wait for
hints. Additionally, the `storage_proxy.hh` include, which was
introduced in the commit being reverted is also removed and smaller
header files are included instead (gossiper.hh and fb_utilities.hh).
This commit is contained in:
Piotr Dulikowski
2021-05-27 16:09:25 +02:00
parent e3c32c897a
commit ecf854affc

View File

@@ -26,7 +26,8 @@
#include "dht/sharder.hh"
#include "streaming/stream_reason.hh"
#include "gms/inet_address.hh"
#include "service/storage_proxy.hh"
#include "utils/fb_utilities.hh"
#include "gms/gossiper.hh"
#include "service/priority_manager.hh"
#include "message/messaging_service.hh"
#include "sstables/sstables.hh"
@@ -1015,21 +1016,6 @@ static future<> repair_ranges(lw_shared_ptr<repair_info> ri) {
});
}
static future<> try_wait_for_hints_to_be_replayed(repair_uniq_id id, std::vector<gms::inet_address> source_nodes, std::vector<gms::inet_address> target_nodes) {
auto get_elapsed_seconds = [start_time = lowres_clock::now()] {
return std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
};
auto& sp = service::get_local_storage_proxy();
rlogger.info("repair id {}: started replaying hints before repair, source nodes: {}, target nodes: {}", id, source_nodes, target_nodes);
try {
co_await sp.wait_for_hints_to_be_replayed(id.uuid, std::move(source_nodes), std::move(target_nodes));
rlogger.info("repair id {}: finished replaying hints (took {}s), continuing with repair", id, get_elapsed_seconds());
} catch (...) {
rlogger.warn("repair id {}: failed to replay hints before repair (took {}s): {}, the repair will continue", id, get_elapsed_seconds(), std::current_exception());
}
co_return;
}
// repair_start() can run on any cpu; It runs on cpu0 the function
// do_repair_start(). The benefit of always running that function on the same
// CPU is that it allows us to keep some state (like a list of ongoing
@@ -1131,15 +1117,6 @@ int repair_service::do_repair_start(sstring keyspace, std::unordered_map<sstring
(void)repair_tracker().run(id, [this, &db, id, keyspace = std::move(keyspace),
cfs = std::move(cfs), ranges = std::move(ranges), options = std::move(options), ignore_nodes = std::move(ignore_nodes)] () mutable {
auto participants = get_hosts_participating_in_repair(db.local(), keyspace, ranges, options.data_centers, options.hosts, ignore_nodes).get();
if (db.local().get_config().wait_for_hint_replay_before_repair()) {
auto waiting_nodes = db.local().get_token_metadata().get_all_endpoints();
std::erase_if(waiting_nodes, [&] (const auto& addr) {
return ignore_nodes.contains(addr);
});
try_wait_for_hints_to_be_replayed(id, std::move(waiting_nodes), participants).get();
}
std::vector<future<>> repair_results;
repair_results.reserve(smp::count);
auto table_ids = get_table_ids(db.local(), keyspace, cfs);