From 3ebb124eb2d843f2faf2255a0bfc2bd267d383ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 19 Sep 2024 07:13:17 -0400 Subject: [PATCH] repair/row_level: remove reader timeout This timeout was added to catch reader related deadlocks. We have not seen such deadlocks for a long time, but we did see false-timeouts caused by this, see explanation below. Since the cost now outweight the benefit, remove the timeout altogether. The false timeout happens during mixed-shard repair. The `reader_permit::set_timeout()` call is called on the top-level permit which repair has a handle on. In the case of the mixed-shard repair, this belongs to the multishard reader. Calling set_timeout() on the multishard reader has no effect on the actual shard readers, except in one case: when the shard reader is created, it inherits the multishard reader's current timeout. As the shard reader can be alive for a long time, this timeout is not refreshed and ultimately causes a timeout and fails the repair. Refs: #18269 Closes scylladb/scylladb#20703 --- repair/row_level.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/repair/row_level.cc b/repair/row_level.cc index cb869d220c..2456d69b81 100644 --- a/repair/row_level.cc +++ b/repair/row_level.cc @@ -349,11 +349,6 @@ repair_reader::repair_reader( future repair_reader::read_mutation_fragment() { ++_reads_issued; - // Use a very long timeout for the reader to break out any eventual - // deadlock within the reader. Thirty minutes should be more than - // enough to read a single mutation fragment. - auto timeout = db::timeout_clock::now() + std::chrono::minutes(30); - _reader.set_timeout(timeout); // reset to db::no_timeout in pause() return _reader().then_wrapped([this] (future f) { try { auto mfopt = f.get(); @@ -397,7 +392,6 @@ void repair_reader::check_current_dk() { } void repair_reader::pause() { - _reader.set_timeout(db::no_timeout); if (_reader_handle) { _reader_handle->pause(); }