From f85cd289bcfca774e34794cd751a68af711f25fa Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Jan 2022 18:22:29 +0200 Subject: [PATCH] Merge "repair: make sure there is one permit per repair with count res" from Botond " Repair obtains a permit for each repair-meta instance it creates. This permit is supposed to track all resources consumed by that repair as well as ensure concurrency limit is respected. However when the non-local reader path is used (shard config of master != shard config of follower), a second permit will be obtained -- for the shard reader of the multishard reader. This creates a situation where the repair-meta's permit can block the shard permit, creating a deadlock situation. This patch solves this by dropping the count resource on the repair-meta's permit when a non-local reader path is executed -- that is a multishard reader is created. Fixes: #9751 " * 'repair-double-permit-block/v4' of https://github.com/denesb/scylla: repair: make sure there is one permit per repair with count res reader_permit: add release_base_resource() (cherry picked from commit 52b7778ae69190adbaccb15364de22b1045b6399) --- reader_concurrency_semaphore.cc | 8 ++++++++ reader_permit.hh | 2 ++ repair/row_level.cc | 4 ++++ 3 files changed, 14 insertions(+) diff --git a/reader_concurrency_semaphore.cc b/reader_concurrency_semaphore.cc index 9fb18317c4..006139a265 100644 --- a/reader_concurrency_semaphore.cc +++ b/reader_concurrency_semaphore.cc @@ -249,6 +249,10 @@ public: return _base_resources; } + void release_base_resources() noexcept { + _semaphore.signal(std::exchange(_base_resources, {})); + } + sstring description() const { return format("{}.{}:{}", _schema ? _schema->ks_name() : "*", @@ -394,6 +398,10 @@ reader_resources reader_permit::base_resources() const { return _impl->base_resources(); } +void reader_permit::release_base_resources() noexcept { + return _impl->release_base_resources(); +} + sstring reader_permit::description() const { return _impl->description(); } diff --git a/reader_permit.hh b/reader_permit.hh index 8a0c85e659..6cefbda790 100644 --- a/reader_permit.hh +++ b/reader_permit.hh @@ -161,6 +161,8 @@ public: reader_resources base_resources() const; + void release_base_resources() noexcept; + sstring description() const; db::timeout_clock::time_point timeout() const noexcept; diff --git a/repair/row_level.cc b/repair/row_level.cc index 4a4e1af66b..02217ad5dc 100644 --- a/repair/row_level.cc +++ b/repair/row_level.cc @@ -407,6 +407,10 @@ public: {}, mutation_reader::forwarding::no); } else { + // We can't have two permits with count resource for 1 repair. + // So we release the one on _permit so the only one is the one the + // shard reader will obtain. + _permit.release_base_resources(); _reader = make_multishard_streaming_reader(db, _schema, _permit, [this] { auto shard_range = _sharder.next(); if (shard_range) {