From 1fefe597e60f7ffa58d97fb1b9cbc276958bb488 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Wed, 5 Jul 2023 11:05:04 -0300 Subject: [PATCH] sstables: Close SSTable reader if index exhaustion is detected in fast forward call When wiring multi range reader with cleanup, I found that cleanup wouldn't be able to release disk space of input SSTables earlier. The reason is that multi range reader fast forward to the next range, therefore it enables mutation_reader::forwarding, and as a result, combined reader cannot release readers proactively as it cannot tell for sure that the underlying reader is exhausted. It may have reached EOS for the current range, but it may have data for the next one. The concept of EOS actually only applies to the current range being read. A reader that returned EOS will actually get out of this state once the combined reader fast forward to the next range. Therefore, only the underlying reader, i.e. the sstable reader, can for certain know that the data source is completely exhausted, given that tokens are read in monotonically increasing order. For reversed reads, that's not true but fast forward to range is not actually supported yet for it. Today, the SSTable reader already knows that the underlying SSTable was exhausted in fast_forward_to(), after it call index_reader's advance_to(partition_range), therefore it disables subsequent reads. We can take a step further and also check that the index was exhausted, i.e. reached EOF. So if the index is exhausted, and there's no partition to read after the fast_forward_to() call, we know that there's nothing left to do in this reader, and therefore the reader can be closed proactively, allowing the disk space of SSTable to be reclaimed if it was already deleted. We can see that the combined reader, under multi range reader, will incrementally find a set of disjoint SSTable exhausted, as it fast foward to owned ranges 1: INFO 2023-07-05 10:51:09,570 [shard 0] mutation_reader - flat_multi_range_mutation_reader(): fast forwarding to range [{-4525396453480898112, start},{-4525396453480898112, end}] INFO 2023-07-05 10:51:09,570 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-1-big-Data.db, start == *end, eof ? true INFO 2023-07-05 10:51:09,570 [shard 0] sstable - closing reader 0x60100029d800 for /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-1-big-Data.db INFO 2023-07-05 10:51:09,570 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-3-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,570 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-4-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,570 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-5-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,570 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-6-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,570 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-7-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,570 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-8-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,570 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-9-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,570 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-10-big-Data.db, start == *end, eof ? false 2: INFO 2023-07-05 10:51:09,572 [shard 0] mutation_reader - flat_multi_range_mutation_reader(): fast forwarding to range [{-2253424581619911583, start},{-2253424581619911583, end}] INFO 2023-07-05 10:51:09,572 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-2-big-Data.db, start == *end, eof ? true INFO 2023-07-05 10:51:09,572 [shard 0] sstable - closing reader 0x60100029d400 for /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-2-big-Data.db INFO 2023-07-05 10:51:09,572 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-4-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,572 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-5-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,572 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-6-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,572 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-7-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,572 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-8-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,572 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-9-big-Data.db, start == *end, eof ? false INFO 2023-07-05 10:51:09,572 [shard 0] sstable - sstable /tmp/scylla-9831a31a-66f3-4541-8681-000ac8e21bbb/me-10-big-Data.db, start == *end, eof ? false And so on. Signed-off-by: Raphael S. Carvalho --- readers/mutation_readers.cc | 1 + sstables/mx/reader.cc | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/readers/mutation_readers.cc b/readers/mutation_readers.cc index d109902356..5f6d3c8680 100644 --- a/readers/mutation_readers.cc +++ b/readers/mutation_readers.cc @@ -532,6 +532,7 @@ public: return make_ready_future<>(); } if (auto r = next()) { + mrlog.trace("flat_multi_range_mutation_reader {}: fast forwarding to range {}", fmt::ptr(this), *r); return _reader.fast_forward_to(*r); } else { _end_of_stream = true; diff --git a/sstables/mx/reader.cc b/sstables/mx/reader.cc index 11007fc30c..88595172b2 100644 --- a/sstables/mx/reader.cc +++ b/sstables/mx/reader.cc @@ -1511,6 +1511,9 @@ private: } }); } + bool has_sstable_attached() const noexcept { + return bool(_sst); + } bool is_initialized() const { return bool(_context); } @@ -1519,6 +1522,14 @@ private: if (is_initialized()) { co_return true; } + // If the reader has no SSTable attached, the reader was proactively closed in the + // context of fast-forward calls. The higher level code has no way to know that + // underlying reader is really exhausted, so reader is responsible for releasing + // its resources beforehand. From there on, the reader has the same semantics + // as that of an empty reader. + if (!has_sstable_attached()) { + co_return false; + } if (_single_partition_read) { _sst->get_stats().on_single_partition_read(); const auto& key = dht::ring_position_view(_pr.start()->value()); @@ -1612,6 +1623,15 @@ public: } _index_in_current_partition = false; _read_enabled = false; + if (_index_reader->eof()) { + // Close the SSTable reader proactively, if the index is completely exhausted + // and no partition was found in the current fast-forward call. This allows + // disk space of SSTables to be reclaimed earlier if they take part in a + // long-living read and they're deleted midway. + sstlog.trace("Closing reader {} for {} after fast-forward call found that index reached EOF and there's nothing left to read", + fmt::ptr(this), _sst->get_filename()); + return close(); + } return make_ready_future<>(); }); } @@ -1699,7 +1719,7 @@ public: close_index_reader = _index_reader->close().finally([_ = std::move(_index_reader)] {}); } - return when_all_succeed(std::move(close_context), std::move(close_index_reader)).discard_result().handle_exception([] (std::exception_ptr ep) { + return when_all_succeed(std::move(close_context), std::move(close_index_reader)).discard_result().handle_exception([sst = std::move(_sst)] (std::exception_ptr ep) { // close can not fail as it is called either from the destructor or from flat_mutation_reader::close sstlog.warn("Failed closing of sstable_mutation_reader: {}. Ignored since the reader is already done.", ep); });