From d78536437580538bca37b47758826e804d9f1051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Chojnowski?= Date: Mon, 3 Oct 2022 18:50:19 +0200 Subject: [PATCH 1/3] cache: make all cache unlinks explicit Our LSA cache is implemented as an auto_unlink Boost intrusive list, meaning that elements of the list unlink themselves from the list automatically on destruction. Some parts of the code rely on that, and don't unlink them manually. However, this precludes accurate bookkeeping about the cache. Elements only have access to themselves and their neighbours, not to any bookkeeping context. Therefore, a destructor cannot update the relevant metadata. In this patch, we fix this by adding explicit unlink calls to places where it would be done by a destructor. In a following patch, we will add an assert to the destructor to check that every element is unlinked before destruction. --- cache_flat_mutation_reader.hh | 4 +++- db/cache_tracker.hh | 7 +++++-- mutation_partition.cc | 8 ++++---- sstables/partition_index_cache.hh | 4 +++- test/boost/mvcc_test.cc | 8 ++++++-- utils/cached_file.hh | 2 ++ 6 files changed, 23 insertions(+), 10 deletions(-) diff --git a/cache_flat_mutation_reader.hh b/cache_flat_mutation_reader.hh index 901cf0bf93..d5cccdc916 100644 --- a/cache_flat_mutation_reader.hh +++ b/cache_flat_mutation_reader.hh @@ -737,7 +737,9 @@ void cache_flat_mutation_reader::maybe_drop_last_entry() noexcept { && _snp->at_oldest_version()) { with_allocator(_snp->region().allocator(), [&] { - _last_row->on_evicted(_read_context.cache()._tracker); + cache_tracker& tracker = _read_context.cache()._tracker; + tracker.get_lru().remove(*_last_row); + _last_row->on_evicted(tracker); }); _last_row = nullptr; diff --git a/db/cache_tracker.hh b/db/cache_tracker.hh index 434072dcb7..c484a555c7 100644 --- a/db/cache_tracker.hh +++ b/db/cache_tracker.hh @@ -92,7 +92,7 @@ public: void insert(partition_entry&) noexcept; void insert(partition_version&) noexcept; void insert(rows_entry&) noexcept; - void on_remove() noexcept; + void remove(rows_entry&) noexcept; void clear_continuity(cache_entry& ce) noexcept; void on_partition_erase() noexcept; void on_partition_merge() noexcept; @@ -123,9 +123,12 @@ public: }; inline -void cache_tracker::on_remove() noexcept { +void cache_tracker::remove(rows_entry& entry) noexcept { --_stats.rows; ++_stats.row_removals; + if (entry.is_linked()) { + _lru.remove(entry); + } } inline diff --git a/mutation_partition.cc b/mutation_partition.cc index a1aa120a34..c0bf77c18a 100644 --- a/mutation_partition.cc +++ b/mutation_partition.cc @@ -336,7 +336,7 @@ stop_iteration mutation_partition::apply_monotonically(const schema& s, mutation ++app_stats.rows_dropped_by_tombstones; i = _rows.erase(i); if (tracker) { - tracker->on_remove(); + tracker->remove(e); } del(&e); } else { @@ -428,7 +428,7 @@ stop_iteration mutation_partition::apply_monotonically(const schema& s, mutation if (src_e.dummy()) { p_i = p._rows.erase(p_i); if (tracker) { - tracker->on_remove(); + tracker->remove(src_e); } del(&src_e); insert = false; @@ -448,9 +448,9 @@ stop_iteration mutation_partition::apply_monotonically(const schema& s, mutation // violating exception guarantees. src_e.set_continuous(false); if (tracker) { - tracker->on_remove(); // Newer evictable versions store complete rows i->replace_with(std::move(src_e)); + tracker->remove(src_e); } else { memory::on_alloc_point(); i->apply_monotonically(s, std::move(src_e)); @@ -2394,7 +2394,7 @@ stop_iteration mutation_partition::clear_gently(cache_tracker* tracker) noexcept auto end = _rows.end(); while (i != end) { if (tracker) { - tracker->on_remove(); + tracker->remove(*i); } i = _rows.erase_and_dispose(i, del); diff --git a/sstables/partition_index_cache.hh b/sstables/partition_index_cache.hh index 15046d9727..080de873c0 100644 --- a/sstables/partition_index_cache.hh +++ b/sstables/partition_index_cache.hh @@ -118,7 +118,7 @@ public: entry_ptr(const entry_ptr&) noexcept = default; entry_ptr& operator=(std::nullptr_t) noexcept { if (_ref) { - if (_ref.unique()) { + if (_ref.unique() && _ref->ready()) { _ref->_parent->_lru.add(*_ref); } _ref = nullptr; @@ -172,6 +172,7 @@ public: ~partition_index_cache() { with_allocator(_region.allocator(), [&] { _cache.clear_and_dispose([this] (entry* e) noexcept { + _lru.remove(*e); on_evicted(*e); }); }); @@ -260,6 +261,7 @@ public: if (i->is_referenced()) { ++i; } else { + _lru.remove(*i); on_evicted(*i); i = i.erase(key_less_comparator()); } diff --git a/test/boost/mvcc_test.cc b/test/boost/mvcc_test.cc index 3631c785ab..a0c17152fb 100644 --- a/test/boost/mvcc_test.cc +++ b/test/boost/mvcc_test.cc @@ -320,6 +320,7 @@ public: mvcc_partition(mvcc_partition&&) = default; ~mvcc_partition() { + evict(); with_allocator(region().allocator(), [&] { _e = {}; }); @@ -569,8 +570,6 @@ SEASTAR_TEST_CASE(test_eviction_with_active_reader) { BOOST_REQUIRE(cursor.continuous()); BOOST_REQUIRE(cursor.key().equal(s, ck1)); - e.evict(); - { logalloc::reclaim_lock rl(ms.region()); cursor.maybe_refresh(); @@ -769,6 +768,7 @@ SEASTAR_TEST_CASE(test_continuity_merging_in_evictable) { assert_that(s, actual2) .has_same_continuity(expected) .is_equal_to_compacted(expected); + e.evict(tracker.cleaner()); } }); }); @@ -940,6 +940,7 @@ SEASTAR_TEST_CASE(test_partition_snapshot_row_cursor) { BOOST_REQUIRE(eq(cur.position(), table.make_ckey(5))); BOOST_REQUIRE(cur.continuous()); } + e.evict(tracker.cleaner()); }); }); } @@ -1238,6 +1239,7 @@ SEASTAR_TEST_CASE(test_cursor_tracks_continuity_in_reversed_mode) { BOOST_REQUIRE(!cur.next()); } + e.evict(tracker.cleaner()); }); }); } @@ -1293,6 +1295,7 @@ SEASTAR_TEST_CASE(test_ensure_entry_in_latest_in_reversed_mode) { BOOST_REQUIRE(!res.inserted); } } + e.evict(tracker.cleaner()); }); }); } @@ -1344,6 +1347,7 @@ SEASTAR_TEST_CASE(test_ensure_entry_in_latest_does_not_set_continuity_in_reverse // the entry for ckey 2 in latest version should not be marked as continuous. BOOST_REQUIRE(!cur.continuous()); } + e.evict(tracker.cleaner()); }); }); } diff --git a/utils/cached_file.hh b/utils/cached_file.hh index 2de749001e..61890ed4f3 100644 --- a/utils/cached_file.hh +++ b/utils/cached_file.hh @@ -332,6 +332,7 @@ public: while (start != end) { if (start->is_linked()) { ++count; + _lru.remove(*start); on_evicted(*start); start = start.erase_and_dispose(disposer, page_idx_less_comparator()); } else { @@ -449,6 +450,7 @@ public: auto i = _cache.begin(); while (i != _cache.end()) { if (i->is_linked()) { + _lru.remove(*i); on_evicted(*i); i = i.erase(page_idx_less_comparator()); } else { From f340c9cca5dbbc230885d1fc5dd88945679ed3fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Chojnowski?= Date: Mon, 3 Oct 2022 19:12:57 +0200 Subject: [PATCH 2/3] utils: lru: remove unlink_from_lru() unlink_from_lru() allows for unlinking elements from cache without notifying the cache. This messes up any potential cache bookkeeping. Improved that by replacing all uses of unlink_from_lru() with calls to lru::remove(), which does have access to cache's metadata. --- row_cache.cc | 13 ++++++++----- sstables/partition_index_cache.hh | 4 +++- utils/cached_file.hh | 4 +++- utils/lru.hh | 4 ---- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/row_cache.cc b/row_cache.cc index 47b6caa25b..6b1a1677f4 100644 --- a/row_cache.cc +++ b/row_cache.cc @@ -165,9 +165,10 @@ void cache_tracker::clear() { } void cache_tracker::touch(rows_entry& e) { - // last dummy may not be linked if evicted, but - // the unlink_from_lru() handles it - e.unlink_from_lru(); + // last dummy may not be linked if evicted + if (e.is_linked()) { + _lru.remove(e); + } _lru.add(e); } @@ -1096,7 +1097,10 @@ void row_cache::unlink_from_lru(const dht::decorated_key& dk) { if (i != _partitions.end()) { for (partition_version& pv : i->partition().versions_from_oldest()) { for (rows_entry& row : pv.partition().clustered_rows()) { - row.unlink_from_lru(); + // Last dummy may already be unlinked. + if (row.is_linked()) { + _tracker.get_lru().remove(row); + } } } } @@ -1231,7 +1235,6 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept { // so don't remove it, just unlink from the LRU. // That dummy is linked in the LRU, because there may be partitions // with no regular rows, and we need to track them. - unlink_from_lru(); } else { // When evicting a dummy with both sides continuous we don't need to break continuity. // diff --git a/sstables/partition_index_cache.hh b/sstables/partition_index_cache.hh index 080de873c0..764979a25c 100644 --- a/sstables/partition_index_cache.hh +++ b/sstables/partition_index_cache.hh @@ -111,7 +111,9 @@ public: explicit entry_ptr(lsa::weak_ptr ref) : _ref(std::move(ref)) { - _ref->unlink_from_lru(); + if (_ref->is_linked()) { + _ref->_parent->_lru.remove(*_ref); + } } ~entry_ptr() { *this = nullptr; } entry_ptr(entry_ptr&&) noexcept = default; diff --git a/utils/cached_file.hh b/utils/cached_file.hh index 61890ed4f3..6348943fa6 100644 --- a/utils/cached_file.hh +++ b/utils/cached_file.hh @@ -79,7 +79,9 @@ private: // because it will not be linked in the LRU. ptr_type share() noexcept { if (_use_count++ == 0) { - unlink_from_lru(); + if (is_linked()) { + parent->_lru.remove(*this); + } } return std::unique_ptr(this); } diff --git a/utils/lru.hh b/utils/lru.hh index 7d4fb40d5b..01e2a99e0e 100644 --- a/utils/lru.hh +++ b/utils/lru.hh @@ -30,10 +30,6 @@ public: return _lru_link.is_linked(); } - void unlink_from_lru() { - _lru_link.unlink(); - } - void swap(evictable& o) noexcept { _lru_link.swap_nodes(o._lru_link); } From a96433d3a46d2b4e281fd47cff5667c8e7c88aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Chojnowski?= Date: Mon, 3 Oct 2022 19:18:11 +0200 Subject: [PATCH 3/3] utils: lru: assert that evictables are unlinked before destruction Previous patches introduce the assumption that evictables are manually unlinked before destruction, to allow for correct bookkeeping within the cache. This assert assures that this assumptions is correct. This is particularly important because the switch from automatic to explicit unlinking had to be done manually. Destructor calls are invisible, so it's possible that we have missed some automatic destruction site. --- utils/lru.hh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/utils/lru.hh b/utils/lru.hh index 01e2a99e0e..89af7eed99 100644 --- a/utils/lru.hh +++ b/utils/lru.hh @@ -13,12 +13,24 @@ class evictable { friend class lru; + // For bookkeeping, we want the unlinking of evictables to be explicit. + // E.g. if the cache's internal data structure consists of multiple lists, we would + // like to know which list is an element being removed from. + // Therefore, we are using auto_unlink only to be able to call unlink() in the move constructor + // and we do NOT rely on automatic unlinking in _lru_link's destructor. + // It's the programmer's responsibility. to call lru::remove on the evictable before its destruction. + // Failure to do so is a bug, and it will trigger an assertion in the destructor. using lru_link_type = boost::intrusive::list_member_hook< boost::intrusive::link_mode>; lru_link_type _lru_link; protected: // Prevent destruction via evictable pointer. LRU is not aware of allocation strategy. - ~evictable() = default; + // Prevent destruction of a linked evictable. While we could unlink the evictable here + // in the destructor, we can't perform proper accounting for that without access to the + // head of the containing list. + ~evictable() { + assert(!_lru_link.is_linked()); + } public: evictable() = default; evictable(evictable&& o) noexcept;