From 7b3f55a65fbee2ee4a7df511cd08c977cefdcf28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Chojnowski?= <michal.chojnowski@scylladb.com>
Date: Thu, 4 Jul 2024 12:10:05 +0200
Subject: [PATCH] logalloc: add hold_reserve

mutation_partition_v2::apply_monotonically() needs to perform some allocations
in a destructor, to ensure that the invariants of the data structure are
restored before returning. But it is usually called with reclaiming disabled,
so the allocations might fail even in a perfectly healthy node with plenty of
reclaimable memory.

This patch adds a mechanism which allows to reserve some LSA memory (by
asking the allocator to keep it unused) and make it available for allocation
right when we need to guarantee allocation success.
---
 test/boost/logalloc_test.cc  | 67 ++++++++++++++++++++++++++++++++++++
 utils/allocation_strategy.hh | 28 +++++++++++++++
 utils/logalloc.cc            | 48 ++++++++++++++++++++++++++
 3 files changed, 143 insertions(+)
diff --git a/test/boost/logalloc_test.cc b/test/boost/logalloc_test.cc
index 16a7cf6161..fd40313ae9 100644
--- a/test/boost/logalloc_test.cc
+++ b/test/boost/logalloc_test.cc
@@ -519,6 +519,73 @@ SEASTAR_TEST_CASE(test_zone_reclaiming_preserves_free_size) {
     });
 }
 
+// Tests the intended usage of hold_reserve.
+//
+// Sets up a reserve, exhausts memory, opens the reserve,
+// checks that this allows us to do multiple additional allocations
+// without failing.
+SEASTAR_THREAD_TEST_CASE(test_hold_reserve) {
+    logalloc::region region;
+    logalloc::allocating_section as;
+
+    // We will fill LSA with an intrusive list of small entries.
+    // We make it intrusive to avoid any containers which do std allocations,
+    // since it could make the test imprecise.
+    struct entry {
+        using link = boost::intrusive::list_member_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>>;
+        link _link;
+        // We are going to fill the entire memory with this.
+        // Padding makes the entries bigger to speed up the test.
+        std::array<char, 8192> _padding;
+    };
+    using list = boost::intrusive::list<entry,
+        boost::intrusive::member_hook<entry, entry::link, &entry::_link>,
+        boost::intrusive::constant_time_size<false>>;
+
+    as.with_reserve(region, [&] {
+        with_allocator(region.allocator(), [&] {
+            assert(sizeof(entry) + 128 < current_allocator().preferred_max_contiguous_allocation());
+            logalloc::reclaim_lock rl(region);
+
+            // Reserve a segment.
+            auto guard = std::make_optional<hold_reserve>(128*1024);
+
+            // Fill the entire available memory with LSA objects.
+            list entries;
+            auto clean_up = defer([&entries] {
+                entries.clear_and_dispose([] (entry *e) {current_allocator().destroy(e);});
+            });
+            auto alloc_entry = [] () {
+                return current_allocator().construct<entry>();
+            };
+            try {
+                while (true) {
+                    entries.push_back(*alloc_entry());
+                }
+            } catch (const std::bad_alloc&) {
+                // expected
+            }
+
+            // Sanity check. We should be OOM at this point.
+            BOOST_REQUIRE_THROW(hold_reserve(128*1024), std::bad_alloc);
+            BOOST_REQUIRE_THROW(alloc_entry(), std::bad_alloc);
+
+            // Release the reserve.
+            guard.reset();
+
+            // Sanity check.
+            BOOST_REQUIRE_NO_THROW(hold_reserve(128*1024));
+            BOOST_REQUIRE_NO_THROW(hold_reserve(128*1024));
+            BOOST_REQUIRE_NO_THROW(hold_reserve(128*1024));
+
+            // Freeing up a segment should be enough to allocate multiple small entries;
+            for (int i = 0; i < 10; ++i) {
+                entries.push_back(*alloc_entry());
+            }
+        });
+    });
+}
+
 // No point in testing contiguous memory allocation in debug mode
 #ifndef SEASTAR_DEFAULT_ALLOCATOR
 SEASTAR_THREAD_TEST_CASE(test_can_reclaim_contiguous_memory_with_mixed_allocations) {
diff --git a/utils/allocation_strategy.hh b/utils/allocation_strategy.hh
index 399be20572..3d088f6087 100644
--- a/utils/allocation_strategy.hh
+++ b/utils/allocation_strategy.hh
@@ -188,6 +188,24 @@ public:
     void invalidate_references() noexcept {
         ++_invalidate_counter;
     }
+
+    // Asks the allocator to set aside some free memory,
+    // preventing it from being allocated until the matching
+    // unreserve() call. Can be used to preallocate some memory
+    // for a critical section where allocations can't fail.
+    //
+    // This is hack designed with the implementation details of the
+    // log-structured allocator in mind. In other allocators,
+    // it doesn't do anything useful.
+    //
+    // Don't use this unless you understand exactly what you are doing.
+    virtual uintptr_t reserve(size_t memory) {
+        return 0;
+    }
+
+    // As the argument to this function, you must pass the *return value* of the matching reserve().
+    virtual void unreserve(uintptr_t opaque) noexcept {
+    }
 };
 
 class standard_allocation_strategy : public allocation_strategy {
@@ -257,6 +275,16 @@ struct alloc_strategy_deleter {
     }
 };
 
+// RAII for allocation_strategy::reserve().
+class hold_reserve {
+    uintptr_t _opaque;
+public:
+    hold_reserve(size_t memory) : _opaque(current_allocator().reserve(memory)) {}
+    ~hold_reserve() { current_allocator().unreserve(_opaque); }
+    // Disallow copying and moving. They *could* be implemented, but I just didn't bother.
+    hold_reserve(hold_reserve&&) = delete;
+};
+
 // std::unique_ptr which can be used for owning an object allocated using allocation_strategy.
 // Must be destroyed before the pointer is invalidated. For compacting allocators, that
 // means it must not escape outside allocating_section or reclaim lock.
diff --git a/utils/logalloc.cc b/utils/logalloc.cc
index eaaf59be15..fb7d70e50f 100644
--- a/utils/logalloc.cc
+++ b/utils/logalloc.cc
@@ -1007,7 +1007,17 @@ class segment_pool {
     utils::dynamic_bitset _lsa_owned_segments_bitmap; // owned by this
     utils::dynamic_bitset _lsa_free_segments_bitmap;  // owned by this, but not in use
     size_t _free_segments = 0;
+
+    // Invariant: _free_segments > _current_emergency_reserve_goal.
+    // Used to ensure that some critical allocations won't fail.
+    // (We grow _current_emergency_reserve_goal in advance and shrink it right
+    // before the critical allocations, which allows them to utilize the pre-reserved
+    // segments).
     size_t _current_emergency_reserve_goal = 1;
+    // Used by allocating_section to request a certain number of free segments
+    // to be prepared for usage when the section is entered.
+    // This is more of a side-channel argument to refill_emergency_reserve() than a real piece of state.
+    // Passing it via a variable makes it easier to debug.
     size_t _emergency_reserve_max = 30;
     bool _allocation_failure_flag = false;
     bool _allocation_enabled = true;
@@ -2347,6 +2357,44 @@ public:
         return _eviction_fn;
     }
 
+    // LSA holds an internal "emergency reserve" of free segments that
+    // is only "opened" for usage before some critical allocations
+    // (in particular: the ones performed during memory compaction)
+    // to ensure that they won't fail.
+    //
+    // Here we hijack this mechanism to let the rest of the application implement
+    // some critical sections with infallible LSA allocations.
+    //
+    // reserve() increments the size of the internal emergency reserve,
+    // unreserve() decrements it.
+    //
+    // When you want to have some critical section that has to do some LSA 
+    // allocations infallibly (e.g. to restore some invariants
+    // of a LSA-managed data structure in a destructor), you can call reserve()
+    // beforehand to ensure that some extra memory will be held unused,
+    // and then call unreserve() (with reserve()'s return value as the argument)
+    // to make the reserved free segments available to the critical section.
+    // 
+    uintptr_t reserve(size_t memory) override {
+        // We round up the requested reserve to full segments.
+        size_t n_segments = (memory + segment::size - 1) >> segment::size_shift;
+
+        auto& pool = segment_pool();
+        size_t new_goal = pool.current_emergency_reserve_goal() + n_segments;
+        pool.ensure_free_segments(new_goal);
+        pool.set_current_emergency_reserve_goal(new_goal);
+
+        static_assert(sizeof(uintptr_t) >= sizeof(size_t));
+        return n_segments;
+    }
+
+    void unreserve(uintptr_t n_segments) noexcept override {
+        auto& pool = segment_pool();
+        assert(pool.current_emergency_reserve_goal() >= n_segments);
+        size_t new_goal = pool.current_emergency_reserve_goal() - n_segments;
+        pool.set_current_emergency_reserve_goal(new_goal);
+    }
+
     friend class region;
     friend class lsa_buffer;
     friend class region_evictable_occupancy_ascending_less_comparator;