From 7b3f55a65fbee2ee4a7df511cd08c977cefdcf28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Chojnowski?= Date: Thu, 4 Jul 2024 12:10:05 +0200 Subject: [PATCH] logalloc: add hold_reserve mutation_partition_v2::apply_monotonically() needs to perform some allocations in a destructor, to ensure that the invariants of the data structure are restored before returning. But it is usually called with reclaiming disabled, so the allocations might fail even in a perfectly healthy node with plenty of reclaimable memory. This patch adds a mechanism which allows to reserve some LSA memory (by asking the allocator to keep it unused) and make it available for allocation right when we need to guarantee allocation success. --- test/boost/logalloc_test.cc | 67 ++++++++++++++++++++++++++++++++++++ utils/allocation_strategy.hh | 28 +++++++++++++++ utils/logalloc.cc | 48 ++++++++++++++++++++++++++ 3 files changed, 143 insertions(+) diff --git a/test/boost/logalloc_test.cc b/test/boost/logalloc_test.cc index 16a7cf6161..fd40313ae9 100644 --- a/test/boost/logalloc_test.cc +++ b/test/boost/logalloc_test.cc @@ -519,6 +519,73 @@ SEASTAR_TEST_CASE(test_zone_reclaiming_preserves_free_size) { }); } +// Tests the intended usage of hold_reserve. +// +// Sets up a reserve, exhausts memory, opens the reserve, +// checks that this allows us to do multiple additional allocations +// without failing. +SEASTAR_THREAD_TEST_CASE(test_hold_reserve) { + logalloc::region region; + logalloc::allocating_section as; + + // We will fill LSA with an intrusive list of small entries. + // We make it intrusive to avoid any containers which do std allocations, + // since it could make the test imprecise. + struct entry { + using link = boost::intrusive::list_member_hook>; + link _link; + // We are going to fill the entire memory with this. + // Padding makes the entries bigger to speed up the test. + std::array _padding; + }; + using list = boost::intrusive::list, + boost::intrusive::constant_time_size>; + + as.with_reserve(region, [&] { + with_allocator(region.allocator(), [&] { + assert(sizeof(entry) + 128 < current_allocator().preferred_max_contiguous_allocation()); + logalloc::reclaim_lock rl(region); + + // Reserve a segment. + auto guard = std::make_optional(128*1024); + + // Fill the entire available memory with LSA objects. + list entries; + auto clean_up = defer([&entries] { + entries.clear_and_dispose([] (entry *e) {current_allocator().destroy(e);}); + }); + auto alloc_entry = [] () { + return current_allocator().construct(); + }; + try { + while (true) { + entries.push_back(*alloc_entry()); + } + } catch (const std::bad_alloc&) { + // expected + } + + // Sanity check. We should be OOM at this point. + BOOST_REQUIRE_THROW(hold_reserve(128*1024), std::bad_alloc); + BOOST_REQUIRE_THROW(alloc_entry(), std::bad_alloc); + + // Release the reserve. + guard.reset(); + + // Sanity check. + BOOST_REQUIRE_NO_THROW(hold_reserve(128*1024)); + BOOST_REQUIRE_NO_THROW(hold_reserve(128*1024)); + BOOST_REQUIRE_NO_THROW(hold_reserve(128*1024)); + + // Freeing up a segment should be enough to allocate multiple small entries; + for (int i = 0; i < 10; ++i) { + entries.push_back(*alloc_entry()); + } + }); + }); +} + // No point in testing contiguous memory allocation in debug mode #ifndef SEASTAR_DEFAULT_ALLOCATOR SEASTAR_THREAD_TEST_CASE(test_can_reclaim_contiguous_memory_with_mixed_allocations) { diff --git a/utils/allocation_strategy.hh b/utils/allocation_strategy.hh index 399be20572..3d088f6087 100644 --- a/utils/allocation_strategy.hh +++ b/utils/allocation_strategy.hh @@ -188,6 +188,24 @@ public: void invalidate_references() noexcept { ++_invalidate_counter; } + + // Asks the allocator to set aside some free memory, + // preventing it from being allocated until the matching + // unreserve() call. Can be used to preallocate some memory + // for a critical section where allocations can't fail. + // + // This is hack designed with the implementation details of the + // log-structured allocator in mind. In other allocators, + // it doesn't do anything useful. + // + // Don't use this unless you understand exactly what you are doing. + virtual uintptr_t reserve(size_t memory) { + return 0; + } + + // As the argument to this function, you must pass the *return value* of the matching reserve(). + virtual void unreserve(uintptr_t opaque) noexcept { + } }; class standard_allocation_strategy : public allocation_strategy { @@ -257,6 +275,16 @@ struct alloc_strategy_deleter { } }; +// RAII for allocation_strategy::reserve(). +class hold_reserve { + uintptr_t _opaque; +public: + hold_reserve(size_t memory) : _opaque(current_allocator().reserve(memory)) {} + ~hold_reserve() { current_allocator().unreserve(_opaque); } + // Disallow copying and moving. They *could* be implemented, but I just didn't bother. + hold_reserve(hold_reserve&&) = delete; +}; + // std::unique_ptr which can be used for owning an object allocated using allocation_strategy. // Must be destroyed before the pointer is invalidated. For compacting allocators, that // means it must not escape outside allocating_section or reclaim lock. diff --git a/utils/logalloc.cc b/utils/logalloc.cc index eaaf59be15..fb7d70e50f 100644 --- a/utils/logalloc.cc +++ b/utils/logalloc.cc @@ -1007,7 +1007,17 @@ class segment_pool { utils::dynamic_bitset _lsa_owned_segments_bitmap; // owned by this utils::dynamic_bitset _lsa_free_segments_bitmap; // owned by this, but not in use size_t _free_segments = 0; + + // Invariant: _free_segments > _current_emergency_reserve_goal. + // Used to ensure that some critical allocations won't fail. + // (We grow _current_emergency_reserve_goal in advance and shrink it right + // before the critical allocations, which allows them to utilize the pre-reserved + // segments). size_t _current_emergency_reserve_goal = 1; + // Used by allocating_section to request a certain number of free segments + // to be prepared for usage when the section is entered. + // This is more of a side-channel argument to refill_emergency_reserve() than a real piece of state. + // Passing it via a variable makes it easier to debug. size_t _emergency_reserve_max = 30; bool _allocation_failure_flag = false; bool _allocation_enabled = true; @@ -2347,6 +2357,44 @@ public: return _eviction_fn; } + // LSA holds an internal "emergency reserve" of free segments that + // is only "opened" for usage before some critical allocations + // (in particular: the ones performed during memory compaction) + // to ensure that they won't fail. + // + // Here we hijack this mechanism to let the rest of the application implement + // some critical sections with infallible LSA allocations. + // + // reserve() increments the size of the internal emergency reserve, + // unreserve() decrements it. + // + // When you want to have some critical section that has to do some LSA + // allocations infallibly (e.g. to restore some invariants + // of a LSA-managed data structure in a destructor), you can call reserve() + // beforehand to ensure that some extra memory will be held unused, + // and then call unreserve() (with reserve()'s return value as the argument) + // to make the reserved free segments available to the critical section. + // + uintptr_t reserve(size_t memory) override { + // We round up the requested reserve to full segments. + size_t n_segments = (memory + segment::size - 1) >> segment::size_shift; + + auto& pool = segment_pool(); + size_t new_goal = pool.current_emergency_reserve_goal() + n_segments; + pool.ensure_free_segments(new_goal); + pool.set_current_emergency_reserve_goal(new_goal); + + static_assert(sizeof(uintptr_t) >= sizeof(size_t)); + return n_segments; + } + + void unreserve(uintptr_t n_segments) noexcept override { + auto& pool = segment_pool(); + assert(pool.current_emergency_reserve_goal() >= n_segments); + size_t new_goal = pool.current_emergency_reserve_goal() - n_segments; + pool.set_current_emergency_reserve_goal(new_goal); + } + friend class region; friend class lsa_buffer; friend class region_evictable_occupancy_ascending_less_comparator;