mutation_partition_v2::apply_monotonically() needs to perform some allocations
in a destructor, to ensure that the invariants of the data structure are
restored before returning. But it is usually called with reclaiming disabled,
so the allocations might fail even in a perfectly healthy node with plenty of
reclaimable memory.
This patch adds a mechanism which allows to reserve some LSA memory (by
asking the allocator to keep it unused) and make it available for allocation
right when we need to guarantee allocation success.
(cherry picked from commit 7b3f55a65f)
348 lines
12 KiB
C++
348 lines
12 KiB
C++
/*
|
|
* Copyright (C) 2015-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <any>
|
|
#include <cstdlib>
|
|
#include <seastar/core/memory.hh>
|
|
#include <seastar/util/alloc_failure_injector.hh>
|
|
#include <malloc.h>
|
|
|
|
// A function used by compacting collectors to migrate objects during
|
|
// compaction. The function should reconstruct the object located at src
|
|
// in the location pointed by dst. The object at old location should be
|
|
// destroyed. See standard_migrator() above for example. Both src and dst
|
|
// are aligned as requested during alloc()/construct().
|
|
class migrate_fn_type {
|
|
// Migrators may be registered by thread-local objects. The table of all
|
|
// registered migrators is also thread-local which may cause problems with
|
|
// the order of object destruction and lead to use-after-free.
|
|
// This can be worked around by making migrators keep a shared pointer
|
|
// to the table of migrators. std::any is used so that its type doesn't
|
|
// have to be made public.
|
|
std::any _migrators;
|
|
uint32_t _align = 0;
|
|
uint32_t _index;
|
|
private:
|
|
static uint32_t register_migrator(migrate_fn_type* m);
|
|
static void unregister_migrator(uint32_t index);
|
|
protected:
|
|
explicit migrate_fn_type(size_t align) : _align(align), _index(register_migrator(this)) {}
|
|
public:
|
|
virtual ~migrate_fn_type() { unregister_migrator(_index); }
|
|
virtual void migrate(void* src, void* dsts, size_t size) const noexcept = 0;
|
|
virtual size_t size(const void* obj) const = 0;
|
|
size_t align() const { return _align; }
|
|
uint32_t index() const { return _index; }
|
|
virtual std::string name() const = 0;
|
|
};
|
|
|
|
// Non-constant-size classes (ending with `char data[0]`) must provide
|
|
// the method telling the underlying storage size
|
|
template <typename T>
|
|
concept DynamicObject = requires (const T& obj) {
|
|
{ obj.storage_size() } noexcept -> std::same_as<size_t>;
|
|
};
|
|
|
|
template <typename T>
|
|
inline
|
|
size_t
|
|
size_for_allocation_strategy(const T& obj) noexcept {
|
|
if constexpr (DynamicObject<T>) {
|
|
return obj.storage_size();
|
|
} else {
|
|
return sizeof(T);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
requires std::is_nothrow_move_constructible_v<T> && std::is_nothrow_destructible_v<T>
|
|
class standard_migrator final : public migrate_fn_type {
|
|
friend class allocation_strategy;
|
|
standard_migrator() : migrate_fn_type(alignof(T)) {}
|
|
|
|
public:
|
|
virtual void migrate(void* src, void* dst, size_t size) const noexcept override {
|
|
T* src_t = static_cast<T*>(src);
|
|
new (static_cast<T*>(dst)) T(std::move(*src_t));
|
|
src_t->~T();
|
|
}
|
|
virtual size_t size(const void* obj) const override {
|
|
return size_for_allocation_strategy(*static_cast<const T*>(obj));
|
|
}
|
|
|
|
virtual std::string name() const override {
|
|
return typeid(T).name();
|
|
}
|
|
};
|
|
|
|
//
|
|
// Abstracts allocation strategy for managed objects.
|
|
//
|
|
// Managed objects may be moved by the allocator during compaction, which
|
|
// invalidates any references to those objects. Compaction may be started
|
|
// synchronously with allocations. To ensure that references remain valid, use
|
|
// logalloc::compaction_lock.
|
|
//
|
|
// Because references may get invalidated, managing allocators can't be used
|
|
// with standard containers, because they assume the reference is valid until freed.
|
|
//
|
|
// For example containers compatible with compacting allocators see:
|
|
// - managed_ref - managed version of std::unique_ptr<>
|
|
// - managed_bytes - managed version of "bytes"
|
|
//
|
|
// Note: When object is used as an element inside intrusive containers,
|
|
// typically no extra measures need to be taken for reference tracking, if the
|
|
// link member is movable. When object is moved, the member hook will be moved
|
|
// too and it should take care of updating any back-references. The user must
|
|
// be aware though that any iterators into such container may be invalidated
|
|
// across deferring points.
|
|
//
|
|
class allocation_strategy {
|
|
template <typename T>
|
|
standard_migrator<T>& get_standard_migrator()
|
|
{
|
|
seastar::memory::scoped_critical_alloc_section dfg;
|
|
static thread_local standard_migrator<T> instance;
|
|
return instance;
|
|
}
|
|
|
|
protected:
|
|
size_t _preferred_max_contiguous_allocation = std::numeric_limits<size_t>::max();
|
|
uint64_t _invalidate_counter = 1;
|
|
public:
|
|
using migrate_fn = const migrate_fn_type*;
|
|
|
|
virtual ~allocation_strategy() {}
|
|
|
|
virtual void* alloc(migrate_fn, size_t size, size_t alignment) = 0;
|
|
//
|
|
// Allocates space for a new ManagedObject. The caller must construct the
|
|
// object before compaction runs. "size" is the amount of space to reserve
|
|
// in bytes. It can be larger than MangedObjects's size.
|
|
//
|
|
// Throws std::bad_alloc on allocation failure.
|
|
//
|
|
// Doesn't invalidate references to objects allocated with this strategy.
|
|
//
|
|
template <typename T>
|
|
requires DynamicObject<T>
|
|
void* alloc(size_t size) {
|
|
return alloc(&get_standard_migrator<T>(), size, alignof(T));
|
|
}
|
|
|
|
// Releases storage for the object. Doesn't invoke object's destructor.
|
|
// Doesn't invalidate references to objects allocated with this strategy.
|
|
virtual void free(void* object, size_t size) = 0;
|
|
virtual void free(void* object) = 0;
|
|
|
|
// Returns the total immutable memory size used by the allocator to host
|
|
// this object. This will be at least the size of the object itself, plus
|
|
// any immutable overhead needed to represent the object (if any).
|
|
//
|
|
// The immutable overhead is the overhead that cannot change over the
|
|
// lifetime of the object (such as padding, etc).
|
|
virtual size_t object_memory_size_in_allocator(const void* obj) const noexcept = 0;
|
|
|
|
// Like alloc() but also constructs the object with a migrator using
|
|
// standard move semantics. Allocates respecting object's alignment
|
|
// requirement.
|
|
template<typename T, typename... Args>
|
|
T* construct(Args&&... args) {
|
|
void* storage = alloc(&get_standard_migrator<T>(), sizeof(T), alignof(T));
|
|
try {
|
|
return new (storage) T(std::forward<Args>(args)...);
|
|
} catch (...) {
|
|
free(storage, sizeof(T));
|
|
throw;
|
|
}
|
|
}
|
|
|
|
// Destroys T and releases its storage.
|
|
// Doesn't invalidate references to allocated objects.
|
|
template<typename T>
|
|
void destroy(T* obj) {
|
|
size_t size = size_for_allocation_strategy(*obj);
|
|
obj->~T();
|
|
free(obj, size);
|
|
}
|
|
|
|
size_t preferred_max_contiguous_allocation() const noexcept {
|
|
return _preferred_max_contiguous_allocation;
|
|
}
|
|
|
|
// Returns a number which is increased when references to objects managed by this allocator
|
|
// are invalidated, e.g. due to internal events like compaction or eviction.
|
|
// When the value returned by this method doesn't change, references obtained
|
|
// between invocations remain valid.
|
|
uint64_t invalidate_counter() const noexcept {
|
|
return _invalidate_counter;
|
|
}
|
|
|
|
void invalidate_references() noexcept {
|
|
++_invalidate_counter;
|
|
}
|
|
|
|
// Asks the allocator to set aside some free memory,
|
|
// preventing it from being allocated until the matching
|
|
// unreserve() call. Can be used to preallocate some memory
|
|
// for a critical section where allocations can't fail.
|
|
//
|
|
// This is hack designed with the implementation details of the
|
|
// log-structured allocator in mind. In other allocators,
|
|
// it doesn't do anything useful.
|
|
//
|
|
// Don't use this unless you understand exactly what you are doing.
|
|
virtual uintptr_t reserve(size_t memory) {
|
|
return 0;
|
|
}
|
|
|
|
// As the argument to this function, you must pass the *return value* of the matching reserve().
|
|
virtual void unreserve(uintptr_t opaque) noexcept {
|
|
}
|
|
};
|
|
|
|
class standard_allocation_strategy : public allocation_strategy {
|
|
public:
|
|
constexpr standard_allocation_strategy() {
|
|
_preferred_max_contiguous_allocation = 128 * 1024;
|
|
}
|
|
virtual void* alloc(migrate_fn, size_t size, size_t alignment) override {
|
|
seastar::memory::on_alloc_point();
|
|
// ASAN doesn't intercept aligned_alloc() and complains on free().
|
|
void* ret;
|
|
// The system posix_memalign will return EINVAL if alignment is not
|
|
// a multiple of pointer size.
|
|
if (alignment < sizeof(void*)) {
|
|
alignment = sizeof(void*);
|
|
}
|
|
if (posix_memalign(&ret, alignment, size) != 0) {
|
|
throw std::bad_alloc();
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
virtual void free(void* obj, size_t size) override {
|
|
::free(obj);
|
|
}
|
|
|
|
virtual void free(void* obj) override {
|
|
::free(obj);
|
|
}
|
|
|
|
virtual size_t object_memory_size_in_allocator(const void* obj) const noexcept override {
|
|
return ::malloc_usable_size(const_cast<void *>(obj));
|
|
}
|
|
};
|
|
|
|
extern standard_allocation_strategy standard_allocation_strategy_instance;
|
|
|
|
inline
|
|
standard_allocation_strategy& standard_allocator() {
|
|
return standard_allocation_strategy_instance;
|
|
}
|
|
|
|
inline
|
|
allocation_strategy*& current_allocation_strategy_ptr() {
|
|
static thread_local allocation_strategy* current = &standard_allocation_strategy_instance;
|
|
return current;
|
|
}
|
|
|
|
inline
|
|
allocation_strategy& current_allocator() {
|
|
return *current_allocation_strategy_ptr();
|
|
}
|
|
|
|
template<typename T>
|
|
inline
|
|
auto current_deleter() {
|
|
auto& alloc = current_allocator();
|
|
return [&alloc] (T* obj) noexcept {
|
|
alloc.destroy(obj);
|
|
};
|
|
}
|
|
|
|
template<typename T>
|
|
struct alloc_strategy_deleter {
|
|
void operator()(T* ptr) const noexcept {
|
|
current_allocator().destroy(ptr);
|
|
}
|
|
};
|
|
|
|
// RAII for allocation_strategy::reserve().
|
|
class hold_reserve {
|
|
uintptr_t _opaque;
|
|
public:
|
|
hold_reserve(size_t memory) : _opaque(current_allocator().reserve(memory)) {}
|
|
~hold_reserve() { current_allocator().unreserve(_opaque); }
|
|
// Disallow copying and moving. They *could* be implemented, but I just didn't bother.
|
|
hold_reserve(hold_reserve&&) = delete;
|
|
};
|
|
|
|
// std::unique_ptr which can be used for owning an object allocated using allocation_strategy.
|
|
// Must be destroyed before the pointer is invalidated. For compacting allocators, that
|
|
// means it must not escape outside allocating_section or reclaim lock.
|
|
// Must be destroyed in the same allocating context in which T was allocated.
|
|
template<typename T>
|
|
using alloc_strategy_unique_ptr = std::unique_ptr<T, alloc_strategy_deleter<T>>;
|
|
|
|
//
|
|
// Passing allocators to objects.
|
|
//
|
|
// The same object type can be allocated using different allocators, for
|
|
// example standard allocator (for temporary data), or log-structured
|
|
// allocator for long-lived data. In case of LSA, objects may be allocated
|
|
// inside different LSA regions. Objects should be freed only from the region
|
|
// which owns it.
|
|
//
|
|
// There's a problem of how to ensure correct usage of allocators. Storing the
|
|
// reference to the allocator used for construction of some object inside that
|
|
// object is a possible solution. This has a disadvantage of extra space
|
|
// overhead per-object though. We could avoid that if the code which decides
|
|
// about which allocator to use is also the code which controls object's life
|
|
// time. That seems to be the case in current uses, so a simplified scheme of
|
|
// passing allocators will do. Allocation strategy is set in a thread-local
|
|
// context, as shown below. From there, aware objects pick up the allocation
|
|
// strategy. The code controlling the objects must ensure that object allocated
|
|
// in one regime is also freed in the same regime.
|
|
//
|
|
// with_allocator() provides a way to set the current allocation strategy used
|
|
// within given block of code. with_allocator() can be nested, which will
|
|
// temporarily shadow enclosing strategy. Use current_allocator() to obtain
|
|
// currently active allocation strategy. Use current_deleter() to obtain a
|
|
// Deleter object using current allocation strategy to destroy objects.
|
|
//
|
|
// Example:
|
|
//
|
|
// logalloc::region r;
|
|
// with_allocator(r.allocator(), [] {
|
|
// auto obj = make_managed<int>();
|
|
// });
|
|
//
|
|
|
|
class allocator_lock {
|
|
allocation_strategy* _prev;
|
|
public:
|
|
allocator_lock(allocation_strategy& alloc) {
|
|
_prev = current_allocation_strategy_ptr();
|
|
current_allocation_strategy_ptr() = &alloc;
|
|
}
|
|
|
|
~allocator_lock() {
|
|
current_allocation_strategy_ptr() = _prev;
|
|
}
|
|
};
|
|
|
|
template<typename Func>
|
|
inline
|
|
decltype(auto) with_allocator(allocation_strategy& alloc, Func&& func) {
|
|
allocator_lock l(alloc);
|
|
return func();
|
|
}
|