Files
scylladb/utils/logalloc.hh
Tomasz Grabiec 082342ecad Attach names to allocating sections for better debuggability
Large reserves in allocating_section can cause stalls. We already log
reserve increase, but we don't know which table it belongs to:

  lsa - LSA allocation failure, increasing reserve in section 0x600009f94590 to 128 segments;

Allocating sections used for updating row cache on memtable flush are
notoriously problematic. Each table has its own row_cache, so its own
allocating_section(s). If we attached table name to those sections, we
could identify which table is causing problems. In some issues we
suspected system.raft, but we can't be sure.

This patch allows naming allocating_sections for the purpose of
identifying them in such log messages. I use abstract_formatter for
this purpose to avoid the cost of formatting strings on the hot path
(e.g. index_reader). And also to avoid duplicating strings which are
already stored elsewhere.

Fixes #25799

Closes scylladb/scylladb#27470
2025-12-07 14:14:25 +02:00

555 lines
19 KiB
C++

/*
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include <memory>
#include <seastar/core/memory.hh>
#include <seastar/core/shard_id.hh>
#include <seastar/core/shared_ptr.hh>
#include "allocation_strategy.hh"
#include "seastarx.hh"
#include "utils/assert.hh"
#include "utils/entangled.hh"
#include "utils/memory_limit_reached.hh"
#include "utils/abstract_formatter.hh"
namespace logalloc {
struct occupancy_stats;
class region;
class region_impl;
class allocating_section;
constexpr int segment_size_shift = 17; // 128K; see #151, #152
constexpr size_t segment_size = 1 << segment_size_shift;
constexpr size_t max_zone_segments = 256;
constexpr size_t max_managed_object_size = segment_size * 0.1;
constexpr size_t background_reclaim_free_memory_threshold = 60'000'000;
//
// Frees some amount of objects from the region to which it's attached.
//
// This should eventually stop given no new objects are added:
//
// while (eviction_fn() == memory::reclaiming_result::reclaimed_something) ;
//
using eviction_fn = std::function<memory::reclaiming_result()>;
// Listens for events from a region
class region_listener {
public:
virtual ~region_listener();
virtual void add(region* r) = 0;
virtual void del(region* r) = 0;
virtual void moved(region* old_address, region* new_address) = 0;
virtual void increase_usage(region* r, ssize_t delta) = 0;
virtual void decrease_evictable_usage(region* r) = 0;
virtual void decrease_usage(region* r, ssize_t delta) = 0;
};
// Controller for all LSA regions. There's one per shard.
class tracker {
public:
class impl;
struct config {
bool defragment_on_idle;
bool abort_on_lsa_bad_alloc;
bool sanitizer_report_backtrace = false; // Better reports but slower
size_t lsa_reclamation_step;
scheduling_group background_reclaim_sched_group;
};
struct stats {
size_t segments_compacted;
size_t lsa_buffer_segments;
uint64_t memory_allocated;
uint64_t memory_freed;
uint64_t memory_compacted;
uint64_t memory_evicted;
uint64_t num_allocations;
friend stats operator+(const stats& s1, const stats& s2) {
stats result(s1);
result += s2;
return result;
}
friend stats operator-(const stats& s1, const stats& s2) {
stats result(s1);
result -= s2;
return result;
}
stats& operator+=(const stats& other) {
segments_compacted += other.segments_compacted;
lsa_buffer_segments += other.lsa_buffer_segments;
memory_allocated += other.memory_allocated;
memory_freed += other.memory_freed;
memory_compacted += other.memory_compacted;
memory_evicted += other.memory_evicted;
num_allocations += other.num_allocations;
return *this;
}
stats& operator-=(const stats& other) {
segments_compacted -= other.segments_compacted;
lsa_buffer_segments -= other.lsa_buffer_segments;
memory_allocated -= other.memory_allocated;
memory_freed -= other.memory_freed;
memory_compacted -= other.memory_compacted;
memory_evicted -= other.memory_evicted;
num_allocations -= other.num_allocations;
return *this;
}
};
void configure(const config& cfg);
future<> stop();
private:
std::unique_ptr<impl> _impl;
memory::reclaimer _reclaimer;
friend class region;
friend class region_impl;
memory::reclaiming_result reclaim(seastar::memory::reclaimer::request);
public:
tracker();
~tracker();
stats statistics() const;
//
// Tries to reclaim given amount of bytes in total using all compactible
// and evictable regions. Returns the number of bytes actually reclaimed.
// That value may be smaller than requested when evictable pools are empty
// and compactible pools can't compact any more.
//
// Invalidates references to objects in all compactible and evictable regions.
//
size_t reclaim(size_t bytes);
// Compacts as much as possible. Very expensive, mainly for testing.
// Guarantees that every live object from reclaimable regions will be moved.
// Invalidates references to objects in all compactible and evictable regions.
void full_compaction();
void reclaim_all_free_segments();
occupancy_stats global_occupancy() const noexcept;
// Returns aggregate statistics for all pools.
occupancy_stats region_occupancy() const noexcept;
// Returns statistics for all segments allocated by LSA on this shard.
occupancy_stats occupancy() const noexcept;
// Returns amount of allocated memory not managed by LSA
size_t non_lsa_used_space() const noexcept;
impl& get_impl() noexcept { return *_impl; }
// Returns the minimum number of segments reclaimed during single reclamation cycle.
size_t reclamation_step() const noexcept;
bool should_abort_on_bad_alloc() const noexcept;
};
class tracker_reclaimer_lock {
tracker::impl& _tracker_impl;
public:
tracker_reclaimer_lock(tracker::impl& impl) noexcept;
tracker_reclaimer_lock(tracker& t) noexcept : tracker_reclaimer_lock(t.get_impl()) { }
~tracker_reclaimer_lock();
};
tracker& shard_tracker() noexcept;
class segment_descriptor;
/// A unique pointer to a chunk of memory allocated inside an LSA region.
///
/// The pointer can be in disengaged state in which case it doesn't point at any buffer (nullptr state).
/// When the pointer points at some buffer, it is said to be engaged.
///
/// The pointer owns the object.
/// When the pointer is destroyed or it transitions from engaged to disengaged state, the buffer is freed.
/// The buffer is never leaked when operating by the API of lsa_buffer.
/// The pointer object can be safely destroyed in any allocator context.
///
/// The pointer object is never invalidated.
/// The pointed-to buffer can be moved around by LSA, so the pointer returned by get() can be
/// invalidated, but the pointer object itself is updated automatically and get() always returns
/// a pointer which is valid at the time of the call.
///
/// Must not outlive the region.
class lsa_buffer {
friend class region_impl;
entangled _link; // Paired with segment_descriptor::_buf_pointers[...]
segment_descriptor* _desc; // Valid only when engaged
char* _buf = nullptr; // Valid only when engaged
size_t _size = 0;
public:
using char_type = char;
lsa_buffer() = default;
lsa_buffer(lsa_buffer&&) noexcept = default;
~lsa_buffer();
/// Makes this instance point to the buffer pointed to by the other pointer.
/// If this pointer was engaged before, the owned buffer is freed.
/// The other pointer will be in disengaged state after this.
lsa_buffer& operator=(lsa_buffer&& other) noexcept {
if (this != &other) {
this->~lsa_buffer();
new (this) lsa_buffer(std::move(other));
}
return *this;
}
/// Disengages the pointer.
/// If the pointer was engaged before, the owned buffer is freed.
/// Postcondition: !bool(*this)
lsa_buffer& operator=(std::nullptr_t) noexcept {
this->~lsa_buffer();
return *this;
}
/// Returns a pointer to the first element of the buffer.
/// Valid only when engaged.
char_type* get() noexcept { return _buf; }
const char_type* get() const noexcept { return _buf; }
/// Returns the number of bytes in the buffer.
size_t size() const noexcept { return _size; }
/// Returns true iff the pointer is engaged.
explicit operator bool() const noexcept { return bool(_link); }
};
// Monoid representing pool occupancy statistics.
// Naturally ordered so that sparser pools come fist.
// All sizes in bytes.
class occupancy_stats {
size_t _free_space;
size_t _total_space;
public:
occupancy_stats() noexcept : _free_space(0), _total_space(0) {}
occupancy_stats(size_t free_space, size_t total_space) noexcept
: _free_space(free_space), _total_space(total_space) { }
bool operator<(const occupancy_stats& other) const noexcept {
return used_fraction() < other.used_fraction();
}
friend occupancy_stats operator+(const occupancy_stats& s1, const occupancy_stats& s2) noexcept {
occupancy_stats result(s1);
result += s2;
return result;
}
friend occupancy_stats operator-(const occupancy_stats& s1, const occupancy_stats& s2) noexcept {
occupancy_stats result(s1);
result -= s2;
return result;
}
occupancy_stats& operator+=(const occupancy_stats& other) noexcept {
_total_space += other._total_space;
_free_space += other._free_space;
return *this;
}
occupancy_stats& operator-=(const occupancy_stats& other) noexcept {
_total_space -= other._total_space;
_free_space -= other._free_space;
return *this;
}
size_t used_space() const noexcept {
return _total_space - _free_space;
}
size_t free_space() const noexcept {
return _free_space;
}
size_t total_space() const noexcept {
return _total_space;
}
float used_fraction() const noexcept {
return _total_space ? float(used_space()) / total_space() : 0;
}
explicit operator bool() const noexcept {
return _total_space > 0;
}
};
class basic_region_impl : public allocation_strategy {
protected:
tracker& _tracker;
bool _reclaiming_enabled = true;
seastar::shard_id _cpu = this_shard_id();
public:
basic_region_impl(tracker& tracker) : _tracker(tracker)
{ }
tracker& get_tracker() { return _tracker; }
void set_reclaiming_enabled(bool enabled) noexcept {
SCYLLA_ASSERT(this_shard_id() == _cpu);
_reclaiming_enabled = enabled;
}
bool reclaiming_enabled() const noexcept {
return _reclaiming_enabled;
}
};
//
// Log-structured allocator region.
//
// Objects allocated using this region are said to be owned by this region.
// Objects must be freed only using the region which owns them. Ownership can
// be transferred across regions using the merge() method. Region must be live
// as long as it owns any objects.
//
// Each region has separate memory accounting and can be compacted
// independently from other regions. To reclaim memory from all regions use
// shard_tracker().
//
// Region is automatically added to the set of
// compactible regions when constructed.
//
class region {
public:
using impl = region_impl;
private:
shared_ptr<basic_region_impl> _impl;
private:
region_impl& get_impl() noexcept;
const region_impl& get_impl() const noexcept;
public:
region();
~region();
region(region&& other) noexcept;
region& operator=(region&& other) noexcept;
region(const region& other) = delete;
void listen(region_listener* listener);
void unlisten();
occupancy_stats occupancy() const noexcept;
tracker& get_tracker() const {
return _impl->get_tracker();
}
allocation_strategy& allocator() noexcept {
return *_impl;
}
const allocation_strategy& allocator() const noexcept {
return *_impl;
}
// Allocates a buffer of a given size.
// The buffer's pointer will be aligned to 4KB.
// Note: it is wasteful to allocate buffers of sizes which are not a multiple of the alignment.
lsa_buffer alloc_buf(size_t buffer_size);
// Merges another region into this region. The other region is left empty.
// Doesn't invalidate references to allocated objects.
void merge(region& other) noexcept;
// Compacts everything. Mainly for testing.
// Invalidates references to allocated objects.
void full_compaction();
// Runs eviction function once. Mainly for testing.
memory::reclaiming_result evict_some();
// Changes the reclaimability state of this region. When region is not
// reclaimable, it won't be considered by tracker::reclaim(). By default region is
// reclaimable after construction.
void set_reclaiming_enabled(bool e) noexcept { _impl->set_reclaiming_enabled(e); }
// Returns the reclaimability state of this region.
bool reclaiming_enabled() const noexcept { return _impl->reclaiming_enabled(); }
// Returns a value which is increased when this region is either compacted or
// evicted from, which invalidates references into the region.
// When the value returned by this method doesn't change, references remain valid.
uint64_t reclaim_counter() const noexcept {
return allocator().invalidate_counter();
}
// Will cause subsequent calls to evictable_occupancy() to report empty occupancy.
void ground_evictable_occupancy();
// Follows region's occupancy in the parent region group. Less fine-grained than occupancy().
// After ground_evictable_occupancy() is called returns 0.
occupancy_stats evictable_occupancy() const noexcept;
// Makes this region an evictable region. Supplied function will be called
// when data from this region needs to be evicted in order to reclaim space.
// The function should free some space from this region.
void make_evictable(eviction_fn) noexcept;
const eviction_fn& evictor() const noexcept;
uint64_t id() const noexcept;
std::unordered_map<std::string, uint64_t> collect_stats() const;
friend class allocating_section;
};
// Forces references into the region to remain valid as long as this guard is
// live by disabling compaction and eviction.
// Can be nested.
struct reclaim_lock {
region& _region;
bool _prev;
reclaim_lock(region& r) noexcept
: _region(r)
, _prev(r.reclaiming_enabled())
{
_region.set_reclaiming_enabled(false);
}
~reclaim_lock() {
_region.set_reclaiming_enabled(_prev);
}
};
// Utility for running critical sections which need to lock some region and
// also allocate LSA memory. The object learns from failures how much it
// should reserve up front in order to not cause allocation failures.
class allocating_section {
// Do not decay below these minimal values
static constexpr size_t s_min_lsa_reserve = 1;
static constexpr size_t s_min_std_reserve = 1024;
static constexpr uint64_t s_bytes_per_decay = 10'000'000'000;
static constexpr unsigned s_segments_per_decay = 100'000;
size_t _lsa_reserve = s_min_lsa_reserve; // in segments
size_t _std_reserve = s_min_std_reserve; // in bytes
size_t _minimum_lsa_emergency_reserve = 0;
int64_t _remaining_std_bytes_until_decay = s_bytes_per_decay;
int _remaining_lsa_segments_until_decay = s_segments_per_decay;
abstract_formatter _name;
private:
struct guard {
tracker::impl& _tracker;
size_t _prev;
explicit guard(tracker::impl& tracker) noexcept;
~guard();
};
void reserve(tracker::impl& tracker);
void maybe_decay_reserve() noexcept;
void on_alloc_failure(logalloc::region&);
public:
allocating_section() = default;
explicit allocating_section(abstract_formatter name) : _name(std::move(name)) {}
void set_lsa_reserve(size_t) noexcept;
void set_std_reserve(size_t) noexcept;
//
// Reserves standard allocator and LSA memory for subsequent operations that
// have to be performed with memory reclamation disabled.
//
// Throws std::bad_alloc when reserves can't be increased to a sufficient level.
//
template<typename Func>
decltype(auto) with_reserve(region& r, Func&& fn) {
auto prev_lsa_reserve = _lsa_reserve;
auto prev_std_reserve = _std_reserve;
try {
guard g(r.get_tracker().get_impl());
_minimum_lsa_emergency_reserve = g._prev;
reserve(r.get_tracker().get_impl());
return fn();
} catch (const std::bad_alloc&) {
// roll-back limits to protect against pathological requests
// preventing future requests from succeeding.
_lsa_reserve = prev_lsa_reserve;
_std_reserve = prev_std_reserve;
throw;
}
}
//
// Invokes func with reclaim_lock on region r. If LSA allocation fails
// inside func it is retried after increasing LSA segment reserve. The
// memory reserves are increased with region lock off allowing for memory
// reclamation to take place in the region.
//
// References in the region are invalidated when allocating section is re-entered
// on allocation failure.
//
// Throws std::bad_alloc when reserves can't be increased to a sufficient level.
//
template<typename Func>
decltype(auto) with_reclaiming_disabled(logalloc::region& r, Func&& fn) {
SCYLLA_ASSERT(r.reclaiming_enabled());
maybe_decay_reserve();
while (true) {
try {
logalloc::reclaim_lock _(r);
memory::disable_abort_on_alloc_failure_temporarily dfg;
return fn();
} catch (const utils::memory_limit_reached&) {
// Do not retry, bumping reserves won't help.
// The read reached a memory limit in the semaphore and is being
// terminated.
throw;
} catch (const std::bad_alloc&) {
on_alloc_failure(r);
}
}
}
//
// Reserves standard allocator and LSA memory and
// invokes func with reclaim_lock on region r. If LSA allocation fails
// inside func it is retried after increasing LSA segment reserve. The
// memory reserves are increased with region lock off allowing for memory
// reclamation to take place in the region.
//
// References in the region are invalidated when allocating section is re-entered
// on allocation failure.
//
// Throws std::bad_alloc when reserves can't be increased to a sufficient level.
//
template<typename Func>
decltype(auto) operator()(logalloc::region& r, Func&& func) {
return with_reserve(r, [this, &r, &func] {
return with_reclaiming_disabled(r, func);
});
}
};
future<> prime_segment_pool(size_t available_memory, size_t min_free_memory);
// Use the segment pool appropriate for the standard allocator.
//
// In debug mode, this will use the release standard allocator store.
// Call once, when initializing the application, before any LSA allocation takes place.
future<> use_standard_allocator_segment_pool_backend(size_t available_memory);
}
template <> struct fmt::formatter<logalloc::occupancy_stats> : fmt::formatter<string_view> {
auto format(const logalloc::occupancy_stats& stats, fmt::format_context& ctx) const {
return fmt::format_to(ctx.out(), "{:.2f}%, {:d} / {:d} [B]",
stats.used_fraction() * 100, stats.used_space(), stats.total_space());
}
};