The segment heap is a max-heap, with sparser segments on the top. When we free from a segment its occupancy is decreased, but its position in the heap increases. This bug caused that we picked up segments for compaction in the wrong order. In extreme cases this can lead to a livelock, in some cases may just increase compaction latency.
1281 lines
39 KiB
C++
1281 lines
39 KiB
C++
/*
|
|
* Copyright 2015 Cloudius Systems
|
|
*/
|
|
|
|
#include <boost/range/algorithm/heap_algorithm.hpp>
|
|
#include <boost/range/algorithm/remove.hpp>
|
|
#include <boost/heap/binomial_heap.hpp>
|
|
#include <stack>
|
|
|
|
#include <seastar/core/memory.hh>
|
|
#include <seastar/core/align.hh>
|
|
#include <seastar/core/print.hh>
|
|
|
|
#include "utils/logalloc.hh"
|
|
#include "log.hh"
|
|
|
|
standard_allocation_strategy standard_allocation_strategy_instance;
|
|
|
|
namespace logalloc {
|
|
|
|
struct segment;
|
|
|
|
static logging::logger logger("lsa");
|
|
static logging::logger timing_logger("lsa-timing");
|
|
static thread_local tracker tracker_instance;
|
|
|
|
using clock = std::chrono::high_resolution_clock;
|
|
|
|
class tracker::impl {
|
|
std::vector<region::impl*> _regions;
|
|
scollectd::registrations _collectd_registrations;
|
|
bool _reclaiming_enabled = true;
|
|
private:
|
|
// Prevents tracker's reclaimer from running while live. Reclaimer may be
|
|
// invoked synchronously with allocator. This guard ensures that this
|
|
// object is not re-entered while inside one of the tracker's methods.
|
|
struct reclaiming_lock {
|
|
impl& _ref;
|
|
bool _prev;
|
|
reclaiming_lock(impl& ref)
|
|
: _ref(ref)
|
|
, _prev(ref._reclaiming_enabled)
|
|
{
|
|
_ref._reclaiming_enabled = false;
|
|
}
|
|
~reclaiming_lock() {
|
|
_ref._reclaiming_enabled = _prev;
|
|
}
|
|
};
|
|
void register_collectd_metrics();
|
|
public:
|
|
impl() {
|
|
register_collectd_metrics();
|
|
}
|
|
~impl() {
|
|
assert(_regions.empty());
|
|
}
|
|
void register_region(region::impl*);
|
|
void unregister_region(region::impl*);
|
|
size_t reclaim(size_t bytes);
|
|
void full_compaction();
|
|
occupancy_stats occupancy();
|
|
};
|
|
|
|
tracker::tracker()
|
|
: _impl(std::make_unique<impl>())
|
|
, _reclaimer([this] () {
|
|
return reclaim(10*1024*1024)
|
|
? memory::reclaiming_result::reclaimed_something
|
|
: memory::reclaiming_result::reclaimed_nothing;
|
|
}, memory::reclaimer_scope::sync)
|
|
{ }
|
|
|
|
tracker::~tracker() {
|
|
}
|
|
|
|
size_t tracker::reclaim(size_t bytes) {
|
|
return _impl->reclaim(bytes);
|
|
}
|
|
|
|
occupancy_stats tracker::occupancy() {
|
|
return _impl->occupancy();
|
|
}
|
|
|
|
void tracker::full_compaction() {
|
|
return _impl->full_compaction();
|
|
}
|
|
|
|
tracker& shard_tracker() {
|
|
return tracker_instance;
|
|
}
|
|
|
|
struct segment_occupancy_descending_less_compare {
|
|
inline bool operator()(segment* s1, segment* s2) const;
|
|
};
|
|
|
|
// FIXME: The choice of data structure was arbitrary, evaluate different heap variants.
|
|
// Consider using an intrusive container leveraging segment_descriptor objects.
|
|
using segment_heap = boost::heap::binomial_heap<
|
|
segment*, boost::heap::compare<segment_occupancy_descending_less_compare>>;
|
|
|
|
struct segment {
|
|
static constexpr int size_shift = segment_size_shift;
|
|
using size_type = std::conditional_t<(size_shift < 16), uint16_t, uint32_t>;
|
|
static constexpr size_t size = segment_size;
|
|
|
|
uint8_t data[size];
|
|
|
|
template<typename T = void>
|
|
const T* at(size_t offset) const {
|
|
return reinterpret_cast<const T*>(data + offset);
|
|
}
|
|
|
|
template<typename T = void>
|
|
T* at(size_t offset) {
|
|
return reinterpret_cast<T*>(data + offset);
|
|
}
|
|
|
|
bool is_empty() const;
|
|
void record_alloc(size_type size);
|
|
void record_free(size_type size);
|
|
occupancy_stats occupancy() const;
|
|
|
|
void set_heap_handle(segment_heap::handle_type);
|
|
const segment_heap::handle_type& heap_handle();
|
|
};
|
|
|
|
inline bool
|
|
segment_occupancy_descending_less_compare::operator()(segment* s1, segment* s2) const {
|
|
return s2->occupancy() < s1->occupancy();
|
|
}
|
|
|
|
struct segment_descriptor {
|
|
bool _lsa_managed;
|
|
segment::size_type _offset;
|
|
segment::size_type _free_space;
|
|
segment_heap::handle_type _heap_handle;
|
|
|
|
segment_descriptor()
|
|
: _lsa_managed(false)
|
|
{ }
|
|
|
|
bool is_empty() const {
|
|
return _free_space == segment::size;
|
|
}
|
|
|
|
occupancy_stats occupancy() const {
|
|
return { _free_space, segment::size };
|
|
}
|
|
|
|
void record_alloc(segment::size_type size) {
|
|
_free_space -= size;
|
|
}
|
|
|
|
void record_free(segment::size_type size) {
|
|
_free_space += size;
|
|
}
|
|
|
|
void set_heap_handle(segment_heap::handle_type h) {
|
|
_heap_handle = h;
|
|
}
|
|
|
|
const segment_heap::handle_type& heap_handle() const {
|
|
return _heap_handle;
|
|
}
|
|
|
|
};
|
|
|
|
#ifndef DEFAULT_ALLOCATOR
|
|
|
|
struct free_segment {
|
|
free_segment* next;
|
|
} __attribute__((packed));
|
|
|
|
class segment_stack {
|
|
free_segment* _head = nullptr;
|
|
size_t _size = 0;
|
|
public:
|
|
segment* pop() noexcept {
|
|
segment* seg = reinterpret_cast<segment*>(_head);
|
|
_head = _head->next;
|
|
--_size;
|
|
return seg;
|
|
}
|
|
void push(segment* seg) noexcept {
|
|
free_segment* fs = reinterpret_cast<free_segment*>(seg);
|
|
fs->next = _head;
|
|
_head = fs;
|
|
++_size;
|
|
}
|
|
size_t size() const {
|
|
return _size;
|
|
}
|
|
};
|
|
|
|
// Segment pool implementation for the seastar allocator.
|
|
// Stores segment descriptors in a vector which is indexed using most significant
|
|
// bits of segment address.
|
|
class segment_pool {
|
|
std::vector<segment_descriptor> _segments;
|
|
uintptr_t _segments_base; // The address of the first segment
|
|
size_t _segments_in_use{};
|
|
memory::memory_layout _layout;
|
|
size_t _current_emergency_reserve_goal = 1;
|
|
size_t _emergency_reserve_max = 30;
|
|
segment_stack _emergency_reserve;
|
|
bool _allocation_failure_flag = false;
|
|
private:
|
|
segment* allocate_or_fallback_to_reserve();
|
|
void free_or_restore_to_reserve(segment* seg) noexcept;
|
|
public:
|
|
segment_pool();
|
|
segment* new_segment();
|
|
segment_descriptor& descriptor(const segment*);
|
|
// Returns segment containing given object or nullptr.
|
|
segment* containing_segment(void* obj) const;
|
|
void free_segment(segment*) noexcept;
|
|
void free_segment(segment*, segment_descriptor&) noexcept;
|
|
size_t segments_in_use() const;
|
|
size_t current_emergency_reserve_goal() const { return _current_emergency_reserve_goal; }
|
|
void set_emergency_reserve_max(size_t new_size) { _emergency_reserve_max = new_size; }
|
|
size_t emergency_reserve_max() { return _emergency_reserve_max; }
|
|
void set_current_emergency_reserve_goal(size_t goal) { _current_emergency_reserve_goal = goal; }
|
|
void clear_allocation_failure_flag() { _allocation_failure_flag = false; }
|
|
bool allocation_failure_flag() { return _allocation_failure_flag; }
|
|
void refill_emergency_reserve();
|
|
size_t trim_emergency_reserve_to_max();
|
|
struct reservation_goal;
|
|
};
|
|
|
|
void segment_pool::refill_emergency_reserve() {
|
|
while (_emergency_reserve.size() < _emergency_reserve_max) {
|
|
auto seg = new segment;
|
|
_emergency_reserve.push(seg);
|
|
}
|
|
}
|
|
|
|
size_t segment_pool::trim_emergency_reserve_to_max() {
|
|
size_t n_released = 0;
|
|
while (_emergency_reserve.size() > _emergency_reserve_max) {
|
|
_emergency_reserve.pop();
|
|
++n_released;
|
|
}
|
|
return n_released;
|
|
}
|
|
|
|
segment_descriptor&
|
|
segment_pool::descriptor(const segment* seg) {
|
|
uintptr_t seg_addr = reinterpret_cast<uintptr_t>(seg);
|
|
uintptr_t index = (seg_addr - _segments_base) >> segment::size_shift;
|
|
return _segments[index];
|
|
}
|
|
|
|
segment*
|
|
segment_pool::containing_segment(void* obj) const {
|
|
auto addr = reinterpret_cast<uintptr_t>(obj);
|
|
auto offset = addr & (segment::size - 1);
|
|
auto index = (addr - _segments_base) >> segment::size_shift;
|
|
auto& desc = _segments[index];
|
|
if (desc._lsa_managed && offset >= desc._offset) {
|
|
return reinterpret_cast<segment*>(addr - offset + desc._offset);
|
|
} else {
|
|
if (index == 0) {
|
|
return nullptr;
|
|
}
|
|
auto& prev = _segments[index - 1];
|
|
if (prev._lsa_managed && offset < prev._offset) {
|
|
return reinterpret_cast<segment*>(addr - offset - segment::size + prev._offset);
|
|
} else {
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
segment*
|
|
segment_pool::allocate_or_fallback_to_reserve() {
|
|
if (_emergency_reserve.size() <= _current_emergency_reserve_goal) {
|
|
try {
|
|
return new segment;
|
|
} catch (const std::bad_alloc&) {
|
|
_allocation_failure_flag = true;
|
|
throw;
|
|
}
|
|
}
|
|
return _emergency_reserve.pop();
|
|
}
|
|
|
|
void
|
|
segment_pool::free_or_restore_to_reserve(segment* seg) noexcept {
|
|
if (_emergency_reserve.size() < emergency_reserve_max()) {
|
|
_emergency_reserve.push(seg);
|
|
} else {
|
|
delete seg;
|
|
}
|
|
}
|
|
|
|
segment*
|
|
segment_pool::new_segment() {
|
|
auto seg = allocate_or_fallback_to_reserve();
|
|
++_segments_in_use;
|
|
segment_descriptor& desc = descriptor(seg);
|
|
desc._lsa_managed = true;
|
|
desc._offset = reinterpret_cast<uintptr_t>(seg) & (segment::size - 1);
|
|
desc._free_space = segment::size;
|
|
return seg;
|
|
}
|
|
|
|
void segment_pool::free_segment(segment* seg) noexcept {
|
|
free_segment(seg, descriptor(seg));
|
|
}
|
|
|
|
void segment_pool::free_segment(segment* seg, segment_descriptor& desc) noexcept {
|
|
logger.trace("Releasing segment {}", seg);
|
|
desc._lsa_managed = false;
|
|
free_or_restore_to_reserve(seg);
|
|
--_segments_in_use;
|
|
}
|
|
|
|
segment_pool::segment_pool()
|
|
: _layout(memory::get_memory_layout())
|
|
{
|
|
_segments_base = align_down(_layout.start, (uintptr_t)segment::size);
|
|
_segments.resize((_layout.end - _segments_base) / segment::size);
|
|
for (size_t i = 0; i < _current_emergency_reserve_goal; ++i) {
|
|
_emergency_reserve.push(new segment);
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
// Segment pool version for the standard allocator. Slightly less efficient
|
|
// than the version for seastar's allocator.
|
|
class segment_pool {
|
|
std::unordered_map<const segment*, segment_descriptor> _segments;
|
|
size_t _segments_in_use{};
|
|
public:
|
|
segment* new_segment() {
|
|
++_segments_in_use;
|
|
auto seg = new (with_alignment(segment::size)) segment;
|
|
assert((reinterpret_cast<uintptr_t>(seg) & (sizeof(segment) - 1)) == 0);
|
|
segment_descriptor& desc = _segments[seg];
|
|
desc._lsa_managed = true;
|
|
desc._free_space = segment::size;
|
|
return seg;
|
|
}
|
|
segment_descriptor& descriptor(const segment* seg) {
|
|
auto i = _segments.find(seg);
|
|
if (i != _segments.end()) {
|
|
return i->second;
|
|
} else {
|
|
segment_descriptor& desc = _segments[seg];
|
|
desc._lsa_managed = false;
|
|
return desc;
|
|
}
|
|
}
|
|
void free_segment(segment* seg, segment_descriptor& desc) {
|
|
free_segment(seg);
|
|
}
|
|
void free_segment(segment* seg) {
|
|
--_segments_in_use;
|
|
auto i = _segments.find(seg);
|
|
assert(i != _segments.end());
|
|
_segments.erase(i);
|
|
delete seg;
|
|
}
|
|
segment* containing_segment(void* obj) const {
|
|
uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
|
|
auto seg = reinterpret_cast<segment*>(align_down(addr, static_cast<uintptr_t>(segment::size)));
|
|
auto i = _segments.find(seg);
|
|
if (i == _segments.end()) {
|
|
return nullptr;
|
|
}
|
|
return seg;
|
|
}
|
|
size_t segments_in_use() const;
|
|
size_t current_emergency_reserve_goal() const { return 0; }
|
|
void set_current_emergency_reserve_goal(size_t goal) { }
|
|
void set_emergency_reserve_max(size_t new_size) { }
|
|
size_t emergency_reserve_max() { return 0; }
|
|
void clear_allocation_failure_flag() { }
|
|
bool allocation_failure_flag() { return false; }
|
|
void refill_emergency_reserve() {}
|
|
size_t trim_emergency_reserve_to_max() { return 0; }
|
|
public:
|
|
class reservation_goal;
|
|
};
|
|
|
|
#endif
|
|
|
|
// RAII wrapper to maintain segment_pool::current_emergency_reserve_goal()
|
|
class segment_pool::reservation_goal {
|
|
segment_pool& _sp;
|
|
size_t _old_goal;
|
|
public:
|
|
reservation_goal(segment_pool& sp, size_t goal)
|
|
: _sp(sp), _old_goal(_sp.current_emergency_reserve_goal()) {
|
|
_sp.set_current_emergency_reserve_goal(goal);
|
|
}
|
|
~reservation_goal() {
|
|
_sp.set_current_emergency_reserve_goal(_old_goal);
|
|
}
|
|
};
|
|
|
|
size_t segment_pool::segments_in_use() const {
|
|
return _segments_in_use;
|
|
}
|
|
|
|
static thread_local segment_pool shard_segment_pool;
|
|
|
|
void segment::record_alloc(segment::size_type size) {
|
|
shard_segment_pool.descriptor(this).record_alloc(size);
|
|
}
|
|
|
|
void segment::record_free(segment::size_type size) {
|
|
shard_segment_pool.descriptor(this).record_free(size);
|
|
}
|
|
|
|
bool segment::is_empty() const {
|
|
return shard_segment_pool.descriptor(this).is_empty();
|
|
}
|
|
|
|
occupancy_stats
|
|
segment::occupancy() const {
|
|
return { shard_segment_pool.descriptor(this)._free_space, segment::size };
|
|
}
|
|
|
|
void
|
|
segment::set_heap_handle(segment_heap::handle_type handle) {
|
|
shard_segment_pool.descriptor(this)._heap_handle = handle;
|
|
}
|
|
|
|
const segment_heap::handle_type&
|
|
segment::heap_handle() {
|
|
return shard_segment_pool.descriptor(this)._heap_handle;
|
|
}
|
|
|
|
//
|
|
// For interface documentation see logalloc::region and allocation_strategy.
|
|
//
|
|
// Allocation dynamics.
|
|
//
|
|
// Objects are allocated inside fixed-size segments. Objects don't cross
|
|
// segment boundary. Active allocations are served from a single segment using
|
|
// bump-the-pointer method. That segment is called the active segment. When
|
|
// active segment fills up, it is closed. Closed segments are kept in a heap
|
|
// which orders them by occupancy. As objects are freed, the segment become
|
|
// sparser and are eventually released. Objects which are too large are
|
|
// allocated using standard allocator.
|
|
//
|
|
// Segment layout.
|
|
//
|
|
// Objects in a segment are laid out sequentially. Each object is preceded by
|
|
// a descriptor (see object_descriptor). Object alignment is respected, so if
|
|
// there is a gap between the end of current object and the next object's
|
|
// descriptor, a trunk of the object descriptor is left right after the
|
|
// current object with the flags byte indicating the amount of padding.
|
|
//
|
|
// Per-segment metadata is kept in a separate array, managed by segment_pool
|
|
// object.
|
|
//
|
|
class region_impl : public allocation_strategy {
|
|
static constexpr float max_occupancy_for_compaction = 0.85; // FIXME: make configurable
|
|
static constexpr size_t max_managed_object_size = segment::size * 0.1;
|
|
|
|
// single-byte flags
|
|
struct obj_flags {
|
|
static constexpr uint8_t live_flag = 0x01;
|
|
static constexpr uint8_t eos_flag = 0x02;
|
|
static constexpr size_t max_alignment = (0xff >> 2) + 1;
|
|
|
|
static uint8_t with_padding(uint8_t padding) {
|
|
assert(padding < max_alignment);
|
|
return uint8_t(padding << 2);
|
|
}
|
|
|
|
//
|
|
// bit 0: 0 = dead, 1 = live
|
|
// bit 1: when set, end-of-segment marker
|
|
// bits 2-7: The value represents padding in bytes between the end of previous object
|
|
// and this object's descriptor. Must be smaller than object's alignment, so max alignment is 64.
|
|
uint8_t _value;
|
|
|
|
obj_flags(uint8_t value)
|
|
: _value(value)
|
|
{ }
|
|
|
|
static obj_flags make_end_of_segment() {
|
|
return { eos_flag };
|
|
}
|
|
|
|
static obj_flags make_live(uint8_t padding) {
|
|
return obj_flags(live_flag | with_padding(padding));
|
|
}
|
|
|
|
static obj_flags make_padding(uint8_t padding) {
|
|
return obj_flags(with_padding(padding));
|
|
}
|
|
|
|
static obj_flags make_dead(uint8_t padding) {
|
|
return obj_flags(with_padding(padding));
|
|
}
|
|
|
|
// Number of bytes preceding this descriptor after the end of the previous object
|
|
uint8_t padding() const {
|
|
return _value >> 2;
|
|
}
|
|
|
|
bool is_live() const {
|
|
return _value & live_flag;
|
|
}
|
|
|
|
bool is_end_of_segment() const {
|
|
return _value & eos_flag;
|
|
}
|
|
|
|
void mark_dead() {
|
|
_value &= ~live_flag;
|
|
}
|
|
} __attribute__((packed));
|
|
|
|
class object_descriptor {
|
|
private:
|
|
obj_flags _flags;
|
|
uint8_t _alignment;
|
|
segment::size_type _size;
|
|
allocation_strategy::migrate_fn _migrator;
|
|
public:
|
|
object_descriptor(allocation_strategy::migrate_fn migrator, segment::size_type size, uint8_t alignment, uint8_t padding)
|
|
: _flags(obj_flags::make_live(padding))
|
|
, _alignment(alignment)
|
|
, _size(size)
|
|
, _migrator(migrator)
|
|
{ }
|
|
|
|
void mark_dead() {
|
|
_flags.mark_dead();
|
|
}
|
|
|
|
allocation_strategy::migrate_fn migrator() const {
|
|
return _migrator;
|
|
}
|
|
|
|
uint8_t alignment() const {
|
|
return _alignment;
|
|
}
|
|
|
|
segment::size_type size() const {
|
|
return _size;
|
|
}
|
|
|
|
obj_flags flags() const {
|
|
return _flags;
|
|
}
|
|
|
|
bool is_live() const {
|
|
return _flags.is_live();
|
|
}
|
|
|
|
bool is_end_of_segment() const {
|
|
return _flags.is_end_of_segment();
|
|
}
|
|
|
|
uint8_t padding() const {
|
|
return _flags.padding();
|
|
}
|
|
|
|
friend std::ostream& operator<<(std::ostream& out, const object_descriptor& desc) {
|
|
return out << sprint("{flags = %x, migrator=%p, alignment=%d, size=%d}",
|
|
(int)desc._flags._value, desc._migrator, desc._alignment, desc._size);
|
|
}
|
|
} __attribute__((packed));
|
|
private:
|
|
region_group* _group = nullptr;
|
|
segment* _active = nullptr;
|
|
size_t _active_offset;
|
|
segment_heap _segments; // Contains only closed segments
|
|
occupancy_stats _closed_occupancy;
|
|
bool _reclaiming_enabled = true;
|
|
bool _evictable = false;
|
|
uint64_t _id;
|
|
uint64_t _reclaim_counter = 0;
|
|
eviction_fn _eviction_fn;
|
|
private:
|
|
struct compaction_lock {
|
|
region_impl& _region;
|
|
bool _prev;
|
|
compaction_lock(region_impl& r)
|
|
: _region(r)
|
|
, _prev(r._reclaiming_enabled)
|
|
{
|
|
_region._reclaiming_enabled = false;
|
|
}
|
|
~compaction_lock() {
|
|
_region._reclaiming_enabled = _prev;
|
|
}
|
|
};
|
|
void* alloc_small(allocation_strategy::migrate_fn migrator, segment::size_type size, size_t alignment) {
|
|
assert(alignment < obj_flags::max_alignment);
|
|
|
|
if (!_active) {
|
|
_active = new_segment();
|
|
_active_offset = 0;
|
|
}
|
|
|
|
size_t obj_offset = align_up(_active_offset + sizeof(object_descriptor), alignment);
|
|
if (obj_offset + size > segment::size) {
|
|
close_and_open();
|
|
return alloc_small(migrator, size, alignment);
|
|
}
|
|
|
|
auto descriptor_offset = obj_offset - sizeof(object_descriptor);
|
|
auto padding = descriptor_offset - _active_offset;
|
|
|
|
new (_active->at(_active_offset)) obj_flags(obj_flags::make_padding(padding));
|
|
new (_active->at(descriptor_offset)) object_descriptor(migrator, size, alignment, padding);
|
|
|
|
void* obj = _active->at(obj_offset);
|
|
_active_offset = obj_offset + size;
|
|
_active->record_alloc(size + sizeof(object_descriptor) + padding);
|
|
return obj;
|
|
}
|
|
|
|
template<typename Func>
|
|
void for_each_live(segment* seg, Func&& func) {
|
|
static_assert(std::is_same<void, std::result_of_t<Func(object_descriptor*, void*)>>::value, "bad Func signature");
|
|
|
|
size_t offset = 0;
|
|
while (offset < segment::size) {
|
|
object_descriptor* desc = seg->at<object_descriptor>(offset);
|
|
offset += desc->flags().padding();
|
|
desc = seg->at<object_descriptor>(offset);
|
|
if (desc->is_end_of_segment()) {
|
|
break;
|
|
}
|
|
offset += sizeof(object_descriptor);
|
|
if (desc->is_live()) {
|
|
func(desc, seg->at(offset));
|
|
}
|
|
offset += desc->size();
|
|
}
|
|
}
|
|
|
|
void close_active() {
|
|
if (!_active) {
|
|
return;
|
|
}
|
|
if (_active_offset < segment::size) {
|
|
new (_active->at(_active_offset)) obj_flags(obj_flags::make_end_of_segment());
|
|
}
|
|
logger.trace("Closing segment {}, used={}, waste={} [B]", _active, _active->occupancy(), segment::size - _active_offset);
|
|
_closed_occupancy += _active->occupancy();
|
|
auto handle = _segments.push(_active);
|
|
_active->set_heap_handle(handle);
|
|
_active = nullptr;
|
|
}
|
|
|
|
void free_segment(segment* seg) noexcept {
|
|
shard_segment_pool.free_segment(seg);
|
|
if (_group) {
|
|
_group->update(-segment::size);
|
|
}
|
|
}
|
|
|
|
segment* new_segment() {
|
|
segment* seg = shard_segment_pool.new_segment();
|
|
if (_group) {
|
|
_group->update(segment::size);
|
|
}
|
|
return seg;
|
|
}
|
|
|
|
void compact(segment* seg) {
|
|
++_reclaim_counter;
|
|
|
|
for_each_live(seg, [this] (object_descriptor* desc, void* obj) {
|
|
auto dst = alloc_small(desc->migrator(), desc->size(), desc->alignment());
|
|
desc->migrator()(obj, dst, desc->size());
|
|
});
|
|
|
|
free_segment(seg);
|
|
}
|
|
|
|
void close_and_open() {
|
|
segment* new_active = new_segment();
|
|
close_active();
|
|
_active = new_active;
|
|
_active_offset = 0;
|
|
}
|
|
|
|
static uint64_t next_id() {
|
|
static std::atomic<uint64_t> id{0};
|
|
return id.fetch_add(1);
|
|
}
|
|
struct degroup_temporarily {
|
|
region_impl* impl;
|
|
region_group* group;
|
|
explicit degroup_temporarily(region_impl* impl)
|
|
: impl(impl), group(impl->_group) {
|
|
if (group) {
|
|
group->del(impl);
|
|
}
|
|
}
|
|
~degroup_temporarily() {
|
|
if (group) {
|
|
group->add(impl);
|
|
}
|
|
}
|
|
};
|
|
|
|
public:
|
|
explicit region_impl(region_group* group = nullptr)
|
|
: _group(group), _id(next_id())
|
|
{
|
|
tracker_instance._impl->register_region(this);
|
|
if (group) {
|
|
group->add(this);
|
|
}
|
|
}
|
|
|
|
virtual ~region_impl() {
|
|
tracker_instance._impl->unregister_region(this);
|
|
|
|
while (!_segments.empty()) {
|
|
segment* seg = _segments.top();
|
|
_segments.pop();
|
|
assert(seg->is_empty());
|
|
free_segment(seg);
|
|
}
|
|
if (_active) {
|
|
assert(_active->is_empty());
|
|
free_segment(_active);
|
|
}
|
|
if (_group) {
|
|
_group->del(this);
|
|
}
|
|
}
|
|
|
|
region_impl(region_impl&&) = delete;
|
|
region_impl(const region_impl&) = delete;
|
|
|
|
bool empty() const {
|
|
return occupancy().used_space() == 0;
|
|
}
|
|
|
|
occupancy_stats occupancy() const {
|
|
occupancy_stats total{};
|
|
total += _closed_occupancy;
|
|
if (_active) {
|
|
total += _active->occupancy();
|
|
}
|
|
return total;
|
|
}
|
|
|
|
occupancy_stats compactible_occupancy() const {
|
|
return _closed_occupancy;
|
|
}
|
|
|
|
//
|
|
// Returns true if this region can be compacted and compact() will make forward progress,
|
|
// so that this will eventually stop:
|
|
//
|
|
// while (is_compactible()) { compact(); }
|
|
//
|
|
bool is_compactible() const {
|
|
return _reclaiming_enabled
|
|
&& (_closed_occupancy.free_space() >= 2 * segment::size)
|
|
&& (_closed_occupancy.used_fraction() < max_occupancy_for_compaction)
|
|
&& (_segments.top()->occupancy().free_space() >= max_managed_object_size);
|
|
}
|
|
|
|
virtual void* alloc(allocation_strategy::migrate_fn migrator, size_t size, size_t alignment) override {
|
|
compaction_lock _(*this);
|
|
if (size > max_managed_object_size) {
|
|
return standard_allocator().alloc(migrator, size, alignment);
|
|
} else {
|
|
return alloc_small(migrator, (segment::size_type) size, alignment);
|
|
}
|
|
}
|
|
|
|
virtual void free(void* obj) noexcept override {
|
|
compaction_lock _(*this);
|
|
segment* seg = shard_segment_pool.containing_segment(obj);
|
|
|
|
if (!seg) {
|
|
standard_allocator().free(obj);
|
|
return;
|
|
}
|
|
|
|
segment_descriptor& seg_desc = shard_segment_pool.descriptor(seg);
|
|
|
|
auto desc = reinterpret_cast<object_descriptor*>(reinterpret_cast<uintptr_t>(obj) - sizeof(object_descriptor));
|
|
desc->mark_dead();
|
|
|
|
if (seg != _active) {
|
|
_closed_occupancy -= seg->occupancy();
|
|
}
|
|
|
|
seg_desc.record_free(desc->size() + sizeof(object_descriptor) + desc->padding());
|
|
|
|
if (seg != _active) {
|
|
if (seg_desc.is_empty()) {
|
|
_segments.erase(seg_desc.heap_handle());
|
|
free_segment(seg);
|
|
} else {
|
|
_closed_occupancy += seg_desc.occupancy();
|
|
_segments.increase(seg_desc.heap_handle());
|
|
}
|
|
}
|
|
}
|
|
|
|
// Merges another region into this region. The other region is mad
|
|
// to refer to this region.
|
|
// Doesn't invalidate references to allocated objects.
|
|
void merge(region_impl& other) {
|
|
compaction_lock dct1(*this);
|
|
compaction_lock dct2(other);
|
|
degroup_temporarily dgt1(this);
|
|
degroup_temporarily dgt2(&other);
|
|
|
|
if (_active && _active->is_empty()) {
|
|
shard_segment_pool.free_segment(_active);
|
|
_active = nullptr;
|
|
}
|
|
if (!_active) {
|
|
_active = other._active;
|
|
other._active = nullptr;
|
|
_active_offset = other._active_offset;
|
|
} else {
|
|
other.close_active();
|
|
}
|
|
|
|
_segments.merge(other._segments);
|
|
|
|
_closed_occupancy += other._closed_occupancy;
|
|
other._closed_occupancy = {};
|
|
|
|
// Make sure both regions will notice a future increment
|
|
// to the reclaim counter
|
|
_reclaim_counter = std::max(_reclaim_counter, other._reclaim_counter);
|
|
}
|
|
|
|
// Returns occupancy of the sparsest compactible segment.
|
|
occupancy_stats min_occupancy() const {
|
|
if (_segments.empty()) {
|
|
return {};
|
|
}
|
|
return _segments.top()->occupancy();
|
|
}
|
|
|
|
// Tries to release one full segment back to the segment pool.
|
|
void compact() {
|
|
if (!is_compactible()) {
|
|
return;
|
|
}
|
|
|
|
compaction_lock _(*this);
|
|
|
|
auto in_use = shard_segment_pool.segments_in_use();
|
|
|
|
while (shard_segment_pool.segments_in_use() >= in_use) {
|
|
segment* seg = _segments.top();
|
|
logger.debug("Compacting segment {} from region {}, {}", seg, id(), seg->occupancy());
|
|
_segments.pop();
|
|
_closed_occupancy -= seg->occupancy();
|
|
compact(seg);
|
|
}
|
|
}
|
|
|
|
// Compacts everything. Mainly for testing.
|
|
// Invalidates references to allocated objects.
|
|
void full_compaction() {
|
|
compaction_lock _(*this);
|
|
logger.debug("Full compaction, {}", occupancy());
|
|
close_and_open();
|
|
segment_heap all;
|
|
std::swap(all, _segments);
|
|
_closed_occupancy = {};
|
|
while (!all.empty()) {
|
|
segment* seg = all.top();
|
|
all.pop();
|
|
compact(seg);
|
|
}
|
|
logger.debug("Done, {}", occupancy());
|
|
}
|
|
|
|
allocation_strategy& allocator() {
|
|
return *this;
|
|
}
|
|
|
|
uint64_t id() const {
|
|
return _id;
|
|
}
|
|
|
|
void set_reclaiming_enabled(bool enabled) {
|
|
_reclaiming_enabled = enabled;
|
|
}
|
|
|
|
bool reclaiming_enabled() const {
|
|
return _reclaiming_enabled;
|
|
}
|
|
|
|
// Returns true if this pool is evictable, so that evict_some() can be called.
|
|
bool is_evictable() const {
|
|
return _evictable && _reclaiming_enabled;
|
|
}
|
|
|
|
memory::reclaiming_result evict_some() {
|
|
++_reclaim_counter;
|
|
return _eviction_fn();
|
|
}
|
|
|
|
void make_not_evictable() {
|
|
_evictable = false;
|
|
_eviction_fn = {};
|
|
}
|
|
|
|
void make_evictable(eviction_fn fn) {
|
|
_evictable = true;
|
|
_eviction_fn = std::move(fn);
|
|
}
|
|
|
|
uint64_t reclaim_counter() const {
|
|
return _reclaim_counter;
|
|
}
|
|
|
|
friend class region_group;
|
|
};
|
|
|
|
region::region()
|
|
: _impl(make_shared<impl>())
|
|
{ }
|
|
|
|
region::region(region_group& group)
|
|
: _impl(make_shared<impl>(&group)) {
|
|
}
|
|
|
|
region::~region() {
|
|
}
|
|
|
|
occupancy_stats region::occupancy() const {
|
|
return _impl->occupancy();
|
|
}
|
|
|
|
void region::merge(region& other) {
|
|
if (_impl != other._impl) {
|
|
_impl->merge(*other._impl);
|
|
other._impl = _impl;
|
|
}
|
|
}
|
|
|
|
void region::full_compaction() {
|
|
_impl->full_compaction();
|
|
}
|
|
|
|
void region::make_evictable(eviction_fn fn) {
|
|
_impl->make_evictable(std::move(fn));
|
|
}
|
|
|
|
allocation_strategy& region::allocator() {
|
|
return *_impl;
|
|
}
|
|
|
|
void region::set_reclaiming_enabled(bool compactible) {
|
|
_impl->set_reclaiming_enabled(compactible);
|
|
}
|
|
|
|
bool region::reclaiming_enabled() const {
|
|
return _impl->reclaiming_enabled();
|
|
}
|
|
|
|
uint64_t region::reclaim_counter() const {
|
|
return _impl->reclaim_counter();
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& out, const occupancy_stats& stats) {
|
|
return out << sprint("%.2f%%, %d / %d [B]",
|
|
stats.used_fraction() * 100, stats.used_space(), stats.total_space());
|
|
}
|
|
|
|
occupancy_stats tracker::impl::occupancy() {
|
|
reclaiming_lock _(*this);
|
|
occupancy_stats total{};
|
|
for (auto&& r: _regions) {
|
|
total += r->occupancy();
|
|
}
|
|
return total;
|
|
}
|
|
|
|
void tracker::impl::full_compaction() {
|
|
reclaiming_lock _(*this);
|
|
|
|
logger.debug("Full compaction on all regions, {}", occupancy());
|
|
|
|
for (region_impl* r : _regions) {
|
|
if (r->is_compactible()) {
|
|
r->full_compaction();
|
|
}
|
|
}
|
|
|
|
logger.debug("Compaction done, {}", occupancy());
|
|
}
|
|
|
|
static void reclaim_from_evictable(region::impl& r, size_t target_segments_in_use) {
|
|
while (true) {
|
|
auto deficit = (shard_segment_pool.segments_in_use() - target_segments_in_use) * segment::size;
|
|
auto occupancy = r.occupancy();
|
|
auto used = occupancy.used_space();
|
|
if (used == 0) {
|
|
// FIXME: There could be still some objects which are allocated
|
|
// using that region but were too large and are not managed by
|
|
// LSA. We should avoid having large objects in the first place,
|
|
// and make the managed_blob object fracture them internally. To
|
|
// handle eviction of large objects we should first move the
|
|
// segment pool service into the seastar allocator, so that
|
|
// evicting large objects counts towards that pool. It also makes
|
|
// sense to have the reclaimer coupled with that segment pool, and
|
|
// not with the page pool like it is now.
|
|
break;
|
|
}
|
|
auto used_target = used - std::min(used, deficit - std::min(deficit, occupancy.free_space()));
|
|
logger.debug("Evicting {} bytes from region {}, occupancy={}", used - used_target, r.id(), r.occupancy());
|
|
while (r.occupancy().used_space() > used_target || !r.is_compactible()) {
|
|
if (r.evict_some() == memory::reclaiming_result::reclaimed_nothing) {
|
|
logger.debug("Unable to evict more, evicted {} bytes", used - r.occupancy().used_space());
|
|
return;
|
|
}
|
|
if (shard_segment_pool.segments_in_use() <= target_segments_in_use) {
|
|
logger.debug("Target met after evicting {} bytes", used - r.occupancy().used_space());
|
|
return;
|
|
}
|
|
if (r.empty()) {
|
|
return;
|
|
}
|
|
}
|
|
logger.debug("Compacting after evicting {} bytes", used - r.occupancy().used_space());
|
|
r.compact();
|
|
}
|
|
}
|
|
|
|
struct reclaim_timer {
|
|
clock::time_point start;
|
|
bool enabled;
|
|
reclaim_timer() {
|
|
if (timing_logger.is_enabled(logging::log_level::debug)) {
|
|
start = clock::now();
|
|
enabled = true;
|
|
} else {
|
|
enabled = false;
|
|
}
|
|
}
|
|
~reclaim_timer() {
|
|
if (enabled) {
|
|
auto duration = clock::now() - start;
|
|
timing_logger.debug("Reclamation cycle took {} us.",
|
|
std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(duration).count());
|
|
}
|
|
}
|
|
};
|
|
|
|
size_t tracker::impl::reclaim(size_t bytes) {
|
|
//
|
|
// Algorithm outline.
|
|
//
|
|
// Regions are kept in a max-heap ordered so that regions with
|
|
// sparser segments are picked first. Non-compactible regions will be
|
|
// picked last. In each iteration we try to release one whole segment from
|
|
// the region which has the sparsest segment. We do it until we released
|
|
// enough segments or there are no more regions we can compact.
|
|
//
|
|
// When compaction is not sufficient to reclaim space, we evict data from
|
|
// evictable regions.
|
|
//
|
|
|
|
// This may run synchronously with allocation, so we should not allocate
|
|
// memory, otherwise we may get std::bad_alloc. Currently we only allocate
|
|
// in the logger when debug level is enabled. It's disabled during normal
|
|
// operation. Having it is still valuable during testing and in most cases
|
|
// should work just fine even if allocates.
|
|
|
|
constexpr auto max_bytes = std::numeric_limits<size_t>::max() - segment::size;
|
|
auto segments_to_release = align_up(std::min(max_bytes, bytes), segment::size) >> segment::size_shift;
|
|
size_t nr_released = 0;
|
|
|
|
size_t released_from_reserve = shard_segment_pool.trim_emergency_reserve_to_max();
|
|
nr_released += released_from_reserve;
|
|
if (nr_released >= segments_to_release) {
|
|
return nr_released * segment::size;
|
|
}
|
|
|
|
if (!_reclaiming_enabled) {
|
|
return nr_released * segment::size;
|
|
}
|
|
|
|
reclaiming_lock _(*this);
|
|
reclaim_timer timing_guard;
|
|
|
|
size_t in_use = shard_segment_pool.segments_in_use();
|
|
auto target = in_use - std::min(in_use, segments_to_release - nr_released);
|
|
|
|
logger.debug("Compacting, requested {} ({} B), {} segments in use ({} B), target is {}",
|
|
segments_to_release, bytes, in_use, in_use * segment::size, target);
|
|
|
|
// Allow dipping into reserves while compacting
|
|
segment_pool::reservation_goal open_emergency_pool(shard_segment_pool, 0);
|
|
|
|
auto cmp = [] (region::impl* c1, region::impl* c2) {
|
|
if (c1->is_compactible() != c2->is_compactible()) {
|
|
return !c1->is_compactible();
|
|
}
|
|
return c2->min_occupancy() < c1->min_occupancy();
|
|
};
|
|
|
|
boost::range::make_heap(_regions, cmp);
|
|
|
|
if (logger.is_enabled(logging::log_level::debug)) {
|
|
logger.debug("Occupancy of regions:");
|
|
for (region::impl* r : _regions) {
|
|
logger.debug(" - {}: min={}, avg={}", r->id(), r->min_occupancy(), r->compactible_occupancy());
|
|
}
|
|
}
|
|
|
|
while (shard_segment_pool.segments_in_use() > target) {
|
|
boost::range::pop_heap(_regions, cmp);
|
|
region::impl* r = _regions.back();
|
|
|
|
if (!r->is_compactible()) {
|
|
logger.trace("Unable to release segments, no compactible pools.");
|
|
break;
|
|
}
|
|
|
|
r->compact();
|
|
|
|
boost::range::push_heap(_regions, cmp);
|
|
}
|
|
|
|
auto released_during_compaction = in_use - shard_segment_pool.segments_in_use();
|
|
|
|
if (shard_segment_pool.segments_in_use() > target) {
|
|
logger.debug("Considering evictable regions.");
|
|
// FIXME: Fair eviction
|
|
for (region::impl* r : _regions) {
|
|
if (r->is_evictable()) {
|
|
reclaim_from_evictable(*r, target);
|
|
if (shard_segment_pool.segments_in_use() <= target) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
nr_released += in_use - shard_segment_pool.segments_in_use();
|
|
|
|
logger.debug("Released {} segments (wanted {}), {} during compaction, {} from reserve",
|
|
nr_released, segments_to_release, released_during_compaction, released_from_reserve);
|
|
|
|
return nr_released * segment::size;
|
|
}
|
|
|
|
void tracker::impl::register_region(region::impl* r) {
|
|
reclaiming_lock _(*this);
|
|
_regions.push_back(r);
|
|
logger.debug("Registered region @{} with id={}", r, r->id());
|
|
}
|
|
|
|
void tracker::impl::unregister_region(region::impl* r) {
|
|
reclaiming_lock _(*this);
|
|
logger.debug("Unregistering region, id={}", r->id());
|
|
_regions.erase(std::remove(_regions.begin(), _regions.end(), r));
|
|
}
|
|
|
|
void tracker::impl::register_collectd_metrics() {
|
|
_collectd_registrations = scollectd::registrations({
|
|
scollectd::add_polled_metric(
|
|
scollectd::type_instance_id("lsa", scollectd::per_cpu_plugin_instance, "bytes", "total_space"),
|
|
scollectd::make_typed(scollectd::data_type::GAUGE, [this] { return occupancy().total_space(); })
|
|
),
|
|
scollectd::add_polled_metric(
|
|
scollectd::type_instance_id("lsa", scollectd::per_cpu_plugin_instance, "bytes", "used_space"),
|
|
scollectd::make_typed(scollectd::data_type::GAUGE, [this] { return occupancy().used_space(); })
|
|
),
|
|
scollectd::add_polled_metric(
|
|
scollectd::type_instance_id("lsa", scollectd::per_cpu_plugin_instance, "bytes", "non_lsa_used_space"),
|
|
scollectd::make_typed(scollectd::data_type::GAUGE, [this] { return memory::stats().allocated_memory() - occupancy().total_space(); })
|
|
),
|
|
scollectd::add_polled_metric(
|
|
scollectd::type_instance_id("lsa", scollectd::per_cpu_plugin_instance, "percent", "occupancy"),
|
|
scollectd::make_typed(scollectd::data_type::GAUGE, [this] { return occupancy().used_fraction() * 100; })
|
|
),
|
|
});
|
|
}
|
|
|
|
region_group::region_group(region_group&& o) noexcept
|
|
: _parent(o._parent), _total_memory(o._total_memory)
|
|
, _subgroups(std::move(o._subgroups)), _regions(std::move(o._regions)) {
|
|
if (_parent) {
|
|
_parent->del(&o);
|
|
_parent->add(this);
|
|
}
|
|
o._total_memory = 0;
|
|
for (auto&& sg : _subgroups) {
|
|
sg->_parent = this;
|
|
}
|
|
for (auto&& r : _regions) {
|
|
r->_group = this;
|
|
}
|
|
}
|
|
|
|
void
|
|
region_group::add(region_group* child) {
|
|
_subgroups.push_back(child);
|
|
update(child->_total_memory);
|
|
}
|
|
|
|
void
|
|
region_group::del(region_group* child) {
|
|
_subgroups.erase(boost::range::remove(_subgroups, child), _subgroups.end());
|
|
update(-child->_total_memory);
|
|
}
|
|
|
|
void
|
|
region_group::add(region_impl* child) {
|
|
_regions.push_back(child);
|
|
update(child->occupancy().total_space());
|
|
}
|
|
|
|
void
|
|
region_group::del(region_impl* child) {
|
|
_regions.erase(boost::range::remove(_regions, child), _regions.end());
|
|
update(-child->occupancy().total_space());
|
|
}
|
|
|
|
allocating_section::guard::guard()
|
|
: _prev(shard_segment_pool.emergency_reserve_max())
|
|
{ }
|
|
|
|
allocating_section::guard::~guard() {
|
|
shard_segment_pool.set_emergency_reserve_max(_prev);
|
|
}
|
|
|
|
#ifndef DEFAULT_ALLOCATOR
|
|
|
|
void allocating_section::guard::enter(allocating_section& self) {
|
|
shard_segment_pool.set_emergency_reserve_max(std::max(self._lsa_reserve, _prev));
|
|
shard_segment_pool.refill_emergency_reserve();
|
|
|
|
while (true) {
|
|
size_t free = memory::stats().free_memory();
|
|
if (free >= self._std_reserve) {
|
|
break;
|
|
}
|
|
if (!tracker_instance.reclaim(self._std_reserve - free)) {
|
|
throw std::bad_alloc();
|
|
}
|
|
}
|
|
|
|
shard_segment_pool.clear_allocation_failure_flag();
|
|
}
|
|
|
|
void allocating_section::on_alloc_failure() {
|
|
if (shard_segment_pool.allocation_failure_flag()) {
|
|
_lsa_reserve *= 2; // FIXME: decay?
|
|
logger.debug("LSA allocation failure, increasing reserve in section {} to {} segments", this, _lsa_reserve);
|
|
} else {
|
|
_std_reserve *= 2; // FIXME: decay?
|
|
logger.debug("Standard allocator failure, increasing head-room in section {} to {} [B]", this, _std_reserve);
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
void allocating_section::guard::enter(allocating_section& self) {
|
|
}
|
|
|
|
void allocating_section::on_alloc_failure() {
|
|
throw std::bad_alloc();
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|