Files
scylladb/utils/logalloc.hh
Glauber Costa 2bffa8af74 logalloc: make sure allocations in release_requests don't recurse back into the allocator
Calls like later() and with_gate() may allocate memory, although that is not
very common. This can create a problem in the sense that it will potentially
recurse and bring us back to the allocator during free - which is the very thing
we are trying to avoid with the call to later().

This patch wraps the relevant calls in the reclaimer lock. This do mean that the
allocation may fail if we are under severe pressure - which includes having
exhausted all reserved space - but at least we won't recurse back to the
allocator.

To make sure we do this as early as possible, we just fold both release_requests
and do_release_requests into a single function

Thanks Tomek for the suggestion.

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Message-Id: <980245ccc17960cf4fcbbfedb29d1878a98d85d8.1470254846.git.glauber@scylladb.com>
(cherry picked from commit fe6a0d97d1)
2016-08-04 11:17:54 +02:00

657 lines
26 KiB
C++

/*
* Copyright (C) 2015 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <bits/unique_ptr.h>
#include <seastar/core/scollectd.hh>
#include <seastar/core/memory.hh>
#include <seastar/core/reactor.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/shared_future.hh>
#include <seastar/core/gate.hh>
#include <seastar/core/future-util.hh>
#include <seastar/core/circular_buffer.hh>
#include "allocation_strategy.hh"
#include <boost/heap/binomial_heap.hpp>
namespace logalloc {
struct occupancy_stats;
class region;
class region_impl;
class allocating_section;
constexpr int segment_size_shift = 18; // 256K; see #151, #152
constexpr size_t segment_size = 1 << segment_size_shift;
//
// Frees some amount of objects from the region to which it's attached.
//
// This should eventually stop given no new objects are added:
//
// while (eviction_fn() == memory::reclaiming_result::reclaimed_something) ;
//
using eviction_fn = std::function<memory::reclaiming_result()>;
//
// Users of a region_group can pass an instance of the class region_group_reclaimer, and specialize
// its methods start_reclaiming() and stop_reclaiming(). Those methods will be called when the LSA
// see relevant changes in the memory pressure conditions for this region_group. By specializing
// those methods - which are a nop by default - the callers can take action to aid the LSA in
// alleviating pressure.
class region_group_reclaimer {
protected:
size_t _threshold;
bool _under_pressure = false;
virtual void start_reclaiming() {}
virtual void stop_reclaiming() {}
public:
bool under_pressure() const {
return _under_pressure;
}
void notify_pressure() {
if (!_under_pressure) {
_under_pressure = true;
start_reclaiming();
}
}
void notify_relief() {
if (_under_pressure) {
_under_pressure = false;
stop_reclaiming();
}
}
region_group_reclaimer(size_t threshold = std::numeric_limits<size_t>::max()) : _threshold(threshold) {}
virtual ~region_group_reclaimer() {}
size_t throttle_threshold() const {
return _threshold;
}
};
// Groups regions for the purpose of statistics. Can be nested.
class region_group {
static region_group_reclaimer no_reclaimer;
struct region_evictable_occupancy_ascending_less_comparator {
bool operator()(region_impl* r1, region_impl* r2) const;
};
// We want to sort the subgroups so that we can easily find the one that holds the biggest
// region for freeing purposes. Please note that this is not the biggest of the region groups,
// since a big region group can have a big collection of very small regions, and freeing them
// won't achieve anything. An example of such scenario is a ScyllaDB region with a lot of very
// small memtables that add up, versus one with a very big memtable. The small memtables are
// likely still growing, and freeing the big memtable will guarantee that the most memory is
// freed up, while maximizing disk throughput.
//
// As asynchronous reclaim will likely involve disk operation, and those tend to be more
// efficient when bulk done, this behavior is not ScyllaDB memtable specific.
//
// The maximal score is recursively defined as:
//
// max(our_biggest_region, our_subtree_biggest_region)
struct subgroup_maximal_region_ascending_less_comparator {
bool operator()(region_group* rg1, region_group* rg2) const {
return rg1->maximal_score() < rg2->maximal_score();
}
};
friend struct subgroup_maximal_region_ascending_less_comparator;
using region_heap = boost::heap::binomial_heap<region_impl*,
boost::heap::compare<region_evictable_occupancy_ascending_less_comparator>,
boost::heap::allocator<std::allocator<region_impl*>>,
//constant_time_size<true> causes corruption with boost < 1.60
boost::heap::constant_time_size<false>>;
using subgroup_heap = boost::heap::binomial_heap<region_group*,
boost::heap::compare<subgroup_maximal_region_ascending_less_comparator>,
boost::heap::allocator<std::allocator<region_group*>>,
//constant_time_size<true> causes corruption with boost < 1.60
boost::heap::constant_time_size<false>>;
region_group* _parent = nullptr;
size_t _total_memory = 0;
region_group_reclaimer& _reclaimer;
subgroup_heap _subgroups;
subgroup_heap::handle_type _subgroup_heap_handle;
region_heap _regions;
region_group* _maximal_rg = nullptr;
// We need to store the score separately, otherwise we'd have to have an extra pass
// before we update the region occupancy.
size_t _maximal_score = 0;
struct allocating_function {
virtual ~allocating_function() = default;
virtual void allocate() = 0;
};
template <typename Func>
struct concrete_allocating_function : public allocating_function {
using futurator = futurize<std::result_of_t<Func()>>;
typename futurator::promise_type pr;
Func func;
public:
void allocate() override {
futurator::apply(func).forward_to(std::move(pr));
}
concrete_allocating_function(Func&& func) : func(std::forward<Func>(func)) {}
typename futurator::type get_future() {
return pr.get_future();
}
};
// It is a more common idiom to just hold the promises in the circular buffer and make them
// ready. However, in the time between the promise being made ready and the function execution,
// it could be that our memory usage went up again. To protect against that, we have to recheck
// if memory is still available after the future resolves.
//
// But we can greatly simplify it if we store the function itself in the circular_buffer, and
// execute it synchronously in release_requests() when we are sure memory is available.
//
// This allows us to easily provide strong execution guarantees while keeping all re-check
// complication in release_requests and keep the main request execution path simpler.
circular_buffer<std::unique_ptr<allocating_function>> _blocked_requests;
// All requests waiting for execution are kept in _blocked_requests (explained above) in the
// region_group they were executed against. However, it could be that they are blocked not due
// to their region group but to an ancestor. To handle these cases we will keep a list of
// descendant region_groups that have requests that are waiting on us.
//
// Please note that what we keep here are not requests, and can be thought as just messages. The
// requests themselves are kept in the region_group in which they originated. When we see that
// there are region_groups waiting on us, we broadcast these messages to the waiters and they
// will then decide whether they can now run or if they have to wait on us again (or potentially
// a different ancestor)
std::experimental::optional<shared_promise<>> _descendant_blocked_requests = {};
region_group* _waiting_on_ancestor = nullptr;
seastar::gate _asynchronous_gate;
bool _shutdown_requested = false;
public:
// When creating a region_group, one can specify an optional throttle_threshold parameter. This
// parameter won't affect normal allocations, but an API is provided, through the region_group's
// method run_when_memory_available(), to make sure that a given function is only executed when
// the total memory for the region group (and all of its parents) is lower or equal to the
// region_group's throttle_treshold (and respectively for its parents).
region_group(region_group_reclaimer& reclaimer = no_reclaimer) : region_group(nullptr, reclaimer) {}
region_group(region_group* parent, region_group_reclaimer& reclaimer = no_reclaimer) : _parent(parent), _reclaimer(reclaimer) {
if (_parent) {
_parent->add(this);
}
}
region_group(region_group&& o) = delete;
region_group(const region_group&) = delete;
~region_group() {
// If we set a throttle threshold, we'd be postponing many operations. So shutdown must be
// called.
if (_reclaimer.throttle_threshold() != std::numeric_limits<size_t>::max()) {
assert(_shutdown_requested);
}
if (_parent) {
_parent->del(this);
}
}
region_group& operator=(const region_group&) = delete;
region_group& operator=(region_group&&) = delete;
size_t memory_used() const {
return _total_memory;
}
void update(ssize_t delta) {
do_for_each_parent(this, [delta] (auto rg) mutable {
rg->update_maximal_rg();
rg->_total_memory += delta;
// It is okay to call release_requests for a region_group that can't allow execution.
// But that can generate various spurious messages to groups waiting on us that will be
// then woken up just so they can go to wait again. So let's filter that.
if (rg->execution_permitted()) {
rg->release_requests();
}
return stop_iteration::no;
});
}
// It would be easier to call update, but it is unfortunately broken in boost versions up to at
// least 1.59.
//
// One possibility would be to just test for delta sigdness, but we adopt an explicit call for
// two reasons:
//
// 1) it save us a branch
// 2) some callers would like to pass delta = 0. For instance, when we are making a region
// evictable / non-evictable. Because the evictable occupancy changes, we would like to call
// the full update cycle even then.
void increase_usage(region_heap::handle_type& r_handle, ssize_t delta) {
_regions.increase(r_handle);
update(delta);
}
void decrease_usage(region_heap::handle_type& r_handle, ssize_t delta) {
_regions.decrease(r_handle);
update(delta);
}
//
// Make sure that the function specified by the parameter func only runs when this region_group,
// as well as each of its ancestors have a memory_used() amount of memory that is lesser or
// equal the throttle_threshold, as specified in the region_group's constructor.
//
// region_groups that did not specify a throttle_threshold will always allow for execution.
//
// In case current memory_used() is over the threshold, a non-ready future is returned and it
// will be made ready at some point in the future, at which memory usage in the offending
// region_group (either this or an ancestor) falls below the threshold.
//
// Requests that are not allowed for execution are queued and released in FIFO order within the
// same region_group, but no guarantees are made regarding release ordering across different
// region_groups.
template <typename Func>
futurize_t<std::result_of_t<Func()>> run_when_memory_available(Func&& func) {
// We disallow future-returning functions here, because otherwise memory may be available
// when we start executing it, but no longer available in the middle of the execution.
static_assert(!is_future<std::result_of_t<Func()>>::value, "future-returning functions are not permitted.");
using futurator = futurize<std::result_of_t<Func()>>;
auto blocked_at = do_for_each_parent(this, [] (auto rg) {
return (rg->_blocked_requests.empty() && rg->execution_permitted()) ? stop_iteration::no : stop_iteration::yes;
});
if (!blocked_at) {
return futurator::apply(func);
}
subscribe_for_ancestor_available_memory_notification(blocked_at);
auto fn = std::make_unique<concrete_allocating_function<Func>>(std::forward<Func>(func));
auto fut = fn->get_future();
_blocked_requests.push_back(std::move(fn));
// This is called here, and not at update(), for two reasons: the first, is that things that
// are done during the free() path should be done carefuly, in the sense that they can
// trigger another update call and put us in a loop. Not to mention we would like to keep
// those from having exceptions. We solve that for release_requests by using later(), but in
// here we can do away with that need altogether.
//
// Second and most important, until we actually block a request, the pressure condition may
// very well be transient. There are opportunities for compactions, the condition can go
// away on its own, etc.
//
// The reason we check execution permitted(), is that we'll still block requests if we have
// free memory but existing requests in the queue. That is so we can keep our FIFO ordering
// guarantee. So we need to distinguish here the case in which we're blocking merely to
// serialize requests, so that the caller does not evict more than it should.
if (!blocked_at->execution_permitted()) {
blocked_at->_reclaimer.notify_pressure();
}
return fut;
}
// returns a pointer to the largest region (in terms of memory usage) that sits below this
// region group. This includes the regions owned by this region group as well as all of its
// children.
region* get_largest_region();
// Shutdown is mandatory for every user who has set a threshold
future<> shutdown() {
_shutdown_requested = true;
return _asynchronous_gate.close();
}
private:
// Make sure we get a notification and can call release_requests when one of our ancestors that
// used to block us is no longer under memory pressure.
void subscribe_for_ancestor_available_memory_notification(region_group *ancestor) {
if ((this == ancestor) || (_waiting_on_ancestor)) {
return; // already subscribed, or no need to
}
_waiting_on_ancestor = ancestor;
with_gate(_asynchronous_gate, [this] {
// We reevaluate _waiting_on_ancestor here so we make sure there is no deferring point
// between determining the ancestor and registering with it for a notification. We start
// with _waiting_on_ancestor set to the initial value, and after we are notified, we
// will set _waiting_on_ancestor to nullptr to force this lambda to reevaluate it.
auto evaluate_ancestor_and_stop = [this] {
if (!_waiting_on_ancestor) {
auto new_blocking_point = do_for_each_parent(this, [] (auto rg) {
return (rg->execution_permitted()) ? stop_iteration::no : stop_iteration::yes;
});
if (!new_blocking_point) {
release_requests();
}
_waiting_on_ancestor = (new_blocking_point == this) ? nullptr : new_blocking_point;
}
return _waiting_on_ancestor == nullptr;
};
return do_until(evaluate_ancestor_and_stop, [this] {
if (!_waiting_on_ancestor->_descendant_blocked_requests) {
_waiting_on_ancestor->_descendant_blocked_requests = shared_promise<>();
}
return _waiting_on_ancestor->_descendant_blocked_requests->get_shared_future().then([this] {
_waiting_on_ancestor = nullptr;
});
});
});
}
// Executes the function func for each region_group upwards in the hierarchy, starting with the
// parameter node. The function func may return stop_iteration::no, in which case it proceeds to
// the next ancestor in the hierarchy, or stop_iteration::yes, in which case it stops at this
// level.
//
// This method returns a pointer to the region_group that was processed last, or nullptr if the
// root was reached.
template <typename Func>
static region_group* do_for_each_parent(region_group *node, Func&& func) {
auto rg = node;
while (rg) {
if (func(rg) == stop_iteration::yes) {
return rg;
}
rg = rg->_parent;
}
return nullptr;
}
inline bool execution_permitted() const {
return _total_memory <= _reclaimer.throttle_threshold();
}
void release_requests() noexcept;
uint64_t top_region_evictable_space() const;
uint64_t maximal_score() const {
return _maximal_score;
}
void update_maximal_rg() {
auto my_score = top_region_evictable_space();
auto children_score = _subgroups.empty() ? 0 : _subgroups.top()->maximal_score();
auto old_maximal_score = _maximal_score;
if (children_score > my_score) {
_maximal_rg = _subgroups.top()->_maximal_rg;
} else {
_maximal_rg = this;
}
_maximal_score = _maximal_rg->top_region_evictable_space();
if (_parent) {
// binomial heap update boost bug.
if (_maximal_score > old_maximal_score) {
_parent->_subgroups.increase(_subgroup_heap_handle);
} else if (_maximal_score < old_maximal_score) {
_parent->_subgroups.decrease(_subgroup_heap_handle);
}
}
}
void add(region_group* child);
void del(region_group* child);
void add(region_impl* child);
void del(region_impl* child);
friend class region_impl;
};
// Controller for all LSA regions. There's one per shard.
class tracker {
public:
class impl;
private:
std::unique_ptr<impl> _impl;
memory::reclaimer _reclaimer;
friend class region;
friend class region_impl;
memory::reclaiming_result reclaim();
public:
tracker();
~tracker();
//
// Tries to reclaim given amount of bytes in total using all compactible
// and evictable regions. Returns the number of bytes actually reclaimed.
// That value may be smaller than requested when evictable pools are empty
// and compactible pools can't compact any more.
//
// Invalidates references to objects in all compactible and evictable regions.
//
size_t reclaim(size_t bytes);
// Compacts one segment at a time from sparsest segment to least sparse until work_waiting_on_reactor returns true
// or there are no more segments to compact.
reactor::idle_cpu_handler_result compact_on_idle(reactor::work_waiting_on_reactor);
// Compacts as much as possible. Very expensive, mainly for testing.
// Invalidates references to objects in all compactible and evictable regions.
void full_compaction();
void reclaim_all_free_segments();
// Returns aggregate statistics for all pools.
occupancy_stats region_occupancy();
// Returns statistics for all segments allocated by LSA on this shard.
occupancy_stats occupancy();
impl& get_impl() { return *_impl; }
// Set the minimum number of segments reclaimed during single reclamation cycle.
void set_reclamation_step(size_t step_in_segments);
// Returns the minimum number of segments reclaimed during single reclamation cycle.
size_t reclamation_step() const;
};
tracker& shard_tracker();
// Monoid representing pool occupancy statistics.
// Naturally ordered so that sparser pools come fist.
// All sizes in bytes.
class occupancy_stats {
size_t _free_space;
size_t _total_space;
public:
occupancy_stats() : _free_space(0), _total_space(0) {}
occupancy_stats(size_t free_space, size_t total_space)
: _free_space(free_space), _total_space(total_space) { }
bool operator<(const occupancy_stats& other) const {
return used_fraction() < other.used_fraction();
}
friend occupancy_stats operator+(const occupancy_stats& s1, const occupancy_stats& s2) {
occupancy_stats result(s1);
result += s2;
return result;
}
friend occupancy_stats operator-(const occupancy_stats& s1, const occupancy_stats& s2) {
occupancy_stats result(s1);
result -= s2;
return result;
}
occupancy_stats& operator+=(const occupancy_stats& other) {
_total_space += other._total_space;
_free_space += other._free_space;
return *this;
}
occupancy_stats& operator-=(const occupancy_stats& other) {
_total_space -= other._total_space;
_free_space -= other._free_space;
return *this;
}
size_t used_space() const {
return _total_space - _free_space;
}
size_t free_space() const {
return _free_space;
}
size_t total_space() const {
return _total_space;
}
float used_fraction() const {
return _total_space ? float(used_space()) / total_space() : 0;
}
friend std::ostream& operator<<(std::ostream&, const occupancy_stats&);
};
//
// Log-structured allocator region.
//
// Objects allocated using this region are said to be owned by this region.
// Objects must be freed only using the region which owns them. Ownership can
// be transferred across regions using the merge() method. Region must be live
// as long as it owns any objects.
//
// Each region has separate memory accounting and can be compacted
// independently from other regions. To reclaim memory from all regions use
// shard_tracker().
//
// Region is automatically added to the set of
// compactible regions when constructed.
//
class region {
public:
using impl = region_impl;
private:
shared_ptr<impl> _impl;
public:
region();
explicit region(region_group& group);
~region();
region(region&& other);
region& operator=(region&& other);
region(const region& other) = delete;
occupancy_stats occupancy() const;
allocation_strategy& allocator();
// Merges another region into this region. The other region is left empty.
// Doesn't invalidate references to allocated objects.
void merge(region& other);
// Compacts everything. Mainly for testing.
// Invalidates references to allocated objects.
void full_compaction();
// Changes the reclaimability state of this region. When region is not
// reclaimable, it won't be considered by tracker::reclaim(). By default region is
// reclaimable after construction.
void set_reclaiming_enabled(bool);
// Returns the reclaimability state of this region.
bool reclaiming_enabled() const;
// Returns a value which is increased when this region is either compacted or
// evicted from, which invalidates references into the region.
// When the value returned by this method doesn't change, references remain valid.
uint64_t reclaim_counter() const;
// Makes this region an evictable region. Supplied function will be called
// when data from this region needs to be evicted in order to reclaim space.
// The function should free some space from this region.
void make_evictable(eviction_fn);
friend class region_group;
friend class allocating_section;
};
// Forces references into the region to remain valid as long as this guard is
// live by disabling compaction and eviction.
// Can be nested.
struct reclaim_lock {
region& _region;
bool _prev;
reclaim_lock(region& r)
: _region(r)
, _prev(r.reclaiming_enabled())
{
_region.set_reclaiming_enabled(false);
}
~reclaim_lock() {
_region.set_reclaiming_enabled(_prev);
}
};
// Utility for running critical sections which need to lock some region and
// also allocate LSA memory. The object learns from failures how much it
// should reserve up front in order to not cause allocation failures.
class allocating_section {
size_t _lsa_reserve = 10; // in segments
size_t _std_reserve = 1024; // in bytes
private:
struct guard {
size_t _prev;
guard();
~guard();
void enter(allocating_section&);
};
void on_alloc_failure();
public:
//
// Invokes func with reclaim_lock on region r. If LSA allocation fails
// inside func it is retried after increasing LSA segment reserve. The
// memory reserves are increased with region lock off allowing for memory
// reclamation to take place in the region.
//
// Throws std::bad_alloc when reserves can't be increased to a sufficient level.
//
template<typename Func>
decltype(auto) operator()(logalloc::region& r, Func&& func) {
auto prev_lsa_reserve = _lsa_reserve;
auto prev_std_reserve = _std_reserve;
try {
while (true) {
assert(r.reclaiming_enabled());
guard g;
g.enter(*this);
try {
logalloc::reclaim_lock _(r);
return func();
} catch (const std::bad_alloc&) {
on_alloc_failure();
}
}
} catch (const std::bad_alloc&) {
// roll-back limits to protect against pathological requests
// preventing future requests from succeeding.
_lsa_reserve = prev_lsa_reserve;
_std_reserve = prev_std_reserve;
throw;
}
}
};
}