Memtables are a replica-side entity, and so are moved to the replica module and namespace. Memtables are also used outside the replica, in two places: - in some virtual tables; this is also in some way inside the replica, (virtual readers are installed at the replica level, not the cooordinator), so I don't consider it a layering violation - in many sstable unit tests, as a convenient way to create sstables with known input. This is a layering violation. We could make memtables their own module, but I think this is wrong. Memtables are deeply tied into replica memory management, and trying to make them a low-level primitive (at a lower level than sstables) will be difficult. Not least because memtables use sstables. Instead, we should have a memtable-like thing that doesn't support merging and doesn't have all other funky memtable stuff, and instead replace the uses of memtables in sstable tests with some kind of make_flat_mutation_reader_from_unsorted_mutations() that does the sorting that is the reason for the use of memtables in tests (and live with the layering violation meanwhile). Test: unit (dev) Closes #10120
228 lines
9.0 KiB
C++
228 lines
9.0 KiB
C++
/*
|
|
* Copyright (C) 2017-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <boost/intrusive/parent_from_member.hpp>
|
|
#include <seastar/core/condition-variable.hh>
|
|
#include <seastar/core/future.hh>
|
|
#include <seastar/core/metrics_registration.hh>
|
|
#include <seastar/core/semaphore.hh>
|
|
#include "replica/database_fwd.hh"
|
|
#include "utils/logalloc.hh"
|
|
|
|
class dirty_memory_manager;
|
|
|
|
class sstable_write_permit final {
|
|
friend class dirty_memory_manager;
|
|
std::optional<semaphore_units<>> _permit;
|
|
|
|
sstable_write_permit() noexcept = default;
|
|
explicit sstable_write_permit(semaphore_units<>&& units) noexcept
|
|
: _permit(std::move(units)) {
|
|
}
|
|
|
|
public:
|
|
sstable_write_permit(sstable_write_permit&&) noexcept = default;
|
|
sstable_write_permit& operator=(sstable_write_permit&&) noexcept = default;
|
|
|
|
static sstable_write_permit unconditional() {
|
|
return sstable_write_permit();
|
|
}
|
|
};
|
|
|
|
class flush_permit {
|
|
friend class dirty_memory_manager;
|
|
dirty_memory_manager* _manager;
|
|
sstable_write_permit _sstable_write_permit;
|
|
semaphore_units<> _background_permit;
|
|
|
|
flush_permit(dirty_memory_manager* manager, sstable_write_permit&& sstable_write_permit, semaphore_units<>&& background_permit)
|
|
: _manager(manager)
|
|
, _sstable_write_permit(std::move(sstable_write_permit))
|
|
, _background_permit(std::move(background_permit)) {
|
|
}
|
|
public:
|
|
flush_permit(flush_permit&&) noexcept = default;
|
|
flush_permit& operator=(flush_permit&&) noexcept = default;
|
|
|
|
sstable_write_permit release_sstable_write_permit() {
|
|
return std::move(_sstable_write_permit);
|
|
}
|
|
|
|
future<flush_permit> reacquire_sstable_write_permit() &&;
|
|
};
|
|
|
|
class dirty_memory_manager: public logalloc::region_group_reclaimer {
|
|
logalloc::region_group_reclaimer _real_dirty_reclaimer;
|
|
// We need a separate boolean, because from the LSA point of view, pressure may still be
|
|
// mounting, in which case the pressure flag could be set back on if we force it off.
|
|
bool _db_shutdown_requested = false;
|
|
|
|
replica::database* _db;
|
|
// The _real_region_group protects against actual dirty memory usage hitting the maximum. Usage
|
|
// for this group is the real dirty memory usage of the system.
|
|
logalloc::region_group _real_region_group;
|
|
// The _virtual_region_group accounts for virtual memory usage. It is defined as the real dirty
|
|
// memory usage minus bytes that were already written to disk.
|
|
logalloc::region_group _virtual_region_group;
|
|
|
|
// We would like to serialize the flushing of memtables. While flushing many memtables
|
|
// simultaneously can sustain high levels of throughput, the memory is not freed until the
|
|
// memtable is totally gone. That means that if we have throttled requests, they will stay
|
|
// throttled for a long time. Even when we have virtual dirty, that only provides a rough
|
|
// estimate, and we can't release requests that early.
|
|
semaphore _flush_serializer;
|
|
// We will accept a new flush before another one ends, once it is done with the data write.
|
|
// That is so we can keep the disk always busy. But there is still some background work that is
|
|
// left to be done. Mostly, update the caches and seal the auxiliary components of the SSTable.
|
|
// This semaphore will cap the amount of background work that we have. Note that we're not
|
|
// overly concerned about memtable memory, because dirty memory will put a limit to that. This
|
|
// is mostly about dangling continuations. So that doesn't have to be a small number.
|
|
static constexpr unsigned _max_background_work = 20;
|
|
semaphore _background_work_flush_serializer = { _max_background_work };
|
|
condition_variable _should_flush;
|
|
int64_t _dirty_bytes_released_pre_accounted = 0;
|
|
|
|
future<> flush_when_needed();
|
|
|
|
future<> _waiting_flush;
|
|
virtual void start_reclaiming() noexcept override;
|
|
|
|
bool has_pressure() const {
|
|
return over_soft_limit();
|
|
}
|
|
|
|
unsigned _extraneous_flushes = 0;
|
|
|
|
seastar::metrics::metric_groups _metrics;
|
|
public:
|
|
void setup_collectd(sstring namestr);
|
|
|
|
future<> shutdown();
|
|
|
|
// Limits and pressure conditions:
|
|
// ===============================
|
|
//
|
|
// Virtual Dirty
|
|
// -------------
|
|
// We can't free memory until the whole memtable is flushed because we need to keep it in memory
|
|
// until the end, but we can fake freeing memory. When we are done with an element of the
|
|
// memtable, we will update the region group pretending memory just went down by that amount.
|
|
//
|
|
// Because the amount of memory that we pretend to free should be close enough to the actual
|
|
// memory used by the memtables, that effectively creates two sub-regions inside the dirty
|
|
// region group, of equal size. In the worst case, we will have <memtable_total_space> dirty
|
|
// bytes used, and half of that already virtually freed.
|
|
//
|
|
// Hard Limit
|
|
// ----------
|
|
// The total space that can be used by memtables in each group is defined by the threshold, but
|
|
// we will only allow the region_group to grow to half of that. This is because of virtual_dirty
|
|
// as explained above. Because virtual dirty is implemented by reducing the usage in the
|
|
// region_group directly on partition written, we want to throttle every time half of the memory
|
|
// as seen by the region_group. To achieve that we need to set the hard limit (first parameter
|
|
// of the region_group_reclaimer) to 1/2 of the user-supplied threshold
|
|
//
|
|
// Soft Limit
|
|
// ----------
|
|
// When the soft limit is hit, no throttle happens. The soft limit exists because we don't want
|
|
// to start flushing only when the limit is hit, but a bit earlier instead. If we were to start
|
|
// flushing only when the hard limit is hit, workloads in which the disk is fast enough to cope
|
|
// would see latency added to some requests unnecessarily.
|
|
//
|
|
// We then set the soft limit to 80 % of the virtual dirty hard limit, which is equal to 40 % of
|
|
// the user-supplied threshold.
|
|
dirty_memory_manager(replica::database& db, size_t threshold, double soft_limit, scheduling_group deferred_work_sg)
|
|
: logalloc::region_group_reclaimer(threshold / 2, threshold * soft_limit / 2)
|
|
, _real_dirty_reclaimer(threshold)
|
|
, _db(&db)
|
|
, _real_region_group("memtable", _real_dirty_reclaimer, deferred_work_sg)
|
|
, _virtual_region_group("memtable (virtual)", &_real_region_group, *this, deferred_work_sg)
|
|
, _flush_serializer(1)
|
|
, _waiting_flush(flush_when_needed()) {}
|
|
|
|
dirty_memory_manager() : logalloc::region_group_reclaimer()
|
|
, _db(nullptr)
|
|
, _real_region_group("memtable", _real_dirty_reclaimer)
|
|
, _virtual_region_group("memtable (virtual)", &_real_region_group, *this)
|
|
, _flush_serializer(1)
|
|
, _waiting_flush(make_ready_future<>()) {}
|
|
|
|
static dirty_memory_manager& from_region_group(logalloc::region_group *rg) {
|
|
return *(boost::intrusive::get_parent_from_member(rg, &dirty_memory_manager::_virtual_region_group));
|
|
}
|
|
|
|
logalloc::region_group& region_group() {
|
|
return _virtual_region_group;
|
|
}
|
|
|
|
const logalloc::region_group& region_group() const {
|
|
return _virtual_region_group;
|
|
}
|
|
|
|
void revert_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
|
|
_real_region_group.update(-delta);
|
|
_virtual_region_group.update(delta);
|
|
_dirty_bytes_released_pre_accounted -= delta;
|
|
}
|
|
|
|
void account_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
|
|
_real_region_group.update(delta);
|
|
_virtual_region_group.update(-delta);
|
|
_dirty_bytes_released_pre_accounted += delta;
|
|
}
|
|
|
|
void pin_real_dirty_memory(int64_t delta) {
|
|
_real_region_group.update(delta);
|
|
}
|
|
|
|
void unpin_real_dirty_memory(int64_t delta) {
|
|
_real_region_group.update(-delta);
|
|
}
|
|
|
|
size_t real_dirty_memory() const {
|
|
return _real_region_group.memory_used();
|
|
}
|
|
|
|
size_t virtual_dirty_memory() const {
|
|
return _virtual_region_group.memory_used();
|
|
}
|
|
|
|
future<> flush_one(replica::memtable_list& cf, flush_permit&& permit);
|
|
|
|
future<flush_permit> get_flush_permit() {
|
|
return get_units(_background_work_flush_serializer, 1).then([this] (auto&& units) {
|
|
return this->get_flush_permit(std::move(units));
|
|
});
|
|
}
|
|
|
|
bool has_extraneous_flushes_requested() const {
|
|
return _extraneous_flushes > 0;
|
|
}
|
|
|
|
void start_extraneous_flush() {
|
|
++_extraneous_flushes;
|
|
}
|
|
|
|
void finish_extraneous_flush() {
|
|
--_extraneous_flushes;
|
|
}
|
|
private:
|
|
future<flush_permit> get_flush_permit(semaphore_units<>&& background_permit) {
|
|
return get_units(_flush_serializer, 1).then([this, background_permit = std::move(background_permit)] (auto&& units) mutable {
|
|
return flush_permit(this, sstable_write_permit(std::move(units)), std::move(background_permit));
|
|
});
|
|
}
|
|
|
|
friend class flush_permit;
|
|
};
|
|
|
|
extern thread_local dirty_memory_manager default_dirty_memory_manager;
|
|
|