Presence checker is constructed and destroyed in the standard allocator context, but the presence check was invoked in the LSA context. If the presence checker allocates and caches some managed objects, there will be alloc-dealloc mismatch. That is the case with LeveledCompactionStrategy, which uses incremental_selector. Fix by invoking the presence check in the standard allocator context. Fixes #4063. Message-Id: <1547547700-16599-1-git-send-email-tgrabiec@scylladb.com>
1290 lines
53 KiB
C++
1290 lines
53 KiB
C++
/*
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "row_cache.hh"
|
|
#include <seastar/core/memory.hh>
|
|
#include <seastar/core/do_with.hh>
|
|
#include <seastar/core/future-util.hh>
|
|
#include <seastar/core/metrics.hh>
|
|
#include <seastar/util/defer.hh>
|
|
#include "memtable.hh"
|
|
#include "partition_snapshot_reader.hh"
|
|
#include <chrono>
|
|
#include <boost/version.hpp>
|
|
#include <sys/sdt.h>
|
|
#include "read_context.hh"
|
|
#include "schema_upgrader.hh"
|
|
#include "dirty_memory_manager.hh"
|
|
#include "cache_flat_mutation_reader.hh"
|
|
#include "real_dirty_memory_accounter.hh"
|
|
|
|
namespace cache {
|
|
|
|
logging::logger clogger("cache");
|
|
|
|
}
|
|
|
|
using namespace std::chrono_literals;
|
|
using namespace cache;
|
|
|
|
flat_mutation_reader
|
|
row_cache::create_underlying_reader(read_context& ctx, mutation_source& src, const dht::partition_range& pr) {
|
|
ctx.on_underlying_created();
|
|
return src.make_reader(_schema, pr, ctx.slice(), ctx.pc(), ctx.trace_state(), streamed_mutation::forwarding::yes);
|
|
}
|
|
|
|
cache_tracker::cache_tracker()
|
|
: _garbage(_region, this)
|
|
, _memtable_cleaner(_region, nullptr)
|
|
{
|
|
setup_metrics();
|
|
|
|
_region.make_evictable([this] {
|
|
return with_allocator(_region.allocator(), [this] {
|
|
// Removing a partition may require reading large keys when we rebalance
|
|
// the rbtree, so linearize anything we read
|
|
return with_linearized_managed_bytes([&] {
|
|
try {
|
|
if (!_garbage.empty()) {
|
|
_garbage.clear_some();
|
|
return memory::reclaiming_result::reclaimed_something;
|
|
}
|
|
if (!_memtable_cleaner.empty()) {
|
|
_memtable_cleaner.clear_some();
|
|
return memory::reclaiming_result::reclaimed_something;
|
|
}
|
|
if (_lru.empty()) {
|
|
return memory::reclaiming_result::reclaimed_nothing;
|
|
}
|
|
_lru.back().on_evicted(*this);
|
|
return memory::reclaiming_result::reclaimed_something;
|
|
} catch (std::bad_alloc&) {
|
|
// Bad luck, linearization during partition removal caused us to
|
|
// fail. Drop the entire cache so we can make forward progress.
|
|
clear();
|
|
return memory::reclaiming_result::reclaimed_something;
|
|
}
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
cache_tracker::~cache_tracker() {
|
|
clear();
|
|
}
|
|
|
|
void cache_tracker::set_compaction_scheduling_group(seastar::scheduling_group sg) {
|
|
_memtable_cleaner.set_scheduling_group(sg);
|
|
_garbage.set_scheduling_group(sg);
|
|
}
|
|
|
|
void
|
|
cache_tracker::setup_metrics() {
|
|
namespace sm = seastar::metrics;
|
|
_metrics.add_group("cache", {
|
|
sm::make_gauge("bytes_used", sm::description("current bytes used by the cache out of the total size of memory"), [this] { return _region.occupancy().used_space(); }),
|
|
sm::make_gauge("bytes_total", sm::description("total size of memory for the cache"), [this] { return _region.occupancy().total_space(); }),
|
|
sm::make_derive("partition_hits", sm::description("number of partitions needed by reads and found in cache"), _stats.partition_hits),
|
|
sm::make_derive("partition_misses", sm::description("number of partitions needed by reads and missing in cache"), _stats.partition_misses),
|
|
sm::make_derive("partition_insertions", sm::description("total number of partitions added to cache"), _stats.partition_insertions),
|
|
sm::make_derive("row_hits", sm::description("total number of rows needed by reads and found in cache"), _stats.row_hits),
|
|
sm::make_derive("row_misses", sm::description("total number of rows needed by reads and missing in cache"), _stats.row_misses),
|
|
sm::make_derive("row_insertions", sm::description("total number of rows added to cache"), _stats.row_insertions),
|
|
sm::make_derive("row_evictions", sm::description("total number of rows evicted from cache"), _stats.row_evictions),
|
|
sm::make_derive("row_removals", sm::description("total number of invalidated rows"), _stats.row_removals),
|
|
sm::make_derive("static_row_insertions", sm::description("total number of static rows added to cache"), _stats.static_row_insertions),
|
|
sm::make_derive("concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key),
|
|
sm::make_derive("partition_merges", sm::description("total number of partitions merged"), _stats.partition_merges),
|
|
sm::make_derive("partition_evictions", sm::description("total number of evicted partitions"), _stats.partition_evictions),
|
|
sm::make_derive("partition_removals", sm::description("total number of invalidated partitions"), _stats.partition_removals),
|
|
sm::make_derive("mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations),
|
|
sm::make_gauge("partitions", sm::description("total number of cached partitions"), _stats.partitions),
|
|
sm::make_gauge("rows", sm::description("total number of cached rows"), _stats.rows),
|
|
sm::make_derive("reads", sm::description("number of started reads"), _stats.reads),
|
|
sm::make_derive("reads_with_misses", sm::description("number of reads which had to read from sstables"), _stats.reads_with_misses),
|
|
sm::make_gauge("active_reads", sm::description("number of currently active reads"), [this] { return _stats.active_reads(); }),
|
|
sm::make_derive("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations),
|
|
sm::make_derive("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips),
|
|
sm::make_derive("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips),
|
|
sm::make_derive("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload),
|
|
sm::make_derive("rows_processed_from_memtable", _stats.rows_processed_from_memtable,
|
|
sm::description("total number of rows in memtables which were processed during cache update on memtable flush")),
|
|
sm::make_derive("rows_dropped_from_memtable", _stats.rows_dropped_from_memtable,
|
|
sm::description("total number of rows in memtables which were dropped during cache update on memtable flush")),
|
|
sm::make_derive("rows_merged_from_memtable", _stats.rows_merged_from_memtable,
|
|
sm::description("total number of rows in memtables which were merged with existing rows during cache update on memtable flush")),
|
|
});
|
|
}
|
|
|
|
void cache_tracker::clear() {
|
|
auto partitions_before = _stats.partitions;
|
|
auto rows_before = _stats.rows;
|
|
// We need to clear garbage first because garbage versions cannot be evicted from,
|
|
// mutation_partition::clear_gently() destroys intrusive tree invariants.
|
|
with_allocator(_region.allocator(), [this] {
|
|
_garbage.clear();
|
|
_memtable_cleaner.clear();
|
|
while (!_lru.empty()) {
|
|
_lru.back().on_evicted(*this);
|
|
}
|
|
});
|
|
_stats.partition_removals += partitions_before;
|
|
_stats.row_removals += rows_before;
|
|
allocator().invalidate_references();
|
|
}
|
|
|
|
void cache_tracker::touch(rows_entry& e) {
|
|
if (e._lru_link.is_linked()) { // last dummy may not be linked if evicted.
|
|
_lru.erase(_lru.iterator_to(e));
|
|
}
|
|
_lru.push_front(e);
|
|
}
|
|
|
|
void cache_tracker::insert(cache_entry& entry) {
|
|
insert(entry.partition());
|
|
++_stats.partition_insertions;
|
|
++_stats.partitions;
|
|
// partition_range_cursor depends on this to detect invalidation of _end
|
|
_region.allocator().invalidate_references();
|
|
}
|
|
|
|
void cache_tracker::on_partition_erase() {
|
|
--_stats.partitions;
|
|
++_stats.partition_removals;
|
|
allocator().invalidate_references();
|
|
}
|
|
|
|
void cache_tracker::unlink(rows_entry& row) noexcept {
|
|
row._lru_link.unlink();
|
|
}
|
|
|
|
void cache_tracker::on_partition_merge() {
|
|
++_stats.partition_merges;
|
|
}
|
|
|
|
void cache_tracker::on_partition_hit() {
|
|
++_stats.partition_hits;
|
|
}
|
|
|
|
void cache_tracker::on_partition_miss() {
|
|
++_stats.partition_misses;
|
|
}
|
|
|
|
void cache_tracker::on_partition_eviction() {
|
|
--_stats.partitions;
|
|
++_stats.partition_evictions;
|
|
}
|
|
|
|
void cache_tracker::on_row_eviction() {
|
|
--_stats.rows;
|
|
++_stats.row_evictions;
|
|
}
|
|
|
|
void cache_tracker::on_row_hit() {
|
|
++_stats.row_hits;
|
|
}
|
|
|
|
void cache_tracker::on_row_miss() {
|
|
++_stats.row_misses;
|
|
}
|
|
|
|
void cache_tracker::on_mispopulate() {
|
|
++_stats.mispopulations;
|
|
}
|
|
|
|
void cache_tracker::on_miss_already_populated() {
|
|
++_stats.concurrent_misses_same_key;
|
|
}
|
|
|
|
void cache_tracker::pinned_dirty_memory_overload(uint64_t bytes) {
|
|
_stats.pinned_dirty_memory_overload += bytes;
|
|
}
|
|
|
|
allocation_strategy& cache_tracker::allocator() {
|
|
return _region.allocator();
|
|
}
|
|
|
|
logalloc::region& cache_tracker::region() {
|
|
return _region;
|
|
}
|
|
|
|
const logalloc::region& cache_tracker::region() const {
|
|
return _region;
|
|
}
|
|
|
|
// Stable cursor over partition entries from given range.
|
|
//
|
|
// Must be accessed with reclaim lock held on the cache region.
|
|
// The position of the cursor is always valid, but cache entry reference
|
|
// is not always valid. It remains valid as long as the iterators
|
|
// into _cache._partitions remain valid. Cache entry reference can be
|
|
// brought back to validity by calling refresh().
|
|
//
|
|
class partition_range_cursor final {
|
|
std::reference_wrapper<row_cache> _cache;
|
|
row_cache::partitions_type::iterator _it;
|
|
row_cache::partitions_type::iterator _end;
|
|
dht::ring_position_view _start_pos;
|
|
dht::ring_position_view _end_pos;
|
|
std::optional<dht::decorated_key> _last;
|
|
uint64_t _last_reclaim_count;
|
|
private:
|
|
void set_position(cache_entry& e) {
|
|
// FIXME: make ring_position_view convertible to ring_position, so we can use e.position()
|
|
if (e.is_dummy_entry()) {
|
|
_last = {};
|
|
_start_pos = dht::ring_position_view::max();
|
|
} else {
|
|
_last = e.key();
|
|
_start_pos = dht::ring_position_view(*_last);
|
|
}
|
|
}
|
|
public:
|
|
// Creates a cursor positioned at the lower bound of the range.
|
|
// The cache entry reference is not valid.
|
|
// The range reference must remain live as long as this instance is used.
|
|
partition_range_cursor(row_cache& cache, const dht::partition_range& range)
|
|
: _cache(cache)
|
|
, _start_pos(dht::ring_position_view::for_range_start(range))
|
|
, _end_pos(dht::ring_position_view::for_range_end(range))
|
|
, _last_reclaim_count(std::numeric_limits<uint64_t>::max())
|
|
{ }
|
|
|
|
// Returns true iff the cursor is valid
|
|
bool valid() const {
|
|
return _cache.get().get_cache_tracker().allocator().invalidate_counter() == _last_reclaim_count;
|
|
}
|
|
|
|
// Repositions the cursor to the first entry with position >= pos.
|
|
// Returns true iff the position of the cursor is equal to pos.
|
|
// Can be called on invalid cursor, in which case it brings it back to validity.
|
|
// Strong exception guarantees.
|
|
bool advance_to(dht::ring_position_view pos) {
|
|
auto cmp = cache_entry::compare(_cache.get()._schema);
|
|
if (cmp(_end_pos, pos)) { // next() may have moved _start_pos past the _end_pos.
|
|
_end_pos = pos;
|
|
}
|
|
_end = _cache.get()._partitions.lower_bound(_end_pos, cmp);
|
|
_it = _cache.get()._partitions.lower_bound(pos, cmp);
|
|
auto same = !cmp(pos, _it->position());
|
|
set_position(*_it);
|
|
_last_reclaim_count = _cache.get().get_cache_tracker().allocator().invalidate_counter();
|
|
return same;
|
|
}
|
|
|
|
// Ensures that cache entry reference is valid.
|
|
// The cursor will point at the first entry with position >= the current position.
|
|
// Returns true if and only if the position of the cursor did not change.
|
|
// Strong exception guarantees.
|
|
bool refresh() {
|
|
if (valid()) {
|
|
return true;
|
|
}
|
|
return advance_to(_start_pos);
|
|
}
|
|
|
|
// Positions the cursor at the next entry.
|
|
// May advance past the requested range. Use in_range() after the call to determine that.
|
|
// Call only when in_range() and cache entry reference is valid.
|
|
// Strong exception guarantees.
|
|
void next() {
|
|
auto next = std::next(_it);
|
|
set_position(*next);
|
|
_it = std::move(next);
|
|
}
|
|
|
|
// Valid only after refresh() and before _cache._partitions iterators are invalidated.
|
|
// Points inside the requested range if in_range().
|
|
cache_entry& entry() {
|
|
return *_it;
|
|
}
|
|
|
|
// Call only when cache entry reference is valid.
|
|
bool in_range() {
|
|
return _it != _end;
|
|
}
|
|
|
|
// Returns current position of the cursor.
|
|
// Result valid as long as this instance is valid and not advanced.
|
|
dht::ring_position_view position() const {
|
|
return _start_pos;
|
|
}
|
|
};
|
|
|
|
future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout) {
|
|
if (_range_query) {
|
|
// FIXME: Singular-range mutation readers don't support fast_forward_to(), so need to use a wide range
|
|
// here in case the same reader will need to be fast forwarded later.
|
|
_sm_range = dht::partition_range({dht::ring_position(*_key)}, {dht::ring_position(*_key)});
|
|
} else {
|
|
_sm_range = dht::partition_range::make_singular({dht::ring_position(*_key)});
|
|
}
|
|
return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this, skip_first_fragment, timeout] {
|
|
_underlying_snapshot = {};
|
|
if (skip_first_fragment) {
|
|
return _underlying.underlying()(timeout).then([](auto &&mf) {});
|
|
} else {
|
|
return make_ready_future<>();
|
|
}
|
|
});
|
|
}
|
|
|
|
static flat_mutation_reader read_directly_from_underlying(read_context& reader) {
|
|
flat_mutation_reader res = make_delegating_reader(reader.underlying().underlying());
|
|
if (reader.schema()->version() != reader.underlying().underlying().schema()->version()) {
|
|
res = transform(std::move(res), schema_upgrader(reader.schema()));
|
|
}
|
|
return make_nonforwardable(std::move(res), true);
|
|
}
|
|
|
|
// Reader which populates the cache using data from the delegate.
|
|
class single_partition_populating_reader final : public flat_mutation_reader::impl {
|
|
row_cache& _cache;
|
|
lw_shared_ptr<read_context> _read_context;
|
|
flat_mutation_reader_opt _reader;
|
|
private:
|
|
future<> create_reader(db::timeout_clock::time_point timeout) {
|
|
auto src_and_phase = _cache.snapshot_of(_read_context->range().start()->value());
|
|
auto phase = src_and_phase.phase;
|
|
_read_context->enter_partition(_read_context->range().start()->value().as_decorated_key(), src_and_phase.snapshot, phase);
|
|
return _read_context->create_underlying(false, timeout).then([this, phase, timeout] {
|
|
return _read_context->underlying().underlying()(timeout).then([this, phase] (auto&& mfopt) {
|
|
if (!mfopt) {
|
|
if (phase == _cache.phase_of(_read_context->range().start()->value())) {
|
|
_cache._read_section(_cache._tracker.region(), [this] {
|
|
with_allocator(_cache._tracker.allocator(), [this] {
|
|
dht::decorated_key dk = _read_context->range().start()->value().as_decorated_key();
|
|
_cache.do_find_or_create_entry(dk, nullptr, [&] (auto i) {
|
|
mutation_partition mp(_cache._schema);
|
|
cache_entry* entry = current_allocator().construct<cache_entry>(
|
|
_cache._schema, std::move(dk), std::move(mp));
|
|
_cache._tracker.insert(*entry);
|
|
entry->set_continuous(i->continuous());
|
|
return _cache._partitions.insert_before(i, *entry);
|
|
}, [&] (auto i) {
|
|
_cache._tracker.on_miss_already_populated();
|
|
});
|
|
});
|
|
});
|
|
} else {
|
|
_cache._tracker.on_mispopulate();
|
|
}
|
|
_end_of_stream = true;
|
|
} else if (phase == _cache.phase_of(_read_context->range().start()->value())) {
|
|
_reader = _cache._read_section(_cache._tracker.region(), [&] {
|
|
cache_entry& e = _cache.find_or_create(mfopt->as_partition_start().key(), mfopt->as_partition_start().partition_tombstone(), phase);
|
|
return e.read(_cache, *_read_context, phase);
|
|
});
|
|
} else {
|
|
_cache._tracker.on_mispopulate();
|
|
_reader = read_directly_from_underlying(*_read_context);
|
|
this->push_mutation_fragment(std::move(*mfopt));
|
|
}
|
|
});
|
|
});
|
|
}
|
|
public:
|
|
single_partition_populating_reader(row_cache& cache,
|
|
lw_shared_ptr<read_context> context)
|
|
: impl(context->schema())
|
|
, _cache(cache)
|
|
, _read_context(std::move(context))
|
|
{ }
|
|
|
|
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
|
if (!_reader) {
|
|
return create_reader(timeout).then([this, timeout] {
|
|
if (_end_of_stream) {
|
|
return make_ready_future<>();
|
|
}
|
|
return fill_buffer(timeout);
|
|
});
|
|
}
|
|
return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
|
|
return fill_buffer_from(*_reader, timeout).then([this] (bool reader_finished) {
|
|
if (reader_finished) {
|
|
_end_of_stream = true;
|
|
}
|
|
});
|
|
});
|
|
}
|
|
virtual void next_partition() override {
|
|
if (_reader) {
|
|
clear_buffer();
|
|
_end_of_stream = true;
|
|
}
|
|
}
|
|
virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point timeout) override {
|
|
clear_buffer();
|
|
_end_of_stream = true;
|
|
return make_ready_future<>();
|
|
}
|
|
virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
|
|
throw std::bad_function_call();
|
|
}
|
|
virtual size_t buffer_size() const override {
|
|
if (_reader) {
|
|
return flat_mutation_reader::impl::buffer_size() + _reader->buffer_size();
|
|
}
|
|
return flat_mutation_reader::impl::buffer_size();
|
|
}
|
|
};
|
|
|
|
void cache_tracker::clear_continuity(cache_entry& ce) {
|
|
ce.set_continuous(false);
|
|
}
|
|
|
|
void row_cache::on_partition_hit() {
|
|
_tracker.on_partition_hit();
|
|
}
|
|
|
|
void row_cache::on_partition_miss() {
|
|
_tracker.on_partition_miss();
|
|
}
|
|
|
|
void row_cache::on_row_hit() {
|
|
_stats.hits.mark();
|
|
_tracker.on_row_hit();
|
|
}
|
|
|
|
void row_cache::on_mispopulate() {
|
|
_tracker.on_mispopulate();
|
|
}
|
|
|
|
void row_cache::on_row_miss() {
|
|
_stats.misses.mark();
|
|
_tracker.on_row_miss();
|
|
}
|
|
|
|
void row_cache::on_static_row_insert() {
|
|
++_tracker._stats.static_row_insertions;
|
|
}
|
|
|
|
class range_populating_reader {
|
|
row_cache& _cache;
|
|
autoupdating_underlying_reader& _reader;
|
|
std::optional<row_cache::previous_entry_pointer> _last_key;
|
|
read_context& _read_context;
|
|
private:
|
|
bool can_set_continuity() const {
|
|
return _last_key && _reader.creation_phase() == _cache.phase_of(_reader.population_range_start());
|
|
}
|
|
void handle_end_of_stream() {
|
|
if (!can_set_continuity()) {
|
|
_cache.on_mispopulate();
|
|
return;
|
|
}
|
|
if (!_reader.range().end() || !_reader.range().end()->is_inclusive()) {
|
|
cache_entry::compare cmp(_cache._schema);
|
|
auto it = _reader.range().end() ? _cache._partitions.find(_reader.range().end()->value(), cmp)
|
|
: std::prev(_cache._partitions.end());
|
|
if (it != _cache._partitions.end()) {
|
|
if (it == _cache._partitions.begin()) {
|
|
if (!_last_key->_key) {
|
|
it->set_continuous(true);
|
|
} else {
|
|
_cache.on_mispopulate();
|
|
}
|
|
} else {
|
|
auto prev = std::prev(it);
|
|
if (prev->key().equal(*_cache._schema, *_last_key->_key)) {
|
|
it->set_continuous(true);
|
|
} else {
|
|
_cache.on_mispopulate();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
public:
|
|
range_populating_reader(row_cache& cache, read_context& ctx)
|
|
: _cache(cache)
|
|
, _reader(ctx.underlying())
|
|
, _read_context(ctx)
|
|
{}
|
|
|
|
future<flat_mutation_reader_opt, mutation_fragment_opt > operator()(db::timeout_clock::time_point timeout) {
|
|
return _reader.move_to_next_partition(timeout).then([this] (auto&& mfopt) mutable {
|
|
{
|
|
if (!mfopt) {
|
|
this->handle_end_of_stream();
|
|
return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(std::nullopt, std::nullopt);
|
|
}
|
|
_cache.on_partition_miss();
|
|
const partition_start& ps = mfopt->as_partition_start();
|
|
const dht::decorated_key& key = ps.key();
|
|
if (_reader.creation_phase() == _cache.phase_of(key)) {
|
|
return _cache._read_section(_cache._tracker.region(), [&] {
|
|
cache_entry& e = _cache.find_or_create(key,
|
|
ps.partition_tombstone(),
|
|
_reader.creation_phase(),
|
|
this->can_set_continuity() ? &*_last_key : nullptr);
|
|
_last_key = row_cache::previous_entry_pointer(key);
|
|
return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(
|
|
e.read(_cache, _read_context, _reader.creation_phase()), std::nullopt);
|
|
});
|
|
} else {
|
|
_cache._tracker.on_mispopulate();
|
|
_last_key = row_cache::previous_entry_pointer(key);
|
|
return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(
|
|
read_directly_from_underlying(_read_context), std::move(mfopt));
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
future<> fast_forward_to(dht::partition_range&& pr, db::timeout_clock::time_point timeout) {
|
|
if (!pr.start()) {
|
|
_last_key = row_cache::previous_entry_pointer();
|
|
} else if (!pr.start()->is_inclusive() && pr.start()->value().has_key()) {
|
|
_last_key = row_cache::previous_entry_pointer(pr.start()->value().as_decorated_key());
|
|
} else {
|
|
// Inclusive start bound, cannot set continuity flag.
|
|
_last_key = {};
|
|
}
|
|
|
|
return _reader.fast_forward_to(std::move(pr), timeout);
|
|
}
|
|
};
|
|
|
|
class scanning_and_populating_reader final : public flat_mutation_reader::impl {
|
|
const dht::partition_range* _pr;
|
|
row_cache& _cache;
|
|
lw_shared_ptr<read_context> _read_context;
|
|
partition_range_cursor _primary;
|
|
range_populating_reader _secondary_reader;
|
|
bool _secondary_in_progress = false;
|
|
bool _advance_primary = false;
|
|
std::optional<dht::partition_range::bound> _lower_bound;
|
|
dht::partition_range _secondary_range;
|
|
flat_mutation_reader_opt _reader;
|
|
private:
|
|
flat_mutation_reader read_from_entry(cache_entry& ce) {
|
|
_cache.upgrade_entry(ce);
|
|
_cache.on_partition_hit();
|
|
return ce.read(_cache, *_read_context);
|
|
}
|
|
|
|
static dht::ring_position_view as_ring_position_view(const std::optional<dht::partition_range::bound>& lower_bound) {
|
|
return lower_bound ? dht::ring_position_view(lower_bound->value(), dht::ring_position_view::after_key(!lower_bound->is_inclusive()))
|
|
: dht::ring_position_view::min();
|
|
}
|
|
|
|
flat_mutation_reader_opt do_read_from_primary(db::timeout_clock::time_point timeout) {
|
|
return _cache._read_section(_cache._tracker.region(), [this] {
|
|
return with_linearized_managed_bytes([&] () -> flat_mutation_reader_opt {
|
|
bool not_moved = true;
|
|
if (!_primary.valid()) {
|
|
not_moved = _primary.advance_to(as_ring_position_view(_lower_bound));
|
|
}
|
|
|
|
if (_advance_primary && not_moved) {
|
|
_primary.next();
|
|
not_moved = false;
|
|
}
|
|
_advance_primary = false;
|
|
|
|
if (not_moved || _primary.entry().continuous()) {
|
|
if (!_primary.in_range()) {
|
|
return std::nullopt;
|
|
}
|
|
cache_entry& e = _primary.entry();
|
|
auto fr = read_from_entry(e);
|
|
_lower_bound = dht::partition_range::bound{e.key(), false};
|
|
// Delay the call to next() so that we don't see stale continuity on next invocation.
|
|
_advance_primary = true;
|
|
return flat_mutation_reader_opt(std::move(fr));
|
|
} else {
|
|
if (_primary.in_range()) {
|
|
cache_entry& e = _primary.entry();
|
|
_secondary_range = dht::partition_range(_lower_bound,
|
|
dht::partition_range::bound{e.key(), false});
|
|
_lower_bound = dht::partition_range::bound{e.key(), true};
|
|
_secondary_in_progress = true;
|
|
return std::nullopt;
|
|
} else {
|
|
dht::ring_position_comparator cmp(*_read_context->schema());
|
|
auto range = _pr->trim_front(std::optional<dht::partition_range::bound>(_lower_bound), cmp);
|
|
if (!range) {
|
|
return std::nullopt;
|
|
}
|
|
_lower_bound = dht::partition_range::bound{dht::ring_position::max()};
|
|
_secondary_range = std::move(*range);
|
|
_secondary_in_progress = true;
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
future<flat_mutation_reader_opt> read_from_primary(db::timeout_clock::time_point timeout) {
|
|
auto fro = do_read_from_primary(timeout);
|
|
if (!_secondary_in_progress) {
|
|
return make_ready_future<flat_mutation_reader_opt>(std::move(fro));
|
|
}
|
|
return _secondary_reader.fast_forward_to(std::move(_secondary_range), timeout).then([this, timeout] {
|
|
return read_from_secondary(timeout);
|
|
});
|
|
}
|
|
|
|
future<flat_mutation_reader_opt> read_from_secondary(db::timeout_clock::time_point timeout) {
|
|
return _secondary_reader(timeout).then([this, timeout] (flat_mutation_reader_opt fropt, mutation_fragment_opt ps) {
|
|
if (fropt) {
|
|
if (ps) {
|
|
push_mutation_fragment(std::move(*ps));
|
|
}
|
|
return make_ready_future<flat_mutation_reader_opt>(std::move(fropt));
|
|
} else {
|
|
_secondary_in_progress = false;
|
|
return read_from_primary(timeout);
|
|
}
|
|
});
|
|
}
|
|
future<> read_next_partition(db::timeout_clock::time_point timeout) {
|
|
return (_secondary_in_progress ? read_from_secondary(timeout) : read_from_primary(timeout)).then([this] (auto&& fropt) {
|
|
if (bool(fropt)) {
|
|
_reader = std::move(fropt);
|
|
} else {
|
|
_end_of_stream = true;
|
|
}
|
|
});
|
|
}
|
|
void on_end_of_stream() {
|
|
_reader = {};
|
|
}
|
|
public:
|
|
scanning_and_populating_reader(row_cache& cache,
|
|
const dht::partition_range& range,
|
|
lw_shared_ptr<read_context> context)
|
|
: impl(context->schema())
|
|
, _pr(&range)
|
|
, _cache(cache)
|
|
, _read_context(std::move(context))
|
|
, _primary(cache, range)
|
|
, _secondary_reader(cache, *_read_context)
|
|
, _lower_bound(range.start())
|
|
{ }
|
|
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
|
return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] {
|
|
if (!_reader) {
|
|
return read_next_partition(timeout);
|
|
} else {
|
|
return fill_buffer_from(*_reader, timeout).then([this] (bool reader_finished) {
|
|
if (reader_finished) {
|
|
on_end_of_stream();
|
|
}
|
|
});
|
|
}
|
|
});
|
|
}
|
|
virtual void next_partition() override {
|
|
clear_buffer_to_next_partition();
|
|
if (_reader && is_buffer_empty()) {
|
|
_reader->next_partition();
|
|
}
|
|
}
|
|
virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
|
|
clear_buffer();
|
|
_reader = {};
|
|
_end_of_stream = false;
|
|
_secondary_in_progress = false;
|
|
_advance_primary = false;
|
|
_pr = ≺
|
|
_primary = partition_range_cursor{_cache, pr};
|
|
_lower_bound = pr.start();
|
|
return make_ready_future<>();
|
|
}
|
|
virtual future<> fast_forward_to(position_range cr, db::timeout_clock::time_point timeout) override {
|
|
throw std::bad_function_call();
|
|
}
|
|
virtual size_t buffer_size() const override {
|
|
if (_reader) {
|
|
return flat_mutation_reader::impl::buffer_size() + _reader->buffer_size();
|
|
}
|
|
return flat_mutation_reader::impl::buffer_size();
|
|
}
|
|
};
|
|
|
|
flat_mutation_reader
|
|
row_cache::make_scanning_reader(const dht::partition_range& range, lw_shared_ptr<read_context> context) {
|
|
return make_flat_mutation_reader<scanning_and_populating_reader>(*this, range, std::move(context));
|
|
}
|
|
|
|
flat_mutation_reader
|
|
row_cache::make_reader(schema_ptr s,
|
|
const dht::partition_range& range,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state,
|
|
streamed_mutation::forwarding fwd,
|
|
mutation_reader::forwarding fwd_mr)
|
|
{
|
|
auto ctx = make_lw_shared<read_context>(*this, s, range, slice, pc, trace_state, fwd_mr);
|
|
|
|
if (!ctx->is_range_query() && !fwd_mr) {
|
|
auto mr = _read_section(_tracker.region(), [&] {
|
|
return with_linearized_managed_bytes([&] {
|
|
cache_entry::compare cmp(_schema);
|
|
auto&& pos = ctx->range().start()->value();
|
|
auto i = _partitions.lower_bound(pos, cmp);
|
|
if (i != _partitions.end() && !cmp(pos, i->position())) {
|
|
cache_entry& e = *i;
|
|
upgrade_entry(e);
|
|
on_partition_hit();
|
|
return e.read(*this, *ctx);
|
|
} else if (i->continuous()) {
|
|
return make_empty_flat_reader(std::move(s));
|
|
} else {
|
|
on_partition_miss();
|
|
return make_flat_mutation_reader<single_partition_populating_reader>(*this, std::move(ctx));
|
|
}
|
|
});
|
|
});
|
|
|
|
if (fwd == streamed_mutation::forwarding::yes) {
|
|
return make_forwardable(std::move(mr));
|
|
} else {
|
|
return mr;
|
|
}
|
|
}
|
|
|
|
auto mr = make_scanning_reader(range, std::move(ctx));
|
|
if (fwd == streamed_mutation::forwarding::yes) {
|
|
return make_forwardable(std::move(mr));
|
|
} else {
|
|
return mr;
|
|
}
|
|
}
|
|
|
|
|
|
row_cache::~row_cache() {
|
|
with_allocator(_tracker.allocator(), [this] {
|
|
_partitions.clear_and_dispose([this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
|
|
if (!p->is_dummy_entry()) {
|
|
_tracker.on_partition_erase();
|
|
}
|
|
p->evict(_tracker);
|
|
deleter(p);
|
|
});
|
|
});
|
|
}
|
|
|
|
void row_cache::clear_now() noexcept {
|
|
with_allocator(_tracker.allocator(), [this] {
|
|
auto it = _partitions.erase_and_dispose(_partitions.begin(), partitions_end(), [this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
|
|
_tracker.on_partition_erase();
|
|
p->evict(_tracker);
|
|
deleter(p);
|
|
});
|
|
_tracker.clear_continuity(*it);
|
|
});
|
|
}
|
|
|
|
template<typename CreateEntry, typename VisitEntry>
|
|
//requires requires(CreateEntry create, VisitEntry visit, row_cache::partitions_type::iterator it) {
|
|
// { create(it) } -> row_cache::partitions_type::iterator;
|
|
// { visit(it) } -> void;
|
|
// }
|
|
cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key,
|
|
const previous_entry_pointer* previous, CreateEntry&& create_entry, VisitEntry&& visit_entry)
|
|
{
|
|
return with_allocator(_tracker.allocator(), [&] () -> cache_entry& {
|
|
return with_linearized_managed_bytes([&] () -> cache_entry& {
|
|
auto i = _partitions.lower_bound(key, cache_entry::compare(_schema));
|
|
if (i == _partitions.end() || !i->key().equal(*_schema, key)) {
|
|
i = create_entry(i);
|
|
} else {
|
|
visit_entry(i);
|
|
}
|
|
|
|
if (!previous) {
|
|
return *i;
|
|
}
|
|
|
|
if ((!previous->_key && i == _partitions.begin())
|
|
|| (previous->_key && i != _partitions.begin()
|
|
&& std::prev(i)->key().equal(*_schema, *previous->_key))) {
|
|
i->set_continuous(true);
|
|
} else {
|
|
on_mispopulate();
|
|
}
|
|
|
|
return *i;
|
|
});
|
|
});
|
|
}
|
|
|
|
cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone t, row_cache::phase_type phase, const previous_entry_pointer* previous) {
|
|
return do_find_or_create_entry(key, previous, [&] (auto i) { // create
|
|
auto entry = current_allocator().construct<cache_entry>(cache_entry::incomplete_tag{}, _schema, key, t);
|
|
_tracker.insert(*entry);
|
|
return _partitions.insert_before(i, *entry);
|
|
}, [&] (auto i) { // visit
|
|
_tracker.on_miss_already_populated();
|
|
cache_entry& e = *i;
|
|
e.partition().open_version(*e.schema(), &_tracker, phase).partition().apply(t);
|
|
upgrade_entry(e);
|
|
});
|
|
}
|
|
|
|
void row_cache::populate(const mutation& m, const previous_entry_pointer* previous) {
|
|
_populate_section(_tracker.region(), [&] {
|
|
do_find_or_create_entry(m.decorated_key(), previous, [&] (auto i) {
|
|
cache_entry* entry = current_allocator().construct<cache_entry>(
|
|
m.schema(), m.decorated_key(), m.partition());
|
|
_tracker.insert(*entry);
|
|
entry->set_continuous(i->continuous());
|
|
i = _partitions.insert_before(i, *entry);
|
|
upgrade_entry(*i);
|
|
return i;
|
|
}, [&] (auto i) {
|
|
throw std::runtime_error(format("cache already contains entry for {}", m.key()));
|
|
});
|
|
});
|
|
}
|
|
|
|
mutation_source& row_cache::snapshot_for_phase(phase_type phase) {
|
|
if (phase == _underlying_phase) {
|
|
return _underlying;
|
|
} else {
|
|
if (phase + 1 < _underlying_phase) {
|
|
throw std::runtime_error(format("attempted to read from retired phase {} (current={})", phase, _underlying_phase));
|
|
}
|
|
return *_prev_snapshot;
|
|
}
|
|
}
|
|
|
|
row_cache::snapshot_and_phase row_cache::snapshot_of(dht::ring_position_view pos) {
|
|
dht::ring_position_less_comparator less(*_schema);
|
|
if (!_prev_snapshot_pos || less(pos, *_prev_snapshot_pos)) {
|
|
return {_underlying, _underlying_phase};
|
|
}
|
|
return {*_prev_snapshot, _underlying_phase - 1};
|
|
}
|
|
|
|
void row_cache::invalidate_sync(memtable& m) noexcept {
|
|
with_allocator(_tracker.allocator(), [&m, this] () {
|
|
logalloc::reclaim_lock _(_tracker.region());
|
|
bool blow_cache = false;
|
|
// Note: clear_and_dispose() ought not to look up any keys, so it doesn't require
|
|
// with_linearized_managed_bytes(), but invalidate() does.
|
|
m.partitions.clear_and_dispose([this, deleter = current_deleter<memtable_entry>(), &blow_cache] (memtable_entry* entry) {
|
|
with_linearized_managed_bytes([&] {
|
|
try {
|
|
invalidate_locked(entry->key());
|
|
} catch (...) {
|
|
blow_cache = true;
|
|
}
|
|
entry->partition().evict(_tracker.memtable_cleaner());
|
|
deleter(entry);
|
|
});
|
|
});
|
|
if (blow_cache) {
|
|
// We failed to invalidate the key, presumably due to with_linearized_managed_bytes()
|
|
// running out of memory. Recover using clear_now(), which doesn't throw.
|
|
clear_now();
|
|
}
|
|
});
|
|
}
|
|
|
|
row_cache::phase_type row_cache::phase_of(dht::ring_position_view pos) {
|
|
dht::ring_position_less_comparator less(*_schema);
|
|
if (!_prev_snapshot_pos || less(pos, *_prev_snapshot_pos)) {
|
|
return _underlying_phase;
|
|
}
|
|
return _underlying_phase - 1;
|
|
}
|
|
|
|
template <typename Updater>
|
|
future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater) {
|
|
return do_update(std::move(eu), [this, &m, updater = std::move(updater)] {
|
|
real_dirty_memory_accounter real_dirty_acc(m, _tracker);
|
|
m.on_detach_from_region_group();
|
|
_tracker.region().merge(m); // Now all data in memtable belongs to cache
|
|
_tracker.memtable_cleaner().merge(m._cleaner);
|
|
STAP_PROBE(scylla, row_cache_update_start);
|
|
auto cleanup = defer([&m, this] {
|
|
invalidate_sync(m);
|
|
STAP_PROBE(scylla, row_cache_update_end);
|
|
});
|
|
|
|
return seastar::async([this, &m, updater = std::move(updater), real_dirty_acc = std::move(real_dirty_acc)] () mutable {
|
|
coroutine update;
|
|
size_t size_entry;
|
|
// In case updater fails, we must bring the cache to consistency without deferring.
|
|
auto cleanup = defer([&m, this] {
|
|
invalidate_sync(m);
|
|
_prev_snapshot_pos = {};
|
|
_prev_snapshot = {};
|
|
});
|
|
partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker();
|
|
while (!m.partitions.empty()) {
|
|
with_allocator(_tracker.allocator(), [&] () {
|
|
auto cmp = cache_entry::compare(_schema);
|
|
{
|
|
size_t partition_count = 0;
|
|
{
|
|
STAP_PROBE(scylla, row_cache_update_one_batch_start);
|
|
// FIXME: we should really be checking should_yield() here instead of
|
|
// need_preempt(). However, should_yield() is currently quite
|
|
// expensive and we need to amortize it somehow.
|
|
do {
|
|
STAP_PROBE(scylla, row_cache_update_partition_start);
|
|
with_linearized_managed_bytes([&] {
|
|
if (!update) {
|
|
_update_section(_tracker.region(), [&] {
|
|
memtable_entry& mem_e = *m.partitions.begin();
|
|
size_entry = mem_e.size_in_allocator_without_rows(_tracker.allocator());
|
|
auto cache_i = _partitions.lower_bound(mem_e.key(), cmp);
|
|
update = updater(_update_section, cache_i, mem_e, is_present, real_dirty_acc);
|
|
});
|
|
}
|
|
// We use cooperative deferring instead of futures so that
|
|
// this layer has a chance to restore invariants before deferring,
|
|
// in particular set _prev_snapshot_pos to the correct value.
|
|
if (update.run() == stop_iteration::no) {
|
|
return;
|
|
}
|
|
update = {};
|
|
real_dirty_acc.unpin_memory(size_entry);
|
|
_update_section(_tracker.region(), [&] {
|
|
auto i = m.partitions.begin();
|
|
memtable_entry& mem_e = *i;
|
|
m.partitions.erase(i);
|
|
mem_e.partition().evict(_tracker.memtable_cleaner());
|
|
current_allocator().destroy(&mem_e);
|
|
});
|
|
++partition_count;
|
|
});
|
|
STAP_PROBE(scylla, row_cache_update_partition_end);
|
|
} while (!m.partitions.empty() && !need_preempt());
|
|
with_allocator(standard_allocator(), [&] {
|
|
if (m.partitions.empty()) {
|
|
_prev_snapshot_pos = {};
|
|
} else {
|
|
_update_section(_tracker.region(), [&] {
|
|
_prev_snapshot_pos = dht::ring_position(m.partitions.begin()->key());
|
|
});
|
|
}
|
|
});
|
|
STAP_PROBE1(scylla, row_cache_update_one_batch_end, partition_count);
|
|
}
|
|
}
|
|
});
|
|
real_dirty_acc.commit();
|
|
seastar::thread::yield();
|
|
}
|
|
}).finally([cleanup = std::move(cleanup)] {});
|
|
});
|
|
}
|
|
|
|
future<> row_cache::update(external_updater eu, memtable& m) {
|
|
return do_update(std::move(eu), m, [this] (logalloc::allocating_section& alloc,
|
|
row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e, partition_presence_checker& is_present,
|
|
real_dirty_memory_accounter& acc) mutable {
|
|
// If cache doesn't contain the entry we cannot insert it because the mutation may be incomplete.
|
|
// FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to
|
|
// search it.
|
|
if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) {
|
|
cache_entry& entry = *cache_i;
|
|
upgrade_entry(entry);
|
|
_tracker.on_partition_merge();
|
|
return entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(),
|
|
alloc, _tracker.region(), _tracker, _underlying_phase, acc);
|
|
} else if (cache_i->continuous()
|
|
|| with_allocator(standard_allocator(), [&] { return is_present(mem_e.key()); })
|
|
== partition_presence_checker_result::definitely_doesnt_exist) {
|
|
// Partition is absent in underlying. First, insert a neutral partition entry.
|
|
cache_entry* entry = current_allocator().construct<cache_entry>(cache_entry::evictable_tag(),
|
|
_schema, dht::decorated_key(mem_e.key()),
|
|
partition_entry::make_evictable(*_schema, mutation_partition(_schema)));
|
|
entry->set_continuous(cache_i->continuous());
|
|
_tracker.insert(*entry);
|
|
_partitions.insert_before(cache_i, *entry);
|
|
return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(),
|
|
alloc, _tracker.region(), _tracker, _underlying_phase, acc);
|
|
} else {
|
|
return make_empty_coroutine();
|
|
}
|
|
});
|
|
}
|
|
|
|
future<> row_cache::update_invalidating(external_updater eu, memtable& m) {
|
|
return do_update(std::move(eu), m, [this] (logalloc::allocating_section& alloc,
|
|
row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e, partition_presence_checker& is_present,
|
|
real_dirty_memory_accounter& acc)
|
|
{
|
|
if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) {
|
|
// FIXME: Invalidate only affected row ranges.
|
|
// This invalidates all information about the partition.
|
|
cache_entry& e = *cache_i;
|
|
e.evict(_tracker);
|
|
e.on_evicted(_tracker);
|
|
} else {
|
|
_tracker.clear_continuity(*cache_i);
|
|
}
|
|
// FIXME: subtract gradually from acc.
|
|
return make_empty_coroutine();
|
|
});
|
|
}
|
|
|
|
void row_cache::refresh_snapshot() {
|
|
_underlying = _snapshot_source();
|
|
}
|
|
|
|
void row_cache::touch(const dht::decorated_key& dk) {
|
|
_read_section(_tracker.region(), [&] {
|
|
with_linearized_managed_bytes([&] {
|
|
auto i = _partitions.find(dk, cache_entry::compare(_schema));
|
|
if (i != _partitions.end()) {
|
|
for (partition_version& pv : i->partition().versions_from_oldest()) {
|
|
for (rows_entry& row : pv.partition().clustered_rows()) {
|
|
_tracker.touch(row);
|
|
}
|
|
}
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
void row_cache::unlink_from_lru(const dht::decorated_key& dk) {
|
|
_read_section(_tracker.region(), [&] {
|
|
with_linearized_managed_bytes([&] {
|
|
auto i = _partitions.find(dk, cache_entry::compare(_schema));
|
|
if (i != _partitions.end()) {
|
|
for (partition_version& pv : i->partition().versions_from_oldest()) {
|
|
for (rows_entry& row : pv.partition().clustered_rows()) {
|
|
_tracker.unlink(row);
|
|
}
|
|
}
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
void row_cache::invalidate_locked(const dht::decorated_key& dk) {
|
|
auto pos = _partitions.lower_bound(dk, cache_entry::compare(_schema));
|
|
if (pos == partitions_end() || !pos->key().equal(*_schema, dk)) {
|
|
_tracker.clear_continuity(*pos);
|
|
} else {
|
|
auto it = _partitions.erase_and_dispose(pos,
|
|
[this, &dk, deleter = current_deleter<cache_entry>()](auto&& p) mutable {
|
|
_tracker.on_partition_erase();
|
|
p->evict(_tracker);
|
|
deleter(p);
|
|
});
|
|
_tracker.clear_continuity(*it);
|
|
}
|
|
}
|
|
|
|
future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk) {
|
|
return invalidate(std::move(eu), dht::partition_range::make_singular(dk));
|
|
}
|
|
|
|
future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range) {
|
|
return invalidate(std::move(eu), dht::partition_range_vector({range}));
|
|
}
|
|
|
|
future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges) {
|
|
return do_update(std::move(eu), [this, ranges = std::move(ranges)] {
|
|
auto on_failure = defer([this] { this->clear_now(); });
|
|
with_linearized_managed_bytes([&] {
|
|
for (auto&& range : ranges) {
|
|
this->invalidate_unwrapped(range);
|
|
}
|
|
});
|
|
on_failure.cancel();
|
|
return make_ready_future<>();
|
|
});
|
|
}
|
|
|
|
void row_cache::evict(const dht::partition_range& range) {
|
|
invalidate_unwrapped(range);
|
|
}
|
|
|
|
void row_cache::invalidate_unwrapped(const dht::partition_range& range) {
|
|
logalloc::reclaim_lock _(_tracker.region());
|
|
|
|
auto cmp = cache_entry::compare(_schema);
|
|
auto begin = _partitions.lower_bound(dht::ring_position_view::for_range_start(range), cmp);
|
|
auto end = _partitions.lower_bound(dht::ring_position_view::for_range_end(range), cmp);
|
|
with_allocator(_tracker.allocator(), [this, begin, end] {
|
|
auto it = _partitions.erase_and_dispose(begin, end, [this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
|
|
_tracker.on_partition_erase();
|
|
p->evict(_tracker);
|
|
deleter(p);
|
|
});
|
|
assert(it != _partitions.end());
|
|
_tracker.clear_continuity(*it);
|
|
});
|
|
}
|
|
|
|
row_cache::row_cache(schema_ptr s, snapshot_source src, cache_tracker& tracker, is_continuous cont)
|
|
: _tracker(tracker)
|
|
, _schema(std::move(s))
|
|
, _partitions(cache_entry::compare(_schema))
|
|
, _underlying(src())
|
|
, _snapshot_source(std::move(src))
|
|
{
|
|
with_allocator(_tracker.allocator(), [this, cont] {
|
|
cache_entry* entry = current_allocator().construct<cache_entry>(cache_entry::dummy_entry_tag());
|
|
_partitions.insert_before(_partitions.end(), *entry);
|
|
entry->set_continuous(bool(cont));
|
|
});
|
|
}
|
|
|
|
cache_entry::cache_entry(cache_entry&& o) noexcept
|
|
: _schema(std::move(o._schema))
|
|
, _key(std::move(o._key))
|
|
, _pe(std::move(o._pe))
|
|
, _flags(o._flags)
|
|
, _cache_link()
|
|
{
|
|
{
|
|
using container_type = row_cache::partitions_type;
|
|
container_type::node_algorithms::replace_node(o._cache_link.this_ptr(), _cache_link.this_ptr());
|
|
container_type::node_algorithms::init(o._cache_link.this_ptr());
|
|
}
|
|
}
|
|
|
|
cache_entry::~cache_entry() {
|
|
}
|
|
|
|
void cache_entry::evict(cache_tracker& tracker) noexcept {
|
|
_pe.evict(tracker.cleaner());
|
|
}
|
|
|
|
void row_cache::set_schema(schema_ptr new_schema) noexcept {
|
|
_schema = std::move(new_schema);
|
|
}
|
|
|
|
void cache_entry::on_evicted(cache_tracker& tracker) noexcept {
|
|
auto it = row_cache::partitions_type::s_iterator_to(*this);
|
|
std::next(it)->set_continuous(false);
|
|
evict(tracker);
|
|
current_deleter<cache_entry>()(this);
|
|
tracker.on_partition_eviction();
|
|
}
|
|
|
|
void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
|
|
auto it = mutation_partition::rows_type::iterator_to(*this);
|
|
if (is_last_dummy()) {
|
|
// Every evictable partition entry must have a dummy entry at the end,
|
|
// so don't remove it, just unlink from the LRU.
|
|
// That dummy is linked in the LRU, because there may be partitions
|
|
// with no regular rows, and we need to track them.
|
|
_lru_link.unlink();
|
|
} else {
|
|
++it;
|
|
it->set_continuous(false);
|
|
current_deleter<rows_entry>()(this);
|
|
tracker.on_row_eviction();
|
|
}
|
|
|
|
if (mutation_partition::rows_type::is_only_member(*it)) {
|
|
assert(it->is_last_dummy());
|
|
partition_version& pv = partition_version::container_of(mutation_partition::container_of(
|
|
mutation_partition::rows_type::container_of_only_member(*it)));
|
|
if (pv.is_referenced_from_entry()) {
|
|
cache_entry& ce = cache_entry::container_of(partition_entry::container_of(pv));
|
|
ce.on_evicted(tracker);
|
|
}
|
|
}
|
|
}
|
|
|
|
flat_mutation_reader cache_entry::read(row_cache& rc, read_context& reader) {
|
|
auto source_and_phase = rc.snapshot_of(_key);
|
|
reader.enter_partition(_key, source_and_phase.snapshot, source_and_phase.phase);
|
|
return do_read(rc, reader);
|
|
}
|
|
|
|
flat_mutation_reader cache_entry::read(row_cache& rc, read_context& reader, row_cache::phase_type phase) {
|
|
reader.enter_partition(_key, phase);
|
|
return do_read(rc, reader);
|
|
}
|
|
|
|
// Assumes reader is in the corresponding partition
|
|
flat_mutation_reader cache_entry::do_read(row_cache& rc, read_context& reader) {
|
|
auto snp = _pe.read(rc._tracker.region(), rc._tracker.cleaner(), _schema, &rc._tracker, reader.phase());
|
|
auto ckr = query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key());
|
|
auto r = make_cache_flat_mutation_reader(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp));
|
|
if (reader.schema()->version() != _schema->version()) {
|
|
r = transform(std::move(r), schema_upgrader(reader.schema()));
|
|
}
|
|
return r;
|
|
}
|
|
|
|
const schema_ptr& row_cache::schema() const {
|
|
return _schema;
|
|
}
|
|
|
|
void row_cache::upgrade_entry(cache_entry& e) {
|
|
if (e._schema != _schema) {
|
|
auto& r = _tracker.region();
|
|
assert(!r.reclaiming_enabled());
|
|
with_allocator(r.allocator(), [this, &e] {
|
|
with_linearized_managed_bytes([&] {
|
|
e.partition().upgrade(e._schema, _schema, _tracker.cleaner(), &_tracker);
|
|
e._schema = _schema;
|
|
});
|
|
});
|
|
}
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& out, row_cache& rc) {
|
|
rc._read_section(rc._tracker.region(), [&] {
|
|
out << "{row_cache: " << ::join(", ", rc._partitions.begin(), rc._partitions.end()) << "}";
|
|
});
|
|
return out;
|
|
}
|
|
|
|
future<> row_cache::do_update(row_cache::external_updater eu, row_cache::internal_updater iu) noexcept {
|
|
return futurize_apply([this] {
|
|
return get_units(_update_sem, 1);
|
|
}).then([this, eu = std::move(eu), iu = std::move(iu)] (auto permit) mutable {
|
|
auto pos = dht::ring_position::min();
|
|
eu();
|
|
[&] () noexcept {
|
|
_prev_snapshot_pos = std::move(pos);
|
|
_prev_snapshot = std::exchange(_underlying, _snapshot_source());
|
|
++_underlying_phase;
|
|
}();
|
|
return futurize_apply([&iu] {
|
|
return iu();
|
|
}).then_wrapped([this, permit = std::move(permit)] (auto f) {
|
|
_prev_snapshot_pos = {};
|
|
_prev_snapshot = {};
|
|
if (f.failed()) {
|
|
clogger.warn("Failure during cache update: {}", f.get_exception());
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& out, cache_entry& e) {
|
|
return out << "{cache_entry: " << e.position()
|
|
<< ", cont=" << e.continuous()
|
|
<< ", dummy=" << e.is_dummy_entry()
|
|
<< ", " << partition_entry::printer(*e.schema(), e.partition())
|
|
<< "}";
|
|
}
|