Currently readers are always using the latest snapshot. This is fine for respecting write atomicity if partitions are fully continuous in cache (now), but will break write atomicity once partial population is allowed. Consider the following case: flush write(ck=1), write(ck=2) -> snapshot_1 cache reader 1 reads and inserts ck=1 @snapshot_1 flush write(ck=1), write(ck=2) -> snapshot_2 cache reader 2 reads and inserts ck=2 @snapshot_2 Because cache update is not atomic, it can happen that reader 2 will complete while the partition hasn't been updated yet for snapshot_2. In such case, after read 2 the partition would contain ck=1 from snapshot_1 and ck=2 from snapshot_2. It will match neither of the snapshots, and this could violate write atomicity. To solve this problem we conceptually assign each partition key in the ring to its current snapshot which it reflects. The update process gradually converts entries in ring order to the new snapshot. Reads will not be using the latest snapshot, but rather the current snapshot for the position in the ring they are at. There is a race between the update process and populating reads. Since after the update all entries must reflect the new snapshot, reads using the old snapshot cannot be allowed to insert data which can no longer be reached by the update process. Before this patch this race was prevented by the use of a phased_barrier, where readers would keep phased_barrier::operation alive between starting a read of a partition and inserting it into cache. Cache update was waiting for all prior operations before starting the update. Any later read which was not waited for would use the latest snapshot for reads, so the update process didn't have to fix anything up for such reads. After this change, later reads cannot always use the latest snapshot, they have to use the snapshot corresponding to given entry. So it's not enough for update() to wait for prior reads in order to prevent stale populations. The (simple) solution implemented in this patch is to detect the conflict and abandon population of given sub-range. In general, reads are allowed to populate given range only if it belongs to a single snapshot. Note that the range here is not the whole query range. For population of continuity, it is the range starting after the previous key and ending after the key being inserted. When populating a partition entry, the range is a singular range containing only the partition key. Readers switch to new snapshots automatically as they move across the ring. It's possible that the insertion of the partition doesn't conflict, but continuity does. In such case the entry will be inserted but continuity will not be set.
965 lines
37 KiB
C++
965 lines
37 KiB
C++
/*
|
|
* Copyright (C) 2015 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "row_cache.hh"
|
|
#include "core/memory.hh"
|
|
#include "core/do_with.hh"
|
|
#include "core/future-util.hh"
|
|
#include <seastar/core/metrics.hh>
|
|
#include <seastar/util/defer.hh>
|
|
#include "memtable.hh"
|
|
#include "partition_snapshot_reader.hh"
|
|
#include <chrono>
|
|
#include "utils/move.hh"
|
|
#include <boost/version.hpp>
|
|
#include <sys/sdt.h>
|
|
#include "stdx.hh"
|
|
#include "read_context.hh"
|
|
|
|
using namespace std::chrono_literals;
|
|
using namespace cache;
|
|
|
|
|
|
static logging::logger clogger("cache");
|
|
|
|
thread_local seastar::thread_scheduling_group row_cache::_update_thread_scheduling_group(1ms, 0.2);
|
|
|
|
mutation_reader
|
|
row_cache::create_underlying_reader(read_context& ctx, mutation_source& src, const dht::partition_range& pr) {
|
|
return src(_schema, pr, query::full_slice, ctx.pc(), ctx.trace_state(), streamed_mutation::forwarding::no);
|
|
}
|
|
|
|
cache_tracker& global_cache_tracker() {
|
|
static thread_local cache_tracker instance;
|
|
return instance;
|
|
}
|
|
|
|
cache_tracker::cache_tracker() {
|
|
setup_metrics();
|
|
|
|
_region.make_evictable([this] {
|
|
return with_allocator(_region.allocator(), [this] {
|
|
// Removing a partition may require reading large keys when we rebalance
|
|
// the rbtree, so linearize anything we read
|
|
return with_linearized_managed_bytes([&] {
|
|
try {
|
|
auto evict_last = [this](lru_type& lru) {
|
|
cache_entry& ce = lru.back();
|
|
auto it = row_cache::partitions_type::s_iterator_to(ce);
|
|
clear_continuity(*std::next(it));
|
|
lru.pop_back_and_dispose(current_deleter<cache_entry>());
|
|
};
|
|
if (_lru.empty()) {
|
|
return memory::reclaiming_result::reclaimed_nothing;
|
|
}
|
|
evict_last(_lru);
|
|
--_stats.partitions;
|
|
++_stats.evictions;
|
|
++_stats.modification_count;
|
|
return memory::reclaiming_result::reclaimed_something;
|
|
} catch (std::bad_alloc&) {
|
|
// Bad luck, linearization during partition removal caused us to
|
|
// fail. Drop the entire cache so we can make forward progress.
|
|
clear();
|
|
return memory::reclaiming_result::reclaimed_something;
|
|
}
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
cache_tracker::~cache_tracker() {
|
|
clear();
|
|
}
|
|
|
|
void
|
|
cache_tracker::setup_metrics() {
|
|
namespace sm = seastar::metrics;
|
|
_metrics.add_group("cache", {
|
|
sm::make_gauge("bytes_used", sm::description("current bytes used by the cache out of the total size of memory"), [this] { return _region.occupancy().used_space(); }),
|
|
sm::make_gauge("bytes_total", sm::description("total size of memory for the cache"), [this] { return _region.occupancy().total_space(); }),
|
|
sm::make_derive("total_operations_hits", sm::description("total number of operation hits"), _stats.hits),
|
|
sm::make_derive("total_operations_misses", sm::description("total number of operation misses"), _stats.misses),
|
|
sm::make_derive("total_operations_insertions", sm::description("total number of operation insert"), _stats.insertions),
|
|
sm::make_derive("total_operations_concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key),
|
|
sm::make_derive("total_operations_merges", sm::description("total number of operation merged"), _stats.merges),
|
|
sm::make_derive("total_operations_evictions", sm::description("total number of operation eviction"), _stats.evictions),
|
|
sm::make_derive("total_operations_removals", sm::description("total number of operation removals"), _stats.removals),
|
|
sm::make_derive("total_operations_mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations),
|
|
sm::make_gauge("objects_partitions", sm::description("total number of partition objects"), _stats.partitions)
|
|
});
|
|
}
|
|
|
|
void cache_tracker::clear() {
|
|
with_allocator(_region.allocator(), [this] {
|
|
auto clear = [this] (lru_type& lru) {
|
|
while (!lru.empty()) {
|
|
cache_entry& ce = lru.back();
|
|
auto it = row_cache::partitions_type::s_iterator_to(ce);
|
|
while (it->is_evictable()) {
|
|
cache_entry& to_remove = *it;
|
|
++it;
|
|
to_remove._lru_link.unlink();
|
|
current_deleter<cache_entry>()(&to_remove);
|
|
}
|
|
clear_continuity(*it);
|
|
}
|
|
};
|
|
clear(_lru);
|
|
});
|
|
_stats.removals += _stats.partitions;
|
|
_stats.partitions = 0;
|
|
++_stats.modification_count;
|
|
}
|
|
|
|
void cache_tracker::touch(cache_entry& e) {
|
|
auto move_to_front = [this] (lru_type& lru, cache_entry& e) {
|
|
lru.erase(lru.iterator_to(e));
|
|
lru.push_front(e);
|
|
};
|
|
move_to_front(_lru, e);
|
|
}
|
|
|
|
void cache_tracker::insert(cache_entry& entry) {
|
|
++_stats.insertions;
|
|
++_stats.partitions;
|
|
++_stats.modification_count;
|
|
_lru.push_front(entry);
|
|
}
|
|
|
|
void cache_tracker::on_erase() {
|
|
--_stats.partitions;
|
|
++_stats.removals;
|
|
++_stats.modification_count;
|
|
}
|
|
|
|
void cache_tracker::on_merge() {
|
|
++_stats.merges;
|
|
}
|
|
|
|
void cache_tracker::on_hit() {
|
|
++_stats.hits;
|
|
}
|
|
|
|
void cache_tracker::on_miss() {
|
|
++_stats.misses;
|
|
}
|
|
|
|
void cache_tracker::on_mispopulate() {
|
|
++_stats.mispopulations;
|
|
}
|
|
|
|
void cache_tracker::on_miss_already_populated() {
|
|
++_stats.concurrent_misses_same_key;
|
|
}
|
|
|
|
allocation_strategy& cache_tracker::allocator() {
|
|
return _region.allocator();
|
|
}
|
|
|
|
logalloc::region& cache_tracker::region() {
|
|
return _region;
|
|
}
|
|
|
|
const logalloc::region& cache_tracker::region() const {
|
|
return _region;
|
|
}
|
|
|
|
/*
|
|
* Represent a reader to the underlying source.
|
|
* This reader automatically makes sure that it's up to date with all cache updates
|
|
*/
|
|
class autoupdating_underlying_reader final {
|
|
row_cache& _cache;
|
|
read_context& _read_context;
|
|
stdx::optional<mutation_reader> _reader;
|
|
utils::phased_barrier::phase_type _reader_creation_phase;
|
|
dht::partition_range _range = { };
|
|
stdx::optional<dht::decorated_key> _last_key;
|
|
stdx::optional<dht::decorated_key> _new_last_key;
|
|
public:
|
|
autoupdating_underlying_reader(row_cache& cache, read_context& context)
|
|
: _cache(cache)
|
|
, _read_context(context)
|
|
{ }
|
|
future<streamed_mutation_opt> operator()() {
|
|
_last_key = std::move(_new_last_key);
|
|
auto start = population_range_start();
|
|
auto phase = _cache.phase_of(start);
|
|
if (!_reader || _reader_creation_phase != phase) {
|
|
if (_last_key) {
|
|
auto cmp = dht::ring_position_comparator(*_cache._schema);
|
|
auto&& new_range = _range.split_after(*_last_key, cmp);
|
|
if (!new_range) {
|
|
return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
|
|
}
|
|
_range = std::move(*new_range);
|
|
_last_key = {};
|
|
}
|
|
auto& snap = _cache.snapshot_for_phase(phase);
|
|
_reader = _cache.create_underlying_reader(_read_context, snap, _range);
|
|
_reader_creation_phase = phase;
|
|
}
|
|
return (*_reader)().then([this] (auto&& smopt) {
|
|
if (smopt) {
|
|
_new_last_key = smopt->decorated_key();
|
|
}
|
|
return std::move(smopt);
|
|
});
|
|
}
|
|
future<> fast_forward_to(dht::partition_range&& range) {
|
|
_range = std::move(range);
|
|
_last_key = { };
|
|
_new_last_key = { };
|
|
auto phase = _cache.phase_of(dht::ring_position_view::for_range_start(_range));
|
|
if (_reader && _reader_creation_phase == phase) {
|
|
return _reader->fast_forward_to(_range);
|
|
}
|
|
_reader = _cache.create_underlying_reader(_read_context, _cache.snapshot_for_phase(phase), _range);
|
|
_reader_creation_phase = phase;
|
|
return make_ready_future<>();
|
|
}
|
|
utils::phased_barrier::phase_type creation_phase() const {
|
|
assert(_reader);
|
|
return _reader_creation_phase;
|
|
}
|
|
const dht::partition_range& range() const {
|
|
return _range;
|
|
}
|
|
dht::ring_position_view population_range_start() const {
|
|
return _last_key ? dht::ring_position_view::for_after_key(*_last_key)
|
|
: dht::ring_position_view::for_range_start(_range);
|
|
}
|
|
};
|
|
|
|
// Reader which populates the cache using data from the delegate.
|
|
class single_partition_populating_reader final : public mutation_reader::impl {
|
|
row_cache& _cache;
|
|
mutation_reader _delegate;
|
|
lw_shared_ptr<read_context> _read_context;
|
|
bool done = false;
|
|
public:
|
|
single_partition_populating_reader(row_cache& cache,
|
|
lw_shared_ptr<read_context> context)
|
|
: _cache(cache)
|
|
, _read_context(std::move(context))
|
|
{ }
|
|
|
|
virtual future<streamed_mutation_opt> operator()() override {
|
|
if (done) {
|
|
return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
|
|
}
|
|
done = true;
|
|
auto src_and_phase = _cache.snapshot_of(_read_context->range().start()->value());
|
|
auto phase = src_and_phase.phase;
|
|
_delegate = _cache.create_underlying_reader(*_read_context, src_and_phase.snapshot, _read_context->range());
|
|
return _delegate().then([this, phase] (auto sm) mutable {
|
|
if (!sm) {
|
|
return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
|
|
}
|
|
return mutation_from_streamed_mutation(std::move(sm)).then([this, phase] (mutation_opt&& mo) {
|
|
if (mo) {
|
|
if (phase == _cache.phase_of(_read_context->range().start()->value())) {
|
|
_cache.populate(*mo);
|
|
} else {
|
|
_cache._tracker.on_mispopulate();
|
|
}
|
|
mo->upgrade(_read_context->schema());
|
|
auto ck_ranges = query::clustering_key_filter_ranges::get_ranges(*_read_context->schema(), _read_context->slice(), mo->key());
|
|
auto filtered_partition = mutation_partition(std::move(mo->partition()), *(mo->schema()), std::move(ck_ranges));
|
|
mo->partition() = std::move(filtered_partition);
|
|
return make_ready_future<streamed_mutation_opt>(streamed_mutation_from_mutation(std::move(*mo), _read_context->fwd()));
|
|
}
|
|
return make_ready_future<streamed_mutation_opt>(streamed_mutation_opt());
|
|
});
|
|
});
|
|
}
|
|
};
|
|
|
|
void cache_tracker::clear_continuity(cache_entry& ce) {
|
|
ce.set_continuous(false);
|
|
}
|
|
|
|
void row_cache::on_hit() {
|
|
_stats.hits.mark();
|
|
_tracker.on_hit();
|
|
}
|
|
|
|
void row_cache::on_miss() {
|
|
_stats.misses.mark();
|
|
_tracker.on_miss();
|
|
}
|
|
|
|
class just_cache_scanning_reader final {
|
|
row_cache& _cache;
|
|
row_cache::partitions_type::iterator _it;
|
|
row_cache::partitions_type::iterator _end;
|
|
const dht::partition_range* _range;
|
|
stdx::optional<dht::decorated_key> _last;
|
|
uint64_t _last_reclaim_count;
|
|
size_t _last_modification_count;
|
|
read_context& _read_context;
|
|
private:
|
|
void update_iterators() {
|
|
auto cmp = cache_entry::compare(_cache._schema);
|
|
auto update_end = [&] {
|
|
if (_range->end()) {
|
|
if (_range->end()->is_inclusive()) {
|
|
_end = _cache._partitions.upper_bound(_range->end()->value(), cmp);
|
|
} else {
|
|
_end = _cache._partitions.lower_bound(_range->end()->value(), cmp);
|
|
}
|
|
} else {
|
|
_end = _cache.partitions_end();
|
|
}
|
|
};
|
|
|
|
auto reclaim_count = _cache.get_cache_tracker().region().reclaim_counter();
|
|
auto modification_count = _cache.get_cache_tracker().modification_count();
|
|
if (!_last) {
|
|
if (_range->start()) {
|
|
if (_range->start()->is_inclusive()) {
|
|
_it = _cache._partitions.lower_bound(_range->start()->value(), cmp);
|
|
} else {
|
|
_it = _cache._partitions.upper_bound(_range->start()->value(), cmp);
|
|
}
|
|
} else {
|
|
_it = _cache._partitions.begin();
|
|
}
|
|
update_end();
|
|
} else if (reclaim_count != _last_reclaim_count || modification_count != _last_modification_count) {
|
|
_it = _cache._partitions.upper_bound(*_last, cmp);
|
|
update_end();
|
|
}
|
|
_last_reclaim_count = reclaim_count;
|
|
_last_modification_count = modification_count;
|
|
}
|
|
public:
|
|
struct cache_data {
|
|
streamed_mutation_opt mut;
|
|
bool continuous;
|
|
};
|
|
just_cache_scanning_reader(row_cache& cache,
|
|
const dht::partition_range& range,
|
|
read_context& ctx)
|
|
: _cache(cache)
|
|
, _range(&range)
|
|
, _read_context(ctx)
|
|
{ }
|
|
future<cache_data> operator()() {
|
|
return _cache._read_section(_cache._tracker.region(), [this] {
|
|
return with_linearized_managed_bytes([&] {
|
|
update_iterators();
|
|
if (_it == _end) {
|
|
return make_ready_future<cache_data>(cache_data { {}, _it->continuous() });
|
|
}
|
|
cache_entry& ce = *_it;
|
|
++_it;
|
|
_last = ce.key();
|
|
_cache.upgrade_entry(ce);
|
|
_cache._tracker.touch(ce);
|
|
_cache.on_hit();
|
|
cache_data cd { { }, ce.continuous() };
|
|
cd.mut = ce.read(_cache, _read_context);
|
|
return make_ready_future<cache_data>(std::move(cd));
|
|
});
|
|
});
|
|
}
|
|
future<> fast_forward_to(const dht::partition_range& pr) {
|
|
_last = {};
|
|
_range = ≺
|
|
return make_ready_future<>();
|
|
}
|
|
};
|
|
|
|
class range_populating_reader {
|
|
row_cache& _cache;
|
|
autoupdating_underlying_reader _reader;
|
|
stdx::optional<row_cache::previous_entry_pointer> _last_key;
|
|
read_context& _read_context;
|
|
private:
|
|
bool can_set_continuity() const {
|
|
return _last_key && _reader.creation_phase() == _cache.phase_of(_reader.population_range_start());
|
|
}
|
|
void handle_end_of_stream() {
|
|
if (!can_set_continuity()) {
|
|
return;
|
|
}
|
|
if (!_reader.range().end() || !_reader.range().end()->is_inclusive()) {
|
|
cache_entry::compare cmp(_cache._schema);
|
|
auto it = _reader.range().end() ? _cache._partitions.find(_reader.range().end()->value(), cmp)
|
|
: std::prev(_cache._partitions.end());
|
|
if (it != _cache._partitions.end()) {
|
|
if (it == _cache._partitions.begin()) {
|
|
if (!_last_key->_key) {
|
|
it->set_continuous(true);
|
|
}
|
|
} else {
|
|
auto prev = std::prev(it);
|
|
if (prev->key().equal(*_cache._schema, *_last_key->_key)) {
|
|
it->set_continuous(true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
public:
|
|
range_populating_reader(row_cache& cache, read_context& ctx)
|
|
: _cache(cache)
|
|
, _reader(cache, ctx)
|
|
, _read_context(ctx)
|
|
{}
|
|
|
|
future<streamed_mutation_opt> operator()() {
|
|
return _reader().then([this] (streamed_mutation_opt smopt) mutable {
|
|
return mutation_from_streamed_mutation(std::move(smopt)).then(
|
|
[this] (mutation_opt&& mo) mutable {
|
|
if (!mo) {
|
|
handle_end_of_stream();
|
|
return make_ready_future<streamed_mutation_opt>();
|
|
}
|
|
|
|
_cache.on_miss();
|
|
if (_reader.creation_phase() == _cache.phase_of(mo->decorated_key())) {
|
|
_cache.populate(*mo, can_set_continuity() ? &*_last_key : nullptr);
|
|
} else {
|
|
_cache._tracker.on_mispopulate();
|
|
}
|
|
_last_key = mo->decorated_key();
|
|
|
|
mo->upgrade(_read_context.schema());
|
|
auto ck_ranges = query::clustering_key_filter_ranges::get_ranges(*_read_context.schema(), _read_context.slice(), mo->key());
|
|
auto filtered_partition = mutation_partition(std::move(mo->partition()), *mo->schema(), std::move(ck_ranges));
|
|
mo->partition() = std::move(filtered_partition);
|
|
return make_ready_future<streamed_mutation_opt>(streamed_mutation_from_mutation(std::move(*mo), _read_context.fwd()));
|
|
});
|
|
});
|
|
}
|
|
|
|
future<> fast_forward_to(dht::partition_range&& pr) {
|
|
if (!pr.start()) {
|
|
_last_key = row_cache::previous_entry_pointer();
|
|
} else if (!pr.start()->is_inclusive() && pr.start()->value().has_key()) {
|
|
_last_key = pr.start()->value().as_decorated_key();
|
|
} else {
|
|
// Inclusive start bound, cannot set continuity flag.
|
|
_last_key = {};
|
|
}
|
|
|
|
return _reader.fast_forward_to(std::move(pr));
|
|
}
|
|
};
|
|
|
|
class scanning_and_populating_reader final : public mutation_reader::impl {
|
|
const dht::partition_range* _pr;
|
|
lw_shared_ptr<read_context> _read_context;
|
|
just_cache_scanning_reader _primary_reader;
|
|
range_populating_reader _secondary_reader;
|
|
streamed_mutation_opt _next_primary;
|
|
bool _secondary_in_progress = false;
|
|
bool _first_element = true;
|
|
stdx::optional<dht::decorated_key> _last_key;
|
|
private:
|
|
void update_last_key(const streamed_mutation_opt& smopt) {
|
|
if (smopt) {
|
|
_last_key = smopt->decorated_key();
|
|
}
|
|
}
|
|
|
|
bool is_inclusive_start_bound(const dht::decorated_key& dk) {
|
|
if (!_first_element) {
|
|
return false;
|
|
}
|
|
return _pr->start() && _pr->start()->is_inclusive() && _pr->start()->value().equal(*_read_context->schema(), dk);
|
|
}
|
|
|
|
future<streamed_mutation_opt> read_from_primary() {
|
|
return _primary_reader().then([this] (just_cache_scanning_reader::cache_data cd) {
|
|
auto& smopt = cd.mut;
|
|
if (cd.continuous || (smopt && is_inclusive_start_bound(smopt->decorated_key()))) {
|
|
_first_element = false;
|
|
update_last_key(smopt);
|
|
return make_ready_future<streamed_mutation_opt>(std::move(smopt));
|
|
} else {
|
|
_next_primary = std::move(smopt);
|
|
|
|
dht::partition_range secondary_range = { };
|
|
if (!_next_primary) {
|
|
if (!_last_key) {
|
|
secondary_range = *_pr;
|
|
} else {
|
|
dht::ring_position_comparator cmp(*_read_context->schema());
|
|
auto&& new_range = _pr->split_after(*_last_key, cmp);
|
|
if (!new_range) {
|
|
return make_ready_future<streamed_mutation_opt>();
|
|
}
|
|
secondary_range = std::move(*new_range);
|
|
}
|
|
} else {
|
|
if (_last_key) {
|
|
secondary_range = dht::partition_range::make({ *_last_key, false }, { _next_primary->decorated_key(), false });
|
|
} else {
|
|
if (!_pr->start()) {
|
|
secondary_range = dht::partition_range::make_ending_with({ _next_primary->decorated_key(), false });
|
|
} else {
|
|
secondary_range = dht::partition_range::make(*_pr->start(), { _next_primary->decorated_key(), false });
|
|
}
|
|
}
|
|
}
|
|
|
|
_secondary_in_progress = true;
|
|
return _secondary_reader.fast_forward_to(std::move(secondary_range)).then([this] {
|
|
return read_from_secondary();
|
|
});
|
|
}
|
|
});
|
|
}
|
|
|
|
future<streamed_mutation_opt> read_from_secondary() {
|
|
return _secondary_reader().then([this] (streamed_mutation_opt smopt) {
|
|
if (smopt) {
|
|
return smopt;
|
|
} else {
|
|
_secondary_in_progress = false;
|
|
update_last_key(_next_primary);
|
|
return std::move(_next_primary);
|
|
}
|
|
});
|
|
}
|
|
public:
|
|
scanning_and_populating_reader(row_cache& cache,
|
|
const dht::partition_range& range,
|
|
lw_shared_ptr<read_context> context)
|
|
: _pr(&range)
|
|
, _read_context(std::move(context))
|
|
, _primary_reader(cache, range, *_read_context)
|
|
, _secondary_reader(cache, *_read_context)
|
|
{ }
|
|
|
|
future<streamed_mutation_opt> operator()() {
|
|
if (_secondary_in_progress) {
|
|
return read_from_secondary();
|
|
} else {
|
|
return read_from_primary();
|
|
}
|
|
}
|
|
|
|
future<> fast_forward_to(const dht::partition_range& pr) {
|
|
_secondary_in_progress = false;
|
|
_first_element = true;
|
|
_pr = ≺
|
|
return _primary_reader.fast_forward_to(pr);
|
|
}
|
|
};
|
|
|
|
mutation_reader
|
|
row_cache::make_scanning_reader(const dht::partition_range& range, lw_shared_ptr<read_context> context) {
|
|
return make_mutation_reader<scanning_and_populating_reader>(*this, range, std::move(context));
|
|
}
|
|
|
|
mutation_reader
|
|
row_cache::make_reader(schema_ptr s,
|
|
const dht::partition_range& range,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state,
|
|
streamed_mutation::forwarding fwd,
|
|
mutation_reader::forwarding fwd_mr)
|
|
{
|
|
auto ctx = make_lw_shared<read_context>(*this, std::move(s), range, slice, pc, trace_state, fwd, fwd_mr);
|
|
|
|
if (range.is_singular()) {
|
|
const query::ring_position& pos = range.start()->value();
|
|
|
|
if (!pos.has_key()) {
|
|
return make_scanning_reader(range, std::move(ctx));
|
|
}
|
|
|
|
return _read_section(_tracker.region(), [&] {
|
|
return with_linearized_managed_bytes([&] {
|
|
const dht::decorated_key& dk = pos.as_decorated_key();
|
|
auto i = _partitions.find(dk, cache_entry::compare(_schema));
|
|
if (i != _partitions.end()) {
|
|
cache_entry& e = *i;
|
|
_tracker.touch(e);
|
|
upgrade_entry(e);
|
|
mutation_reader reader;
|
|
reader = make_reader_returning(e.read(*this, *ctx));
|
|
on_hit();
|
|
return reader;
|
|
} else {
|
|
on_miss();
|
|
return make_mutation_reader<single_partition_populating_reader>(*this, std::move(ctx));
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
return make_scanning_reader(range, std::move(ctx));
|
|
}
|
|
|
|
row_cache::~row_cache() {
|
|
with_allocator(_tracker.allocator(), [this] {
|
|
_partitions.clear_and_dispose([this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
|
|
if (!p->is_dummy_entry()) {
|
|
_tracker.on_erase();
|
|
}
|
|
deleter(p);
|
|
});
|
|
});
|
|
}
|
|
|
|
void row_cache::clear_now() noexcept {
|
|
with_allocator(_tracker.allocator(), [this] {
|
|
auto it = _partitions.erase_and_dispose(_partitions.begin(), partitions_end(), [this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
|
|
_tracker.on_erase();
|
|
deleter(p);
|
|
});
|
|
_tracker.clear_continuity(*it);
|
|
});
|
|
}
|
|
|
|
template<typename CreateEntry, typename VisitEntry>
|
|
//requires requires(CreateEntry create, VisitEntry visit, row_cache::partitions_type::iterator it) {
|
|
// { create(it) } -> row_cache::partitions_type::iterator;
|
|
// { visit(it) } -> void;
|
|
// }
|
|
void row_cache::do_find_or_create_entry(const dht::decorated_key& key,
|
|
const previous_entry_pointer* previous, CreateEntry&& create_entry, VisitEntry&& visit_entry)
|
|
{
|
|
with_allocator(_tracker.allocator(), [&] {
|
|
_populate_section(_tracker.region(), [&] {
|
|
with_linearized_managed_bytes([&] {
|
|
auto i = _partitions.lower_bound(key, cache_entry::compare(_schema));
|
|
if (i == _partitions.end() || !i->key().equal(*_schema, key)) {
|
|
i = create_entry(i);
|
|
} else {
|
|
visit_entry(i);
|
|
}
|
|
|
|
if (!previous) {
|
|
return;
|
|
}
|
|
|
|
if ((!previous->_key && i == _partitions.begin())
|
|
|| (previous->_key && i != _partitions.begin()
|
|
&& std::prev(i)->key().equal(*_schema, *previous->_key))) {
|
|
i->set_continuous(true);
|
|
}
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
void row_cache::populate(const mutation& m, const previous_entry_pointer* previous) {
|
|
do_find_or_create_entry(m.decorated_key(), previous, [&] (auto i) {
|
|
cache_entry* entry = current_allocator().construct<cache_entry>(
|
|
m.schema(), m.decorated_key(), m.partition());
|
|
upgrade_entry(*entry);
|
|
_tracker.insert(*entry);
|
|
return _partitions.insert(i, *entry);
|
|
}, [&] (auto i) {
|
|
_tracker.touch(*i);
|
|
// We cache whole partitions right now, so if cache already has this partition,
|
|
// it must be complete, so do nothing.
|
|
_tracker.on_miss_already_populated(); // #1534
|
|
});
|
|
}
|
|
|
|
mutation_source& row_cache::snapshot_for_phase(phase_type phase) {
|
|
if (phase == _underlying_phase) {
|
|
return _underlying;
|
|
} else {
|
|
if (phase + 1 < _underlying_phase) {
|
|
throw std::runtime_error(sprint("attempted to read from retired phase {} (current={})", phase, _underlying_phase));
|
|
}
|
|
return *_prev_snapshot;
|
|
}
|
|
}
|
|
|
|
row_cache::snapshot_and_phase row_cache::snapshot_of(dht::ring_position_view pos) {
|
|
dht::ring_position_less_comparator less(*_schema);
|
|
if (!_prev_snapshot_pos || less(pos, *_prev_snapshot_pos)) {
|
|
return {_underlying, _underlying_phase};
|
|
}
|
|
return {*_prev_snapshot, _underlying_phase - 1};
|
|
}
|
|
|
|
row_cache::phase_type row_cache::phase_of(dht::ring_position_view pos) {
|
|
dht::ring_position_less_comparator less(*_schema);
|
|
if (!_prev_snapshot_pos || less(pos, *_prev_snapshot_pos)) {
|
|
return _underlying_phase;
|
|
}
|
|
return _underlying_phase - 1;
|
|
}
|
|
|
|
future<> row_cache::update(memtable& m, partition_presence_checker presence_checker) {
|
|
m.on_detach_from_region_group();
|
|
_tracker.region().merge(m); // Now all data in memtable belongs to cache
|
|
auto attr = seastar::thread_attributes();
|
|
attr.scheduling_group = &_update_thread_scheduling_group;
|
|
STAP_PROBE(scylla, row_cache_update_start);
|
|
auto t = seastar::thread(attr, [this, &m, presence_checker = std::move(presence_checker)] {
|
|
auto cleanup = defer([&] {
|
|
with_allocator(_tracker.allocator(), [&m, this] () {
|
|
logalloc::reclaim_lock _(_tracker.region());
|
|
bool blow_cache = false;
|
|
// Note: clear_and_dispose() ought not to look up any keys, so it doesn't require
|
|
// with_linearized_managed_bytes(), but invalidate() does.
|
|
m.partitions.clear_and_dispose([this, deleter = current_deleter<memtable_entry>(), &blow_cache] (memtable_entry* entry) {
|
|
with_linearized_managed_bytes([&] {
|
|
try {
|
|
invalidate_locked(entry->key());
|
|
} catch (...) {
|
|
blow_cache = true;
|
|
}
|
|
deleter(entry);
|
|
});
|
|
});
|
|
if (blow_cache) {
|
|
// We failed to invalidate the key, presumably due to with_linearized_managed_bytes()
|
|
// running out of memory. Recover using clear_now(), which doesn't throw.
|
|
clear_now();
|
|
}
|
|
});
|
|
});
|
|
auto permit = get_units(_update_sem, 1).get0();
|
|
++_underlying_phase;
|
|
_prev_snapshot = std::exchange(_underlying, _snapshot_source());
|
|
_prev_snapshot_pos = dht::ring_position::min();
|
|
auto cleanup_prev_snapshot = defer([this] {
|
|
_prev_snapshot_pos = {};
|
|
_prev_snapshot = {};
|
|
});
|
|
while (!m.partitions.empty()) {
|
|
with_allocator(_tracker.allocator(), [this, &m, &presence_checker] () {
|
|
unsigned quota = 30;
|
|
auto cmp = cache_entry::compare(_schema);
|
|
{
|
|
_update_section(_tracker.region(), [&] {
|
|
STAP_PROBE(scylla, row_cache_update_one_batch_start);
|
|
unsigned quota_before = quota;
|
|
// FIXME: we should really be checking should_yield() here instead of
|
|
// need_preempt() + quota. However, should_yield() is currently quite
|
|
// expensive and we need to amortize it somehow.
|
|
do {
|
|
auto i = m.partitions.begin();
|
|
STAP_PROBE(scylla, row_cache_update_partition_start);
|
|
with_linearized_managed_bytes([&] {
|
|
{
|
|
memtable_entry& mem_e = *i;
|
|
// FIXME: Optimize knowing we lookup in-order.
|
|
auto cache_i = _partitions.lower_bound(mem_e.key(), cmp);
|
|
// If cache doesn't contain the entry we cannot insert it because the mutation may be incomplete.
|
|
// FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to
|
|
// search it.
|
|
if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) {
|
|
cache_entry& entry = *cache_i;
|
|
upgrade_entry(entry);
|
|
entry.partition().apply(*_schema, std::move(mem_e.partition()), *mem_e.schema());
|
|
_tracker.touch(entry);
|
|
_tracker.on_merge();
|
|
} else if (presence_checker(mem_e.key()) ==
|
|
partition_presence_checker_result::definitely_doesnt_exist) {
|
|
cache_entry* entry = current_allocator().construct<cache_entry>(
|
|
mem_e.schema(), std::move(mem_e.key()), std::move(mem_e.partition()));
|
|
_tracker.insert(*entry);
|
|
_partitions.insert(cache_i, *entry);
|
|
} else {
|
|
_tracker.clear_continuity(*cache_i);
|
|
}
|
|
i = m.partitions.erase(i);
|
|
current_allocator().destroy(&mem_e);
|
|
--quota;
|
|
}
|
|
});
|
|
STAP_PROBE(scylla, row_cache_update_partition_end);
|
|
} while (!m.partitions.empty() && quota && !need_preempt());
|
|
with_allocator(standard_allocator(), [&] {
|
|
if (m.partitions.empty()) {
|
|
_prev_snapshot_pos = {};
|
|
} else {
|
|
_prev_snapshot_pos = m.partitions.begin()->key();
|
|
}
|
|
});
|
|
STAP_PROBE1(scylla, row_cache_update_one_batch_end, quota_before - quota);
|
|
});
|
|
if (quota == 0 && seastar::thread::should_yield()) {
|
|
return;
|
|
}
|
|
}
|
|
});
|
|
seastar::thread::yield();
|
|
}
|
|
});
|
|
STAP_PROBE(scylla, row_cache_update_end);
|
|
return do_with(std::move(t), [] (seastar::thread& t) {
|
|
return t.join();
|
|
});
|
|
}
|
|
|
|
void row_cache::touch(const dht::decorated_key& dk) {
|
|
_read_section(_tracker.region(), [&] {
|
|
with_linearized_managed_bytes([&] {
|
|
auto i = _partitions.find(dk, cache_entry::compare(_schema));
|
|
if (i != _partitions.end()) {
|
|
_tracker.touch(*i);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
void row_cache::invalidate_locked(const dht::decorated_key& dk) {
|
|
auto pos = _partitions.lower_bound(dk, cache_entry::compare(_schema));
|
|
if (pos == partitions_end() || !pos->key().equal(*_schema, dk)) {
|
|
_tracker.clear_continuity(*pos);
|
|
} else {
|
|
auto it = _partitions.erase_and_dispose(pos,
|
|
[this, &dk, deleter = current_deleter<cache_entry>()](auto&& p) mutable {
|
|
_tracker.on_erase();
|
|
deleter(p);
|
|
});
|
|
_tracker.clear_continuity(*it);
|
|
}
|
|
}
|
|
|
|
future<> row_cache::invalidate(const dht::decorated_key& dk) {
|
|
return invalidate(dht::partition_range::make_singular(dk));
|
|
}
|
|
|
|
future<> row_cache::invalidate(const dht::partition_range& range) {
|
|
return invalidate(dht::partition_range_vector({range}));
|
|
}
|
|
|
|
future<> row_cache::invalidate(dht::partition_range_vector&& ranges) {
|
|
return get_units(_update_sem, 1).then([this, ranges = std::move(ranges)] (auto permit) mutable {
|
|
_underlying = _snapshot_source();
|
|
++_underlying_phase;
|
|
auto on_failure = defer([this] { this->clear_now(); });
|
|
with_linearized_managed_bytes([&] {
|
|
for (auto&& range : ranges) {
|
|
this->invalidate_unwrapped(range);
|
|
}
|
|
});
|
|
on_failure.cancel();
|
|
});
|
|
}
|
|
|
|
void row_cache::invalidate_unwrapped(const dht::partition_range& range) {
|
|
logalloc::reclaim_lock _(_tracker.region());
|
|
|
|
auto cmp = cache_entry::compare(_schema);
|
|
auto begin = _partitions.begin();
|
|
if (range.start()) {
|
|
if (range.start()->is_inclusive()) {
|
|
begin = _partitions.lower_bound(range.start()->value(), cmp);
|
|
} else {
|
|
begin = _partitions.upper_bound(range.start()->value(), cmp);
|
|
}
|
|
}
|
|
auto end = partitions_end();
|
|
if (range.end()) {
|
|
if (range.end()->is_inclusive()) {
|
|
end = _partitions.upper_bound(range.end()->value(), cmp);
|
|
} else {
|
|
end = _partitions.lower_bound(range.end()->value(), cmp);
|
|
}
|
|
}
|
|
with_allocator(_tracker.allocator(), [this, begin, end] {
|
|
auto it = _partitions.erase_and_dispose(begin, end, [this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
|
|
_tracker.on_erase();
|
|
deleter(p);
|
|
});
|
|
assert(it != _partitions.end());
|
|
_tracker.clear_continuity(*it);
|
|
});
|
|
}
|
|
|
|
row_cache::row_cache(schema_ptr s, snapshot_source src, cache_tracker& tracker)
|
|
: _tracker(tracker)
|
|
, _schema(std::move(s))
|
|
, _partitions(cache_entry::compare(_schema))
|
|
, _underlying(src())
|
|
, _snapshot_source(std::move(src))
|
|
{
|
|
with_allocator(_tracker.allocator(), [this] {
|
|
cache_entry* entry = current_allocator().construct<cache_entry>(cache_entry::dummy_entry_tag());
|
|
_partitions.insert(*entry);
|
|
});
|
|
}
|
|
|
|
cache_entry::cache_entry(cache_entry&& o) noexcept
|
|
: _schema(std::move(o._schema))
|
|
, _key(std::move(o._key))
|
|
, _pe(std::move(o._pe))
|
|
, _flags(o._flags)
|
|
, _lru_link()
|
|
, _cache_link()
|
|
{
|
|
if (o._lru_link.is_linked()) {
|
|
auto prev = o._lru_link.prev_;
|
|
o._lru_link.unlink();
|
|
cache_tracker::lru_type::node_algorithms::link_after(prev, _lru_link.this_ptr());
|
|
}
|
|
|
|
{
|
|
using container_type = row_cache::partitions_type;
|
|
container_type::node_algorithms::replace_node(o._cache_link.this_ptr(), _cache_link.this_ptr());
|
|
container_type::node_algorithms::init(o._cache_link.this_ptr());
|
|
}
|
|
}
|
|
|
|
void row_cache::set_schema(schema_ptr new_schema) noexcept {
|
|
_schema = std::move(new_schema);
|
|
}
|
|
|
|
streamed_mutation cache_entry::read(row_cache& rc, read_context& ctx) {
|
|
auto s = ctx.schema();
|
|
auto& slice = ctx.slice();
|
|
auto fwd = ctx.fwd();
|
|
if (_schema->version() != s->version()) {
|
|
auto ck_ranges = query::clustering_key_filter_ranges::get_ranges(*s, slice, _key.key());
|
|
auto mp = mutation_partition(_pe.squashed(_schema, s), *s, std::move(ck_ranges));
|
|
auto m = mutation(s, _key, std::move(mp));
|
|
return streamed_mutation_from_mutation(std::move(m), fwd);
|
|
}
|
|
auto ckr = query::clustering_key_filter_ranges::get_ranges(*s, slice, _key.key());
|
|
auto snp = _pe.read(_schema);
|
|
return make_partition_snapshot_reader(_schema, _key, std::move(ckr), snp, rc._tracker.region(), rc._read_section, { }, fwd);
|
|
}
|
|
|
|
const schema_ptr& row_cache::schema() const {
|
|
return _schema;
|
|
}
|
|
|
|
void row_cache::upgrade_entry(cache_entry& e) {
|
|
if (e._schema != _schema) {
|
|
auto& r = _tracker.region();
|
|
assert(!r.reclaiming_enabled());
|
|
with_allocator(r.allocator(), [this, &e] {
|
|
with_linearized_managed_bytes([&] {
|
|
e.partition().upgrade(e._schema, _schema);
|
|
e._schema = _schema;
|
|
});
|
|
});
|
|
}
|
|
}
|