/* * Copyright (C) 2015 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #include "row_cache.hh" #include "core/memory.hh" #include "core/do_with.hh" #include "core/future-util.hh" #include #include #include "memtable.hh" #include "partition_snapshot_reader.hh" #include #include #include #include "stdx.hh" #include "read_context.hh" #include "schema_upgrader.hh" #include "dirty_memory_manager.hh" #include "cache_flat_mutation_reader.hh" #include "real_dirty_memory_accounter.hh" namespace cache { logging::logger clogger("cache"); } using namespace std::chrono_literals; using namespace cache; flat_mutation_reader row_cache::create_underlying_reader(read_context& ctx, mutation_source& src, const dht::partition_range& pr) { ctx.on_underlying_created(); return src.make_reader(_schema, pr, ctx.slice(), ctx.pc(), ctx.trace_state(), streamed_mutation::forwarding::yes); } cache_tracker::cache_tracker() : _garbage(_region, this) , _memtable_cleaner(_region, nullptr) { setup_metrics(); _region.make_evictable([this] { return with_allocator(_region.allocator(), [this] { // Removing a partition may require reading large keys when we rebalance // the rbtree, so linearize anything we read return with_linearized_managed_bytes([&] { try { if (!_garbage.empty()) { _garbage.clear_some(); return memory::reclaiming_result::reclaimed_something; } if (!_memtable_cleaner.empty()) { _memtable_cleaner.clear_some(); return memory::reclaiming_result::reclaimed_something; } if (_lru.empty()) { return memory::reclaiming_result::reclaimed_nothing; } _lru.back().on_evicted(*this); return memory::reclaiming_result::reclaimed_something; } catch (std::bad_alloc&) { // Bad luck, linearization during partition removal caused us to // fail. Drop the entire cache so we can make forward progress. clear(); return memory::reclaiming_result::reclaimed_something; } }); }); }); } cache_tracker::~cache_tracker() { clear(); } void cache_tracker::set_compaction_scheduling_group(seastar::scheduling_group sg) { _memtable_cleaner.set_scheduling_group(sg); _garbage.set_scheduling_group(sg); } void cache_tracker::setup_metrics() { namespace sm = seastar::metrics; _metrics.add_group("cache", { sm::make_gauge("bytes_used", sm::description("current bytes used by the cache out of the total size of memory"), [this] { return _region.occupancy().used_space(); }), sm::make_gauge("bytes_total", sm::description("total size of memory for the cache"), [this] { return _region.occupancy().total_space(); }), sm::make_derive("partition_hits", sm::description("number of partitions needed by reads and found in cache"), _stats.partition_hits), sm::make_derive("partition_misses", sm::description("number of partitions needed by reads and missing in cache"), _stats.partition_misses), sm::make_derive("partition_insertions", sm::description("total number of partitions added to cache"), _stats.partition_insertions), sm::make_derive("row_hits", sm::description("total number of rows needed by reads and found in cache"), _stats.row_hits), sm::make_derive("row_misses", sm::description("total number of rows needed by reads and missing in cache"), _stats.row_misses), sm::make_derive("row_insertions", sm::description("total number of rows added to cache"), _stats.row_insertions), sm::make_derive("row_evictions", sm::description("total number of rows evicted from cache"), _stats.row_evictions), sm::make_derive("row_removals", sm::description("total number of invalidated rows"), _stats.row_removals), sm::make_derive("static_row_insertions", sm::description("total number of static rows added to cache"), _stats.static_row_insertions), sm::make_derive("concurrent_misses_same_key", sm::description("total number of operation with misses same key"), _stats.concurrent_misses_same_key), sm::make_derive("partition_merges", sm::description("total number of partitions merged"), _stats.partition_merges), sm::make_derive("partition_evictions", sm::description("total number of evicted partitions"), _stats.partition_evictions), sm::make_derive("partition_removals", sm::description("total number of invalidated partitions"), _stats.partition_removals), sm::make_derive("mispopulations", sm::description("number of entries not inserted by reads"), _stats.mispopulations), sm::make_gauge("partitions", sm::description("total number of cached partitions"), _stats.partitions), sm::make_gauge("rows", sm::description("total number of cached rows"), _stats.rows), sm::make_derive("reads", sm::description("number of started reads"), _stats.reads), sm::make_derive("reads_with_misses", sm::description("number of reads which had to read from sstables"), _stats.reads_with_misses), sm::make_gauge("active_reads", sm::description("number of currently active reads"), [this] { return _stats.active_reads(); }), sm::make_derive("sstable_reader_recreations", sm::description("number of times sstable reader was recreated due to memtable flush"), _stats.underlying_recreations), sm::make_derive("sstable_partition_skips", sm::description("number of times sstable reader was fast forwarded across partitions"), _stats.underlying_partition_skips), sm::make_derive("sstable_row_skips", sm::description("number of times sstable reader was fast forwarded within a partition"), _stats.underlying_row_skips), sm::make_derive("pinned_dirty_memory_overload", sm::description("amount of pinned bytes that we tried to unpin over the limit. This should sit constantly at 0, and any number different than 0 is indicative of a bug"), _stats.pinned_dirty_memory_overload), sm::make_derive("rows_processed_from_memtable", _stats.rows_processed_from_memtable, sm::description("total number of rows in memtables which were processed during cache update on memtable flush")), sm::make_derive("rows_dropped_from_memtable", _stats.rows_dropped_from_memtable, sm::description("total number of rows in memtables which were dropped during cache update on memtable flush")), sm::make_derive("rows_merged_from_memtable", _stats.rows_merged_from_memtable, sm::description("total number of rows in memtables which were merged with existing rows during cache update on memtable flush")), }); } void cache_tracker::clear() { auto partitions_before = _stats.partitions; auto rows_before = _stats.rows; // We need to clear garbage first because garbage versions cannot be evicted from, // mutation_partition::clear_gently() destroys intrusive tree invariants. with_allocator(_region.allocator(), [this] { _garbage.clear(); _memtable_cleaner.clear(); while (!_lru.empty()) { _lru.back().on_evicted(*this); } }); _stats.partition_removals += partitions_before; _stats.row_removals += rows_before; allocator().invalidate_references(); } void cache_tracker::touch(rows_entry& e) { if (e._lru_link.is_linked()) { // last dummy may not be linked if evicted. _lru.erase(_lru.iterator_to(e)); } _lru.push_front(e); } void cache_tracker::insert(cache_entry& entry) { insert(entry.partition()); ++_stats.partition_insertions; ++_stats.partitions; // partition_range_cursor depends on this to detect invalidation of _end _region.allocator().invalidate_references(); } void cache_tracker::on_partition_erase() { --_stats.partitions; ++_stats.partition_removals; allocator().invalidate_references(); } void cache_tracker::unlink(rows_entry& row) noexcept { row._lru_link.unlink(); } void cache_tracker::on_partition_merge() { ++_stats.partition_merges; } void cache_tracker::on_partition_hit() { ++_stats.partition_hits; } void cache_tracker::on_partition_miss() { ++_stats.partition_misses; } void cache_tracker::on_partition_eviction() { --_stats.partitions; ++_stats.partition_evictions; } void cache_tracker::on_row_eviction() { --_stats.rows; ++_stats.row_evictions; } void cache_tracker::on_row_hit() { ++_stats.row_hits; } void cache_tracker::on_row_miss() { ++_stats.row_misses; } void cache_tracker::on_mispopulate() { ++_stats.mispopulations; } void cache_tracker::on_miss_already_populated() { ++_stats.concurrent_misses_same_key; } void cache_tracker::pinned_dirty_memory_overload(uint64_t bytes) { _stats.pinned_dirty_memory_overload += bytes; } allocation_strategy& cache_tracker::allocator() { return _region.allocator(); } logalloc::region& cache_tracker::region() { return _region; } const logalloc::region& cache_tracker::region() const { return _region; } // Stable cursor over partition entries from given range. // // Must be accessed with reclaim lock held on the cache region. // The position of the cursor is always valid, but cache entry reference // is not always valid. It remains valid as long as the iterators // into _cache._partitions remain valid. Cache entry reference can be // brought back to validity by calling refresh(). // class partition_range_cursor final { std::reference_wrapper _cache; row_cache::partitions_type::iterator _it; row_cache::partitions_type::iterator _end; dht::ring_position_view _start_pos; dht::ring_position_view _end_pos; stdx::optional _last; uint64_t _last_reclaim_count; private: void set_position(cache_entry& e) { // FIXME: make ring_position_view convertible to ring_position, so we can use e.position() if (e.is_dummy_entry()) { _last = {}; _start_pos = dht::ring_position_view::max(); } else { _last = e.key(); _start_pos = dht::ring_position_view(*_last); } } public: // Creates a cursor positioned at the lower bound of the range. // The cache entry reference is not valid. // The range reference must remain live as long as this instance is used. partition_range_cursor(row_cache& cache, const dht::partition_range& range) : _cache(cache) , _start_pos(dht::ring_position_view::for_range_start(range)) , _end_pos(dht::ring_position_view::for_range_end(range)) , _last_reclaim_count(std::numeric_limits::max()) { } // Returns true iff the cursor is valid bool valid() const { return _cache.get().get_cache_tracker().allocator().invalidate_counter() == _last_reclaim_count; } // Repositions the cursor to the first entry with position >= pos. // Returns true iff the position of the cursor is equal to pos. // Can be called on invalid cursor, in which case it brings it back to validity. // Strong exception guarantees. bool advance_to(dht::ring_position_view pos) { auto cmp = cache_entry::compare(_cache.get()._schema); if (cmp(_end_pos, pos)) { // next() may have moved _start_pos past the _end_pos. _end_pos = pos; } _end = _cache.get()._partitions.lower_bound(_end_pos, cmp); _it = _cache.get()._partitions.lower_bound(pos, cmp); auto same = !cmp(pos, _it->position()); set_position(*_it); _last_reclaim_count = _cache.get().get_cache_tracker().allocator().invalidate_counter(); return same; } // Ensures that cache entry reference is valid. // The cursor will point at the first entry with position >= the current position. // Returns true if and only if the position of the cursor did not change. // Strong exception guarantees. bool refresh() { if (valid()) { return true; } return advance_to(_start_pos); } // Positions the cursor at the next entry. // May advance past the requested range. Use in_range() after the call to determine that. // Call only when in_range() and cache entry reference is valid. // Strong exception guarantees. void next() { auto next = std::next(_it); set_position(*next); _it = std::move(next); } // Valid only after refresh() and before _cache._partitions iterators are invalidated. // Points inside the requested range if in_range(). cache_entry& entry() { return *_it; } // Call only when cache entry reference is valid. bool in_range() { return _it != _end; } // Returns current position of the cursor. // Result valid as long as this instance is valid and not advanced. dht::ring_position_view position() const { return _start_pos; } }; future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout) { if (_range_query) { // FIXME: Singular-range mutation readers don't support fast_forward_to(), so need to use a wide range // here in case the same reader will need to be fast forwarded later. _sm_range = dht::partition_range({dht::ring_position(*_key)}, {dht::ring_position(*_key)}); } else { _sm_range = dht::partition_range::make_singular({dht::ring_position(*_key)}); } return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this, skip_first_fragment, timeout] { _underlying_snapshot = {}; if (skip_first_fragment) { return _underlying.underlying()(timeout).then([](auto &&mf) {}); } else { return make_ready_future<>(); } }); } static flat_mutation_reader read_directly_from_underlying(read_context& reader) { flat_mutation_reader res = make_delegating_reader(reader.underlying().underlying()); if (reader.schema()->version() != reader.underlying().underlying().schema()->version()) { res = transform(std::move(res), schema_upgrader(reader.schema())); } if (reader.fwd() == streamed_mutation::forwarding::no) { res = make_nonforwardable(std::move(res), true); } return std::move(res); } // Reader which populates the cache using data from the delegate. class single_partition_populating_reader final : public flat_mutation_reader::impl { row_cache& _cache; lw_shared_ptr _read_context; flat_mutation_reader_opt _reader; private: future<> create_reader(db::timeout_clock::time_point timeout) { auto src_and_phase = _cache.snapshot_of(_read_context->range().start()->value()); auto phase = src_and_phase.phase; _read_context->enter_partition(_read_context->range().start()->value().as_decorated_key(), src_and_phase.snapshot, phase); return _read_context->create_underlying(false, timeout).then([this, phase, timeout] { return _read_context->underlying().underlying()(timeout).then([this, phase] (auto&& mfopt) { if (!mfopt) { if (phase == _cache.phase_of(_read_context->range().start()->value())) { _cache._read_section(_cache._tracker.region(), [this] { with_allocator(_cache._tracker.allocator(), [this] { dht::decorated_key dk = _read_context->range().start()->value().as_decorated_key(); _cache.do_find_or_create_entry(dk, nullptr, [&] (auto i) { mutation_partition mp(_cache._schema); cache_entry* entry = current_allocator().construct( _cache._schema, std::move(dk), std::move(mp)); _cache._tracker.insert(*entry); entry->set_continuous(i->continuous()); return _cache._partitions.insert_before(i, *entry); }, [&] (auto i) { _cache._tracker.on_miss_already_populated(); }); }); }); } else { _cache._tracker.on_mispopulate(); } _end_of_stream = true; } else if (phase == _cache.phase_of(_read_context->range().start()->value())) { _reader = _cache._read_section(_cache._tracker.region(), [&] { cache_entry& e = _cache.find_or_create(mfopt->as_partition_start().key(), mfopt->as_partition_start().partition_tombstone(), phase); return e.read(_cache, *_read_context, phase); }); } else { _cache._tracker.on_mispopulate(); _reader = read_directly_from_underlying(*_read_context); this->push_mutation_fragment(std::move(*mfopt)); } }); }); } public: single_partition_populating_reader(row_cache& cache, lw_shared_ptr context) : impl(context->schema()) , _cache(cache) , _read_context(std::move(context)) { } virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override { if (!_reader) { return create_reader(timeout).then([this, timeout] { if (_end_of_stream) { return make_ready_future<>(); } return fill_buffer(timeout); }); } return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] { return fill_buffer_from(*_reader, timeout).then([this] (bool reader_finished) { if (reader_finished) { _end_of_stream = true; } }); }); } virtual void next_partition() override { if (_reader) { clear_buffer(); _end_of_stream = true; } } virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point timeout) override { clear_buffer(); _end_of_stream = true; return make_ready_future<>(); } virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override { if (!_reader) { _end_of_stream = true; return make_ready_future<>(); } assert(bool(_read_context->fwd())); _end_of_stream = false; forward_buffer_to(pr.start()); return _reader->fast_forward_to(std::move(pr), timeout); } virtual size_t buffer_size() const override { if (_reader) { return flat_mutation_reader::impl::buffer_size() + _reader->buffer_size(); } return flat_mutation_reader::impl::buffer_size(); } }; void cache_tracker::clear_continuity(cache_entry& ce) { ce.set_continuous(false); } void row_cache::on_partition_hit() { _tracker.on_partition_hit(); } void row_cache::on_partition_miss() { _tracker.on_partition_miss(); } void row_cache::on_row_hit() { _stats.hits.mark(); _tracker.on_row_hit(); } void row_cache::on_mispopulate() { _tracker.on_mispopulate(); } void row_cache::on_row_miss() { _stats.misses.mark(); _tracker.on_row_miss(); } void row_cache::on_static_row_insert() { ++_tracker._stats.static_row_insertions; } class range_populating_reader { row_cache& _cache; autoupdating_underlying_reader& _reader; stdx::optional _last_key; read_context& _read_context; private: bool can_set_continuity() const { return _last_key && _reader.creation_phase() == _cache.phase_of(_reader.population_range_start()); } void handle_end_of_stream() { if (!can_set_continuity()) { _cache.on_mispopulate(); return; } if (!_reader.range().end() || !_reader.range().end()->is_inclusive()) { cache_entry::compare cmp(_cache._schema); auto it = _reader.range().end() ? _cache._partitions.find(_reader.range().end()->value(), cmp) : std::prev(_cache._partitions.end()); if (it != _cache._partitions.end()) { if (it == _cache._partitions.begin()) { if (!_last_key->_key) { it->set_continuous(true); } else { _cache.on_mispopulate(); } } else { auto prev = std::prev(it); if (prev->key().equal(*_cache._schema, *_last_key->_key)) { it->set_continuous(true); } else { _cache.on_mispopulate(); } } } } } public: range_populating_reader(row_cache& cache, read_context& ctx) : _cache(cache) , _reader(ctx.underlying()) , _read_context(ctx) {} future operator()(db::timeout_clock::time_point timeout) { return _reader.move_to_next_partition(timeout).then([this] (auto&& mfopt) mutable { { if (!mfopt) { this->handle_end_of_stream(); return make_ready_future(stdx::nullopt, stdx::nullopt); } _cache.on_partition_miss(); const partition_start& ps = mfopt->as_partition_start(); const dht::decorated_key& key = ps.key(); if (_reader.creation_phase() == _cache.phase_of(key)) { return _cache._read_section(_cache._tracker.region(), [&] { cache_entry& e = _cache.find_or_create(key, ps.partition_tombstone(), _reader.creation_phase(), this->can_set_continuity() ? &*_last_key : nullptr); _last_key = row_cache::previous_entry_pointer(key); return make_ready_future( e.read(_cache, _read_context, _reader.creation_phase()), stdx::nullopt); }); } else { _cache._tracker.on_mispopulate(); _last_key = row_cache::previous_entry_pointer(key); return make_ready_future( read_directly_from_underlying(_read_context), std::move(mfopt)); } } }); } future<> fast_forward_to(dht::partition_range&& pr, db::timeout_clock::time_point timeout) { if (!pr.start()) { _last_key = row_cache::previous_entry_pointer(); } else if (!pr.start()->is_inclusive() && pr.start()->value().has_key()) { _last_key = row_cache::previous_entry_pointer(pr.start()->value().as_decorated_key()); } else { // Inclusive start bound, cannot set continuity flag. _last_key = {}; } return _reader.fast_forward_to(std::move(pr), timeout); } }; class scanning_and_populating_reader final : public flat_mutation_reader::impl { const dht::partition_range* _pr; row_cache& _cache; lw_shared_ptr _read_context; partition_range_cursor _primary; range_populating_reader _secondary_reader; bool _secondary_in_progress = false; bool _advance_primary = false; stdx::optional _lower_bound; dht::partition_range _secondary_range; flat_mutation_reader_opt _reader; private: flat_mutation_reader read_from_entry(cache_entry& ce) { _cache.upgrade_entry(ce); _cache.on_partition_hit(); return ce.read(_cache, *_read_context); } static dht::ring_position_view as_ring_position_view(const stdx::optional& lower_bound) { return lower_bound ? dht::ring_position_view(lower_bound->value(), dht::ring_position_view::after_key(!lower_bound->is_inclusive())) : dht::ring_position_view::min(); } flat_mutation_reader_opt do_read_from_primary(db::timeout_clock::time_point timeout) { return _cache._read_section(_cache._tracker.region(), [this] { return with_linearized_managed_bytes([&] () -> flat_mutation_reader_opt { bool not_moved = true; if (!_primary.valid()) { not_moved = _primary.advance_to(as_ring_position_view(_lower_bound)); } if (_advance_primary && not_moved) { _primary.next(); not_moved = false; } _advance_primary = false; if (not_moved || _primary.entry().continuous()) { if (!_primary.in_range()) { return stdx::nullopt; } cache_entry& e = _primary.entry(); auto fr = read_from_entry(e); _lower_bound = dht::partition_range::bound{e.key(), false}; // Delay the call to next() so that we don't see stale continuity on next invocation. _advance_primary = true; return flat_mutation_reader_opt(std::move(fr)); } else { if (_primary.in_range()) { cache_entry& e = _primary.entry(); _secondary_range = dht::partition_range(_lower_bound, dht::partition_range::bound{e.key(), false}); _lower_bound = dht::partition_range::bound{e.key(), true}; _secondary_in_progress = true; return stdx::nullopt; } else { dht::ring_position_comparator cmp(*_read_context->schema()); auto range = _pr->trim_front(stdx::optional(_lower_bound), cmp); if (!range) { return stdx::nullopt; } _lower_bound = dht::partition_range::bound{dht::ring_position::max()}; _secondary_range = std::move(*range); _secondary_in_progress = true; return stdx::nullopt; } } }); }); } future read_from_primary(db::timeout_clock::time_point timeout) { auto fro = do_read_from_primary(timeout); if (!_secondary_in_progress) { return make_ready_future(std::move(fro)); } return _secondary_reader.fast_forward_to(std::move(_secondary_range), timeout).then([this, timeout] { return read_from_secondary(timeout); }); } future read_from_secondary(db::timeout_clock::time_point timeout) { return _secondary_reader(timeout).then([this, timeout] (flat_mutation_reader_opt fropt, mutation_fragment_opt ps) { if (fropt) { if (ps) { push_mutation_fragment(std::move(*ps)); } return make_ready_future(std::move(fropt)); } else { _secondary_in_progress = false; return read_from_primary(timeout); } }); } future<> read_next_partition(db::timeout_clock::time_point timeout) { return (_secondary_in_progress ? read_from_secondary(timeout) : read_from_primary(timeout)).then([this] (auto&& fropt) { if (bool(fropt)) { _reader = std::move(fropt); } else { _end_of_stream = true; } }); } void on_end_of_stream() { if (_read_context->fwd() == streamed_mutation::forwarding::yes) { _end_of_stream = true; } else { _reader = {}; } } public: scanning_and_populating_reader(row_cache& cache, const dht::partition_range& range, lw_shared_ptr context) : impl(context->schema()) , _pr(&range) , _cache(cache) , _read_context(std::move(context)) , _primary(cache, range) , _secondary_reader(cache, *_read_context) , _lower_bound(range.start()) { } virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override { return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this, timeout] { if (!_reader) { return read_next_partition(timeout); } else { return fill_buffer_from(*_reader, timeout).then([this] (bool reader_finished) { if (reader_finished) { on_end_of_stream(); } }); } }); } virtual void next_partition() override { if (_read_context->fwd() == streamed_mutation::forwarding::yes) { if (_reader) { clear_buffer(); _reader->next_partition(); _end_of_stream = false; } } else { clear_buffer_to_next_partition(); if (_reader && is_buffer_empty()) { _reader->next_partition(); } } } virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { clear_buffer(); _reader = {}; _end_of_stream = false; _secondary_in_progress = false; _advance_primary = false; _pr = ≺ _primary = partition_range_cursor{_cache, pr}; _lower_bound = pr.start(); return make_ready_future<>(); } virtual future<> fast_forward_to(position_range cr, db::timeout_clock::time_point timeout) override { forward_buffer_to(cr.start()); if (_reader) { _end_of_stream = false; return _reader->fast_forward_to(std::move(cr), timeout); } else { _end_of_stream = true; return make_ready_future<>(); } } virtual size_t buffer_size() const override { if (_reader) { return flat_mutation_reader::impl::buffer_size() + _reader->buffer_size(); } return flat_mutation_reader::impl::buffer_size(); } }; flat_mutation_reader row_cache::make_scanning_reader(const dht::partition_range& range, lw_shared_ptr context) { return make_flat_mutation_reader(*this, range, std::move(context)); } flat_mutation_reader row_cache::make_reader(schema_ptr s, const dht::partition_range& range, const query::partition_slice& slice, const io_priority_class& pc, tracing::trace_state_ptr trace_state, streamed_mutation::forwarding fwd, mutation_reader::forwarding fwd_mr) { auto ctx = make_lw_shared(*this, s, range, slice, pc, trace_state, fwd, fwd_mr); if (!ctx->is_range_query()) { return _read_section(_tracker.region(), [&] { return with_linearized_managed_bytes([&] { cache_entry::compare cmp(_schema); auto&& pos = ctx->range().start()->value(); auto i = _partitions.lower_bound(pos, cmp); if (i != _partitions.end() && !cmp(pos, i->position())) { cache_entry& e = *i; upgrade_entry(e); on_partition_hit(); return e.read(*this, *ctx); } else if (i->continuous()) { return make_empty_flat_reader(std::move(s)); } else { on_partition_miss(); return make_flat_mutation_reader(*this, std::move(ctx)); } }); }); } return make_scanning_reader(range, std::move(ctx)); } row_cache::~row_cache() { with_allocator(_tracker.allocator(), [this] { _partitions.clear_and_dispose([this, deleter = current_deleter()] (auto&& p) mutable { if (!p->is_dummy_entry()) { _tracker.on_partition_erase(); } p->evict(_tracker); deleter(p); }); }); } void row_cache::clear_now() noexcept { with_allocator(_tracker.allocator(), [this] { auto it = _partitions.erase_and_dispose(_partitions.begin(), partitions_end(), [this, deleter = current_deleter()] (auto&& p) mutable { _tracker.on_partition_erase(); p->evict(_tracker); deleter(p); }); _tracker.clear_continuity(*it); }); } template //requires requires(CreateEntry create, VisitEntry visit, row_cache::partitions_type::iterator it) { // { create(it) } -> row_cache::partitions_type::iterator; // { visit(it) } -> void; // } cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key, const previous_entry_pointer* previous, CreateEntry&& create_entry, VisitEntry&& visit_entry) { return with_allocator(_tracker.allocator(), [&] () -> cache_entry& { return with_linearized_managed_bytes([&] () -> cache_entry& { auto i = _partitions.lower_bound(key, cache_entry::compare(_schema)); if (i == _partitions.end() || !i->key().equal(*_schema, key)) { i = create_entry(i); } else { visit_entry(i); } if (!previous) { return *i; } if ((!previous->_key && i == _partitions.begin()) || (previous->_key && i != _partitions.begin() && std::prev(i)->key().equal(*_schema, *previous->_key))) { i->set_continuous(true); } else { on_mispopulate(); } return *i; }); }); } cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone t, row_cache::phase_type phase, const previous_entry_pointer* previous) { return do_find_or_create_entry(key, previous, [&] (auto i) { // create auto entry = current_allocator().construct(cache_entry::incomplete_tag{}, _schema, key, t); _tracker.insert(*entry); return _partitions.insert_before(i, *entry); }, [&] (auto i) { // visit _tracker.on_miss_already_populated(); cache_entry& e = *i; e.partition().open_version(*e.schema(), &_tracker, phase).partition().apply(t); upgrade_entry(e); }); } void row_cache::populate(const mutation& m, const previous_entry_pointer* previous) { _populate_section(_tracker.region(), [&] { do_find_or_create_entry(m.decorated_key(), previous, [&] (auto i) { cache_entry* entry = current_allocator().construct( m.schema(), m.decorated_key(), m.partition()); _tracker.insert(*entry); entry->set_continuous(i->continuous()); i = _partitions.insert_before(i, *entry); upgrade_entry(*i); return i; }, [&] (auto i) { throw std::runtime_error(format("cache already contains entry for {}", m.key())); }); }); } mutation_source& row_cache::snapshot_for_phase(phase_type phase) { if (phase == _underlying_phase) { return _underlying; } else { if (phase + 1 < _underlying_phase) { throw std::runtime_error(format("attempted to read from retired phase {} (current={})", phase, _underlying_phase)); } return *_prev_snapshot; } } row_cache::snapshot_and_phase row_cache::snapshot_of(dht::ring_position_view pos) { dht::ring_position_less_comparator less(*_schema); if (!_prev_snapshot_pos || less(pos, *_prev_snapshot_pos)) { return {_underlying, _underlying_phase}; } return {*_prev_snapshot, _underlying_phase - 1}; } void row_cache::invalidate_sync(memtable& m) noexcept { with_allocator(_tracker.allocator(), [&m, this] () { logalloc::reclaim_lock _(_tracker.region()); bool blow_cache = false; // Note: clear_and_dispose() ought not to look up any keys, so it doesn't require // with_linearized_managed_bytes(), but invalidate() does. m.partitions.clear_and_dispose([this, deleter = current_deleter(), &blow_cache] (memtable_entry* entry) { with_linearized_managed_bytes([&] { try { invalidate_locked(entry->key()); } catch (...) { blow_cache = true; } entry->partition().evict(_tracker.memtable_cleaner()); deleter(entry); }); }); if (blow_cache) { // We failed to invalidate the key, presumably due to with_linearized_managed_bytes() // running out of memory. Recover using clear_now(), which doesn't throw. clear_now(); } }); } row_cache::phase_type row_cache::phase_of(dht::ring_position_view pos) { dht::ring_position_less_comparator less(*_schema); if (!_prev_snapshot_pos || less(pos, *_prev_snapshot_pos)) { return _underlying_phase; } return _underlying_phase - 1; } template future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater) { return do_update(std::move(eu), [this, &m, updater = std::move(updater)] { real_dirty_memory_accounter real_dirty_acc(m, _tracker); m.on_detach_from_region_group(); _tracker.region().merge(m); // Now all data in memtable belongs to cache _tracker.memtable_cleaner().merge(m._cleaner); STAP_PROBE(scylla, row_cache_update_start); auto cleanup = defer([&m, this] { invalidate_sync(m); STAP_PROBE(scylla, row_cache_update_end); }); return seastar::async([this, &m, updater = std::move(updater), real_dirty_acc = std::move(real_dirty_acc)] () mutable { coroutine update; size_t size_entry; // In case updater fails, we must bring the cache to consistency without deferring. auto cleanup = defer([&m, this] { invalidate_sync(m); _prev_snapshot_pos = {}; _prev_snapshot = {}; }); partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker(); while (!m.partitions.empty()) { with_allocator(_tracker.allocator(), [&] () { auto cmp = cache_entry::compare(_schema); { size_t partition_count = 0; { STAP_PROBE(scylla, row_cache_update_one_batch_start); // FIXME: we should really be checking should_yield() here instead of // need_preempt(). However, should_yield() is currently quite // expensive and we need to amortize it somehow. do { STAP_PROBE(scylla, row_cache_update_partition_start); with_linearized_managed_bytes([&] { if (!update) { _update_section(_tracker.region(), [&] { memtable_entry& mem_e = *m.partitions.begin(); size_entry = mem_e.size_in_allocator_without_rows(_tracker.allocator()); auto cache_i = _partitions.lower_bound(mem_e.key(), cmp); update = updater(_update_section, cache_i, mem_e, is_present, real_dirty_acc); }); } // We use cooperative deferring instead of futures so that // this layer has a chance to restore invariants before deferring, // in particular set _prev_snapshot_pos to the correct value. if (update.run() == stop_iteration::no) { return; } update = {}; real_dirty_acc.unpin_memory(size_entry); _update_section(_tracker.region(), [&] { auto i = m.partitions.begin(); memtable_entry& mem_e = *i; m.partitions.erase(i); mem_e.partition().evict(_tracker.memtable_cleaner()); current_allocator().destroy(&mem_e); }); ++partition_count; }); STAP_PROBE(scylla, row_cache_update_partition_end); } while (!m.partitions.empty() && !need_preempt()); with_allocator(standard_allocator(), [&] { if (m.partitions.empty()) { _prev_snapshot_pos = {}; } else { _update_section(_tracker.region(), [&] { _prev_snapshot_pos = dht::ring_position(m.partitions.begin()->key()); }); } }); STAP_PROBE1(scylla, row_cache_update_one_batch_end, partition_count); } } }); real_dirty_acc.commit(); seastar::thread::yield(); } }).finally([cleanup = std::move(cleanup)] {}); }); } future<> row_cache::update(external_updater eu, memtable& m) { return do_update(std::move(eu), m, [this] (logalloc::allocating_section& alloc, row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e, partition_presence_checker& is_present, real_dirty_memory_accounter& acc) mutable { // If cache doesn't contain the entry we cannot insert it because the mutation may be incomplete. // FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to // search it. if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) { cache_entry& entry = *cache_i; upgrade_entry(entry); _tracker.on_partition_merge(); return entry.partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(), alloc, _tracker.region(), _tracker, _underlying_phase, acc); } else if (cache_i->continuous() || is_present(mem_e.key()) == partition_presence_checker_result::definitely_doesnt_exist) { // Partition is absent in underlying. First, insert a neutral partition entry. cache_entry* entry = current_allocator().construct(cache_entry::evictable_tag(), _schema, dht::decorated_key(mem_e.key()), partition_entry::make_evictable(*_schema, mutation_partition(_schema))); entry->set_continuous(cache_i->continuous()); _tracker.insert(*entry); _partitions.insert_before(cache_i, *entry); return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), *mem_e.schema(), _tracker.memtable_cleaner(), alloc, _tracker.region(), _tracker, _underlying_phase, acc); } else { return make_empty_coroutine(); } }); } future<> row_cache::update_invalidating(external_updater eu, memtable& m) { return do_update(std::move(eu), m, [this] (logalloc::allocating_section& alloc, row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e, partition_presence_checker& is_present, real_dirty_memory_accounter& acc) { if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) { // FIXME: Invalidate only affected row ranges. // This invalidates all information about the partition. cache_entry& e = *cache_i; e.evict(_tracker); e.on_evicted(_tracker); } else { _tracker.clear_continuity(*cache_i); } // FIXME: subtract gradually from acc. return make_empty_coroutine(); }); } void row_cache::refresh_snapshot() { _underlying = _snapshot_source(); } void row_cache::touch(const dht::decorated_key& dk) { _read_section(_tracker.region(), [&] { with_linearized_managed_bytes([&] { auto i = _partitions.find(dk, cache_entry::compare(_schema)); if (i != _partitions.end()) { for (partition_version& pv : i->partition().versions_from_oldest()) { for (rows_entry& row : pv.partition().clustered_rows()) { _tracker.touch(row); } } } }); }); } void row_cache::unlink_from_lru(const dht::decorated_key& dk) { _read_section(_tracker.region(), [&] { with_linearized_managed_bytes([&] { auto i = _partitions.find(dk, cache_entry::compare(_schema)); if (i != _partitions.end()) { for (partition_version& pv : i->partition().versions_from_oldest()) { for (rows_entry& row : pv.partition().clustered_rows()) { _tracker.unlink(row); } } } }); }); } void row_cache::invalidate_locked(const dht::decorated_key& dk) { auto pos = _partitions.lower_bound(dk, cache_entry::compare(_schema)); if (pos == partitions_end() || !pos->key().equal(*_schema, dk)) { _tracker.clear_continuity(*pos); } else { auto it = _partitions.erase_and_dispose(pos, [this, &dk, deleter = current_deleter()](auto&& p) mutable { _tracker.on_partition_erase(); p->evict(_tracker); deleter(p); }); _tracker.clear_continuity(*it); } } future<> row_cache::invalidate(external_updater eu, const dht::decorated_key& dk) { return invalidate(std::move(eu), dht::partition_range::make_singular(dk)); } future<> row_cache::invalidate(external_updater eu, const dht::partition_range& range) { return invalidate(std::move(eu), dht::partition_range_vector({range})); } future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&& ranges) { return do_update(std::move(eu), [this, ranges = std::move(ranges)] { auto on_failure = defer([this] { this->clear_now(); }); with_linearized_managed_bytes([&] { for (auto&& range : ranges) { this->invalidate_unwrapped(range); } }); on_failure.cancel(); return make_ready_future<>(); }); } void row_cache::evict(const dht::partition_range& range) { invalidate_unwrapped(range); } void row_cache::invalidate_unwrapped(const dht::partition_range& range) { logalloc::reclaim_lock _(_tracker.region()); auto cmp = cache_entry::compare(_schema); auto begin = _partitions.lower_bound(dht::ring_position_view::for_range_start(range), cmp); auto end = _partitions.lower_bound(dht::ring_position_view::for_range_end(range), cmp); with_allocator(_tracker.allocator(), [this, begin, end] { auto it = _partitions.erase_and_dispose(begin, end, [this, deleter = current_deleter()] (auto&& p) mutable { _tracker.on_partition_erase(); p->evict(_tracker); deleter(p); }); assert(it != _partitions.end()); _tracker.clear_continuity(*it); }); } row_cache::row_cache(schema_ptr s, snapshot_source src, cache_tracker& tracker, is_continuous cont) : _tracker(tracker) , _schema(std::move(s)) , _partitions(cache_entry::compare(_schema)) , _underlying(src()) , _snapshot_source(std::move(src)) { with_allocator(_tracker.allocator(), [this, cont] { cache_entry* entry = current_allocator().construct(cache_entry::dummy_entry_tag()); _partitions.insert_before(_partitions.end(), *entry); entry->set_continuous(bool(cont)); }); } cache_entry::cache_entry(cache_entry&& o) noexcept : _schema(std::move(o._schema)) , _key(std::move(o._key)) , _pe(std::move(o._pe)) , _flags(o._flags) , _cache_link() { { using container_type = row_cache::partitions_type; container_type::node_algorithms::replace_node(o._cache_link.this_ptr(), _cache_link.this_ptr()); container_type::node_algorithms::init(o._cache_link.this_ptr()); } } cache_entry::~cache_entry() { } void cache_entry::evict(cache_tracker& tracker) noexcept { _pe.evict(tracker.cleaner()); } void row_cache::set_schema(schema_ptr new_schema) noexcept { _schema = std::move(new_schema); } void cache_entry::on_evicted(cache_tracker& tracker) noexcept { auto it = row_cache::partitions_type::s_iterator_to(*this); std::next(it)->set_continuous(false); evict(tracker); current_deleter()(this); tracker.on_partition_eviction(); } void rows_entry::on_evicted(cache_tracker& tracker) noexcept { auto it = mutation_partition::rows_type::iterator_to(*this); if (is_last_dummy()) { // Every evictable partition entry must have a dummy entry at the end, // so don't remove it, just unlink from the LRU. // That dummy is linked in the LRU, because there may be partitions // with no regular rows, and we need to track them. _lru_link.unlink(); } else { ++it; it->set_continuous(false); current_deleter()(this); tracker.on_row_eviction(); } if (mutation_partition::rows_type::is_only_member(*it)) { assert(it->is_last_dummy()); partition_version& pv = partition_version::container_of(mutation_partition::container_of( mutation_partition::rows_type::container_of_only_member(*it))); if (pv.is_referenced_from_entry()) { cache_entry& ce = cache_entry::container_of(partition_entry::container_of(pv)); ce.on_evicted(tracker); } } } flat_mutation_reader cache_entry::read(row_cache& rc, read_context& reader) { auto source_and_phase = rc.snapshot_of(_key); reader.enter_partition(_key, source_and_phase.snapshot, source_and_phase.phase); return do_read(rc, reader); } flat_mutation_reader cache_entry::read(row_cache& rc, read_context& reader, row_cache::phase_type phase) { reader.enter_partition(_key, phase); return do_read(rc, reader); } // Assumes reader is in the corresponding partition flat_mutation_reader cache_entry::do_read(row_cache& rc, read_context& reader) { auto snp = _pe.read(rc._tracker.region(), rc._tracker.cleaner(), _schema, &rc._tracker, reader.phase()); auto ckr = query::clustering_key_filter_ranges::get_ranges(*_schema, reader.slice(), _key.key()); auto r = make_cache_flat_mutation_reader(_schema, _key, std::move(ckr), rc, reader.shared_from_this(), std::move(snp)); if (reader.schema()->version() != _schema->version()) { r = transform(std::move(r), schema_upgrader(reader.schema())); } if (reader.fwd() == streamed_mutation::forwarding::yes) { r = make_forwardable(std::move(r)); } return std::move(r); } const schema_ptr& row_cache::schema() const { return _schema; } void row_cache::upgrade_entry(cache_entry& e) { if (e._schema != _schema) { auto& r = _tracker.region(); assert(!r.reclaiming_enabled()); with_allocator(r.allocator(), [this, &e] { with_linearized_managed_bytes([&] { e.partition().upgrade(e._schema, _schema, _tracker.cleaner(), &_tracker); e._schema = _schema; }); }); } } std::ostream& operator<<(std::ostream& out, row_cache& rc) { rc._read_section(rc._tracker.region(), [&] { out << "{row_cache: " << ::join(", ", rc._partitions.begin(), rc._partitions.end()) << "}"; }); return out; } future<> row_cache::do_update(row_cache::external_updater eu, row_cache::internal_updater iu) noexcept { return futurize_apply([this] { return get_units(_update_sem, 1); }).then([this, eu = std::move(eu), iu = std::move(iu)] (auto permit) mutable { auto pos = dht::ring_position::min(); eu(); [&] () noexcept { _prev_snapshot_pos = std::move(pos); _prev_snapshot = std::exchange(_underlying, _snapshot_source()); ++_underlying_phase; }(); return futurize_apply([&iu] { return iu(); }).then_wrapped([this, permit = std::move(permit)] (auto f) { _prev_snapshot_pos = {}; _prev_snapshot = {}; if (f.failed()) { clogger.warn("Failure during cache update: {}", f.get_exception()); } }); }); } std::ostream& operator<<(std::ostream& out, cache_entry& e) { return out << "{cache_entry: " << e.position() << ", cont=" << e.continuous() << ", dummy=" << e.is_dummy_entry() << ", " << partition_entry::printer(*e.schema(), e.partition()) << "}"; }