Add the following counters: (1) querier_cache_lookups (2) querier_cache_misses (3) querier_cache_drops (4) querier_cache_time_based_evictions (5) querier_cache_resource_based_evictions (6) querier_cache_memory_based_evictions (6) querier_cache_population (1) counts the total number of querier cache lookups. Not all page-fetches will result in a querier lookup. For example the first page of a query will not do a lookup as there was no previous page to reuse the querier from. The second, and all subsequent pages however should attempt to reuse the querier from the previous page. (2) counts the subset of (1) where the read have missed the querier cache (failed to find a matching saved querier). (3) counts the subset of (1) where the querier was recalled and dropped immediately. This can happen for example if the querier was at the wrong position. (4) counts the cached queriers that were evicted due to their TTL expiring. (5) counts the cached queriers that were evicted due to reader-resource (those limited by reader-concurrency limits) shortage. (6) counts the cached queriers that were evicted due to reaching the cache's memory limits (currently set to 4% of the shards' memory). (7) is the current number of entries in the cache Note: * The count of cache hits can be derived from these counters as (1) - (2). * cache_drop (3) also implies a cache hit (see above). This means that the number of actually reused queriers is: (1) - (2) - (3)
406 lines
16 KiB
C++
406 lines
16 KiB
C++
/*
|
|
* Copyright (C) 2018 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "mutation_compactor.hh"
|
|
#include "mutation_reader.hh"
|
|
|
|
#include <seastar/core/weak_ptr.hh>
|
|
#include <variant>
|
|
|
|
/// One-stop object for serving queries.
|
|
///
|
|
/// Encapsulates all state and logic for serving all pages for a given range
|
|
/// of a query on a given shard. Can serve mutation or data queries. Can be
|
|
/// used with any CompactedMutationsConsumer certified result-builder.
|
|
/// Intended to be created on the first page of a query then saved and reused on
|
|
/// subsequent pages.
|
|
/// (1) Create with the parameters of your query.
|
|
/// (2) Call consume_page() with your consumer to consume the contents of the
|
|
/// next page.
|
|
/// (3) At the end of the page save the querier if you expect more pages.
|
|
/// The are_limits_reached() method can be used to determine whether the
|
|
/// page was filled or not. Also check your result builder for short reads.
|
|
/// Most result builders have memory-accounters that will stop the read
|
|
/// once some memory limit was reached. This is called a short read as the
|
|
/// read stops before the row and/or partition limits are reached.
|
|
/// (4) At the beginning of the next page use can_be_used_for_page() to
|
|
/// determine whether it can be used with the page's schema and start
|
|
/// position. If a schema or position mismatch is detected the querier
|
|
/// cannot be used to produce the next page and a new one has to be created
|
|
/// instead.
|
|
class querier {
|
|
public:
|
|
enum class can_use {
|
|
yes,
|
|
no_emit_only_live_rows_mismatch,
|
|
no_schema_version_mismatch,
|
|
no_ring_pos_mismatch,
|
|
no_clustering_pos_mismatch
|
|
};
|
|
private:
|
|
template <typename Consumer>
|
|
class clustering_position_tracker {
|
|
std::unique_ptr<Consumer> _consumer;
|
|
lw_shared_ptr<std::optional<clustering_key_prefix>> _last_ckey;
|
|
|
|
public:
|
|
clustering_position_tracker(std::unique_ptr<Consumer>&& consumer, lw_shared_ptr<std::optional<clustering_key_prefix>> last_ckey)
|
|
: _consumer(std::move(consumer))
|
|
, _last_ckey(std::move(last_ckey)) {
|
|
}
|
|
|
|
void consume_new_partition(const dht::decorated_key& dk) {
|
|
_last_ckey->reset();
|
|
_consumer->consume_new_partition(dk);
|
|
}
|
|
void consume(tombstone t) {
|
|
_consumer->consume(t);
|
|
}
|
|
stop_iteration consume(static_row&& sr, tombstone t, bool is_live) {
|
|
return _consumer->consume(std::move(sr), std::move(t), is_live);
|
|
}
|
|
stop_iteration consume(clustering_row&& cr, row_tombstone t, bool is_live) {
|
|
*_last_ckey = cr.key();
|
|
return _consumer->consume(std::move(cr), std::move(t), is_live);
|
|
}
|
|
stop_iteration consume(range_tombstone&& rt) {
|
|
return _consumer->consume(std::move(rt));
|
|
}
|
|
stop_iteration consume_end_of_partition() {
|
|
return _consumer->consume_end_of_partition();
|
|
}
|
|
auto consume_end_of_stream() {
|
|
return _consumer->consume_end_of_stream();
|
|
}
|
|
};
|
|
|
|
struct position {
|
|
const dht::decorated_key* partition_key;
|
|
const clustering_key_prefix* clustering_key;
|
|
};
|
|
|
|
schema_ptr _schema;
|
|
std::unique_ptr<dht::partition_range> _range;
|
|
std::unique_ptr<query::partition_slice> _slice;
|
|
flat_mutation_reader _reader;
|
|
std::variant<lw_shared_ptr<compact_for_mutation_query_state>, lw_shared_ptr<compact_for_data_query_state>> _compaction_state;
|
|
lw_shared_ptr<std::optional<clustering_key_prefix>> _last_ckey;
|
|
|
|
std::variant<lw_shared_ptr<compact_for_mutation_query_state>, lw_shared_ptr<compact_for_data_query_state>> make_compaction_state(
|
|
const schema& s,
|
|
uint32_t row_limit,
|
|
uint32_t partition_limit,
|
|
gc_clock::time_point query_time,
|
|
emit_only_live_rows only_live) const {
|
|
if (only_live == emit_only_live_rows::yes) {
|
|
return make_lw_shared<compact_for_query_state<emit_only_live_rows::yes>>(s, query_time, *_slice, row_limit, partition_limit);
|
|
} else {
|
|
return make_lw_shared<compact_for_query_state<emit_only_live_rows::no>>(s, query_time, *_slice, row_limit, partition_limit);
|
|
}
|
|
}
|
|
|
|
position current_position() const;
|
|
|
|
bool ring_position_matches(const dht::partition_range& range, const position& pos) const;
|
|
|
|
bool clustering_position_matches(const query::partition_slice& slice, const position& pos) const;
|
|
|
|
public:
|
|
querier(const mutation_source& ms,
|
|
schema_ptr schema,
|
|
const dht::partition_range& range,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_ptr,
|
|
emit_only_live_rows only_live)
|
|
: _schema(schema)
|
|
, _range(std::make_unique<dht::partition_range>(range))
|
|
, _slice(std::make_unique<query::partition_slice>(slice))
|
|
, _reader(ms.make_reader(schema, *_range, *_slice, pc, std::move(trace_ptr),
|
|
streamed_mutation::forwarding::no, mutation_reader::forwarding::no))
|
|
, _compaction_state(make_compaction_state(*schema, 0, 0, gc_clock::time_point{}, only_live))
|
|
, _last_ckey(make_lw_shared<std::optional<clustering_key_prefix>>()) {
|
|
}
|
|
|
|
bool is_reversed() const {
|
|
return _slice->options.contains(query::partition_slice::option::reversed);
|
|
}
|
|
|
|
bool are_limits_reached() const {
|
|
return std::visit([] (const auto& cs) { return cs->are_limits_reached(); }, _compaction_state);
|
|
}
|
|
|
|
/// Does the querier's range matches `range`?
|
|
///
|
|
/// A query can have more then one querier executing parallelly for
|
|
/// different sub-ranges on the same shard. This method helps identifying
|
|
/// the appropriate one for the `range'.
|
|
/// Since ranges can be narrowed from page-to-page (as the query moves
|
|
/// through it) we cannot just check the two ranges for equality.
|
|
/// Instead we exploit the fact the a query-range will always be split into
|
|
/// non-overlapping sub ranges and thus each bound of a range is unique.
|
|
/// Thus if any of the range's bounds are equal we have a match.
|
|
/// For singular ranges we just check the one bound.
|
|
bool matches(const dht::partition_range& range) const;
|
|
|
|
/// Can the querier be used for the next page?
|
|
///
|
|
/// The querier can only be used for the next page if the only_live, the
|
|
/// schema versions, the ring and the clustering positions match.
|
|
can_use can_be_used_for_page(emit_only_live_rows only_live, const schema& s,
|
|
const dht::partition_range& range, const query::partition_slice& slice) const;
|
|
|
|
template <typename Consumer>
|
|
GCC6_CONCEPT(
|
|
requires CompactedFragmentsConsumer<Consumer>
|
|
)
|
|
auto consume_page(Consumer&& consumer,
|
|
uint32_t row_limit,
|
|
uint32_t partition_limit,
|
|
gc_clock::time_point query_time,
|
|
db::timeout_clock::time_point timeout) {
|
|
return std::visit([=, consumer = std::move(consumer)] (auto& compaction_state) mutable {
|
|
// FIXME: #3158
|
|
// consumer cannot be moved after consume_new_partition() is called
|
|
// on it because it stores references to some of it's own members.
|
|
// Move it to the heap before any consumption begins to avoid
|
|
// accidents.
|
|
return _reader.peek().then([=, consumer = std::make_unique<Consumer>(std::move(consumer))] (mutation_fragment* next_fragment) mutable {
|
|
const auto next_fragment_kind = next_fragment ? next_fragment->mutation_fragment_kind() : mutation_fragment::kind::partition_end;
|
|
compaction_state->start_new_page(row_limit, partition_limit, query_time, next_fragment_kind, *consumer);
|
|
|
|
const auto is_reversed = flat_mutation_reader::consume_reversed_partitions(
|
|
_slice->options.contains(query::partition_slice::option::reversed));
|
|
|
|
using compaction_state_type = typename std::remove_reference<decltype(*compaction_state)>::type;
|
|
constexpr auto only_live = compaction_state_type::parameters::only_live;
|
|
auto reader_consumer = make_stable_flattened_mutations_consumer<compact_for_query<only_live, clustering_position_tracker<Consumer>>>(
|
|
compaction_state,
|
|
clustering_position_tracker(std::move(consumer), _last_ckey));
|
|
|
|
return _reader.consume(std::move(reader_consumer), is_reversed, timeout);
|
|
});
|
|
}, _compaction_state);
|
|
}
|
|
|
|
size_t memory_usage() const {
|
|
return _reader.buffer_size();
|
|
}
|
|
|
|
};
|
|
|
|
/// Special-purpose cache for saving queriers between pages.
|
|
///
|
|
/// Queriers are saved at the end of the page and looked up at the beginning of
|
|
/// the next page. The lookup() always removes the querier from the cache, it
|
|
/// has to be inserted again at the end of the page.
|
|
/// Lookup provides the following extra logic, special to queriers:
|
|
/// * It accepts a factory function which is used to create a new querier if
|
|
/// the lookup fails (see below). This allows for simple call sites.
|
|
/// * It does range matching. A query sometimes will result in multiple querier
|
|
/// objects executing on the same node and shard paralelly. To identify the
|
|
/// appropriate querier lookup() will consider - in addition to the lookup
|
|
/// key - the read range.
|
|
/// * It does schema version and position checking. In some case a subsequent
|
|
/// page will have a different schema version or will start from a position
|
|
/// that is before the end position of the previous page. lookup() will
|
|
/// recognize these cases and drop the previous querier and create a new one.
|
|
///
|
|
/// Inserted queriers will have a TTL. When this expires the querier is
|
|
/// evicted. This is to avoid excess and unnecessary resource usage due to
|
|
/// abandoned queriers.
|
|
/// Provides a way to evict readers one-by-one via `evict_one()`. This can be
|
|
/// used by the concurrency-limiting code to evict cached readers to free up
|
|
/// resources for admitting new ones.
|
|
/// Keeps the total memory consumption of cached queriers
|
|
/// below max_queriers_memory_usage by evicting older entries upon inserting
|
|
/// new ones if the the memory consupmtion would go above the limit.
|
|
class querier_cache {
|
|
public:
|
|
static const std::chrono::seconds default_entry_ttl;
|
|
static const size_t max_queriers_memory_usage;
|
|
|
|
struct stats {
|
|
// The number of cache lookups.
|
|
uint64_t lookups = 0;
|
|
// The subset of lookups that missed.
|
|
uint64_t misses = 0;
|
|
// The subset of lookups that hit but the looked up querier had to be
|
|
// dropped due to position mismatch.
|
|
uint64_t drops = 0;
|
|
// The number of queriers evicted due to their TTL expiring.
|
|
uint64_t time_based_evictions = 0;
|
|
// The number of queriers evicted to free up resources to be able to
|
|
// create new readers.
|
|
uint64_t resource_based_evictions = 0;
|
|
// The number of queriers evicted to because the maximum memory usage
|
|
// was reached.
|
|
uint64_t memory_based_evictions = 0;
|
|
// The number of queriers currently in the cache.
|
|
uint64_t population = 0;
|
|
};
|
|
|
|
private:
|
|
class entry : public weakly_referencable<entry> {
|
|
querier _querier;
|
|
lowres_clock::time_point _expires;
|
|
public:
|
|
// Since entry cannot be moved and unordered_map::emplace can pass only
|
|
// a single param to it's mapped-type we need to force a single-param
|
|
// constructor for entry. Oh C++...
|
|
struct param {
|
|
querier q;
|
|
std::chrono::seconds ttl;
|
|
};
|
|
|
|
explicit entry(param p)
|
|
: _querier(std::move(p.q))
|
|
, _expires(lowres_clock::now() + p.ttl) {
|
|
}
|
|
|
|
bool is_expired(const lowres_clock::time_point& now) const {
|
|
return _expires <= now;
|
|
}
|
|
|
|
const querier& get() const & {
|
|
return _querier;
|
|
}
|
|
|
|
querier&& get() && {
|
|
return std::move(_querier);
|
|
}
|
|
|
|
size_t memory_usage() const {
|
|
return _querier.memory_usage();
|
|
}
|
|
};
|
|
|
|
using entries = std::unordered_map<utils::UUID, entry>;
|
|
|
|
class meta_entry {
|
|
entries& _entries;
|
|
weak_ptr<entry> _entry_ptr;
|
|
entries::iterator _entry_it;
|
|
|
|
public:
|
|
meta_entry(entries& e, entries::iterator it)
|
|
: _entries(e)
|
|
, _entry_ptr(it->second.weak_from_this())
|
|
, _entry_it(it) {
|
|
}
|
|
|
|
~meta_entry() {
|
|
if (_entry_ptr) {
|
|
_entries.erase(_entry_it);
|
|
}
|
|
}
|
|
|
|
bool is_expired(const lowres_clock::time_point& now) const {
|
|
return !_entry_ptr || _entry_ptr->is_expired(now);
|
|
}
|
|
|
|
explicit operator bool() const {
|
|
return bool(_entry_ptr);
|
|
}
|
|
|
|
const entry& get_entry() const {
|
|
return *_entry_ptr;
|
|
}
|
|
};
|
|
|
|
entries _entries;
|
|
std::list<meta_entry> _meta_entries;
|
|
timer<lowres_clock> _expiry_timer;
|
|
std::chrono::seconds _entry_ttl;
|
|
stats _stats;
|
|
|
|
entries::iterator find_querier(utils::UUID key, const dht::partition_range& range, tracing::trace_state_ptr trace_state);
|
|
|
|
void scan_cache_entries();
|
|
|
|
public:
|
|
querier_cache(std::chrono::seconds entry_ttl = default_entry_ttl);
|
|
|
|
querier_cache(const querier_cache&) = delete;
|
|
querier_cache& operator=(const querier_cache&) = delete;
|
|
|
|
// this is captured
|
|
querier_cache(querier_cache&&) = delete;
|
|
querier_cache& operator=(querier_cache&&) = delete;
|
|
|
|
void insert(utils::UUID key, querier&& q, tracing::trace_state_ptr trace_state);
|
|
|
|
/// Lookup a querier in the cache.
|
|
///
|
|
/// If the querier doesn't exist, use `create_fun' to create it.
|
|
///
|
|
/// Queriers are found based on `key` and `range`. There may be multiple
|
|
/// queriers for the same `key` differentiated by their read range. Since
|
|
/// each subsequent page may have a narrower read range then the one before
|
|
/// it ranges cannot be simply matched based on equality. For matching we
|
|
/// use the fact that the coordinator splits the query range into
|
|
/// non-overlapping ranges. Thus both bounds of any range, or in case of
|
|
/// singular ranges only the start bound are guaranteed to be unique.
|
|
///
|
|
/// The found querier is checked for a matching read-kind and schema
|
|
/// version. The start position is also checked against the current
|
|
/// position of the querier using the `range' and `slice'. If there is a
|
|
/// mismatch drop the querier and create a new one with `create_fun'.
|
|
querier lookup(utils::UUID key,
|
|
emit_only_live_rows only_live,
|
|
const schema& s,
|
|
const dht::partition_range& range,
|
|
const query::partition_slice& slice,
|
|
tracing::trace_state_ptr trace_state,
|
|
const noncopyable_function<querier()>& create_fun);
|
|
|
|
void set_entry_ttl(std::chrono::seconds entry_ttl);
|
|
|
|
/// Evict a querier.
|
|
///
|
|
/// Return true if a querier was evicted and false otherwise (if the cache
|
|
/// is empty).
|
|
bool evict_one();
|
|
|
|
const stats& get_stats() const {
|
|
return _stats;
|
|
}
|
|
};
|
|
|
|
class querier_cache_context {
|
|
querier_cache* _cache{};
|
|
utils::UUID _key;
|
|
bool _is_first_page;
|
|
|
|
public:
|
|
querier_cache_context() = default;
|
|
querier_cache_context(querier_cache& cache, utils::UUID key, bool is_first_page);
|
|
void insert(querier&& q, tracing::trace_state_ptr trace_state);
|
|
querier lookup(emit_only_live_rows only_live,
|
|
const schema& s,
|
|
const dht::partition_range& range,
|
|
const query::partition_slice& slice,
|
|
tracing::trace_state_ptr trace_state,
|
|
const noncopyable_function<querier()>& create_fun);
|
|
};
|