/* * Copyright (C) 2018 ScyllaDB */ /* * This file is part of Scylla. * * Scylla is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Scylla is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Scylla. If not, see . */ #pragma once #include "mutation_compactor.hh" #include "mutation_reader.hh" #include #include /// One-stop object for serving queries. /// /// Encapsulates all state and logic for serving all pages for a given range /// of a query on a given shard. Can serve mutation or data queries. Can be /// used with any CompactedMutationsConsumer certified result-builder. /// Intended to be created on the first page of a query then saved and reused on /// subsequent pages. /// (1) Create with the parameters of your query. /// (2) Call consume_page() with your consumer to consume the contents of the /// next page. /// (3) At the end of the page save the querier if you expect more pages. /// The are_limits_reached() method can be used to determine whether the /// page was filled or not. Also check your result builder for short reads. /// Most result builders have memory-accounters that will stop the read /// once some memory limit was reached. This is called a short read as the /// read stops before the row and/or partition limits are reached. /// (4) At the beginning of the next page use can_be_used_for_page() to /// determine whether it can be used with the page's schema and start /// position. If a schema or position mismatch is detected the querier /// cannot be used to produce the next page and a new one has to be created /// instead. class querier { public: enum class can_use { yes, no_emit_only_live_rows_mismatch, no_schema_version_mismatch, no_ring_pos_mismatch, no_clustering_pos_mismatch }; private: template class clustering_position_tracker { std::unique_ptr _consumer; lw_shared_ptr> _last_ckey; public: clustering_position_tracker(std::unique_ptr&& consumer, lw_shared_ptr> last_ckey) : _consumer(std::move(consumer)) , _last_ckey(std::move(last_ckey)) { } void consume_new_partition(const dht::decorated_key& dk) { _last_ckey->reset(); _consumer->consume_new_partition(dk); } void consume(tombstone t) { _consumer->consume(t); } stop_iteration consume(static_row&& sr, tombstone t, bool is_live) { return _consumer->consume(std::move(sr), std::move(t), is_live); } stop_iteration consume(clustering_row&& cr, row_tombstone t, bool is_live) { *_last_ckey = cr.key(); return _consumer->consume(std::move(cr), std::move(t), is_live); } stop_iteration consume(range_tombstone&& rt) { return _consumer->consume(std::move(rt)); } stop_iteration consume_end_of_partition() { return _consumer->consume_end_of_partition(); } auto consume_end_of_stream() { return _consumer->consume_end_of_stream(); } }; struct position { const dht::decorated_key* partition_key; const clustering_key_prefix* clustering_key; }; schema_ptr _schema; std::unique_ptr _range; std::unique_ptr _slice; flat_mutation_reader _reader; std::variant, lw_shared_ptr> _compaction_state; lw_shared_ptr> _last_ckey; std::variant, lw_shared_ptr> make_compaction_state( const schema& s, uint32_t row_limit, uint32_t partition_limit, gc_clock::time_point query_time, emit_only_live_rows only_live) const { if (only_live == emit_only_live_rows::yes) { return make_lw_shared>(s, query_time, *_slice, row_limit, partition_limit); } else { return make_lw_shared>(s, query_time, *_slice, row_limit, partition_limit); } } position current_position() const; bool ring_position_matches(const dht::partition_range& range, const position& pos) const; bool clustering_position_matches(const query::partition_slice& slice, const position& pos) const; public: querier(const mutation_source& ms, schema_ptr schema, const dht::partition_range& range, const query::partition_slice& slice, const io_priority_class& pc, tracing::trace_state_ptr trace_ptr, emit_only_live_rows only_live) : _schema(schema) , _range(std::make_unique(range)) , _slice(std::make_unique(slice)) , _reader(ms.make_reader(schema, *_range, *_slice, pc, std::move(trace_ptr), streamed_mutation::forwarding::no, mutation_reader::forwarding::no)) , _compaction_state(make_compaction_state(*schema, 0, 0, gc_clock::time_point{}, only_live)) , _last_ckey(make_lw_shared>()) { } bool is_reversed() const { return _slice->options.contains(query::partition_slice::option::reversed); } bool are_limits_reached() const { return std::visit([] (const auto& cs) { return cs->are_limits_reached(); }, _compaction_state); } /// Does the querier's range matches `range`? /// /// A query can have more then one querier executing parallelly for /// different sub-ranges on the same shard. This method helps identifying /// the appropriate one for the `range'. /// Since ranges can be narrowed from page-to-page (as the query moves /// through it) we cannot just check the two ranges for equality. /// Instead we exploit the fact the a query-range will always be split into /// non-overlapping sub ranges and thus each bound of a range is unique. /// Thus if any of the range's bounds are equal we have a match. /// For singular ranges we just check the one bound. bool matches(const dht::partition_range& range) const; /// Can the querier be used for the next page? /// /// The querier can only be used for the next page if the only_live, the /// schema versions, the ring and the clustering positions match. can_use can_be_used_for_page(emit_only_live_rows only_live, const schema& s, const dht::partition_range& range, const query::partition_slice& slice) const; template GCC6_CONCEPT( requires CompactedFragmentsConsumer ) auto consume_page(Consumer&& consumer, uint32_t row_limit, uint32_t partition_limit, gc_clock::time_point query_time, db::timeout_clock::time_point timeout) { return std::visit([=, consumer = std::move(consumer)] (auto& compaction_state) mutable { // FIXME: #3158 // consumer cannot be moved after consume_new_partition() is called // on it because it stores references to some of it's own members. // Move it to the heap before any consumption begins to avoid // accidents. return _reader.peek().then([=, consumer = std::make_unique(std::move(consumer))] (mutation_fragment* next_fragment) mutable { const auto next_fragment_kind = next_fragment ? next_fragment->mutation_fragment_kind() : mutation_fragment::kind::partition_end; compaction_state->start_new_page(row_limit, partition_limit, query_time, next_fragment_kind, *consumer); const auto is_reversed = flat_mutation_reader::consume_reversed_partitions( _slice->options.contains(query::partition_slice::option::reversed)); using compaction_state_type = typename std::remove_reference::type; constexpr auto only_live = compaction_state_type::parameters::only_live; auto reader_consumer = make_stable_flattened_mutations_consumer>>( compaction_state, clustering_position_tracker(std::move(consumer), _last_ckey)); return _reader.consume(std::move(reader_consumer), is_reversed, timeout); }); }, _compaction_state); } size_t memory_usage() const { return _reader.buffer_size(); } }; /// Special-purpose cache for saving queriers between pages. /// /// Queriers are saved at the end of the page and looked up at the beginning of /// the next page. The lookup() always removes the querier from the cache, it /// has to be inserted again at the end of the page. /// Lookup provides the following extra logic, special to queriers: /// * It accepts a factory function which is used to create a new querier if /// the lookup fails (see below). This allows for simple call sites. /// * It does range matching. A query sometimes will result in multiple querier /// objects executing on the same node and shard paralelly. To identify the /// appropriate querier lookup() will consider - in addition to the lookup /// key - the read range. /// * It does schema version and position checking. In some case a subsequent /// page will have a different schema version or will start from a position /// that is before the end position of the previous page. lookup() will /// recognize these cases and drop the previous querier and create a new one. /// /// Inserted queriers will have a TTL. When this expires the querier is /// evicted. This is to avoid excess and unnecessary resource usage due to /// abandoned queriers. /// Provides a way to evict readers one-by-one via `evict_one()`. This can be /// used by the concurrency-limiting code to evict cached readers to free up /// resources for admitting new ones. /// Keeps the total memory consumption of cached queriers /// below max_queriers_memory_usage by evicting older entries upon inserting /// new ones if the the memory consupmtion would go above the limit. class querier_cache { public: static const std::chrono::seconds default_entry_ttl; static const size_t max_queriers_memory_usage; struct stats { // The number of cache lookups. uint64_t lookups = 0; // The subset of lookups that missed. uint64_t misses = 0; // The subset of lookups that hit but the looked up querier had to be // dropped due to position mismatch. uint64_t drops = 0; // The number of queriers evicted due to their TTL expiring. uint64_t time_based_evictions = 0; // The number of queriers evicted to free up resources to be able to // create new readers. uint64_t resource_based_evictions = 0; // The number of queriers evicted to because the maximum memory usage // was reached. uint64_t memory_based_evictions = 0; // The number of queriers currently in the cache. uint64_t population = 0; }; private: class entry : public weakly_referencable { querier _querier; lowres_clock::time_point _expires; public: // Since entry cannot be moved and unordered_map::emplace can pass only // a single param to it's mapped-type we need to force a single-param // constructor for entry. Oh C++... struct param { querier q; std::chrono::seconds ttl; }; explicit entry(param p) : _querier(std::move(p.q)) , _expires(lowres_clock::now() + p.ttl) { } bool is_expired(const lowres_clock::time_point& now) const { return _expires <= now; } const querier& get() const & { return _querier; } querier&& get() && { return std::move(_querier); } size_t memory_usage() const { return _querier.memory_usage(); } }; using entries = std::unordered_map; class meta_entry { entries& _entries; weak_ptr _entry_ptr; entries::iterator _entry_it; public: meta_entry(entries& e, entries::iterator it) : _entries(e) , _entry_ptr(it->second.weak_from_this()) , _entry_it(it) { } ~meta_entry() { if (_entry_ptr) { _entries.erase(_entry_it); } } bool is_expired(const lowres_clock::time_point& now) const { return !_entry_ptr || _entry_ptr->is_expired(now); } explicit operator bool() const { return bool(_entry_ptr); } const entry& get_entry() const { return *_entry_ptr; } }; entries _entries; std::list _meta_entries; timer _expiry_timer; std::chrono::seconds _entry_ttl; stats _stats; entries::iterator find_querier(utils::UUID key, const dht::partition_range& range, tracing::trace_state_ptr trace_state); void scan_cache_entries(); public: querier_cache(std::chrono::seconds entry_ttl = default_entry_ttl); querier_cache(const querier_cache&) = delete; querier_cache& operator=(const querier_cache&) = delete; // this is captured querier_cache(querier_cache&&) = delete; querier_cache& operator=(querier_cache&&) = delete; void insert(utils::UUID key, querier&& q, tracing::trace_state_ptr trace_state); /// Lookup a querier in the cache. /// /// If the querier doesn't exist, use `create_fun' to create it. /// /// Queriers are found based on `key` and `range`. There may be multiple /// queriers for the same `key` differentiated by their read range. Since /// each subsequent page may have a narrower read range then the one before /// it ranges cannot be simply matched based on equality. For matching we /// use the fact that the coordinator splits the query range into /// non-overlapping ranges. Thus both bounds of any range, or in case of /// singular ranges only the start bound are guaranteed to be unique. /// /// The found querier is checked for a matching read-kind and schema /// version. The start position is also checked against the current /// position of the querier using the `range' and `slice'. If there is a /// mismatch drop the querier and create a new one with `create_fun'. querier lookup(utils::UUID key, emit_only_live_rows only_live, const schema& s, const dht::partition_range& range, const query::partition_slice& slice, tracing::trace_state_ptr trace_state, const noncopyable_function& create_fun); void set_entry_ttl(std::chrono::seconds entry_ttl); /// Evict a querier. /// /// Return true if a querier was evicted and false otherwise (if the cache /// is empty). bool evict_one(); const stats& get_stats() const { return _stats; } }; class querier_cache_context { querier_cache* _cache{}; utils::UUID _key; bool _is_first_page; public: querier_cache_context() = default; querier_cache_context(querier_cache& cache, utils::UUID key, bool is_first_page); void insert(querier&& q, tracing::trace_state_ptr trace_state); querier lookup(emit_only_live_rows only_live, const schema& s, const dht::partition_range& range, const query::partition_slice& slice, tracing::trace_state_ptr trace_state, const noncopyable_function& create_fun); };