Files
scylladb/query-result.hh
Botond Dénes 6a7dedfe34 service/storage_proxy: set smallest continue pos as query's continue pos
We expect each replica to stop at exactly the same position when the
digests match. Soon however, if replicas have a lot of tombstones, some
may stop earlier then the others. As long as all digests match, this is
fine but we need to make sure we continue from the smallest such
positions on the next page.
2022-08-10 06:03:38 +03:00

460 lines
17 KiB
C++

/*
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
#pragma once
#include "bytes_ostream.hh"
#include "digest_algorithm.hh"
#include "query-request.hh"
#include "full_position.hh"
#include <optional>
#include <seastar/util/bool_class.hh>
#include "seastarx.hh"
namespace query {
struct short_read_tag { };
using short_read = bool_class<short_read_tag>;
// result_memory_limiter, result_memory_accounter and result_memory_tracker
// form an infrastructure for limiting size of query results.
//
// result_memory_limiter is a shard-local object which ensures that all results
// combined do not use more than 10% of the shard memory.
//
// result_memory_accounter is used by result producers, updates the shard-local
// limits as well as keeps track of the individual maximum result size limit
// which is 1 MB.
//
// result_memory_tracker is just an object that makes sure the
// result_memory_limiter is notified when memory is released (but not sooner).
class result_memory_accounter;
class result_memory_limiter {
const size_t _maximum_total_result_memory;
semaphore _memory_limiter;
public:
static constexpr size_t minimum_result_size = 4 * 1024;
static constexpr size_t maximum_result_size = 1 * 1024 * 1024;
static constexpr size_t unlimited_result_size = std::numeric_limits<size_t>::max();
public:
explicit result_memory_limiter(size_t maximum_total_result_memory)
: _maximum_total_result_memory(maximum_total_result_memory)
, _memory_limiter(_maximum_total_result_memory)
{ }
result_memory_limiter(const result_memory_limiter&) = delete;
result_memory_limiter(result_memory_limiter&&) = delete;
ssize_t total_used_memory() const {
return _maximum_total_result_memory - _memory_limiter.available_units();
}
// Reserves minimum_result_size and creates new memory accounter for
// mutation query. Uses the specified maximum result size and may be
// stopped before reaching it due to memory pressure on shard.
future<result_memory_accounter> new_mutation_read(query::max_result_size max_result_size, short_read short_read_allowed);
// Reserves minimum_result_size and creates new memory accounter for
// data query. Uses the specified maximum result size, result will *not*
// be stopped due to on shard memory pressure in order to avoid digest
// mismatches.
future<result_memory_accounter> new_data_read(query::max_result_size max_result_size, short_read short_read_allowed);
// Creates a memory accounter for digest reads. Such accounter doesn't
// contribute to the shard memory usage, but still stops producing the
// result after individual limit has been reached.
future<result_memory_accounter> new_digest_read(query::max_result_size max_result_size, short_read short_read_allowed);
// Checks whether the result can grow any more, takes into account only
// the per shard limit.
stop_iteration check() const {
return stop_iteration(_memory_limiter.current() <= 0);
}
// Consumes n bytes from memory limiter and checks whether the result
// can grow any more (considering just the per-shard limit).
stop_iteration update_and_check(size_t n) {
_memory_limiter.consume(n);
return check();
}
void release(size_t n) noexcept {
_memory_limiter.signal(n);
}
semaphore& sem() noexcept { return _memory_limiter; }
};
class result_memory_tracker {
semaphore_units<> _units;
size_t _used_memory;
private:
static thread_local semaphore _dummy;
public:
result_memory_tracker() noexcept : _units(_dummy, 0), _used_memory(0) { }
result_memory_tracker(semaphore& sem, size_t blocked, size_t used) noexcept
: _units(sem, blocked), _used_memory(used) { }
size_t used_memory() const { return _used_memory; }
};
class result_memory_accounter {
result_memory_limiter* _limiter = nullptr;
size_t _blocked_bytes = 0;
size_t _used_memory = 0;
size_t _total_used_memory = 0;
query::max_result_size _maximum_result_size;
stop_iteration _stop_on_global_limit;
short_read _short_read_allowed;
mutable bool _below_soft_limit = true;
private:
// Mutation query accounter. Uses provided individual result size limit and
// will stop when shard memory pressure grows too high.
struct mutation_query_tag { };
explicit result_memory_accounter(mutation_query_tag, result_memory_limiter& limiter, query::max_result_size max_size, short_read short_read_allowed) noexcept
: _limiter(&limiter)
, _blocked_bytes(result_memory_limiter::minimum_result_size)
, _maximum_result_size(max_size)
, _stop_on_global_limit(true)
, _short_read_allowed(short_read_allowed)
{ }
// Data query accounter. Uses provided individual result size limit and
// will *not* stop even though shard memory pressure grows too high.
struct data_query_tag { };
explicit result_memory_accounter(data_query_tag, result_memory_limiter& limiter, query::max_result_size max_size, short_read short_read_allowed) noexcept
: _limiter(&limiter)
, _blocked_bytes(result_memory_limiter::minimum_result_size)
, _maximum_result_size(max_size)
, _short_read_allowed(short_read_allowed)
{ }
// Digest query accounter. Uses provided individual result size limit and
// will *not* stop even though shard memory pressure grows too high. This
// accounter does not contribute to the shard memory limits.
struct digest_query_tag { };
explicit result_memory_accounter(digest_query_tag, result_memory_limiter&, query::max_result_size max_size, short_read short_read_allowed) noexcept
: _blocked_bytes(0)
, _maximum_result_size(max_size)
, _short_read_allowed(short_read_allowed)
{ }
stop_iteration check_local_limit() const;
friend class result_memory_limiter;
public:
explicit result_memory_accounter(size_t max_size) noexcept
: _blocked_bytes(0)
, _maximum_result_size(max_size) {
}
result_memory_accounter(result_memory_accounter&& other) noexcept
: _limiter(std::exchange(other._limiter, nullptr))
, _blocked_bytes(other._blocked_bytes)
, _used_memory(other._used_memory)
, _total_used_memory(other._total_used_memory)
, _maximum_result_size(other._maximum_result_size)
, _stop_on_global_limit(other._stop_on_global_limit)
, _short_read_allowed(other._short_read_allowed)
, _below_soft_limit(other._below_soft_limit)
{ }
result_memory_accounter& operator=(result_memory_accounter&& other) noexcept {
if (this != &other) {
this->~result_memory_accounter();
new (this) result_memory_accounter(std::move(other));
}
return *this;
}
~result_memory_accounter() {
if (_limiter) {
_limiter->release(_blocked_bytes);
}
}
size_t used_memory() const { return _used_memory; }
// Consume n more bytes for the result. Returns stop_iteration::yes if
// the result cannot grow any more (taking into account both individual
// and per-shard limits).
stop_iteration update_and_check(size_t n) {
_used_memory += n;
_total_used_memory += n;
auto stop = check_local_limit();
if (_limiter && _used_memory > _blocked_bytes) {
auto to_block = std::min(_used_memory - _blocked_bytes, n);
_blocked_bytes += to_block;
stop = (_limiter->update_and_check(to_block) && _stop_on_global_limit) || stop;
if (stop && !_short_read_allowed) {
// If we are here we stopped because of the global limit.
throw std::runtime_error("Maximum amount of memory for building query results is exhausted, unpaged query cannot be finished");
}
}
return stop;
}
// Checks whether the result can grow any more.
stop_iteration check() const {
auto stop = check_local_limit();
if (!stop && _used_memory >= _blocked_bytes && _limiter) {
return _limiter->check() && _stop_on_global_limit;
}
return stop;
}
// Consume n more bytes for the result.
void update(size_t n) {
update_and_check(n);
}
result_memory_tracker done() && {
if (!_limiter) {
return { };
}
auto& sem = std::exchange(_limiter, nullptr)->sem();
return result_memory_tracker(sem, _blocked_bytes, _used_memory);
}
};
inline future<result_memory_accounter> result_memory_limiter::new_mutation_read(query::max_result_size max_size, short_read short_read_allowed) {
return _memory_limiter.wait(minimum_result_size).then([this, max_size, short_read_allowed] {
return result_memory_accounter(result_memory_accounter::mutation_query_tag(), *this, max_size, short_read_allowed);
});
}
inline future<result_memory_accounter> result_memory_limiter::new_data_read(query::max_result_size max_size, short_read short_read_allowed) {
return _memory_limiter.wait(minimum_result_size).then([this, max_size, short_read_allowed] {
return result_memory_accounter(result_memory_accounter::data_query_tag(), *this, max_size, short_read_allowed);
});
}
inline future<result_memory_accounter> result_memory_limiter::new_digest_read(query::max_result_size max_size, short_read short_read_allowed) {
return make_ready_future<result_memory_accounter>(result_memory_accounter(result_memory_accounter::digest_query_tag(), *this, max_size, short_read_allowed));
}
enum class result_request {
only_result,
only_digest,
result_and_digest,
};
struct result_options {
result_request request = result_request::only_result;
digest_algorithm digest_algo = query::digest_algorithm::none;
static result_options only_result() {
return result_options{};
}
static result_options only_digest(digest_algorithm da) {
return {result_request::only_digest, da};
}
};
class result_digest {
public:
using type = std::array<uint8_t, 16>;
private:
type _digest;
public:
result_digest() = default;
result_digest(type&& digest) : _digest(std::move(digest)) {}
const type& get() const { return _digest; }
bool operator==(const result_digest& rh) const {
return _digest == rh._digest;
}
bool operator!=(const result_digest& rh) const {
return _digest != rh._digest;
}
};
//
// The query results are stored in a serialized form. This is in order to
// address the following problems, which a structured format has:
//
// - high level of indirection (vector of vectors of vectors of blobs), which
// is not CPU cache friendly
//
// - high allocation rate due to fine-grained object structure
//
// On replica side, the query results are probably going to be serialized in
// the transport layer anyway, so serializing the results up-front doesn't add
// net work. There is no processing of the query results on replica other than
// concatenation in case of range queries and checksum calculation. If query
// results are collected in serialized form from different cores, we can
// concatenate them without copying by simply appending the fragments into the
// packet.
//
// On coordinator side, the query results would have to be parsed from the
// transport layer buffers anyway, so the fact that iterators parse it also
// doesn't add net work, but again saves allocations and copying. The CQL
// server doesn't need complex data structures to process the results, it just
// goes over it linearly consuming it.
//
// The coordinator side could be optimized even further for CQL queries which
// do not need processing (eg. select * from cf where ...). We could make the
// replica send the query results in the format which is expected by the CQL
// binary protocol client. So in the typical case the coordinator would just
// pass the data using zero-copy to the client, prepending a header.
//
// Users which need more complex structure of query results can convert this
// to query::result_set.
//
// Related headers:
// - query-result-reader.hh
// - query-result-writer.hh
class result {
bytes_ostream _w;
std::optional<result_digest> _digest;
std::optional<uint32_t> _row_count_low_bits;
api::timestamp_type _last_modified = api::missing_timestamp;
short_read _short_read;
query::result_memory_tracker _memory_tracker;
std::optional<uint32_t> _partition_count;
std::optional<uint32_t> _row_count_high_bits;
std::optional<full_position> _last_position;
public:
class builder;
class partition_writer;
friend class result_merger;
result();
result(bytes_ostream&& w, short_read sr, std::optional<uint32_t> c_low_bits, std::optional<uint32_t> pc,
std::optional<uint32_t> c_high_bits, std::optional<full_position> last_position, result_memory_tracker memory_tracker = { })
: _w(std::move(w))
, _row_count_low_bits(c_low_bits)
, _short_read(sr)
, _memory_tracker(std::move(memory_tracker))
, _partition_count(pc)
, _row_count_high_bits(c_high_bits)
, _last_position(std::move(last_position))
{
w.reduce_chunk_count();
}
result(bytes_ostream&& w, std::optional<result_digest> d, api::timestamp_type last_modified,
short_read sr, std::optional<uint32_t> c_low_bits, std::optional<uint32_t> pc, std::optional<uint32_t> c_high_bits,
std::optional<full_position> last_position, result_memory_tracker memory_tracker = { })
: _w(std::move(w))
, _digest(d)
, _row_count_low_bits(c_low_bits)
, _last_modified(last_modified)
, _short_read(sr)
, _memory_tracker(std::move(memory_tracker))
, _partition_count(pc)
, _row_count_high_bits(c_high_bits)
, _last_position(std::move(last_position))
{
w.reduce_chunk_count();
}
result(bytes_ostream&& w, short_read sr, uint64_t c, std::optional<uint32_t> pc,
std::optional<full_position> last_position, result_memory_tracker memory_tracker = { })
: _w(std::move(w))
, _row_count_low_bits(static_cast<uint32_t>(c))
, _short_read(sr)
, _memory_tracker(std::move(memory_tracker))
, _partition_count(pc)
, _row_count_high_bits(static_cast<uint32_t>(c >> 32))
, _last_position(std::move(last_position))
{
w.reduce_chunk_count();
}
result(bytes_ostream&& w, std::optional<result_digest> d, api::timestamp_type last_modified,
short_read sr, uint64_t c, std::optional<uint32_t> pc, std::optional<full_position> last_position, result_memory_tracker memory_tracker = { })
: _w(std::move(w))
, _digest(d)
, _row_count_low_bits(static_cast<uint32_t>(c))
, _last_modified(last_modified)
, _short_read(sr)
, _memory_tracker(std::move(memory_tracker))
, _partition_count(pc)
, _row_count_high_bits(static_cast<uint32_t>(c >> 32))
, _last_position(std::move(last_position))
{
w.reduce_chunk_count();
}
result(result&&) = default;
result(const result&) = default;
result& operator=(result&&) = default;
result& operator=(const result&) = default;
const bytes_ostream& buf() const {
return _w;
}
const std::optional<result_digest>& digest() const {
return _digest;
}
const std::optional<uint32_t> row_count_low_bits() const {
return _row_count_low_bits;
}
const std::optional<uint32_t> row_count_high_bits() const {
return _row_count_high_bits;
}
const std::optional<uint64_t> row_count() const {
if (!_row_count_low_bits) {
return _row_count_low_bits;
}
return (static_cast<uint64_t>(_row_count_high_bits.value_or(0)) << 32) | _row_count_low_bits.value();
}
void set_row_count(std::optional<uint64_t> row_count) {
if (!row_count) {
_row_count_low_bits = std::nullopt;
_row_count_high_bits = std::nullopt;
} else {
_row_count_low_bits = std::make_optional(static_cast<uint32_t>(row_count.value()));
_row_count_high_bits = std::make_optional(static_cast<uint32_t>(row_count.value() >> 32));
}
}
const api::timestamp_type last_modified() const {
return _last_modified;
}
short_read is_short_read() const {
return _short_read;
}
const std::optional<uint32_t>& partition_count() const {
return _partition_count;
}
void ensure_counts();
const std::optional<full_position>& last_position() const {
return _last_position;
}
void set_last_position(std::optional<full_position> last_position) {
_last_position = std::move(last_position);
}
// Return _last_position if replica filled it, otherwise calculate it based
// on the content (by looking up the last row in the last partition).
full_position get_or_calculate_last_position() const;
struct printer {
schema_ptr s;
const query::partition_slice& slice;
const query::result& res;
};
sstring pretty_print(schema_ptr, const query::partition_slice&) const;
printer pretty_printer(schema_ptr, const query::partition_slice&) const;
};
std::ostream& operator<<(std::ostream& os, const query::result::printer&);
}