/* * Copyright (C) 2015-present ScyllaDB */ /* * SPDX-License-Identifier: AGPL-3.0-or-later */ #include #include #include #include "query-request.hh" #include "query-result.hh" #include "query-result-writer.hh" #include "query-result-set.hh" #include "seastar/core/shared_ptr.hh" #include "seastar/core/thread.hh" #include "to_string.hh" #include "bytes.hh" #include "mutation_partition_serializer.hh" #include "query-result-reader.hh" #include "query_result_merger.hh" #include "partition_slice_builder.hh" #include "schema_registry.hh" #include "utils/overloaded_functor.hh" namespace query { static logging::logger qlogger("query"); constexpr size_t result_memory_limiter::minimum_result_size; constexpr size_t result_memory_limiter::maximum_result_size; constexpr size_t result_memory_limiter::unlimited_result_size; thread_local semaphore result_memory_tracker::_dummy { 0 }; const dht::partition_range full_partition_range = dht::partition_range::make_open_ended_both_sides(); const clustering_range full_clustering_range = clustering_range::make_open_ended_both_sides(); std::ostream& operator<<(std::ostream& out, const specific_ranges& s); std::ostream& operator<<(std::ostream& out, const partition_slice& ps) { out << "{" << "regular_cols=[" << join(", ", ps.regular_columns) << "]" << ", static_cols=[" << join(", ", ps.static_columns) << "]" << ", rows=[" << join(", ", ps._row_ranges) << "]" ; if (ps._specific_ranges) { out << ", specific=[" << *ps._specific_ranges << "]"; } out << ", options=" << format("{:x}", ps.options.mask()); // FIXME: pretty print options out << ", cql_format=" << ps.cql_format(); out << ", partition_row_limit=" << ps.partition_row_limit(); return out << "}"; } std::ostream& operator<<(std::ostream& out, const read_command& r) { return out << "read_command{" << "cf_id=" << r.cf_id << ", version=" << r.schema_version << ", slice=" << r.slice << "" << ", limit=" << r.get_row_limit() << ", timestamp=" << r.timestamp.time_since_epoch().count() << ", partition_limit=" << r.partition_limit << ", query_uuid=" << r.query_uuid << ", is_first_page=" << r.is_first_page << ", read_timestamp=" << r.read_timestamp << "}"; } std::ostream& operator<<(std::ostream& out, const forward_request::reduction_type& r) { out << "reduction_type{"; switch (r) { case forward_request::reduction_type::count: out << "count"; break; case forward_request::reduction_type::aggregate: out << "aggregate"; break; } return out << "}"; } std::ostream& operator<<(std::ostream& out, const forward_request::aggregation_info& a) { return out << "aggregation_info{" << ", name=" << a.name << ", column_names=[" << join(",", a.column_names) << "]" << "}"; } std::ostream& operator<<(std::ostream& out, const forward_request& r) { auto ms = std::chrono::time_point_cast(r.timeout).time_since_epoch().count(); out << "forward_request{" << "reductions=[" << join(",", r.reduction_types) << "]"; if(r.aggregation_infos) { out << ", aggregation_infos=[" << join(",", r.aggregation_infos.value()) << "]"; } return out << ", cmd=" << r.cmd << ", pr=" << r.pr << ", cl=" << r.cl << ", timeout(ms)=" << ms << "}"; } std::ostream& operator<<(std::ostream& out, const specific_ranges& s) { return out << "{" << s._pk << " : " << join(", ", s._ranges) << "}"; } void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, position_in_partition_view pos, bool reversed) { auto cmp = [reversed, cmp = position_in_partition::composite_tri_compare(s)] (const auto& a, const auto& b) { return reversed ? cmp(b, a) : cmp(a, b); }; auto start_bound = [reversed] (const auto& range) -> position_in_partition_view { return reversed ? position_in_partition_view::for_range_end(range) : position_in_partition_view::for_range_start(range); }; auto end_bound = [reversed] (const auto& range) -> position_in_partition_view { return reversed ? position_in_partition_view::for_range_start(range) : position_in_partition_view::for_range_end(range); }; auto it = ranges.begin(); while (it != ranges.end()) { if (cmp(end_bound(*it), pos) <= 0) { it = ranges.erase(it); continue; } else if (cmp(start_bound(*it), pos) <= 0) { assert(cmp(pos, end_bound(*it)) < 0); auto r = reversed ? clustering_range(it->start(), clustering_range::bound(pos.key(), pos.get_bound_weight() != bound_weight::before_all_prefixed)) : clustering_range(clustering_range::bound(pos.key(), pos.get_bound_weight() != bound_weight::after_all_prefixed), it->end()); *it = std::move(r); } ++it; } } void trim_clustering_row_ranges_to(const schema& s, clustering_row_ranges& ranges, const clustering_key& key, bool reversed) { if (key.is_full(s)) { return trim_clustering_row_ranges_to(s, ranges, reversed ? position_in_partition_view::before_key(key) : position_in_partition_view::after_key(key), reversed); } auto full_key = key; clustering_key::make_full(s, full_key); return trim_clustering_row_ranges_to(s, ranges, reversed ? position_in_partition_view::after_key(full_key) : position_in_partition_view::before_key(full_key), reversed); } clustering_range reverse(const clustering_range& range) { if (range.is_singular()) { return range; } return clustering_range(range.end(), range.start()); } static void reverse_clustering_ranges_bounds(clustering_row_ranges& ranges) { for (auto& range : ranges) { range = reverse(range); } } partition_slice legacy_reverse_slice_to_native_reverse_slice(const schema& schema, partition_slice slice) { return partition_slice_builder(schema, std::move(slice)) .mutate_ranges([] (clustering_row_ranges& ranges) { reverse_clustering_ranges_bounds(ranges); }) .mutate_specific_ranges([] (specific_ranges& ranges) { reverse_clustering_ranges_bounds(ranges.ranges()); }) .build(); } partition_slice native_reverse_slice_to_legacy_reverse_slice(const schema& schema, partition_slice slice) { // They are the same, we give them different names to express intent return legacy_reverse_slice_to_native_reverse_slice(schema, std::move(slice)); } partition_slice reverse_slice(const schema& schema, partition_slice slice) { return partition_slice_builder(schema, std::move(slice)) .mutate_ranges([] (clustering_row_ranges& ranges) { std::reverse(ranges.begin(), ranges.end()); reverse_clustering_ranges_bounds(ranges); }) .mutate_specific_ranges([] (specific_ranges& sranges) { auto& ranges = sranges.ranges(); std::reverse(ranges.begin(), ranges.end()); reverse_clustering_ranges_bounds(ranges); }) .with_option_toggled() .build(); } partition_slice half_reverse_slice(const schema& schema, partition_slice slice) { return partition_slice_builder(schema, std::move(slice)) .mutate_ranges([] (clustering_row_ranges& ranges) { std::reverse(ranges.begin(), ranges.end()); }) .mutate_specific_ranges([] (specific_ranges& sranges) { auto& ranges = sranges.ranges(); std::reverse(ranges.begin(), ranges.end()); }) .with_option_toggled() .build(); } partition_slice::partition_slice(clustering_row_ranges row_ranges, query::column_id_vector static_columns, query::column_id_vector regular_columns, option_set options, std::unique_ptr specific_ranges, cql_serialization_format cql_format, uint32_t partition_row_limit_low_bits, uint32_t partition_row_limit_high_bits) : _row_ranges(std::move(row_ranges)) , static_columns(std::move(static_columns)) , regular_columns(std::move(regular_columns)) , options(options) , _specific_ranges(std::move(specific_ranges)) , _cql_format(std::move(cql_format)) , _partition_row_limit_low_bits(partition_row_limit_low_bits) , _partition_row_limit_high_bits(partition_row_limit_high_bits) {} partition_slice::partition_slice(clustering_row_ranges row_ranges, query::column_id_vector static_columns, query::column_id_vector regular_columns, option_set options, std::unique_ptr specific_ranges, cql_serialization_format cql_format, uint64_t partition_row_limit) : partition_slice(std::move(row_ranges), std::move(static_columns), std::move(regular_columns), options, std::move(specific_ranges), std::move(cql_format), static_cast(partition_row_limit), static_cast(partition_row_limit >> 32)) {} partition_slice::partition_slice(clustering_row_ranges ranges, const schema& s, const column_set& columns, option_set options) : partition_slice(ranges, query::column_id_vector{}, query::column_id_vector{}, options) { regular_columns.reserve(columns.count()); for (ordinal_column_id id = columns.find_first(); id != column_set::npos; id = columns.find_next(id)) { const column_definition& def = s.column_at(id); if (def.is_static()) { static_columns.push_back(def.id); } else if (def.is_regular()) { regular_columns.push_back(def.id); } // else clustering or partition key column - skip, these are controlled by options } } partition_slice::partition_slice(partition_slice&&) = default; partition_slice& partition_slice::operator=(partition_slice&& other) noexcept = default; // Only needed because selection_statement::execute does copies of its read_command // in the map-reduce op. partition_slice::partition_slice(const partition_slice& s) : _row_ranges(s._row_ranges) , static_columns(s.static_columns) , regular_columns(s.regular_columns) , options(s.options) , _specific_ranges(s._specific_ranges ? std::make_unique(*s._specific_ranges) : nullptr) , _cql_format(s._cql_format) , _partition_row_limit_low_bits(s._partition_row_limit_low_bits) , _partition_row_limit_high_bits(s._partition_row_limit_high_bits) {} partition_slice::~partition_slice() {} const clustering_row_ranges& partition_slice::row_ranges(const schema& s, const partition_key& k) const { auto* r = _specific_ranges ? _specific_ranges->range_for(s, k) : nullptr; return r ? *r : _row_ranges; } void partition_slice::set_range(const schema& s, const partition_key& k, clustering_row_ranges range) { if (!_specific_ranges) { _specific_ranges = std::make_unique(k, std::move(range)); } else { _specific_ranges->add(s, k, std::move(range)); } } void partition_slice::clear_range(const schema& s, const partition_key& k) { if (_specific_ranges && _specific_ranges->contains(s, k)) { // just in case someone changes the impl above, // we should do actual remove if specific_ranges suddenly // becomes an actual map assert(_specific_ranges->size() == 1); _specific_ranges = nullptr; } } clustering_row_ranges partition_slice::get_all_ranges() const { auto all_ranges = default_row_ranges(); const auto& specific_ranges = get_specific_ranges(); if (specific_ranges) { all_ranges.insert(all_ranges.end(), specific_ranges->ranges().begin(), specific_ranges->ranges().end()); } return all_ranges; } sstring result::pretty_print(schema_ptr s, const query::partition_slice& slice) const { std::ostringstream out; out << "{ result: " << result_set::from_raw_result(s, slice, *this); out << " digest: "; if (_digest) { out << std::hex << std::setw(2); for (auto&& c : _digest->get()) { out << unsigned(c) << " "; } } else { out << "{}"; } out << ", short_read=" << is_short_read() << " }"; return out.str(); } query::result::printer result::pretty_printer(schema_ptr s, const query::partition_slice& slice) const { return query::result::printer{s, slice, *this}; } std::ostream& operator<<(std::ostream& os, const query::result::printer& p) { os << p.res.pretty_print(p.s, p.slice); return os; } void result::ensure_counts() { if (!_partition_count || !row_count()) { uint64_t row_count; std::tie(_partition_count, row_count) = result_view::do_with(*this, [this] (auto&& view) { return view.count_partitions_and_rows(); }); set_row_count(row_count); } } full_position result::get_or_calculate_last_position() const { if (_last_position) { return *_last_position; } return result_view::do_with(*this, [] (const result_view& v) { return v.calculate_last_position(); }); } result::result() : result([] { bytes_ostream out; ser::writer_of_query_result(out).skip_partitions().end_query_result(); return out; }(), short_read::no, 0, 0, {}) { } static void write_partial_partition(ser::writer_of_qr_partition&& pw, const ser::qr_partition_view& pv, uint64_t rows_to_include) { auto key = pv.key(); auto static_cells_wr = (key ? std::move(pw).write_key(*key) : std::move(pw).skip_key()) .start_static_row() .start_cells(); for (auto&& cell : pv.static_row().cells()) { static_cells_wr.add(cell); } auto rows_wr = std::move(static_cells_wr) .end_cells() .end_static_row() .start_rows(); auto rows = pv.rows(); // rows.size() can be 0 is there's a single static row auto it = rows.begin(); for (uint64_t i = 0; i < std::min(rows.size(), rows_to_include); ++i) { rows_wr.add(*it++); } std::move(rows_wr).end_rows().end_qr_partition(); } foreign_ptr> result_merger::get() { if (_partial.size() == 1) { return std::move(_partial[0]); } bytes_ostream w; auto partitions = ser::writer_of_query_result(w).start_partitions(); uint64_t row_count = 0; short_read is_short_read; uint32_t partition_count = 0; std::optional last_position; for (auto&& r : _partial) { result_view::do_with(*r, [&] (result_view rv) { last_position.reset(); for (auto&& pv : rv._v.partitions()) { auto rows = pv.rows(); // If rows.empty(), then there's a static row, or there wouldn't be a partition const uint64_t rows_in_partition = rows.size() ? : 1; const uint64_t rows_to_include = std::min(_max_rows - row_count, rows_in_partition); row_count += rows_to_include; if (rows_to_include >= rows_in_partition) { partitions.add(pv); if (++partition_count >= _max_partitions) { return; } } else if (rows_to_include > 0) { ++partition_count; write_partial_partition(partitions.add(), pv, rows_to_include); return; } else { return; } } last_position = r->last_position(); }); if (r->is_short_read()) { is_short_read = short_read::yes; last_position.reset(); break; } if (row_count >= _max_rows || partition_count >= _max_partitions) { last_position.reset(); break; } } std::move(partitions).end_partitions().end_query_result(); return make_foreign(make_lw_shared(std::move(w), is_short_read, row_count, partition_count, std::move(last_position))); } std::ostream& operator<<(std::ostream& out, const query::forward_result::printer& p) { if (p.functions.size() != p.res.query_results.size()) { return out << "[malformed forward_result (" << p.res.query_results.size() << " results, " << p.functions.size() << " aggregates)]"; } out << "["; for (size_t i = 0; i < p.functions.size(); i++) { auto& return_type = p.functions[i]->return_type(); out << return_type->to_string(bytes_view(*p.res.query_results[i])); if (i + 1 < p.functions.size()) { out << ", "; } } return out << "]"; } } std::optional position_range_to_clustering_range(const position_range& r, const schema& s) { assert(r.start().get_type() == partition_region::clustered); assert(r.end().get_type() == partition_region::clustered); if (r.start().has_key() && r.end().has_key() && clustering_key_prefix::equality(s)(r.start().key(), r.end().key())) { assert(r.start().get_bound_weight() != r.end().get_bound_weight()); if (r.end().get_bound_weight() == bound_weight::after_all_prefixed && r.start().get_bound_weight() != bound_weight::after_all_prefixed) { // [before x, after x) and [for x, after x) get converted to [x, x]. return query::clustering_range::make_singular(r.start().key()); } // [before x, for x) does not contain any keys. return std::nullopt; } // position_range -> clustering_range // (recall that position_ranges are always left-closed, right opened): // [before x, ...), [for x, ...) -> [x, ... // [after x, ...) -> (x, ... // [..., before x), [..., for x) -> ..., x) // [..., after x) -> ..., x] auto to_bound = [&s] (const position_in_partition& p, bool left) -> std::optional { if (p.is_before_all_clustered_rows(s)) { assert(left); return {}; } if (p.is_after_all_clustered_rows(s)) { assert(!left); return {}; } assert(p.has_key()); auto bw = p.get_bound_weight(); bool inclusive = left ? bw != bound_weight::after_all_prefixed : bw == bound_weight::after_all_prefixed; return query::clustering_range::bound{p.key(), inclusive}; }; return query::clustering_range{to_bound(r.start(), true), to_bound(r.end(), false)}; }