1160 lines
50 KiB
C++
1160 lines
50 KiB
C++
/*
|
|
* Copyright (C) 2020-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#include <seastar/util/defer.hh>
|
|
|
|
#include <boost/icl/interval_map.hpp>
|
|
#include <boost/range/adaptor/map.hpp>
|
|
#include <boost/range/algorithm/remove_if.hpp>
|
|
#include <boost/range/algorithm/sort.hpp>
|
|
|
|
#include "compatible_ring_position.hh"
|
|
#include "compaction/compaction_strategy_impl.hh"
|
|
#include "compaction/leveled_compaction_strategy.hh"
|
|
#include "compaction/time_window_compaction_strategy.hh"
|
|
|
|
#include "sstable_set_impl.hh"
|
|
|
|
#include "replica/database.hh"
|
|
#include "readers/from_mutations_v2.hh"
|
|
#include "readers/empty_v2.hh"
|
|
#include "readers/combined.hh"
|
|
|
|
namespace sstables {
|
|
|
|
void sstable_run::insert(shared_sstable sst) {
|
|
_all.insert(std::move(sst));
|
|
}
|
|
|
|
void sstable_run::erase(shared_sstable sst) {
|
|
_all.erase(sst);
|
|
}
|
|
|
|
uint64_t sstable_run::data_size() const {
|
|
return boost::accumulate(_all | boost::adaptors::transformed(std::mem_fn(&sstable::data_size)), uint64_t(0));
|
|
}
|
|
|
|
double sstable_run::estimate_droppable_tombstone_ratio(gc_clock::time_point gc_before) const {
|
|
auto estimate_sum = boost::accumulate(_all | boost::adaptors::transformed(std::bind(&sstable::estimate_droppable_tombstone_ratio, std::placeholders::_1, gc_before)), double(0));
|
|
return _all.size() ? estimate_sum / _all.size() : double(0);
|
|
}
|
|
|
|
std::ostream& operator<<(std::ostream& os, const sstables::sstable_run& run) {
|
|
os << "Run = {\n";
|
|
if (run.all().empty()) {
|
|
os << " Identifier: not found\n";
|
|
} else {
|
|
os << format(" Identifier: {}\n", (*run.all().begin())->run_identifier());
|
|
}
|
|
|
|
auto frags = boost::copy_range<std::vector<shared_sstable>>(run.all());
|
|
boost::sort(frags, [] (const shared_sstable& x, const shared_sstable& y) {
|
|
return x->get_first_decorated_key().token() < y->get_first_decorated_key().token();
|
|
});
|
|
os << " Fragments = {\n";
|
|
for (auto& frag : frags) {
|
|
os << format(" {}={}:{}\n", frag->generation(), frag->get_first_decorated_key().token(), frag->get_last_decorated_key().token());
|
|
}
|
|
os << " }\n}\n";
|
|
return os;
|
|
}
|
|
|
|
sstable_set::sstable_set(std::unique_ptr<sstable_set_impl> impl, schema_ptr s)
|
|
: _impl(std::move(impl))
|
|
, _schema(std::move(s)) {
|
|
}
|
|
|
|
sstable_set::sstable_set(const sstable_set& x)
|
|
: _impl(x._impl->clone())
|
|
, _schema(x._schema) {
|
|
}
|
|
|
|
sstable_set::sstable_set(sstable_set&&) noexcept = default;
|
|
|
|
sstable_set&
|
|
sstable_set::operator=(const sstable_set& x) {
|
|
if (this != &x) {
|
|
auto tmp = sstable_set(x);
|
|
*this = std::move(tmp);
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
sstable_set&
|
|
sstable_set::operator=(sstable_set&&) noexcept = default;
|
|
|
|
std::vector<shared_sstable>
|
|
sstable_set::select(const dht::partition_range& range) const {
|
|
return _impl->select(range);
|
|
}
|
|
|
|
std::vector<sstable_run>
|
|
sstable_set::select_sstable_runs(const std::vector<shared_sstable>& sstables) const {
|
|
return _impl->select_sstable_runs(sstables);
|
|
}
|
|
|
|
std::vector<sstable_run>
|
|
partitioned_sstable_set::select_sstable_runs(const std::vector<shared_sstable>& sstables) const {
|
|
auto has_run = [this] (const shared_sstable& sst) { return _all_runs.contains(sst->run_identifier()); };
|
|
auto run_ids = boost::copy_range<std::unordered_set<utils::UUID>>(sstables | boost::adaptors::filtered(has_run) | boost::adaptors::transformed(std::mem_fn(&sstable::run_identifier)));
|
|
return boost::copy_range<std::vector<sstable_run>>(run_ids | boost::adaptors::transformed([this] (utils::UUID run_id) {
|
|
return _all_runs.at(run_id);
|
|
}));
|
|
}
|
|
|
|
lw_shared_ptr<sstable_list>
|
|
sstable_set::all() const {
|
|
return _impl->all();
|
|
}
|
|
|
|
void sstable_set::for_each_sstable(std::function<void(const shared_sstable&)> func) const {
|
|
return _impl->for_each_sstable(std::move(func));
|
|
}
|
|
|
|
void
|
|
sstable_set::insert(shared_sstable sst) {
|
|
_impl->insert(sst);
|
|
}
|
|
|
|
void
|
|
sstable_set::erase(shared_sstable sst) {
|
|
_impl->erase(sst);
|
|
}
|
|
|
|
sstable_set::~sstable_set() = default;
|
|
|
|
sstable_set::incremental_selector::incremental_selector(std::unique_ptr<incremental_selector_impl> impl, const schema& s)
|
|
: _impl(std::move(impl))
|
|
, _cmp(s) {
|
|
}
|
|
|
|
sstable_set::incremental_selector::~incremental_selector() = default;
|
|
|
|
sstable_set::incremental_selector::incremental_selector(sstable_set::incremental_selector&&) noexcept = default;
|
|
|
|
sstable_set::incremental_selector::selection
|
|
sstable_set::incremental_selector::select(const dht::ring_position_view& pos) const {
|
|
if (!_current_range_view || !_current_range_view->contains(pos, _cmp)) {
|
|
std::tie(_current_range, _current_sstables, _current_next_position) = _impl->select(pos);
|
|
_current_range_view = _current_range->transform([] (const dht::ring_position& rp) { return dht::ring_position_view(rp); });
|
|
}
|
|
return {_current_sstables, _current_next_position};
|
|
}
|
|
|
|
sstable_set::incremental_selector
|
|
sstable_set::make_incremental_selector() const {
|
|
return incremental_selector(_impl->make_incremental_selector(), *_schema);
|
|
}
|
|
|
|
partitioned_sstable_set::interval_type partitioned_sstable_set::make_interval(const schema& s, const dht::partition_range& range) {
|
|
return interval_type::closed(
|
|
compatible_ring_position_or_view(s, dht::ring_position_view(range.start()->value())),
|
|
compatible_ring_position_or_view(s, dht::ring_position_view(range.end()->value())));
|
|
}
|
|
|
|
partitioned_sstable_set::interval_type partitioned_sstable_set::make_interval(const dht::partition_range& range) const {
|
|
return make_interval(*_schema, range);
|
|
}
|
|
|
|
partitioned_sstable_set::interval_type partitioned_sstable_set::make_interval(const schema_ptr& s, const sstable& sst) {
|
|
return interval_type::closed(
|
|
compatible_ring_position_or_view(s, dht::ring_position(sst.get_first_decorated_key())),
|
|
compatible_ring_position_or_view(s, dht::ring_position(sst.get_last_decorated_key())));
|
|
}
|
|
|
|
partitioned_sstable_set::interval_type partitioned_sstable_set::make_interval(const sstable& sst) {
|
|
return make_interval(_schema, sst);
|
|
}
|
|
|
|
partitioned_sstable_set::interval_type partitioned_sstable_set::singular(const dht::ring_position& rp) const {
|
|
// We should use the view here, since this is used for queries.
|
|
auto rpv = dht::ring_position_view(rp);
|
|
auto crp = compatible_ring_position_or_view(*_schema, std::move(rpv));
|
|
return interval_type::closed(crp, crp);
|
|
}
|
|
|
|
std::pair<partitioned_sstable_set::map_iterator, partitioned_sstable_set::map_iterator>
|
|
partitioned_sstable_set::query(const dht::partition_range& range) const {
|
|
if (range.start() && range.end()) {
|
|
return _leveled_sstables.equal_range(make_interval(range));
|
|
}
|
|
else if (range.start() && !range.end()) {
|
|
auto start = singular(range.start()->value());
|
|
return { _leveled_sstables.lower_bound(start), _leveled_sstables.end() };
|
|
} else if (!range.start() && range.end()) {
|
|
auto end = singular(range.end()->value());
|
|
return { _leveled_sstables.begin(), _leveled_sstables.upper_bound(end) };
|
|
} else {
|
|
return { _leveled_sstables.begin(), _leveled_sstables.end() };
|
|
}
|
|
}
|
|
|
|
bool partitioned_sstable_set::store_as_unleveled(const shared_sstable& sst) const {
|
|
return _use_level_metadata && sst->get_sstable_level() == 0;
|
|
}
|
|
|
|
dht::ring_position partitioned_sstable_set::to_ring_position(const compatible_ring_position_or_view& crp) {
|
|
// Ring position views, representing bounds of sstable intervals are
|
|
// guaranteed to have key() != nullptr;
|
|
const auto& pos = crp.position();
|
|
return dht::ring_position(pos.token(), *pos.key());
|
|
}
|
|
|
|
dht::partition_range partitioned_sstable_set::to_partition_range(const interval_type& i) {
|
|
return dht::partition_range::make(
|
|
{to_ring_position(i.lower()), boost::icl::is_left_closed(i.bounds())},
|
|
{to_ring_position(i.upper()), boost::icl::is_right_closed(i.bounds())});
|
|
}
|
|
|
|
dht::partition_range partitioned_sstable_set::to_partition_range(const dht::ring_position_view& pos, const interval_type& i) {
|
|
auto lower_bound = [&] {
|
|
if (pos.key()) {
|
|
return dht::partition_range::bound(dht::ring_position(pos.token(), *pos.key()),
|
|
pos.is_after_key() == dht::ring_position_view::after_key::no);
|
|
} else {
|
|
return dht::partition_range::bound(dht::ring_position(pos.token(), pos.get_token_bound()), true);
|
|
}
|
|
}();
|
|
auto upper_bound = dht::partition_range::bound(to_ring_position(i.lower()), !boost::icl::is_left_closed(i.bounds()));
|
|
return dht::partition_range::make(std::move(lower_bound), std::move(upper_bound));
|
|
}
|
|
|
|
partitioned_sstable_set::partitioned_sstable_set(schema_ptr schema, lw_shared_ptr<sstable_list> all, bool use_level_metadata)
|
|
: _schema(std::move(schema))
|
|
, _all(std::move(all))
|
|
, _use_level_metadata(use_level_metadata) {
|
|
}
|
|
|
|
partitioned_sstable_set::partitioned_sstable_set(schema_ptr schema, const std::vector<shared_sstable>& unleveled_sstables, const interval_map_type& leveled_sstables,
|
|
const lw_shared_ptr<sstable_list>& all, const std::unordered_map<utils::UUID, sstable_run>& all_runs, bool use_level_metadata)
|
|
: _schema(schema)
|
|
, _unleveled_sstables(unleveled_sstables)
|
|
, _leveled_sstables(leveled_sstables)
|
|
, _all(make_lw_shared<sstable_list>(*all))
|
|
, _all_runs(all_runs)
|
|
, _use_level_metadata(use_level_metadata) {
|
|
}
|
|
|
|
std::unique_ptr<sstable_set_impl> partitioned_sstable_set::clone() const {
|
|
return std::make_unique<partitioned_sstable_set>(_schema, _unleveled_sstables, _leveled_sstables, _all, _all_runs, _use_level_metadata);
|
|
}
|
|
|
|
std::vector<shared_sstable> partitioned_sstable_set::select(const dht::partition_range& range) const {
|
|
auto ipair = query(range);
|
|
auto b = std::move(ipair.first);
|
|
auto e = std::move(ipair.second);
|
|
value_set result;
|
|
while (b != e) {
|
|
boost::copy(b++->second, std::inserter(result, result.end()));
|
|
}
|
|
auto r = _unleveled_sstables;
|
|
r.insert(r.end(), result.begin(), result.end());
|
|
return r;
|
|
}
|
|
|
|
lw_shared_ptr<sstable_list> partitioned_sstable_set::all() const {
|
|
return _all;
|
|
}
|
|
|
|
void partitioned_sstable_set::for_each_sstable(std::function<void(const shared_sstable&)> func) const {
|
|
for (auto& sst : *_all) {
|
|
func(sst);
|
|
}
|
|
}
|
|
|
|
void partitioned_sstable_set::insert(shared_sstable sst) {
|
|
_all->insert(sst);
|
|
auto undo_all_insert = defer([&] () { _all->erase(sst); });
|
|
|
|
_all_runs[sst->run_identifier()].insert(sst);
|
|
auto undo_all_runs_insert = defer([&] () { _all_runs[sst->run_identifier()].erase(sst); });
|
|
|
|
if (store_as_unleveled(sst)) {
|
|
_unleveled_sstables.push_back(sst);
|
|
} else {
|
|
_leveled_sstables_change_cnt++;
|
|
_leveled_sstables.add({make_interval(*sst), value_set({sst})});
|
|
}
|
|
undo_all_insert.cancel();
|
|
undo_all_runs_insert.cancel();
|
|
}
|
|
|
|
void partitioned_sstable_set::erase(shared_sstable sst) {
|
|
_all_runs[sst->run_identifier()].erase(sst);
|
|
_all->erase(sst);
|
|
if (store_as_unleveled(sst)) {
|
|
_unleveled_sstables.erase(std::remove(_unleveled_sstables.begin(), _unleveled_sstables.end(), sst), _unleveled_sstables.end());
|
|
} else {
|
|
_leveled_sstables_change_cnt++;
|
|
_leveled_sstables.subtract({make_interval(*sst), value_set({sst})});
|
|
}
|
|
}
|
|
|
|
class partitioned_sstable_set::incremental_selector : public incremental_selector_impl {
|
|
schema_ptr _schema;
|
|
const std::vector<shared_sstable>& _unleveled_sstables;
|
|
const interval_map_type& _leveled_sstables;
|
|
const uint64_t& _leveled_sstables_change_cnt;
|
|
uint64_t _last_known_leveled_sstables_change_cnt;
|
|
map_iterator _it;
|
|
private:
|
|
dht::ring_position_ext next_position(map_iterator it) {
|
|
if (it == _leveled_sstables.end()) {
|
|
return dht::ring_position_view::max();
|
|
} else {
|
|
auto&& next_position = partitioned_sstable_set::to_ring_position(it->first.lower());
|
|
return dht::ring_position_ext(next_position, dht::ring_position_ext::after_key(!boost::icl::is_left_closed(it->first.bounds())));
|
|
}
|
|
}
|
|
static bool is_before_interval(const compatible_ring_position_or_view& crp, const interval_type& interval) {
|
|
if (boost::icl::is_left_closed(interval.bounds())) {
|
|
return crp < interval.lower();
|
|
} else {
|
|
return crp <= interval.lower();
|
|
}
|
|
}
|
|
void maybe_invalidate_iterator(const compatible_ring_position_or_view& crp) {
|
|
if (_last_known_leveled_sstables_change_cnt != _leveled_sstables_change_cnt) {
|
|
_it = _leveled_sstables.lower_bound(interval_type::closed(crp, crp));
|
|
_last_known_leveled_sstables_change_cnt = _leveled_sstables_change_cnt;
|
|
}
|
|
}
|
|
public:
|
|
incremental_selector(schema_ptr schema, const std::vector<shared_sstable>& unleveled_sstables, const interval_map_type& leveled_sstables,
|
|
const uint64_t& leveled_sstables_change_cnt)
|
|
: _schema(std::move(schema))
|
|
, _unleveled_sstables(unleveled_sstables)
|
|
, _leveled_sstables(leveled_sstables)
|
|
, _leveled_sstables_change_cnt(leveled_sstables_change_cnt)
|
|
, _last_known_leveled_sstables_change_cnt(leveled_sstables_change_cnt)
|
|
, _it(leveled_sstables.begin()) {
|
|
}
|
|
virtual std::tuple<dht::partition_range, std::vector<shared_sstable>, dht::ring_position_ext> select(const dht::ring_position_view& pos) override {
|
|
auto crp = compatible_ring_position_or_view(*_schema, pos);
|
|
auto ssts = _unleveled_sstables;
|
|
using namespace dht;
|
|
|
|
maybe_invalidate_iterator(crp);
|
|
|
|
while (_it != _leveled_sstables.end()) {
|
|
if (boost::icl::contains(_it->first, crp)) {
|
|
ssts.insert(ssts.end(), _it->second.begin(), _it->second.end());
|
|
return std::make_tuple(partitioned_sstable_set::to_partition_range(_it->first), std::move(ssts), next_position(std::next(_it)));
|
|
}
|
|
// We don't want to skip current interval if pos lies before it.
|
|
if (is_before_interval(crp, _it->first)) {
|
|
return std::make_tuple(partitioned_sstable_set::to_partition_range(pos, _it->first), std::move(ssts), next_position(_it));
|
|
}
|
|
_it++;
|
|
}
|
|
return std::make_tuple(partition_range::make_open_ended_both_sides(), std::move(ssts), ring_position_view::max());
|
|
}
|
|
};
|
|
|
|
time_series_sstable_set::time_series_sstable_set(schema_ptr schema)
|
|
: _schema(std::move(schema))
|
|
, _reversed_schema(_schema->make_reversed())
|
|
, _sstables(make_lw_shared<container_t>(position_in_partition::less_compare(*_schema)))
|
|
, _sstables_reversed(make_lw_shared<container_t>(position_in_partition::less_compare(*_reversed_schema)))
|
|
{}
|
|
|
|
time_series_sstable_set::time_series_sstable_set(const time_series_sstable_set& s)
|
|
: _schema(s._schema)
|
|
, _reversed_schema(s._reversed_schema)
|
|
, _sstables(make_lw_shared(*s._sstables))
|
|
, _sstables_reversed(make_lw_shared(*s._sstables_reversed))
|
|
{}
|
|
|
|
std::unique_ptr<sstable_set_impl> time_series_sstable_set::clone() const {
|
|
return std::make_unique<time_series_sstable_set>(*this);
|
|
}
|
|
|
|
std::vector<shared_sstable> time_series_sstable_set::select(const dht::partition_range& range) const {
|
|
return boost::copy_range<std::vector<shared_sstable>>(*_sstables | boost::adaptors::map_values);
|
|
}
|
|
|
|
lw_shared_ptr<sstable_list> time_series_sstable_set::all() const {
|
|
return make_lw_shared<sstable_list>(boost::copy_range<sstable_list>(*_sstables | boost::adaptors::map_values));
|
|
}
|
|
|
|
void time_series_sstable_set::for_each_sstable(std::function<void(const shared_sstable&)> func) const {
|
|
for (auto& entry : *_sstables) {
|
|
func(entry.second);
|
|
}
|
|
}
|
|
|
|
// O(log n)
|
|
void time_series_sstable_set::insert(shared_sstable sst) {
|
|
auto min_pos = sst->min_position();
|
|
auto max_pos_reversed = sst->max_position().reversed();
|
|
_sstables->emplace(std::move(min_pos), sst);
|
|
_sstables_reversed->emplace(std::move(max_pos_reversed), std::move(sst));
|
|
}
|
|
|
|
// O(n) worst case, but should be close to O(log n) most of the time
|
|
void time_series_sstable_set::erase(shared_sstable sst) {
|
|
{
|
|
auto [first, last] = _sstables->equal_range(sst->min_position());
|
|
auto it = std::find_if(first, last,
|
|
[&sst] (const std::pair<position_in_partition, shared_sstable>& p) { return sst == p.second; });
|
|
if (it != last) {
|
|
_sstables->erase(it);
|
|
}
|
|
}
|
|
|
|
auto [first, last] = _sstables_reversed->equal_range(sst->max_position().reversed());
|
|
auto it = std::find_if(first, last,
|
|
[&sst] (const std::pair<position_in_partition, shared_sstable>& p) { return sst == p.second; });
|
|
if (it != last) {
|
|
_sstables_reversed->erase(it);
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<incremental_selector_impl> time_series_sstable_set::make_incremental_selector() const {
|
|
struct selector : public incremental_selector_impl {
|
|
const time_series_sstable_set& _set;
|
|
|
|
selector(const time_series_sstable_set& set) : _set(set) {}
|
|
|
|
virtual std::tuple<dht::partition_range, std::vector<shared_sstable>, dht::ring_position_ext>
|
|
select(const dht::ring_position_view&) override {
|
|
return std::make_tuple(dht::partition_range::make_open_ended_both_sides(), _set.select(), dht::ring_position_view::max());
|
|
}
|
|
};
|
|
|
|
return std::make_unique<selector>(*this);
|
|
}
|
|
|
|
// Queue of readers of sstables in a time_series_sstable_set,
|
|
// returning readers in order of the sstables' clustering key lower bounds.
|
|
//
|
|
// For sstable `s` we take `s.min_position()` as the lower bound for non-reversed reads,
|
|
// and `s.max_position().reversed()` for reversed reads (in reversed reads comparisons
|
|
// are performed using a reversed schema). Let `lower_bound(s)` denote this lower bound
|
|
// in the comments below.
|
|
//
|
|
// Skips sstables that don't pass the supplied filter.
|
|
// Guarantees that the filter will be called at most once for each sstable;
|
|
// exactly once after all sstables are iterated over.
|
|
//
|
|
// The readers are created lazily on-demand using the supplied factory function.
|
|
//
|
|
// Additionally to the sstable readers, the queue always returns one ``dummy reader''
|
|
// that contains only the partition_start/end markers. This dummy reader is always
|
|
// returned as the first on the first `pop(b)` call for any `b`. Its upper bound
|
|
// is `before_all_clustered_rows`.
|
|
class sstable_position_reader_queue : public position_reader_queue {
|
|
using container_t = time_series_sstable_set::container_t;
|
|
using value_t = container_t::value_type;
|
|
|
|
schema_ptr _query_schema;
|
|
lw_shared_ptr<const container_t> _sstables;
|
|
|
|
// Iterates over sstables in order of their lower bounds.
|
|
// Invariant: _it == _end or filter(it->second) == true
|
|
container_t::const_iterator _it;
|
|
const container_t::const_iterator _end;
|
|
|
|
position_in_partition::tri_compare _cmp;
|
|
|
|
std::function<flat_mutation_reader_v2(sstable&)> _create_reader;
|
|
std::function<bool(const sstable&)> _filter;
|
|
|
|
// After construction contains a reader which returns only the partition
|
|
// start (and end, if not in forwarding mode) markers. This is the first
|
|
// returned reader.
|
|
std::optional<flat_mutation_reader_v2> _dummy_reader;
|
|
|
|
bool _reversed;
|
|
|
|
flat_mutation_reader_v2 create_reader(sstable& sst) {
|
|
return _create_reader(sst);
|
|
}
|
|
|
|
bool filter(const sstable& sst) const {
|
|
return _filter(sst);
|
|
}
|
|
|
|
public:
|
|
// Assumes that `create_reader` returns readers that emit only fragments from partition `pk`.
|
|
//
|
|
// For reversed reads `query_schema` must be reversed (see docs/design-notes/reverse-reads.md).
|
|
sstable_position_reader_queue(const time_series_sstable_set& set,
|
|
schema_ptr query_schema,
|
|
std::function<flat_mutation_reader_v2(sstable&)> create_reader,
|
|
std::function<bool(const sstable&)> filter,
|
|
partition_key pk,
|
|
reader_permit permit,
|
|
streamed_mutation::forwarding fwd_sm,
|
|
bool reversed)
|
|
: _query_schema(std::move(query_schema))
|
|
, _sstables(reversed ? set._sstables_reversed : set._sstables)
|
|
, _it(_sstables->begin())
|
|
, _end(_sstables->end())
|
|
, _cmp(*_query_schema)
|
|
, _create_reader(std::move(create_reader))
|
|
, _filter(std::move(filter))
|
|
, _dummy_reader(make_flat_mutation_reader_from_mutations_v2(_query_schema,
|
|
std::move(permit), {mutation(_query_schema, std::move(pk))}, _query_schema->full_slice(), fwd_sm))
|
|
, _reversed(reversed)
|
|
{
|
|
while (_it != _end && !this->filter(*_it->second)) {
|
|
++_it;
|
|
}
|
|
}
|
|
|
|
virtual ~sstable_position_reader_queue() override = default;
|
|
|
|
// If the dummy reader was not yet returned, return the dummy reader.
|
|
// Otherwise, open sstable readers to all sstables with smallest lower_bound() from the set
|
|
// {S: filter(S) and prev_min_pos < lower_bound(S) <= bound}, where `prev_min_pos` is the lower_bound()
|
|
// of the sstables returned from last non-empty pop() or -infinity if no sstables were previously returned,
|
|
// and `filter` is the filtering function provided when creating the queue.
|
|
//
|
|
// Note that there may be multiple returned sstables (all with the same position) or none.
|
|
//
|
|
// Note that lower_bound(S) is global for sstable S; if the readers are used to inspect specific partitions,
|
|
// the minimal positions in these partitions might actually all be greater than lower_bound(S).
|
|
virtual std::vector<reader_and_upper_bound> pop(position_in_partition_view bound) override {
|
|
if (empty(bound)) {
|
|
return {};
|
|
}
|
|
|
|
if (_dummy_reader) {
|
|
std::vector<reader_and_upper_bound> ret;
|
|
ret.emplace_back(*std::exchange(_dummy_reader, std::nullopt), position_in_partition::before_all_clustered_rows());
|
|
return ret;
|
|
}
|
|
|
|
// by !empty(bound) and `_it` invariant:
|
|
// _it != _end, _it->first <= bound, and filter(*_it->second) == true
|
|
assert(_cmp(_it->first, bound) <= 0);
|
|
// we don't assert(filter(*_it->second)) due to the requirement that `filter` is called at most once for each sstable
|
|
|
|
// Find all sstables with the same position as `_it` (they form a contiguous range in the container).
|
|
auto next = std::find_if(std::next(_it), _end, [this] (const value_t& v) { return _cmp(v.first, _it->first) != 0; });
|
|
|
|
// We'll return all sstables in the range [_it, next) which pass the filter
|
|
std::vector<reader_and_upper_bound> ret;
|
|
do {
|
|
// loop invariant: filter(*_it->second) == true
|
|
auto upper_bound = _reversed ? _it->second->min_position().reversed() : _it->second->max_position();
|
|
ret.emplace_back(create_reader(*_it->second), std::move(upper_bound));
|
|
// restore loop invariant
|
|
do {
|
|
++_it;
|
|
} while (_it != next && !filter(*_it->second));
|
|
} while (_it != next);
|
|
|
|
// filter(*_it->second) wasn't called yet since the inner `do..while` above checks _it != next first
|
|
// restore the `_it` invariant before returning
|
|
while (_it != _end && !filter(*_it->second)) {
|
|
++_it;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
// If the dummy reader was not returned yet, returns false.
|
|
// Otherwise checks if the set of sstables {S: filter(S) and prev_min_pos < lower_bound(S) <= bound}
|
|
// is empty (see pop() for definition of `prev_min_pos`).
|
|
virtual bool empty(position_in_partition_view bound) const override {
|
|
return !_dummy_reader && (_it == _end || _cmp(_it->first, bound) > 0);
|
|
}
|
|
|
|
virtual future<> close() noexcept override {
|
|
_it = _end;
|
|
return make_ready_future<>();
|
|
}
|
|
};
|
|
|
|
std::unique_ptr<position_reader_queue> time_series_sstable_set::make_position_reader_queue(
|
|
std::function<flat_mutation_reader_v2(sstable&)> create_reader,
|
|
std::function<bool(const sstable&)> filter,
|
|
partition_key pk, schema_ptr query_schema, reader_permit permit,
|
|
streamed_mutation::forwarding fwd_sm, bool reversed) const {
|
|
return std::make_unique<sstable_position_reader_queue>(*this,
|
|
std::move(query_schema), std::move(create_reader), std::move(filter),
|
|
std::move(pk), std::move(permit), fwd_sm, reversed);
|
|
}
|
|
|
|
std::unique_ptr<incremental_selector_impl> partitioned_sstable_set::make_incremental_selector() const {
|
|
return std::make_unique<incremental_selector>(_schema, _unleveled_sstables, _leveled_sstables, _leveled_sstables_change_cnt);
|
|
}
|
|
|
|
std::unique_ptr<sstable_set_impl> compaction_strategy_impl::make_sstable_set(schema_ptr schema) const {
|
|
// with use_level_metadata enabled, L0 sstables will not go to interval map, which suits well STCS.
|
|
return std::make_unique<partitioned_sstable_set>(schema, make_lw_shared<sstable_list>(), true);
|
|
}
|
|
|
|
std::unique_ptr<sstable_set_impl> leveled_compaction_strategy::make_sstable_set(schema_ptr schema) const {
|
|
return std::make_unique<partitioned_sstable_set>(std::move(schema), make_lw_shared<sstable_list>());
|
|
}
|
|
|
|
std::unique_ptr<sstable_set_impl> time_window_compaction_strategy::make_sstable_set(schema_ptr schema) const {
|
|
return std::make_unique<time_series_sstable_set>(std::move(schema));
|
|
}
|
|
|
|
sstable_set make_partitioned_sstable_set(schema_ptr schema, lw_shared_ptr<sstable_list> all, bool use_level_metadata) {
|
|
return sstable_set(std::make_unique<partitioned_sstable_set>(schema, std::move(all), use_level_metadata), schema);
|
|
}
|
|
|
|
sstable_set
|
|
compaction_strategy::make_sstable_set(schema_ptr schema) const {
|
|
return sstable_set(
|
|
_compaction_strategy_impl->make_sstable_set(schema),
|
|
schema);
|
|
}
|
|
|
|
using sstable_reader_factory_type = std::function<flat_mutation_reader_v2(shared_sstable&, const dht::partition_range& pr)>;
|
|
|
|
static logging::logger irclogger("incremental_reader_selector");
|
|
|
|
// Incremental selector implementation for combined_mutation_reader that
|
|
// selects readers on-demand as the read progresses through the token
|
|
// range.
|
|
class incremental_reader_selector : public reader_selector {
|
|
const dht::partition_range* _pr;
|
|
lw_shared_ptr<const sstable_set> _sstables;
|
|
tracing::trace_state_ptr _trace_state;
|
|
std::optional<sstable_set::incremental_selector> _selector;
|
|
std::unordered_set<int64_t> _read_sstable_gens;
|
|
sstable_reader_factory_type _fn;
|
|
|
|
flat_mutation_reader_v2 create_reader(shared_sstable sst) {
|
|
tracing::trace(_trace_state, "Reading partition range {} from sstable {}", *_pr, seastar::value_of([&sst] { return sst->get_filename(); }));
|
|
return _fn(sst, *_pr);
|
|
}
|
|
|
|
public:
|
|
explicit incremental_reader_selector(schema_ptr s,
|
|
lw_shared_ptr<const sstable_set> sstables,
|
|
const dht::partition_range& pr,
|
|
tracing::trace_state_ptr trace_state,
|
|
sstable_reader_factory_type fn)
|
|
: reader_selector(s, pr.start() ? pr.start()->value() : dht::ring_position_view::min())
|
|
, _pr(&pr)
|
|
, _sstables(std::move(sstables))
|
|
, _trace_state(std::move(trace_state))
|
|
, _selector(_sstables->make_incremental_selector())
|
|
, _fn(std::move(fn)) {
|
|
|
|
irclogger.trace("{}: created for range: {} with {} sstables",
|
|
fmt::ptr(this),
|
|
*_pr,
|
|
_sstables->all()->size());
|
|
}
|
|
|
|
incremental_reader_selector(const incremental_reader_selector&) = delete;
|
|
incremental_reader_selector& operator=(const incremental_reader_selector&) = delete;
|
|
|
|
incremental_reader_selector(incremental_reader_selector&&) = delete;
|
|
incremental_reader_selector& operator=(incremental_reader_selector&&) = delete;
|
|
|
|
virtual std::vector<flat_mutation_reader_v2> create_new_readers(const std::optional<dht::ring_position_view>& pos) override {
|
|
irclogger.trace("{}: {}({})", fmt::ptr(this), __FUNCTION__, seastar::lazy_deref(pos));
|
|
|
|
auto readers = std::vector<flat_mutation_reader_v2>();
|
|
|
|
do {
|
|
auto selection = _selector->select(_selector_position);
|
|
_selector_position = selection.next_position;
|
|
|
|
irclogger.trace("{}: {} sstables to consider, advancing selector to {}", fmt::ptr(this), selection.sstables.size(),
|
|
_selector_position);
|
|
|
|
readers = boost::copy_range<std::vector<flat_mutation_reader_v2>>(selection.sstables
|
|
| boost::adaptors::filtered([this] (auto& sst) { return _read_sstable_gens.emplace(sst->generation()).second; })
|
|
| boost::adaptors::transformed([this] (auto& sst) { return this->create_reader(sst); }));
|
|
} while (!_selector_position.is_max() && readers.empty() && (!pos || dht::ring_position_tri_compare(*_s, *pos, _selector_position) >= 0));
|
|
|
|
irclogger.trace("{}: created {} new readers", fmt::ptr(this), readers.size());
|
|
|
|
// prevents sstable_set::incremental_selector::_current_sstables from holding reference to
|
|
// sstables when done selecting.
|
|
if (_selector_position.is_max()) {
|
|
_selector.reset();
|
|
}
|
|
|
|
return readers;
|
|
}
|
|
|
|
virtual std::vector<flat_mutation_reader_v2> fast_forward_to(const dht::partition_range& pr) override {
|
|
_pr = ≺
|
|
|
|
auto pos = dht::ring_position_view::for_range_start(*_pr);
|
|
if (dht::ring_position_tri_compare(*_s, pos, _selector_position) >= 0) {
|
|
return create_new_readers(pos);
|
|
}
|
|
|
|
return {};
|
|
}
|
|
};
|
|
|
|
// The returned function uses the bloom filter to check whether the given sstable
|
|
// may have a partition given by the ring position `pos`.
|
|
//
|
|
// Returning `false` means the sstable doesn't have such a partition.
|
|
// Returning `true` means it may, i.e. we don't know whether or not it does.
|
|
//
|
|
// Assumes the given `pos` and `schema` are alive during the function's lifetime.
|
|
static std::predicate<const sstable&> auto
|
|
make_pk_filter(const dht::ring_position& pos, const schema& schema) {
|
|
return [&pos, key = key::from_partition_key(schema, *pos.key()), cmp = dht::ring_position_comparator(schema)] (const sstable& sst) {
|
|
return cmp(pos, sst.get_first_decorated_key()) >= 0 &&
|
|
cmp(pos, sst.get_last_decorated_key()) <= 0 &&
|
|
sst.filter_has_key(key);
|
|
};
|
|
}
|
|
|
|
// Filter out sstables for reader using bloom filter
|
|
static std::vector<shared_sstable>
|
|
filter_sstable_for_reader_by_pk(std::vector<shared_sstable>&& sstables, const schema& schema, const dht::ring_position& pos) {
|
|
auto filter = [_filter = make_pk_filter(pos, schema)] (const shared_sstable& sst) { return !_filter(*sst); };
|
|
sstables.erase(boost::remove_if(sstables, filter), sstables.end());
|
|
return std::move(sstables);
|
|
}
|
|
|
|
// Filter out sstables for reader using sstable metadata that keeps track
|
|
// of a range for each clustering component.
|
|
static std::vector<shared_sstable>
|
|
filter_sstable_for_reader_by_ck(std::vector<shared_sstable>&& sstables, replica::column_family& cf, const schema_ptr& schema,
|
|
const query::partition_slice& slice) {
|
|
// no clustering filtering is applied if schema defines no clustering key or
|
|
// compaction strategy thinks it will not benefit from such an optimization,
|
|
// or the partition_slice includes static columns.
|
|
if (!schema->clustering_key_size() || !cf.get_compaction_strategy().use_clustering_key_filter() || slice.static_columns.size()) {
|
|
return std::move(sstables);
|
|
}
|
|
|
|
replica::cf_stats* stats = cf.cf_stats();
|
|
stats->clustering_filter_count++;
|
|
stats->sstables_checked_by_clustering_filter += sstables.size();
|
|
|
|
auto ck_filtering_all_ranges = slice.get_all_ranges();
|
|
// fast path to include all sstables if only one full range was specified.
|
|
// For example, this happens if query only specifies a partition key.
|
|
if (ck_filtering_all_ranges.size() == 1 && ck_filtering_all_ranges[0].is_full()) {
|
|
stats->clustering_filter_fast_path_count++;
|
|
stats->surviving_sstables_after_clustering_filter += sstables.size();
|
|
return std::move(sstables);
|
|
}
|
|
|
|
auto skipped = std::partition(sstables.begin(), sstables.end(), [&ranges = ck_filtering_all_ranges] (const shared_sstable& sst) {
|
|
return sst->may_contain_rows(ranges);
|
|
});
|
|
sstables.erase(skipped, sstables.end());
|
|
stats->surviving_sstables_after_clustering_filter += sstables.size();
|
|
|
|
return std::move(sstables);
|
|
}
|
|
|
|
std::vector<sstable_run>
|
|
sstable_set_impl::select_sstable_runs(const std::vector<shared_sstable>& sstables) const {
|
|
throw_with_backtrace<std::bad_function_call>();
|
|
}
|
|
|
|
flat_mutation_reader_v2
|
|
sstable_set_impl::create_single_key_sstable_reader(
|
|
replica::column_family* cf,
|
|
schema_ptr schema,
|
|
reader_permit permit,
|
|
utils::estimated_histogram& sstable_histogram,
|
|
const dht::partition_range& pr,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state,
|
|
streamed_mutation::forwarding fwd,
|
|
mutation_reader::forwarding fwd_mr) const
|
|
{
|
|
const auto& pos = pr.start()->value();
|
|
auto selected_sstables = filter_sstable_for_reader_by_pk(select(pr), *schema, pos);
|
|
auto num_sstables = selected_sstables.size();
|
|
if (!num_sstables) {
|
|
return make_empty_flat_reader_v2(schema, permit);
|
|
}
|
|
auto readers = boost::copy_range<std::vector<flat_mutation_reader_v2>>(
|
|
filter_sstable_for_reader_by_ck(std::move(selected_sstables), *cf, schema, slice)
|
|
| boost::adaptors::transformed([&] (const shared_sstable& sstable) {
|
|
tracing::trace(trace_state, "Reading key {} from sstable {}", pos, seastar::value_of([&sstable] { return sstable->get_filename(); }));
|
|
return sstable->make_reader(schema, permit, pr, slice, pc, trace_state, fwd);
|
|
})
|
|
);
|
|
|
|
// If filter_sstable_for_reader_by_ck filtered any sstable that contains the partition
|
|
// we want to emit partition_start/end if no rows were found,
|
|
// to prevent https://github.com/scylladb/scylla/issues/3552.
|
|
//
|
|
// Use `make_flat_mutation_reader_from_mutations` with an empty mutation to emit
|
|
// the partition_start/end pair and append it to the list of readers passed
|
|
// to make_combined_reader to ensure partition_start/end are emitted even if
|
|
// all sstables actually containing the partition were filtered.
|
|
auto num_readers = readers.size();
|
|
if (num_readers != num_sstables) {
|
|
readers.push_back(make_flat_mutation_reader_from_mutations_v2(schema, permit, {mutation(schema, *pos.key())}, slice, fwd));
|
|
}
|
|
sstable_histogram.add(num_readers);
|
|
return make_combined_reader(schema, std::move(permit), std::move(readers), fwd, fwd_mr);
|
|
}
|
|
|
|
flat_mutation_reader_v2
|
|
time_series_sstable_set::create_single_key_sstable_reader(
|
|
replica::column_family* cf,
|
|
schema_ptr schema,
|
|
reader_permit permit,
|
|
utils::estimated_histogram& sstable_histogram,
|
|
const dht::partition_range& pr,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state,
|
|
streamed_mutation::forwarding fwd_sm,
|
|
mutation_reader::forwarding fwd_mr) const {
|
|
const auto& pos = pr.start()->value();
|
|
// First check if the optimized algorithm for TWCS single partition queries can be applied.
|
|
// Multiple conditions must be satisfied:
|
|
// 1. The sstables must be sufficiently modern so they contain the min/max column metadata.
|
|
// 2. The schema cannot have static columns, since we're going to be opening new readers
|
|
// into new sstables in the middle of the partition query. TWCS sstables will usually pass
|
|
// this condition.
|
|
// 3. The sstables cannot have partition tombstones for the same reason as above.
|
|
// TWCS sstables will usually pass this condition.
|
|
// 4. The optimized query path must be enabled.
|
|
using sst_entry = std::pair<position_in_partition, shared_sstable>;
|
|
if (!cf->get_config().enable_optimized_twcs_queries
|
|
|| schema->has_static_columns()
|
|
|| std::any_of(_sstables->begin(), _sstables->end(),
|
|
[] (const sst_entry& e) {
|
|
return e.second->get_version() < sstable_version_types::md
|
|
|| e.second->may_have_partition_tombstones();
|
|
})) {
|
|
// Some of the conditions were not satisfied so we use the standard query path.
|
|
return sstable_set_impl::create_single_key_sstable_reader(
|
|
cf, std::move(schema), std::move(permit), sstable_histogram,
|
|
pr, slice, pc, std::move(trace_state), fwd_sm, fwd_mr);
|
|
}
|
|
|
|
auto pk_filter = make_pk_filter(pos, *schema);
|
|
auto it = std::find_if(_sstables->begin(), _sstables->end(), [&] (const sst_entry& e) { return pk_filter(*e.second); });
|
|
if (it == _sstables->end()) {
|
|
// No sstables contain data for the queried partition.
|
|
return make_empty_flat_reader_v2(std::move(schema), std::move(permit));
|
|
}
|
|
|
|
auto& stats = *cf->cf_stats();
|
|
stats.clustering_filter_count++;
|
|
|
|
auto create_reader = [schema, permit, &pr, &slice, &pc, trace_state, fwd_sm] (sstable& sst) {
|
|
return sst.make_reader(schema, permit, pr, slice, pc, trace_state, fwd_sm);
|
|
};
|
|
|
|
auto ck_filter = [ranges = slice.get_all_ranges()] (const sstable& sst) { return sst.may_contain_rows(ranges); };
|
|
|
|
// We're going to pass this filter into sstable_position_reader_queue. The queue guarantees that
|
|
// the filter is going to be called at most once for each sstable and exactly once after
|
|
// the queue is exhausted. We use that fact to gather statistics.
|
|
auto filter = [pk_filter = std::move(pk_filter), ck_filter = std::move(ck_filter), &stats]
|
|
(const sstable& sst) {
|
|
if (!pk_filter(sst)) {
|
|
return false;
|
|
}
|
|
|
|
++stats.sstables_checked_by_clustering_filter;
|
|
if (ck_filter(sst)) {
|
|
++stats.surviving_sstables_after_clustering_filter;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
auto reversed = slice.is_reversed();
|
|
// Note that `sstable_position_reader_queue` always includes a reader which emits a `partition_start` fragment,
|
|
// guaranteeing that the reader we return emits it as well; this helps us avoid the problem from #3552.
|
|
return make_clustering_combined_reader(
|
|
schema, permit, fwd_sm,
|
|
make_position_reader_queue(
|
|
std::move(create_reader), std::move(filter), *pos.key(), schema, permit, fwd_sm, reversed));
|
|
}
|
|
|
|
compound_sstable_set::compound_sstable_set(schema_ptr schema, std::vector<lw_shared_ptr<sstable_set>> sets)
|
|
: _schema(std::move(schema))
|
|
, _sets(std::move(sets)) {
|
|
}
|
|
|
|
std::unique_ptr<sstable_set_impl> compound_sstable_set::clone() const {
|
|
std::vector<lw_shared_ptr<sstable_set>> cloned_sets;
|
|
cloned_sets.reserve(_sets.size());
|
|
for (auto& set : _sets) {
|
|
// implicit clone by using sstable_set's copy ctor.
|
|
cloned_sets.push_back(make_lw_shared(std::move(*set)));
|
|
}
|
|
return std::make_unique<compound_sstable_set>(_schema, std::move(cloned_sets));
|
|
}
|
|
|
|
std::vector<shared_sstable> compound_sstable_set::select(const dht::partition_range& range) const {
|
|
std::vector<shared_sstable> ret;
|
|
for (auto& set : _sets) {
|
|
auto ssts = set->select(range);
|
|
if (ret.empty()) {
|
|
ret = std::move(ssts);
|
|
} else {
|
|
ret.reserve(ret.size() + ssts.size());
|
|
std::move(ssts.begin(), ssts.end(), std::back_inserter(ret));
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
std::vector<sstable_run> compound_sstable_set::select_sstable_runs(const std::vector<shared_sstable>& sstables) const {
|
|
std::vector<sstable_run> ret;
|
|
for (auto& set : _sets) {
|
|
auto runs = set->select_sstable_runs(sstables);
|
|
if (ret.empty()) {
|
|
ret = std::move(runs);
|
|
} else {
|
|
ret.reserve(ret.size() + runs.size());
|
|
std::move(runs.begin(), runs.end(), std::back_inserter(ret));
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
lw_shared_ptr<sstable_list> compound_sstable_set::all() const {
|
|
auto sets = _sets;
|
|
auto it = std::partition(sets.begin(), sets.end(), [] (const auto& set) { return !set->all()->empty(); });
|
|
auto non_empty_set_count = std::distance(sets.begin(), it);
|
|
|
|
if (!non_empty_set_count) {
|
|
return make_lw_shared<sstable_list>();
|
|
}
|
|
// optimize for common case where primary set contains sstables, but secondary one is empty for most of the time.
|
|
if (non_empty_set_count == 1) {
|
|
const auto& non_empty_set = *std::begin(sets);
|
|
return non_empty_set->all();
|
|
}
|
|
|
|
auto ret = make_lw_shared<sstable_list>();
|
|
for (auto& set : boost::make_iterator_range(sets.begin(), it)) {
|
|
auto ssts = set->all();
|
|
ret->reserve(ret->size() + ssts->size());
|
|
ret->insert(ssts->begin(), ssts->end());
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void compound_sstable_set::for_each_sstable(std::function<void(const shared_sstable&)> func) const {
|
|
for (auto& set : _sets) {
|
|
set->for_each_sstable([&func] (const shared_sstable& sst) {
|
|
func(sst);
|
|
});
|
|
}
|
|
}
|
|
|
|
void compound_sstable_set::insert(shared_sstable sst) {
|
|
throw_with_backtrace<std::bad_function_call>();
|
|
}
|
|
void compound_sstable_set::erase(shared_sstable sst) {
|
|
throw_with_backtrace<std::bad_function_call>();
|
|
}
|
|
|
|
class compound_sstable_set::incremental_selector : public incremental_selector_impl {
|
|
const schema& _schema;
|
|
const std::vector<lw_shared_ptr<sstable_set>>& _sets;
|
|
std::vector<sstable_set::incremental_selector> _selectors;
|
|
private:
|
|
std::vector<sstable_set::incremental_selector> make_selectors(const std::vector<lw_shared_ptr<sstable_set>>& sets) {
|
|
return boost::copy_range<std::vector<sstable_set::incremental_selector>>(_sets | boost::adaptors::transformed([] (const auto& set) {
|
|
return set->make_incremental_selector();
|
|
}));
|
|
}
|
|
public:
|
|
incremental_selector(const schema& schema, const std::vector<lw_shared_ptr<sstable_set>>& sets)
|
|
: _schema(schema)
|
|
, _sets(sets)
|
|
, _selectors(make_selectors(sets)) {
|
|
}
|
|
|
|
virtual std::tuple<dht::partition_range, std::vector<shared_sstable>, dht::ring_position_ext> select(const dht::ring_position_view& pos) override {
|
|
// Return all sstables selected on the requested position from all selectors.
|
|
std::vector<shared_sstable> sstables;
|
|
// Return the lowest next position from all selectors, such that this function will be called again to select the
|
|
// lowest next position from the selector which previously returned it.
|
|
dht::ring_position_view lowest_next_position = dht::ring_position_view::max();
|
|
// Always return minimum singular range, such that incremental_selector::select() will always call this function,
|
|
// which in turn will call the selectors to decide on whether or not any select should be actually performed.
|
|
const dht::partition_range current_range = dht::partition_range::make_singular(dht::ring_position::min());
|
|
auto cmp = dht::ring_position_comparator(_schema);
|
|
|
|
for (auto& selector : _selectors) {
|
|
auto ret = selector.select(pos);
|
|
sstables.reserve(sstables.size() + ret.sstables.size());
|
|
std::copy(ret.sstables.begin(), ret.sstables.end(), std::back_inserter(sstables));
|
|
if (cmp(ret.next_position, lowest_next_position) < 0) {
|
|
lowest_next_position = ret.next_position;
|
|
}
|
|
}
|
|
|
|
return std::make_tuple(std::move(current_range), std::move(sstables), dht::ring_position_ext(lowest_next_position));
|
|
}
|
|
};
|
|
|
|
std::unique_ptr<incremental_selector_impl> compound_sstable_set::make_incremental_selector() const {
|
|
if (_sets.empty()) {
|
|
// compound_sstable_set must manage one sstable set at least.
|
|
abort();
|
|
}
|
|
auto sets = _sets;
|
|
auto it = std::partition(sets.begin(), sets.end(), [] (const lw_shared_ptr<sstable_set>& set) { return !set->all()->empty(); });
|
|
auto non_empty_set_count = std::distance(sets.begin(), it);
|
|
|
|
// optimize for common case where only primary set contains sstables, so its selector can be built without an interposer.
|
|
// optimization also applies when no set contains sstable, so any set can be picked as selection will be a no-op anyway.
|
|
if (non_empty_set_count <= 1) {
|
|
const auto& set = sets.front();
|
|
return set->_impl->make_incremental_selector();
|
|
}
|
|
return std::make_unique<incremental_selector>(*_schema, _sets);
|
|
}
|
|
|
|
sstable_set make_compound_sstable_set(schema_ptr schema, std::vector<lw_shared_ptr<sstable_set>> sets) {
|
|
return sstable_set(std::make_unique<compound_sstable_set>(schema, std::move(sets)), schema);
|
|
}
|
|
|
|
flat_mutation_reader_v2
|
|
compound_sstable_set::create_single_key_sstable_reader(
|
|
replica::column_family* cf,
|
|
schema_ptr schema,
|
|
reader_permit permit,
|
|
utils::estimated_histogram& sstable_histogram,
|
|
const dht::partition_range& pr,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state,
|
|
streamed_mutation::forwarding fwd,
|
|
mutation_reader::forwarding fwd_mr) const {
|
|
auto sets = _sets;
|
|
auto it = std::partition(sets.begin(), sets.end(), [] (const auto& set) { return !set->all()->empty(); });
|
|
auto non_empty_set_count = std::distance(sets.begin(), it);
|
|
|
|
if (!non_empty_set_count) {
|
|
return make_empty_flat_reader_v2(schema, permit);
|
|
}
|
|
// optimize for common case where only 1 set is populated, avoiding the expensive combined reader
|
|
if (non_empty_set_count == 1) {
|
|
const auto& non_empty_set = *std::begin(sets);
|
|
return non_empty_set->create_single_key_sstable_reader(cf, std::move(schema), std::move(permit), sstable_histogram, pr, slice, pc, trace_state, fwd, fwd_mr);
|
|
}
|
|
|
|
auto readers = boost::copy_range<std::vector<flat_mutation_reader_v2>>(
|
|
boost::make_iterator_range(sets.begin(), it)
|
|
| boost::adaptors::transformed([&] (const lw_shared_ptr<sstable_set>& non_empty_set) {
|
|
return non_empty_set->create_single_key_sstable_reader(cf, schema, permit, sstable_histogram, pr, slice, pc, trace_state, fwd, fwd_mr);
|
|
})
|
|
);
|
|
return make_combined_reader(std::move(schema), std::move(permit), std::move(readers), fwd, fwd_mr);
|
|
}
|
|
|
|
flat_mutation_reader_v2
|
|
sstable_set::create_single_key_sstable_reader(
|
|
replica::column_family* cf,
|
|
schema_ptr schema,
|
|
reader_permit permit,
|
|
utils::estimated_histogram& sstable_histogram,
|
|
const dht::partition_range& pr,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state,
|
|
streamed_mutation::forwarding fwd,
|
|
mutation_reader::forwarding fwd_mr) const {
|
|
assert(pr.is_singular() && pr.start()->value().has_key());
|
|
return _impl->create_single_key_sstable_reader(cf, std::move(schema),
|
|
std::move(permit), sstable_histogram, pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
|
|
}
|
|
|
|
flat_mutation_reader_v2
|
|
sstable_set::make_range_sstable_reader(
|
|
schema_ptr s,
|
|
reader_permit permit,
|
|
const dht::partition_range& pr,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state,
|
|
streamed_mutation::forwarding fwd,
|
|
mutation_reader::forwarding fwd_mr,
|
|
read_monitor_generator& monitor_generator) const
|
|
{
|
|
auto reader_factory_fn = [s, permit, &slice, &pc, trace_state, fwd, fwd_mr, &monitor_generator]
|
|
(shared_sstable& sst, const dht::partition_range& pr) mutable {
|
|
return sst->make_reader(s, permit, pr, slice, pc, trace_state, fwd, fwd_mr, monitor_generator(sst));
|
|
};
|
|
return make_combined_reader(s, std::move(permit), std::make_unique<incremental_reader_selector>(s,
|
|
shared_from_this(),
|
|
pr,
|
|
std::move(trace_state),
|
|
std::move(reader_factory_fn)),
|
|
fwd,
|
|
fwd_mr);
|
|
}
|
|
|
|
flat_mutation_reader_v2
|
|
sstable_set::make_local_shard_sstable_reader(
|
|
schema_ptr s,
|
|
reader_permit permit,
|
|
const dht::partition_range& pr,
|
|
const query::partition_slice& slice,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_state,
|
|
streamed_mutation::forwarding fwd,
|
|
mutation_reader::forwarding fwd_mr,
|
|
read_monitor_generator& monitor_generator) const
|
|
{
|
|
auto reader_factory_fn = [s, permit, &slice, &pc, trace_state, fwd, fwd_mr, &monitor_generator]
|
|
(shared_sstable& sst, const dht::partition_range& pr) mutable {
|
|
assert(!sst->is_shared());
|
|
return sst->make_reader(s, permit, pr, slice, pc, trace_state, fwd, fwd_mr, monitor_generator(sst));
|
|
};
|
|
if (auto sstables = _impl->all(); sstables->size() == 1) [[unlikely]] {
|
|
auto sst = *sstables->begin();
|
|
return reader_factory_fn(sst, pr);
|
|
}
|
|
return make_combined_reader(s, std::move(permit), std::make_unique<incremental_reader_selector>(s,
|
|
shared_from_this(),
|
|
pr,
|
|
std::move(trace_state),
|
|
std::move(reader_factory_fn)),
|
|
fwd,
|
|
fwd_mr);
|
|
}
|
|
|
|
flat_mutation_reader_v2 sstable_set::make_crawling_reader(
|
|
schema_ptr schema,
|
|
reader_permit permit,
|
|
const io_priority_class& pc,
|
|
tracing::trace_state_ptr trace_ptr,
|
|
read_monitor_generator& monitor_generator) const {
|
|
std::vector<flat_mutation_reader_v2> readers;
|
|
_impl->for_each_sstable([&] (const shared_sstable& sst) mutable {
|
|
readers.emplace_back(sst->make_crawling_reader(schema, permit, pc, trace_ptr, monitor_generator(sst)));
|
|
});
|
|
return make_combined_reader(schema, std::move(permit), std::move(readers), streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
|
|
}
|
|
|
|
unsigned sstable_set_overlapping_count(const schema_ptr& schema, const std::vector<shared_sstable>& sstables) {
|
|
unsigned overlapping_sstables = 0;
|
|
auto prev_last = dht::ring_position::min();
|
|
for (auto& sst : sstables) {
|
|
if (dht::ring_position(sst->get_first_decorated_key()).tri_compare(*schema, prev_last) <= 0) {
|
|
overlapping_sstables++;
|
|
}
|
|
prev_last = dht::ring_position(sst->get_last_decorated_key());
|
|
}
|
|
return overlapping_sstables;
|
|
}
|
|
|
|
} // namespace sstables
|