View building from staging creates a reader from scratch (memtable
+ sstables - staging) for every partition, in order to calculate
the diff between new staging data and data in base sstable set,
and then pushes the result into the view replicas.
perf shows that the reader creation is very expensive:
+ 12.15% 10.75% reactor-3 scylla [.] lexicographical_tri_compare<compound_type<(allow_prefixes)0>::iterator, compound_type<(allow_prefixes)0>::iterator, legacy_compound_view<compound_type<(allow_prefixes)0> >::tri_comparator::operator()(managed_bytes_basic_view<(mutable_view)0>, managed_bytes
+ 10.01% 9.99% reactor-3 scylla [.] boost::icl::is_empty<boost::icl::continuous_interval<compatible_ring_position_or_view, std::less> >
+ 8.95% 8.94% reactor-3 scylla [.] legacy_compound_view<compound_type<(allow_prefixes)0> >::tri_comparator::operator()
+ 7.29% 7.28% reactor-3 scylla [.] dht::ring_position_tri_compare
+ 6.28% 6.27% reactor-3 scylla [.] dht::tri_compare
+ 4.11% 3.52% reactor-3 scylla [.] boost::icl::interval_base_map<boost::icl::interval_map<compatible_ring_position_or_view, std::unordered_set<seastar::lw_shared_ptr<sstables::sstable>, std::hash<seastar::lw_shared_ptr<sstables::sstable> >, std::equal_to<seastar::lw_shared_ptr<sstables::sst+ 4.09% 4.07% reactor-3 scylla [.] sstables::index_consume_entry_context<sstables::index_consumer>::process_state
+ 3.46% 0.93% reactor-3 scylla [.] sstables::sstable_run::will_introduce_overlapping
+ 2.53% 2.53% reactor-3 libstdc++.so.6 [.] std::_Rb_tree_increment
+ 2.45% 2.45% reactor-3 scylla [.] boost::icl::non_empty::exclusive_less<boost::icl::continuous_interval<compatible_ring_position_or_view, std::less> >
+ 2.14% 2.13% reactor-3 scylla [.] boost::icl::exclusive_less<boost::icl::continuous_interval<compatible_ring_position_or_view, std::less> >
+ 2.07% 2.07% reactor-3 scylla [.] logalloc::region_impl::free
+ 2.06% 1.91% reactor-3 scylla [.] sstables::index_consumer::consume_entry(sstables::parsed_partition_index_entry&&)::{lambda()#1}::operator()() const::{lambda()#1}::operator()
+ 2.04% 2.04% reactor-3 scylla [.] boost::icl::interval_base_map<boost::icl::interval_map<compatible_ring_position_or_view, std::unordered_set<seastar::lw_shared_ptr<sstables::sstable>, std::hash<seastar::lw_shared_ptr<sstables::sstable> >, std::equal_to<seastar::lw_shared_ptr<sstables::sst+ 1.87% 0.00% reactor-3 [kernel.kallsyms] [k] entry_SYSCALL_64_after_hwframe
+ 1.86% 0.00% reactor-3 [kernel.kallsyms] [k] do_syscall_64
+ 1.39% 1.38% reactor-3 libc.so.6 [.] __memcmp_avx2_movbe
+ 1.37% 0.92% reactor-3 scylla [.] boost::icl::segmental::join_left<boost::icl::interval_map<compatible_ring_position_or_view, std::unordered_set<seastar::lw_shared_ptr<sstables::sstable>, std::hash<seastar::lw_shared_ptr<sstables::sstable> >, std::equal_to<seastar::lw_shared_ptr<sstables::
+ 1.34% 1.33% reactor-3 scylla [.] logalloc::region_impl::alloc_small
+ 1.33% 1.33% reactor-3 scylla [.] seastar::memory::small_pool::add_more_objects
+ 1.30% 0.35% reactor-3 scylla [.] seastar::reactor::do_run
+ 1.29% 1.29% reactor-3 scylla [.] seastar::memory::allocate
+ 1.19% 0.05% reactor-3 libc.so.6 [.] syscall
+ 1.16% 1.04% reactor-3 scylla [.] boost::icl::interval_base_map<boost::icl::interval_map<compatible_ring_position_or_view, std::unordered_set<seastar::lw_shared_ptr<sstables::sstable>, std::hash<seastar::lw_shared_ptr<sstables::sstable> >, std::equal_to<seastar::lw_shared_ptr<sstables::sst
+ 1.07% 0.79% reactor-3 scylla [.] sstables::partitioned_sstable_set::insert
That shows some significant amount of work for inserting sstables
into the interval map and maintaining the sstable run (which sorts
fragments by first key and checks for overlapping).
The interval map is known for having issues with L0 sstables, as
it will have to be replicated almost to every single interval
stored by the map, causing terrible space and time complexity.
With enough L0 sstables, it can fall into quadratic behavior.
This overhead is fixed by not building a new fresh sstable set
when recreating the reader, but rather supplying a predicate
to sstable set that will filter out staging sstables when
creating either a single-key or range scan reader.
This could have another benefit over today's approach which
may incorrectly consider a staging sstable as non-staging, if
the staging sst wasn't included in the current batch for view
building.
With this improvement, view building was measured to be 3x faster.
from
INFO 2023-06-16 12:36:40,014 [shard 0] view_update_generator - Processed keyspace1.standard1: 5 sstables in 963957ms = 50kB/s
to
INFO 2023-06-16 14:47:12,129 [shard 0] view_update_generator - Processed keyspace1.standard1: 5 sstables in 319899ms = 150kB/s
Refs #14089.
Fixes #14244.
Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
158 lines
7.7 KiB
C++
158 lines
7.7 KiB
C++
/*
|
|
* Copyright (C) 2020-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <boost/icl/interval_map.hpp>
|
|
|
|
#include "compatible_ring_position.hh"
|
|
#include "sstable_set.hh"
|
|
#include "readers/clustering_combined.hh"
|
|
#include "sstables/types_fwd.hh"
|
|
|
|
namespace sstables {
|
|
|
|
// specialized when sstables are partitioned in the token range space
|
|
// e.g. leveled compaction strategy
|
|
class partitioned_sstable_set : public sstable_set_impl {
|
|
using value_set = std::unordered_set<shared_sstable>;
|
|
using interval_map_type = boost::icl::interval_map<compatible_ring_position_or_view, value_set>;
|
|
using interval_type = interval_map_type::interval_type;
|
|
using map_iterator = interval_map_type::const_iterator;
|
|
private:
|
|
schema_ptr _schema;
|
|
std::vector<shared_sstable> _unleveled_sstables;
|
|
interval_map_type _leveled_sstables;
|
|
lw_shared_ptr<sstable_list> _all;
|
|
std::unordered_map<run_id, sstable_run> _all_runs;
|
|
// Change counter on interval map for leveled sstables which is used by
|
|
// incremental selector to determine whether or not to invalidate iterators.
|
|
uint64_t _leveled_sstables_change_cnt = 0;
|
|
bool _use_level_metadata = false;
|
|
private:
|
|
static interval_type make_interval(const schema& s, const dht::partition_range& range);
|
|
interval_type make_interval(const dht::partition_range& range) const;
|
|
static interval_type make_interval(const schema_ptr& s, const sstable& sst);
|
|
interval_type make_interval(const sstable& sst);
|
|
interval_type singular(const dht::ring_position& rp) const;
|
|
std::pair<map_iterator, map_iterator> query(const dht::partition_range& range) const;
|
|
// SSTables are stored separately to avoid interval map's fragmentation issue when level 0 falls behind.
|
|
bool store_as_unleveled(const shared_sstable& sst) const;
|
|
public:
|
|
static dht::ring_position to_ring_position(const compatible_ring_position_or_view& crp);
|
|
static dht::partition_range to_partition_range(const interval_type& i);
|
|
static dht::partition_range to_partition_range(const dht::ring_position_view& pos, const interval_type& i);
|
|
|
|
partitioned_sstable_set(const partitioned_sstable_set&) = delete;
|
|
explicit partitioned_sstable_set(schema_ptr schema, bool use_level_metadata = true);
|
|
// For cloning the partitioned_sstable_set (makes a deep copy, including *_all)
|
|
explicit partitioned_sstable_set(
|
|
schema_ptr schema,
|
|
const std::vector<shared_sstable>& unleveled_sstables,
|
|
const interval_map_type& leveled_sstables,
|
|
const lw_shared_ptr<sstable_list>& all,
|
|
const std::unordered_map<run_id, sstable_run>& all_runs,
|
|
bool use_level_metadata);
|
|
|
|
virtual std::unique_ptr<sstable_set_impl> clone() const override;
|
|
virtual std::vector<shared_sstable> select(const dht::partition_range& range) const override;
|
|
virtual std::vector<sstable_run> select_sstable_runs(const std::vector<shared_sstable>& sstables) const override;
|
|
virtual lw_shared_ptr<const sstable_list> all() const override;
|
|
virtual stop_iteration for_each_sstable_until(std::function<stop_iteration(const shared_sstable&)> func) const override;
|
|
virtual future<stop_iteration> for_each_sstable_gently_until(std::function<future<stop_iteration>(const shared_sstable&)> func) const override;
|
|
virtual void insert(shared_sstable sst) override;
|
|
virtual void erase(shared_sstable sst) override;
|
|
virtual size_t size() const noexcept override;
|
|
virtual std::unique_ptr<incremental_selector_impl> make_incremental_selector() const override;
|
|
class incremental_selector;
|
|
};
|
|
|
|
class time_series_sstable_set : public sstable_set_impl {
|
|
private:
|
|
using container_t = std::multimap<position_in_partition, shared_sstable, position_in_partition::less_compare>;
|
|
|
|
schema_ptr _schema;
|
|
schema_ptr _reversed_schema; // == _schema->make_reversed();
|
|
bool _enable_optimized_twcs_queries;
|
|
// s.min_position() -> s, ordered using _schema
|
|
lw_shared_ptr<container_t> _sstables;
|
|
// s.max_position().reversed() -> s, ordered using _reversed_schema; the set of values is the same as in _sstables
|
|
lw_shared_ptr<container_t> _sstables_reversed;
|
|
|
|
public:
|
|
time_series_sstable_set(schema_ptr schema, bool enable_optimized_twcs_queries);
|
|
time_series_sstable_set(const time_series_sstable_set& s);
|
|
|
|
virtual std::unique_ptr<sstable_set_impl> clone() const override;
|
|
virtual std::vector<shared_sstable> select(const dht::partition_range& range = query::full_partition_range) const override;
|
|
virtual lw_shared_ptr<const sstable_list> all() const override;
|
|
virtual stop_iteration for_each_sstable_until(std::function<stop_iteration(const shared_sstable&)> func) const override;
|
|
virtual future<stop_iteration> for_each_sstable_gently_until(std::function<future<stop_iteration>(const shared_sstable&)> func) const override;
|
|
virtual void insert(shared_sstable sst) override;
|
|
virtual void erase(shared_sstable sst) override;
|
|
virtual size_t size() const noexcept override;
|
|
virtual std::unique_ptr<incremental_selector_impl> make_incremental_selector() const override;
|
|
|
|
std::unique_ptr<position_reader_queue> make_position_reader_queue(
|
|
std::function<flat_mutation_reader_v2(sstable&)> create_reader,
|
|
std::function<bool(const sstable&)> filter,
|
|
partition_key pk, schema_ptr schema, reader_permit permit,
|
|
streamed_mutation::forwarding fwd_sm,
|
|
bool reversed) const;
|
|
|
|
virtual flat_mutation_reader_v2 create_single_key_sstable_reader(
|
|
replica::column_family*,
|
|
schema_ptr,
|
|
reader_permit,
|
|
utils::estimated_histogram&,
|
|
const dht::partition_range&,
|
|
const query::partition_slice&,
|
|
tracing::trace_state_ptr,
|
|
streamed_mutation::forwarding,
|
|
mutation_reader::forwarding,
|
|
const sstable_predicate&) const override;
|
|
|
|
friend class sstable_position_reader_queue;
|
|
};
|
|
|
|
// this compound set holds reference to N sstable sets and allow their operations to be combined.
|
|
// the managed sets cannot be modified through compound_sstable_set, but only jointly read from, so insert() and erase() are disabled.
|
|
class compound_sstable_set : public sstable_set_impl {
|
|
schema_ptr _schema;
|
|
std::vector<lw_shared_ptr<sstable_set>> _sets;
|
|
public:
|
|
compound_sstable_set(schema_ptr schema, std::vector<lw_shared_ptr<sstable_set>> sets);
|
|
|
|
virtual std::unique_ptr<sstable_set_impl> clone() const override;
|
|
virtual std::vector<shared_sstable> select(const dht::partition_range& range = query::full_partition_range) const override;
|
|
virtual std::vector<sstable_run> select_sstable_runs(const std::vector<shared_sstable>& sstables) const override;
|
|
virtual lw_shared_ptr<const sstable_list> all() const override;
|
|
virtual stop_iteration for_each_sstable_until(std::function<stop_iteration(const shared_sstable&)> func) const override;
|
|
virtual future<stop_iteration> for_each_sstable_gently_until(std::function<future<stop_iteration>(const shared_sstable&)> func) const override;
|
|
virtual void insert(shared_sstable sst) override;
|
|
virtual void erase(shared_sstable sst) override;
|
|
virtual size_t size() const noexcept override;
|
|
virtual std::unique_ptr<incremental_selector_impl> make_incremental_selector() const override;
|
|
|
|
virtual flat_mutation_reader_v2 create_single_key_sstable_reader(
|
|
replica::column_family*,
|
|
schema_ptr,
|
|
reader_permit,
|
|
utils::estimated_histogram&,
|
|
const dht::partition_range&,
|
|
const query::partition_slice&,
|
|
tracing::trace_state_ptr,
|
|
streamed_mutation::forwarding,
|
|
mutation_reader::forwarding,
|
|
const sstable_predicate&) const override;
|
|
|
|
class incremental_selector;
|
|
};
|
|
|
|
} // namespace sstables
|