diff --git a/configure.py b/configure.py index 46aa71950a..75b81c71f0 100755 --- a/configure.py +++ b/configure.py @@ -333,6 +333,7 @@ scylla_tests = set([ 'test/boost/estimated_histogram_test', 'test/boost/logalloc_test', 'test/boost/managed_vector_test', + 'test/boost/intrusive_array_test', 'test/boost/map_difference_test', 'test/boost/memtable_test', 'test/boost/meta_test', @@ -388,6 +389,8 @@ scylla_tests = set([ 'test/boost/view_schema_ckey_test', 'test/boost/vint_serialization_test', 'test/boost/virtual_reader_test', + 'test/boost/bptree_test', + 'test/boost/double_decker_test', 'test/manual/ec2_snitch_test', 'test/manual/gce_snitch_test', 'test/manual/gossip', @@ -404,6 +407,7 @@ scylla_tests = set([ 'test/perf/perf_fast_forward', 'test/perf/perf_hash', 'test/perf/perf_mutation', + 'test/perf/perf_bptree', 'test/perf/perf_row_cache_update', 'test/perf/perf_simple_query', 'test/perf/perf_sstable', @@ -411,6 +415,8 @@ scylla_tests = set([ 'test/unit/lsa_sync_eviction_test', 'test/unit/row_cache_alloc_stress_test', 'test/unit/row_cache_stress_test', + 'test/unit/bptree_stress_test', + 'test/unit/bptree_compaction_test', ]) perf_tests = set([ @@ -958,6 +964,7 @@ pure_boost_tests = set([ 'test/boost/small_vector_test', 'test/boost/top_k_test', 'test/boost/vint_serialization_test', + 'test/boost/bptree_test', 'test/manual/streaming_histogram_test', ]) @@ -971,10 +978,13 @@ tests_not_using_seastar_test_framework = set([ 'test/perf/perf_cql_parser', 'test/perf/perf_hash', 'test/perf/perf_mutation', + 'test/perf/perf_bptree', 'test/perf/perf_row_cache_update', 'test/unit/lsa_async_eviction_test', 'test/unit/lsa_sync_eviction_test', 'test/unit/row_cache_alloc_stress_test', + 'test/unit/bptree_stress_test', + 'test/unit/bptree_compaction_test', 'test/manual/sstable_scan_footprint_test', ]) | pure_boost_tests diff --git a/dht/i_partitioner.cc b/dht/i_partitioner.cc index 665a81415e..1d269dea85 100644 --- a/dht/i_partitioner.cc +++ b/dht/i_partitioner.cc @@ -316,11 +316,7 @@ int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_posit } } -int ring_position_comparator::operator()(ring_position_view lh, ring_position_view rh) const { - return ring_position_tri_compare(s, lh, rh); -} - -int ring_position_comparator::operator()(ring_position_view lh, sstables::decorated_key_view rh) const { +int ring_position_comparator_for_sstables::operator()(ring_position_view lh, sstables::decorated_key_view rh) const { auto token_cmp = tri_compare(*lh._token, rh.token()); if (token_cmp) { return token_cmp; @@ -334,7 +330,7 @@ int ring_position_comparator::operator()(ring_position_view lh, sstables::decora return lh._weight; } -int ring_position_comparator::operator()(sstables::decorated_key_view a, ring_position_view b) const { +int ring_position_comparator_for_sstables::operator()(sstables::decorated_key_view a, ring_position_view b) const { return -(*this)(b, a); } diff --git a/dht/i_partitioner.hh b/dht/i_partitioner.hh index 115423d92a..9d8fa99ea9 100644 --- a/dht/i_partitioner.hh +++ b/dht/i_partitioner.hh @@ -330,6 +330,7 @@ public: class ring_position_view { friend int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh); friend class ring_position_comparator; + friend class ring_position_comparator_for_sstables; friend class ring_position_ext; // Order is lexicographical on (_token, _key) tuples, where _key part may be missing, and @@ -566,11 +567,40 @@ public: int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh); +template +requires std::is_convertible::value +ring_position_view ring_position_view_to_compare(const T& val) { + return val; +} + // Trichotomic comparator for ring order struct ring_position_comparator { const schema& s; ring_position_comparator(const schema& s_) : s(s_) {} - int operator()(ring_position_view, ring_position_view) const; + + int operator()(ring_position_view lh, ring_position_view rh) const { + return ring_position_tri_compare(s, lh, rh); + } + + template + int operator()(const T& lh, ring_position_view rh) const { + return ring_position_tri_compare(s, ring_position_view_to_compare(lh), rh); + } + + template + int operator()(ring_position_view lh, const T& rh) const { + return ring_position_tri_compare(s, lh, ring_position_view_to_compare(rh)); + } + + template + int operator()(const T1& lh, const T2& rh) const { + return ring_position_tri_compare(s, ring_position_view_to_compare(lh), ring_position_view_to_compare(rh)); + } +}; + +struct ring_position_comparator_for_sstables { + const schema& s; + ring_position_comparator_for_sstables(const schema& s_) : s(s_) {} int operator()(ring_position_view, sstables::decorated_key_view) const; int operator()(sstables::decorated_key_view, ring_position_view) const; }; diff --git a/dht/token.cc b/dht/token.cc index 1056153d33..8b44757d7d 100644 --- a/dht/token.cc +++ b/dht/token.cc @@ -59,13 +59,7 @@ int tri_compare(const token& t1, const token& t2) { } else if (t1._kind > t2._kind) { return 1; } else if (t1._kind == token_kind::key) { - auto l1 = long_token(t1); - auto l2 = long_token(t2); - if (l1 == l2) { - return 0; - } else { - return l1 < l2 ? -1 : 1; - } + return tri_compare_raw(long_token(t1), long_token(t2)); } return 0; } diff --git a/dht/token.hh b/dht/token.hh index 8ba1b0b53d..de7f4b14f4 100644 --- a/dht/token.hh +++ b/dht/token.hh @@ -160,6 +160,47 @@ public: return 0; // hardcoded for now; unlikely to change } + int64_t raw() const noexcept { + if (is_minimum()) { + return std::numeric_limits::min(); + } + if (is_maximum()) { + return std::numeric_limits::max(); + } + + return _data; + } +}; + +static inline int tri_compare_raw(const int64_t l1, const int64_t l2) noexcept { + if (l1 == l2) { + return 0; + } else { + return l1 < l2 ? -1 : 1; + } +} + +template +concept TokenCarrier = requires (const T& v) { + { v.token() } -> std::same_as; +}; + +struct raw_token_less_comparator { + bool operator()(const int64_t k1, const int64_t k2) const noexcept { + return dht::tri_compare_raw(k1, k2) < 0; + } + + template + requires TokenCarrier + bool operator()(const Key& k1, const int64_t k2) const noexcept { + return dht::tri_compare_raw(k1.token().raw(), k2) < 0; + } + + template + requires TokenCarrier + bool operator()(const int64_t k1, const Key& k2) const noexcept { + return dht::tri_compare_raw(k1, k2.token().raw()) < 0; + } }; const token& minimum_token() noexcept; diff --git a/memtable.cc b/memtable.cc index 14c9b02e9e..95472cb9f4 100644 --- a/memtable.cc +++ b/memtable.cc @@ -117,7 +117,7 @@ memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, table_stats& ta , _cleaner(*this, no_cache_tracker, table_stats.memtable_app_stats, compaction_scheduling_group) , _memtable_list(memtable_list) , _schema(std::move(schema)) - , partitions(memtable_entry::compare(_schema)) + , partitions(dht::raw_token_less_comparator{}) , _table_stats(table_stats) { } @@ -137,12 +137,16 @@ uint64_t memtable::dirty_size() const { return occupancy().total_space(); } +void memtable::evict_entry(memtable_entry& e, mutation_cleaner& cleaner) noexcept { + e.partition().evict(cleaner); + nr_partitions--; +} + void memtable::clear() noexcept { auto dirty_before = dirty_size(); with_allocator(allocator(), [this] { - partitions.clear_and_dispose([this] (memtable_entry* e) { - e->partition().evict(_cleaner); - current_deleter()(e); + partitions.clear_and_dispose([this] (memtable_entry* e) noexcept { + evict_entry(*e, _cleaner); }); }); remove_flushed_memory(dirty_before - dirty_size()); @@ -154,6 +158,7 @@ future<> memtable::clear_gently() noexcept { auto& alloc = allocator(); auto p = std::move(partitions); + nr_partitions = 0; while (!p.empty()) { auto dirty_before = dirty_size(); with_allocator(alloc, [&] () noexcept { @@ -161,9 +166,7 @@ future<> memtable::clear_gently() noexcept { if (p.begin()->clear_gently() == stop_iteration::no) { break; } - p.erase_and_dispose(p.begin(), [&] (auto e) { - alloc.destroy(e); - }); + p.begin().erase(dht::raw_token_less_comparator{}); if (need_preempt()) { break; } @@ -172,6 +175,13 @@ future<> memtable::clear_gently() noexcept { remove_flushed_memory(dirty_before - dirty_size()); seastar::thread::yield(); } + + /* + * The collection is not guaranteed to free everything + * with the last erase. If anything gets freed in destructor, + * it will be unaccounted from wrong allocator, so handle it + */ + with_allocator(alloc, [&p] { p.clear(); }); }); auto f = t->join(); return f.then([t = std::move(t)] {}); @@ -205,12 +215,17 @@ memtable::find_or_create_partition(const dht::decorated_key& key) { assert(!reclaiming_enabled()); // call lower_bound so we have a hint for the insert, just in case. - auto i = partitions.lower_bound(key, memtable_entry::compare(_schema)); - if (i == partitions.end() || !key.equal(*_schema, i->key())) { - memtable_entry* entry = current_allocator().construct( - _schema, dht::decorated_key(key), mutation_partition(_schema)); - partitions.insert_before(i, *entry); + partitions_type::bound_hint hint; + auto i = partitions.lower_bound(key, dht::ring_position_comparator(*_schema), hint); + if (i == partitions.end() || !hint.match) { + partitions_type::iterator entry = partitions.emplace_before(i, + key.token().raw(), hint, + _schema, dht::decorated_key(key), mutation_partition(_schema)); + ++nr_partitions; ++_table_stats.memtable_partition_insertions; + if (!hint.emplace_keeps_iterators()) { + current_allocator().invalidate_references(); + } return entry->partition(); } else { ++_table_stats.memtable_partition_hits; @@ -223,14 +238,14 @@ boost::iterator_range memtable::slice(const dht::partition_range& range) const { if (query::is_single_partition(range)) { const query::ring_position& pos = range.start()->value(); - auto i = partitions.find(pos, memtable_entry::compare(_schema)); + auto i = partitions.find(pos, dht::ring_position_comparator(*_schema)); if (i != partitions.end()) { return boost::make_iterator_range(i, std::next(i)); } else { return boost::make_iterator_range(i, i); } } else { - auto cmp = memtable_entry::compare(_schema); + auto cmp = dht::ring_position_comparator(*_schema); auto i1 = range.start() ? (range.start()->is_inclusive() @@ -259,7 +274,7 @@ class iterator_reader { size_t _last_partition_count = 0; memtable::partitions_type::iterator lookup_end() { - auto cmp = memtable_entry::compare(_memtable->_schema); + auto cmp = dht::ring_position_comparator(*_memtable->_schema); return _range->end() ? (_range->end()->is_inclusive() ? _memtable->partitions.upper_bound(_range->end()->value(), cmp) @@ -269,7 +284,7 @@ class iterator_reader { void update_iterators() { // We must be prepared that iterators may get invalidated during compaction. auto current_reclaim_counter = _memtable->reclaim_counter(); - auto cmp = memtable_entry::compare(_memtable->_schema); + auto cmp = dht::ring_position_comparator(*_memtable->_schema); if (_last) { if (current_reclaim_counter != _last_reclaim_counter || _last_partition_count != _memtable->partition_count()) { @@ -652,7 +667,7 @@ memtable::make_flat_reader(schema_ptr s, const query::ring_position& pos = range.start()->value(); auto snp = _read_section(*this, [&] () -> partition_snapshot_ptr { managed_bytes::linearization_context_guard lcg; - auto i = partitions.find(pos, memtable_entry::compare(_schema)); + auto i = partitions.find(pos, dht::ring_position_comparator(*_schema)); if (i != partitions.end()) { upgrade_entry(*i); return i->snapshot(*this); @@ -759,20 +774,12 @@ mutation_source memtable::as_data_source() { }); } -size_t memtable::partition_count() const { - return partitions.size(); -} - memtable_entry::memtable_entry(memtable_entry&& o) noexcept - : _link() - , _schema(std::move(o._schema)) + : _schema(std::move(o._schema)) , _key(std::move(o._key)) , _pe(std::move(o._pe)) -{ - using container_type = memtable::partitions_type; - container_type::node_algorithms::replace_node(o._link.this_ptr(), _link.this_ptr()); - container_type::node_algorithms::init(o._link.this_ptr()); -} + , _flags(o._flags) +{ } stop_iteration memtable_entry::clear_gently() noexcept { return _pe.clear_gently(no_cache_tracker); @@ -808,6 +815,10 @@ void memtable::set_schema(schema_ptr new_schema) noexcept { _schema = std::move(new_schema); } +size_t memtable_entry::object_memory_size(allocation_strategy& allocator) { + return memtable::partitions_type::estimated_object_memory_size_in_allocator(allocator, this); +} + std::ostream& operator<<(std::ostream& out, memtable& mt) { logalloc::reclaim_lock rl(mt); return out << "{memtable: [" << ::join(",\n", mt.partitions) << "]}"; diff --git a/memtable.hh b/memtable.hh index 47361b7237..5a810c9c66 100644 --- a/memtable.hh +++ b/memtable.hh @@ -32,11 +32,11 @@ #include "db/commitlog/replay_position.hh" #include "db/commitlog/rp_set.hh" #include "utils/extremum_tracking.hh" -#include "utils/logalloc.hh" #include "partition_version.hh" #include "flat_mutation_reader.hh" #include "mutation_cleaner.hh" #include "sstables/types.hh" +#include "utils/double-decker.hh" class frozen_mutation; @@ -44,11 +44,22 @@ class frozen_mutation; namespace bi = boost::intrusive; class memtable_entry { - bi::set_member_hook<> _link; schema_ptr _schema; dht::decorated_key _key; partition_entry _pe; + struct { + bool _head : 1; + bool _tail : 1; + bool _train : 1; + } _flags{}; public: + bool is_head() const noexcept { return _flags._head; } + void set_head(bool v) noexcept { _flags._head = v; } + bool is_tail() const noexcept { return _flags._tail; } + void set_tail(bool v) noexcept { _flags._tail = v; } + bool with_train() const noexcept { return _flags._train; } + void set_train(bool v) noexcept { _flags._train = v; } + friend class memtable; memtable_entry(schema_ptr s, dht::decorated_key key, mutation_partition p) @@ -77,8 +88,10 @@ public: return _key.key().external_memory_usage(); } + size_t object_memory_size(allocation_strategy& allocator); + size_t size_in_allocator_without_rows(allocation_strategy& allocator) { - return allocator.object_memory_size_in_allocator(this) + external_memory_usage_without_rows(); + return object_memory_size(allocator) + external_memory_usage_without_rows(); } size_t size_in_allocator(allocation_strategy& allocator) { @@ -89,34 +102,7 @@ public: return size; } - struct compare { - dht::decorated_key::less_comparator _c; - - compare(schema_ptr s) - : _c(std::move(s)) - {} - - bool operator()(const dht::decorated_key& k1, const memtable_entry& k2) const { - return _c(k1, k2._key); - } - - bool operator()(const memtable_entry& k1, const memtable_entry& k2) const { - return _c(k1._key, k2._key); - } - - bool operator()(const memtable_entry& k1, const dht::decorated_key& k2) const { - return _c(k1._key, k2); - } - - bool operator()(const memtable_entry& k1, const dht::ring_position& k2) const { - return _c(k1._key, k2); - } - - bool operator()(const dht::ring_position& k1, const memtable_entry& k2) const { - return _c(k1, k2._key); - } - }; - + friend dht::ring_position_view ring_position_view_to_compare(const memtable_entry& mt) { return mt._key; } friend std::ostream& operator<<(std::ostream&, const memtable_entry&); }; @@ -126,9 +112,9 @@ struct table_stats; // Managed by lw_shared_ptr<>. class memtable final : public enable_lw_shared_from_this, private logalloc::region { public: - using partitions_type = bi::set, &memtable_entry::_link>, - bi::compare>; + using partitions_type = double_decker; private: dirty_memory_manager& _dirty_mgr; mutation_cleaner _cleaner; @@ -137,6 +123,7 @@ private: logalloc::allocating_section _read_section; logalloc::allocating_section _allocating_section; partitions_type partitions; + size_t nr_partitions = 0; db::replay_position _replay_position; db::rp_set _rp_set; // mutation source to which reads fall-back after mark_flushed() @@ -203,6 +190,7 @@ public: void apply(const mutation& m, db::rp_handle&& = {}); // The mutation is upgraded to current schema. void apply(const frozen_mutation& m, const schema_ptr& m_schema, db::rp_handle&& = {}); + void evict_entry(memtable_entry& e, mutation_cleaner& cleaner) noexcept; static memtable& from_region(logalloc::region& r) { return static_cast(r); @@ -236,7 +224,7 @@ public: return _memtable_list; } - size_t partition_count() const; + size_t partition_count() const { return nr_partitions; } logalloc::occupancy_stats occupancy() const; // Creates a reader of data in this memtable for given partition range. diff --git a/row_cache.cc b/row_cache.cc index 873c535338..5f7bc52295 100644 --- a/row_cache.cc +++ b/row_cache.cc @@ -283,13 +283,13 @@ public: // Can be called on invalid cursor, in which case it brings it back to validity. // Strong exception guarantees. bool advance_to(dht::ring_position_view pos) { - auto cmp = cache_entry::compare(_cache.get()._schema); - if (cmp(_end_pos, pos)) { // next() may have moved _start_pos past the _end_pos. + dht::ring_position_comparator cmp(*_cache.get()._schema); + if (cmp(_end_pos, pos) < 0) { // next() may have moved _start_pos past the _end_pos. _end_pos = pos; } _end = _cache.get()._partitions.lower_bound(_end_pos, cmp); _it = _cache.get()._partitions.lower_bound(pos, cmp); - auto same = !cmp(pos, _it->position()); + auto same = cmp(pos, _it->position()) >= 0; set_position(*_it); _last_reclaim_count = _cache.get().get_cache_tracker().allocator().invalidate_counter(); return same; @@ -375,13 +375,14 @@ private: _cache._read_section(_cache._tracker.region(), [this] { with_allocator(_cache._tracker.allocator(), [this] { dht::decorated_key dk = _read_context->range().start()->value().as_decorated_key(); - _cache.do_find_or_create_entry(dk, nullptr, [&] (auto i) { + _cache.do_find_or_create_entry(dk, nullptr, [&] (auto i, const row_cache::partitions_type::bound_hint& hint) { mutation_partition mp(_cache._schema); - cache_entry* entry = current_allocator().construct( + bool cont = i->continuous(); + row_cache::partitions_type::iterator entry = _cache._partitions.emplace_before(i, dk.token().raw(), hint, _cache._schema, std::move(dk), std::move(mp)); _cache._tracker.insert(*entry); - entry->set_continuous(i->continuous()); - return _cache._partitions.insert_before(i, *entry); + entry->set_continuous(cont); + return entry; }, [&] (auto i) { _cache._tracker.on_miss_already_populated(); }); @@ -496,7 +497,7 @@ private: return; } if (!_reader.range().end() || !_reader.range().end()->is_inclusive()) { - cache_entry::compare cmp(_cache._schema); + dht::ring_position_comparator cmp(*_cache._schema); auto it = _reader.range().end() ? _cache._partitions.find(_reader.range().end()->value(), cmp) : std::prev(_cache._partitions.end()); if (it != _cache._partitions.end()) { @@ -754,10 +755,10 @@ row_cache::make_reader(schema_ptr s, if (!ctx->is_range_query() && !fwd_mr) { auto mr = _read_section(_tracker.region(), [&] { return with_linearized_managed_bytes([&] { - cache_entry::compare cmp(_schema); + dht::ring_position_comparator cmp(*_schema); auto&& pos = ctx->range().start()->value(); auto i = _partitions.lower_bound(pos, cmp); - if (i != _partitions.end() && !cmp(pos, i->position())) { + if (i != _partitions.end() && cmp(pos, i->position()) >= 0) { cache_entry& e = *i; upgrade_entry(e); on_partition_hit(); @@ -789,22 +790,20 @@ row_cache::make_reader(schema_ptr s, row_cache::~row_cache() { with_allocator(_tracker.allocator(), [this] { - _partitions.clear_and_dispose([this, deleter = current_deleter()] (auto&& p) mutable { + _partitions.clear_and_dispose([this] (cache_entry* p) mutable noexcept { if (!p->is_dummy_entry()) { _tracker.on_partition_erase(); } p->evict(_tracker); - deleter(p); }); }); } void row_cache::clear_now() noexcept { with_allocator(_tracker.allocator(), [this] { - auto it = _partitions.erase_and_dispose(_partitions.begin(), partitions_end(), [this, deleter = current_deleter()] (auto&& p) mutable { + auto it = _partitions.erase_and_dispose(_partitions.begin(), partitions_end(), [this] (cache_entry* p) noexcept { _tracker.on_partition_erase(); p->evict(_tracker); - deleter(p); }); _tracker.clear_continuity(*it); }); @@ -820,9 +819,11 @@ cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key, { return with_allocator(_tracker.allocator(), [&] () -> cache_entry& { return with_linearized_managed_bytes([&] () -> cache_entry& { - auto i = _partitions.lower_bound(key, cache_entry::compare(_schema)); - if (i == _partitions.end() || !i->key().equal(*_schema, key)) { - i = create_entry(i); + partitions_type::bound_hint hint; + dht::ring_position_comparator cmp(*_schema); + auto i = _partitions.lower_bound(key, cmp, hint); + if (i == _partitions.end() || !hint.match) { + i = create_entry(i, hint); } else { visit_entry(i); } @@ -845,10 +846,11 @@ cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key, } cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone t, row_cache::phase_type phase, const previous_entry_pointer* previous) { - return do_find_or_create_entry(key, previous, [&] (auto i) { // create - auto entry = current_allocator().construct(cache_entry::incomplete_tag{}, _schema, key, t); + return do_find_or_create_entry(key, previous, [&] (auto i, const partitions_type::bound_hint& hint) { // create + partitions_type::iterator entry = _partitions.emplace_before(i, key.token().raw(), hint, + cache_entry::incomplete_tag{}, _schema, key, t); _tracker.insert(*entry); - return _partitions.insert_before(i, *entry); + return entry; }, [&] (auto i) { // visit _tracker.on_miss_already_populated(); cache_entry& e = *i; @@ -859,14 +861,13 @@ cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone void row_cache::populate(const mutation& m, const previous_entry_pointer* previous) { _populate_section(_tracker.region(), [&] { - do_find_or_create_entry(m.decorated_key(), previous, [&] (auto i) { - cache_entry* entry = current_allocator().construct( + do_find_or_create_entry(m.decorated_key(), previous, [&] (auto i, const partitions_type::bound_hint& hint) { + partitions_type::iterator entry = _partitions.emplace_before(i, m.decorated_key().token().raw(), hint, m.schema(), m.decorated_key(), m.partition()); _tracker.insert(*entry); entry->set_continuous(i->continuous()); - i = _partitions.insert_before(i, *entry); - upgrade_entry(*i); - return i; + upgrade_entry(*entry); + return entry; }, [&] (auto i) { throw std::runtime_error(format("cache already contains entry for {}", m.key())); }); @@ -898,15 +899,14 @@ void row_cache::invalidate_sync(memtable& m) noexcept { bool blow_cache = false; // Note: clear_and_dispose() ought not to look up any keys, so it doesn't require // with_linearized_managed_bytes(), but invalidate() does. - m.partitions.clear_and_dispose([this, deleter = current_deleter(), &blow_cache] (memtable_entry* entry) { + m.partitions.clear_and_dispose([this, &m, &blow_cache] (memtable_entry* entry) noexcept { with_linearized_managed_bytes([&] () noexcept { try { invalidate_locked(entry->key()); } catch (...) { blow_cache = true; } - entry->partition().evict(_tracker.memtable_cleaner()); - deleter(entry); + m.evict_entry(*entry, _tracker.memtable_cleaner()); }); }); if (blow_cache) { @@ -950,7 +950,7 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater) partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker(); while (!m.partitions.empty()) { with_allocator(_tracker.allocator(), [&] () { - auto cmp = cache_entry::compare(_schema); + auto cmp = dht::ring_position_comparator(*_schema); { size_t partition_count = 0; { @@ -966,8 +966,9 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater) with_linearized_managed_bytes([&] { memtable_entry& mem_e = *m.partitions.begin(); size_entry = mem_e.size_in_allocator_without_rows(_tracker.allocator()); - auto cache_i = _partitions.lower_bound(mem_e.key(), cmp); - update = updater(_update_section, cache_i, mem_e, is_present, real_dirty_acc); + partitions_type::bound_hint hint; + auto cache_i = _partitions.lower_bound(mem_e.key(), cmp, hint); + update = updater(_update_section, cache_i, mem_e, is_present, real_dirty_acc, hint); }); }); } @@ -982,10 +983,9 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater) _update_section(_tracker.region(), [&] { with_linearized_managed_bytes([&] { auto i = m.partitions.begin(); - memtable_entry& mem_e = *i; - m.partitions.erase(i); - mem_e.partition().evict(_tracker.memtable_cleaner()); - current_allocator().destroy(&mem_e); + i.erase_and_dispose(dht::raw_token_less_comparator{}, [&] (memtable_entry* e) noexcept { + m.evict_entry(*e, _tracker.memtable_cleaner()); + }); }); }); ++partition_count; @@ -1015,11 +1015,11 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater) future<> row_cache::update(external_updater eu, memtable& m) { return do_update(std::move(eu), m, [this] (logalloc::allocating_section& alloc, row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e, partition_presence_checker& is_present, - real_dirty_memory_accounter& acc) mutable { + real_dirty_memory_accounter& acc, const partitions_type::bound_hint& hint) mutable { // If cache doesn't contain the entry we cannot insert it because the mutation may be incomplete. // FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to // search it. - if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) { + if (cache_i != partitions_end() && hint.match) { cache_entry& entry = *cache_i; upgrade_entry(entry); assert(entry._schema == _schema); @@ -1031,12 +1031,11 @@ future<> row_cache::update(external_updater eu, memtable& m) { || with_allocator(standard_allocator(), [&] { return is_present(mem_e.key()); }) == partition_presence_checker_result::definitely_doesnt_exist) { // Partition is absent in underlying. First, insert a neutral partition entry. - cache_entry* entry = current_allocator().construct(cache_entry::evictable_tag(), - _schema, dht::decorated_key(mem_e.key()), + partitions_type::iterator entry = _partitions.emplace_before(cache_i, mem_e.key().token().raw(), hint, + cache_entry::evictable_tag(), _schema, dht::decorated_key(mem_e.key()), partition_entry::make_evictable(*_schema, mutation_partition(_schema))); entry->set_continuous(cache_i->continuous()); _tracker.insert(*entry); - _partitions.insert_before(cache_i, *entry); mem_e.upgrade_schema(_schema, _tracker.memtable_cleaner()); return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), _tracker.memtable_cleaner(), alloc, _tracker.region(), _tracker, _underlying_phase, acc); @@ -1049,7 +1048,7 @@ future<> row_cache::update(external_updater eu, memtable& m) { future<> row_cache::update_invalidating(external_updater eu, memtable& m) { return do_update(std::move(eu), m, [this] (logalloc::allocating_section& alloc, row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e, partition_presence_checker& is_present, - real_dirty_memory_accounter& acc) + real_dirty_memory_accounter& acc, const partitions_type::bound_hint&) { if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) { // FIXME: Invalidate only affected row ranges. @@ -1072,7 +1071,7 @@ void row_cache::refresh_snapshot() { void row_cache::touch(const dht::decorated_key& dk) { _read_section(_tracker.region(), [&] { with_linearized_managed_bytes([&] { - auto i = _partitions.find(dk, cache_entry::compare(_schema)); + auto i = _partitions.find(dk, dht::ring_position_comparator(*_schema)); if (i != _partitions.end()) { for (partition_version& pv : i->partition().versions_from_oldest()) { for (rows_entry& row : pv.partition().clustered_rows()) { @@ -1087,7 +1086,7 @@ void row_cache::touch(const dht::decorated_key& dk) { void row_cache::unlink_from_lru(const dht::decorated_key& dk) { _read_section(_tracker.region(), [&] { with_linearized_managed_bytes([&] { - auto i = _partitions.find(dk, cache_entry::compare(_schema)); + auto i = _partitions.find(dk, dht::ring_position_comparator(*_schema)); if (i != _partitions.end()) { for (partition_version& pv : i->partition().versions_from_oldest()) { for (rows_entry& row : pv.partition().clustered_rows()) { @@ -1100,15 +1099,14 @@ void row_cache::unlink_from_lru(const dht::decorated_key& dk) { } void row_cache::invalidate_locked(const dht::decorated_key& dk) { - auto pos = _partitions.lower_bound(dk, cache_entry::compare(_schema)); + auto pos = _partitions.lower_bound(dk, dht::ring_position_comparator(*_schema)); if (pos == partitions_end() || !pos->key().equal(*_schema, dk)) { _tracker.clear_continuity(*pos); } else { - auto it = _partitions.erase_and_dispose(pos, - [this, &dk, deleter = current_deleter()](auto&& p) mutable { + auto it = pos.erase_and_dispose(dht::raw_token_less_comparator{}, + [this](cache_entry* p) mutable noexcept { _tracker.on_partition_erase(); p->evict(_tracker); - deleter(p); }); _tracker.clear_continuity(*it); } @@ -1138,17 +1136,16 @@ future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector& while (true) { auto done = _update_section(_tracker.region(), [&] { return with_linearized_managed_bytes([&] { - auto cmp = cache_entry::compare(_schema); + auto cmp = dht::ring_position_comparator(*_schema); auto it = _partitions.lower_bound(*_prev_snapshot_pos, cmp); auto end = _partitions.lower_bound(dht::ring_position_view::for_range_end(range), cmp); return with_allocator(_tracker.allocator(), [&] { - auto deleter = current_deleter(); while (it != end) { - it = _partitions.erase_and_dispose(it, [&] (cache_entry* p) mutable { - _tracker.on_partition_erase(); - p->evict(_tracker); - deleter(p); - }); + it = it.erase_and_dispose(dht::raw_token_less_comparator{}, + [&] (cache_entry* p) mutable noexcept { + _tracker.on_partition_erase(); + p->evict(_tracker); + }); // it != end is necessary for correctness. We cannot set _prev_snapshot_pos to end->position() // because after resuming something may be inserted before "end" which falls into the next range. if (need_preempt() && it != end) { @@ -1185,14 +1182,14 @@ void row_cache::evict() { row_cache::row_cache(schema_ptr s, snapshot_source src, cache_tracker& tracker, is_continuous cont) : _tracker(tracker) , _schema(std::move(s)) - , _partitions(cache_entry::compare(_schema)) + , _partitions(dht::raw_token_less_comparator{}) , _underlying(src()) , _snapshot_source(std::move(src)) { with_allocator(_tracker.allocator(), [this, cont] { - cache_entry* entry = current_allocator().construct(cache_entry::dummy_entry_tag()); - _partitions.insert_before(_partitions.end(), *entry); - entry->set_continuous(bool(cont)); + cache_entry entry(cache_entry::dummy_entry_tag{}); + entry.set_continuous(bool(cont)); + _partitions.insert(entry.position().token().raw(), std::move(entry), dht::ring_position_comparator{*_schema}); }); } @@ -1201,13 +1198,7 @@ cache_entry::cache_entry(cache_entry&& o) noexcept , _key(std::move(o._key)) , _pe(std::move(o._pe)) , _flags(o._flags) - , _cache_link() { - { - using container_type = row_cache::partitions_type; - container_type::node_algorithms::replace_node(o._cache_link.this_ptr(), _cache_link.this_ptr()); - container_type::node_algorithms::init(o._cache_link.this_ptr()); - } } cache_entry::~cache_entry() { @@ -1222,11 +1213,11 @@ void row_cache::set_schema(schema_ptr new_schema) noexcept { } void cache_entry::on_evicted(cache_tracker& tracker) noexcept { - auto it = row_cache::partitions_type::s_iterator_to(*this); + row_cache::partitions_type::iterator it(this); std::next(it)->set_continuous(false); evict(tracker); - current_deleter()(this); tracker.on_partition_eviction(); + it.erase(dht::raw_token_less_comparator{}); } void rows_entry::on_evicted(cache_tracker& tracker) noexcept { diff --git a/row_cache.hh b/row_cache.hh index 75b06d15a1..f80aa20579 100644 --- a/row_cache.hh +++ b/row_cache.hh @@ -31,7 +31,6 @@ #include "mutation_reader.hh" #include "mutation_partition.hh" -#include "utils/logalloc.hh" #include "utils/phased_barrier.hh" #include "utils/histogram.hh" #include "partition_version.hh" @@ -40,6 +39,7 @@ #include #include "flat_mutation_reader.hh" #include "mutation_cleaner.hh" +#include "utils/double-decker.hh" namespace bi = boost::intrusive; @@ -61,11 +61,6 @@ class lsa_manager; // // TODO: Make memtables use this format too. class cache_entry { - // We need auto_unlink<> option on the _cache_link because when entry is - // evicted from cache via LRU we don't have a reference to the container - // and don't want to store it with each entry. - using cache_link_type = bi::set_member_hook>; - schema_ptr _schema; dht::decorated_key _key; partition_entry _pe; @@ -73,8 +68,10 @@ class cache_entry { struct { bool _continuous : 1; bool _dummy_entry : 1; + bool _head : 1; + bool _tail : 1; + bool _train : 1; } _flags{}; - cache_link_type _cache_link; friend class size_calculator; flat_mutation_reader do_read(row_cache&, cache::read_context& reader); @@ -82,6 +79,13 @@ public: friend class row_cache; friend class cache_tracker; + bool is_head() const noexcept { return _flags._head; } + void set_head(bool v) noexcept { _flags._head = v; } + bool is_tail() const noexcept { return _flags._tail; } + void set_tail(bool v) noexcept { _flags._tail = v; } + bool with_train() const noexcept { return _flags._train; } + void set_train(bool v) noexcept { _flags._train = v; } + struct dummy_entry_tag{}; struct incomplete_tag{}; struct evictable_tag{}; @@ -137,6 +141,9 @@ public: } return _key; } + + friend dht::ring_position_view ring_position_view_to_compare(const cache_entry& ce) noexcept { return ce.position(); } + const partition_entry& partition() const noexcept { return _pe; } partition_entry& partition() { return _pe; } const schema_ptr& schema() const noexcept { return _schema; } @@ -148,38 +155,6 @@ public: bool is_dummy_entry() const noexcept { return _flags._dummy_entry; } - struct compare { - dht::ring_position_less_comparator _c; - - compare(schema_ptr s) - : _c(*s) - {} - - bool operator()(const dht::decorated_key& k1, const cache_entry& k2) const { - return _c(k1, k2.position()); - } - - bool operator()(dht::ring_position_view k1, const cache_entry& k2) const { - return _c(k1, k2.position()); - } - - bool operator()(const cache_entry& k1, const cache_entry& k2) const { - return _c(k1.position(), k2.position()); - } - - bool operator()(const cache_entry& k1, const dht::decorated_key& k2) const { - return _c(k1.position(), k2); - } - - bool operator()(const cache_entry& k1, dht::ring_position_view k2) const { - return _c(k1.position(), k2); - } - - bool operator()(dht::ring_position_view k1, dht::ring_position_view k2) const { - return _c(k1, k2); - } - }; - friend std::ostream& operator<<(std::ostream&, cache_entry&); }; @@ -315,10 +290,9 @@ void cache_tracker::insert(partition_entry& pe) noexcept { class row_cache final { public: using phase_type = utils::phased_barrier::phase_type; - using partitions_type = bi::set, - bi::constant_time_size, // we need this to have bi::auto_unlink on hooks - bi::compare>; + using partitions_type = double_decker; friend class cache::autoupdating_underlying_reader; friend class single_partition_populating_reader; friend class cache_entry; diff --git a/sstables/index_reader.hh b/sstables/index_reader.hh index 3879d0cdb6..37fb995d00 100644 --- a/sstables/index_reader.hh +++ b/sstables/index_reader.hh @@ -299,7 +299,7 @@ public: // Less-comparator for lookups in the partition index. class index_comparator { - dht::ring_position_comparator _tri_cmp; + dht::ring_position_comparator_for_sstables _tri_cmp; public: index_comparator(const schema& s) : _tri_cmp(s) {} diff --git a/test/boost/bptree_test.cc b/test/boost/bptree_test.cc new file mode 100644 index 0000000000..e398cbfe6e --- /dev/null +++ b/test/boost/bptree_test.cc @@ -0,0 +1,332 @@ + +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#define BOOST_TEST_MODULE bptree + +#include +#include + +#include "utils/bptree.hh" +#include "test/unit/tree_test_key.hh" + +struct int_compare { + bool operator()(const int& a, const int& b) const noexcept { return a < b; } +}; + +using namespace bplus; +using test_key = tree_test_key_base; +using test_tree = tree; + +BOOST_AUTO_TEST_CASE(test_ops_empty_tree) { + /* Sanity checks for no nullptr dereferences */ + test_tree t(int_compare{}); + t.erase(1); + t.find(1); +} + +BOOST_AUTO_TEST_CASE(test_double_insert) { + /* No assertions should happen in ~tree */ + test_tree t(int_compare{}); + auto i = t.emplace(1, 1); + BOOST_REQUIRE(i.second); + i = t.emplace(1, 1); + BOOST_REQUIRE(!i.second); + t.erase(1); +} + +BOOST_AUTO_TEST_CASE(test_cookie_find) { + struct int_to_key_compare { + bool operator()(const test_key& a, const int& b) const noexcept { return (int)a < b; } + bool operator()(const int& a, const test_key& b) const noexcept { return a < (int)b; } + bool operator()(const test_key& a, const test_key& b) const noexcept { + test_key_compare cmp; + return cmp(a, b); + } + }; + + using test_tree = tree; + + test_tree t(int_to_key_compare{}); + t.emplace(test_key{1}, 132); + + auto i = t.find(1); + BOOST_REQUIRE(*i == 132); +} + +BOOST_AUTO_TEST_CASE(test_double_erase) { + test_tree t(int_compare{}); + t.emplace(1, 1); + t.emplace(2, 2); + auto i = t.erase(1); + BOOST_REQUIRE(*i == 2); + i = t.erase(1); + BOOST_REQUIRE(i == t.end()); + i = t.erase(2); + BOOST_REQUIRE(i == t.end()); + t.erase(2); +} + +BOOST_AUTO_TEST_CASE(test_remove_corner_case) { + /* Sanity check for erasure to be precise */ + test_tree t(int_compare{}); + t.emplace(1, 1); + t.emplace(2, 123); + t.emplace(3, 3); + t.erase(1); + t.erase(3); + auto f = t.find(2); + BOOST_REQUIRE(*f == 123); + t.erase(2); +} + +BOOST_AUTO_TEST_CASE(test_end_iterator) { + /* Check std::prev(end()) */ + test_tree t(int_compare{}); + t.emplace(1, 123); + auto i = std::prev(t.end()); + BOOST_REQUIRE(*i = 123); + t.erase(1); +} + +BOOST_AUTO_TEST_CASE(test_next_to_end_iterator) { + /* Same, but with "artificial" end iterator */ + test_tree t(int_compare{}); + auto i = t.emplace(1, 123).first; + i++; + BOOST_REQUIRE(i == t.end()); + i--; + BOOST_REQUIRE(*i = 123); + t.erase(1); +} + +BOOST_AUTO_TEST_CASE(test_clear) { + /* Quick check for tree::clear */ + test_tree t(int_compare{}); + + for (int i = 0; i < 32; i++) { + t.emplace(i, i); + } + + t.clear(); +} + +BOOST_AUTO_TEST_CASE(test_post_clear) { + /* Check that tree is work-able after clear */ + test_tree t(int_compare{}); + + t.emplace(1, 1); + t.clear(); + t.emplace(2, 2); + t.erase(2); +} + +BOOST_AUTO_TEST_CASE(test_iterator_erase) { + /* Check iterator::erase */ + test_tree t(int_compare{}); + auto it = t.emplace(2, 2); + t.emplace(1, 321); + it.first.erase(int_compare{}); + BOOST_REQUIRE(*t.find(1) == 321); + t.erase(1); +} + +BOOST_AUTO_TEST_CASE(test_iterator_equal) { + test_tree t(int_compare{}); + auto i1 = t.emplace(1, 1); + auto i2 = t.emplace(2, 2); + auto i3 = t.find(1); + BOOST_REQUIRE(i1.first == i3); + BOOST_REQUIRE(i1.first != i2.first); +} + +BOOST_AUTO_TEST_CASE(test_lower_bound) { + test_tree t(int_compare{}); + t.emplace(1, 11); + t.emplace(3, 13); + + bool match; + BOOST_REQUIRE(*t.lower_bound(0, match) == 11 && !match); + BOOST_REQUIRE(*t.lower_bound(1, match) == 11 && match); + BOOST_REQUIRE(*t.lower_bound(2, match) == 13 && !match); + BOOST_REQUIRE(*t.lower_bound(3, match) == 13 && match); + BOOST_REQUIRE(t.lower_bound(4, match) == t.end() && !match); +} + +BOOST_AUTO_TEST_CASE(test_upper_bound) { + test_tree t(int_compare{}); + t.emplace(1, 11); + t.emplace(3, 13); + + BOOST_REQUIRE(*t.upper_bound(0) == 11); + BOOST_REQUIRE(*t.upper_bound(1) == 13); + BOOST_REQUIRE(*t.upper_bound(2) == 13); + BOOST_REQUIRE(t.upper_bound(3) == t.end()); + BOOST_REQUIRE(t.upper_bound(4) == t.end()); +} + +BOOST_AUTO_TEST_CASE(test_insert_iterator_index) { + /* Check insertion iterator ++ and duplicate key */ + test_tree t(int_compare{}); + t.emplace(1, 10); + t.emplace(3, 13); + auto i = t.emplace(2, 2).first; + i++; + BOOST_REQUIRE(*i == 13); + auto i2 = t.emplace(2, 2); /* 2nd insert finds the previous */ + BOOST_REQUIRE(!i2.second); + i2.first++; + BOOST_REQUIRE(*(i2.first) == 13); +} + +BOOST_AUTO_TEST_CASE(test_insert_before) { + /* Check iterator::insert_before */ + test_tree t(int_compare{}); + auto i3 = t.emplace(3, 13).first; + auto i2 = i3.emplace_before(2, int_compare{}, 12); + BOOST_REQUIRE(++i2 == i3); + BOOST_REQUIRE(*i3 == 13); + BOOST_REQUIRE(*--i2 == 12); + BOOST_REQUIRE(*--i3 == 12); +} + +BOOST_AUTO_TEST_CASE(test_insert_before_end) { + /* The same but for end() iterator */ + test_tree t(int_compare{}); + auto i = t.emplace(1, 1).first; + auto i2 = t.end().emplace_before(2, int_compare{}, 12); + BOOST_REQUIRE(++i == i2); + BOOST_REQUIRE(++i2 == t.end()); +} + +BOOST_AUTO_TEST_CASE(test_insert_before_end_empty) { + /* The same, but for empty tree */ + test_tree t(int_compare{}); + auto i = t.end().emplace_before(42, int_compare{}, 142); + BOOST_REQUIRE(i == t.begin()); + t.erase(42); +} + +BOOST_AUTO_TEST_CASE(test_iterators) { + test_tree t(int_compare{}); + + for (auto i = t.rbegin(); i != t.rend(); i++) { + BOOST_REQUIRE(false); + } + for (auto i = t.begin(); i != t.end(); i++) { + BOOST_REQUIRE(false); + } + + t.emplace(1, 7); + t.emplace(2, 9); + + { + auto i = t.begin(); + BOOST_REQUIRE(*(i++) == 7); + BOOST_REQUIRE(*(i++) == 9); + BOOST_REQUIRE(i == t.end()); + } + + { + auto i = t.rbegin(); + BOOST_REQUIRE(*(i++) == 9); + BOOST_REQUIRE(*(i++) == 7); + BOOST_REQUIRE(i == t.rend()); + } +} + +/* + * Special test that makes sure "self-iterator" works OK. + * See comment near the bptree::iterator(T* d) constructor + * for details. + */ +class tree_data { + int _key; + int _cookie; +public: + explicit tree_data(int cookie) : _key(-1), _cookie(cookie) {} + tree_data(int key, int cookie) : _key(key), _cookie(cookie) {} + int cookie() const { return _cookie; } + int key() const { + assert(_key != -1); + return _key; + } +}; + +BOOST_AUTO_TEST_CASE(test_data_self_iterator) { + using test_tree = tree; + + test_tree t(int_compare{}); + auto i = t.emplace(1, 42); + BOOST_REQUIRE(i.second); + + tree_data* d = &(*i.first); + BOOST_REQUIRE(d->cookie() == 42); + + test_tree::iterator di(d); + BOOST_REQUIRE(di->cookie() == 42); + + di.erase(int_compare{}); + BOOST_REQUIRE(t.find(1) == t.end()); +} + +BOOST_AUTO_TEST_CASE(test_insert_before_nokey) { + using test_tree = tree; + + test_tree t(int_compare{}); + auto i = t.emplace(2, 52).first; + auto ni = i.emplace_before(int_compare{}, 1, 42); + BOOST_REQUIRE(ni->cookie() == 42); + ni++; + BOOST_REQUIRE(ni == i); +} + + +BOOST_AUTO_TEST_CASE(test_self_iterator_rover) { + test_tree t(int_compare{}); + auto i = t.emplace(2, 42).first; + unsigned long* d = &(*i); + test_tree::iterator di(d); + + i = di.emplace_before(1, int_compare{}, 31); + BOOST_REQUIRE(*i == 31); + BOOST_REQUIRE(*(++i) == 42); + BOOST_REQUIRE(++i == t.end()); + BOOST_REQUIRE(++di == t.end()); +} + +BOOST_AUTO_TEST_CASE(test_erase_range) { + /* Quick check for tree::erase(from, to) */ + test_tree t(int_compare{}); + + for (int i = 0; i < 32; i++) { + t.emplace(i, i); + } + + auto b = t.find(8); + auto e = t.find(25); + t.erase(b, e); + + BOOST_REQUIRE(*t.find(7) == 7); + BOOST_REQUIRE(t.find(8) == t.end()); + BOOST_REQUIRE(t.find(24) == t.end()); + BOOST_REQUIRE(*t.find(25) == 25); +} diff --git a/test/boost/double_decker_test.cc b/test/boost/double_decker_test.cc new file mode 100644 index 0000000000..0f18c69dad --- /dev/null +++ b/test/boost/double_decker_test.cc @@ -0,0 +1,397 @@ + +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#define BOOST_TEST_MODULE double_decker + +#include +#include +#include +#include + +#include "utils/double-decker.hh" +#include "test/lib/random_utils.hh" + +class compound_key { +public: + int key; + std::string sub_key; + + compound_key(int k, std::string sk) noexcept : key(k), sub_key(sk) {} + + compound_key(const compound_key& other) = delete; + compound_key(compound_key&& other) noexcept : key(other.key), sub_key(std::move(other.sub_key)) {} + + compound_key& operator=(const compound_key& other) = delete; + compound_key& operator=(compound_key&& other) noexcept { + key = other.key; + sub_key = std::move(other.sub_key); + return *this; + } + + std::string format() const { + return seastar::format("{}.{}", key, sub_key); + } + + bool operator==(const compound_key& other) const { + return key == other.key && sub_key == other.sub_key; + } + + bool operator!=(const compound_key& other) const { return !(*this == other); } + + struct compare { + int operator()(const int& a, const int& b) const { return a - b; } + int operator()(const int& a, const compound_key& b) const { return a - b.key; } + int operator()(const compound_key& a, const int& b) const { return a.key - b; } + + int operator()(const compound_key& a, const compound_key& b) const { + if (a.key != b.key) { + return this->operator()(a.key, b.key); + } else { + return a.sub_key.compare(b.sub_key); + } + } + }; + + struct less_compare { + compare cmp; + + template + bool operator()(const A& a, const B& b) const noexcept { + return cmp(a, b) < 0; + } + }; +}; + +class test_data { + compound_key _key; + bool _head = false; + bool _tail = false; + bool _train = false; + + int *_cookie; + int *_cookie2; +public: + bool is_head() const noexcept { return _head; } + bool is_tail() const noexcept { return _tail; } + bool with_train() const noexcept { return _train; } + void set_head(bool v) noexcept { _head = v; } + void set_tail(bool v) noexcept { _tail = v; } + void set_train(bool v) noexcept { _train = v; } + + test_data(int key, std::string sub) : _key(key, sub), _cookie(new int(0)), _cookie2(new int(0)) {} + + test_data(const test_data& other) = delete; + test_data(test_data&& other) noexcept : _key(std::move(other._key)), + _head(other._head), _tail(other._tail), _train(other._train), + _cookie(other._cookie), _cookie2(new int(0)) { + other._cookie = nullptr; + } + + ~test_data() { + if (_cookie != nullptr) { + delete _cookie; + } + delete _cookie2; + } + + bool operator==(const compound_key& k) { return _key == k; } + + test_data& operator=(const test_data& other) = delete; + test_data& operator=(test_data&& other) = delete; + + std::string format() const { return _key.format(); } + + struct compare { + compound_key::compare kcmp; + int operator()(const int& a, const int& b) { return kcmp(a, b); } + int operator()(const compound_key& a, const int& b) { return kcmp(a.key, b); } + int operator()(const int& a, const compound_key& b) { return kcmp(a, b.key); } + int operator()(const compound_key& a, const compound_key& b) { return kcmp(a, b); } + int operator()(const compound_key& a, const test_data& b) { return kcmp(a, b._key); } + int operator()(const test_data& a, const compound_key& b) { return kcmp(a._key, b); } + int operator()(const test_data& a, const test_data& b) { return kcmp(a._key, b._key); } + }; +}; + +using collection = double_decker; +using oracle = std::set; + +BOOST_AUTO_TEST_CASE(test_lower_bound) { + collection c(compound_key::less_compare{}); + test_data::compare cmp; + + c.insert(3, test_data(3, "e"), cmp); + c.insert(5, test_data(5, "i"), cmp); + c.insert(5, test_data(5, "o"), cmp); + + collection::bound_hint h; + + BOOST_REQUIRE(*c.lower_bound(compound_key(2, "a"), cmp, h) == compound_key(3, "e") && !h.key_match); + BOOST_REQUIRE(*c.lower_bound(compound_key(3, "a"), cmp, h) == compound_key(3, "e") && h.key_match && !h.key_tail && !h.match); + BOOST_REQUIRE(*c.lower_bound(compound_key(3, "e"), cmp, h) == compound_key(3, "e") && h.key_match && !h.key_tail && h.match); + BOOST_REQUIRE(*c.lower_bound(compound_key(3, "o"), cmp, h) == compound_key(5, "i") && h.key_match && h.key_tail && !h.match); + BOOST_REQUIRE(*c.lower_bound(compound_key(4, "i"), cmp, h) == compound_key(5, "i") && !h.key_match); + BOOST_REQUIRE(*c.lower_bound(compound_key(5, "a"), cmp, h) == compound_key(5, "i") && h.key_match && !h.key_tail && !h.match); + BOOST_REQUIRE(*c.lower_bound(compound_key(5, "i"), cmp, h) == compound_key(5, "i") && h.key_match && !h.key_tail && h.match); + BOOST_REQUIRE(*c.lower_bound(compound_key(5, "l"), cmp, h) == compound_key(5, "o") && h.key_match && !h.key_tail && !h.match); + BOOST_REQUIRE(*c.lower_bound(compound_key(5, "o"), cmp, h) == compound_key(5, "o") && h.key_match && !h.key_tail && h.match); + BOOST_REQUIRE(c.lower_bound(compound_key(5, "q"), cmp, h) == c.end() && h.key_match && h.key_tail); + BOOST_REQUIRE(c.lower_bound(compound_key(6, "q"), cmp, h) == c.end() && !h.key_match); + + c.clear(); +} + +BOOST_AUTO_TEST_CASE(test_upper_bound) { + collection c(compound_key::less_compare{}); + test_data::compare cmp; + + c.insert(3, test_data(3, "e"), cmp); + c.insert(5, test_data(5, "i"), cmp); + c.insert(5, test_data(5, "o"), cmp); + + BOOST_REQUIRE(*c.upper_bound(compound_key(2, "a"), cmp) == compound_key(3, "e")); + BOOST_REQUIRE(*c.upper_bound(compound_key(3, "a"), cmp) == compound_key(3, "e")); + BOOST_REQUIRE(*c.upper_bound(compound_key(3, "e"), cmp) == compound_key(5, "i")); + BOOST_REQUIRE(*c.upper_bound(compound_key(3, "o"), cmp) == compound_key(5, "i")); + BOOST_REQUIRE(*c.upper_bound(compound_key(4, "i"), cmp) == compound_key(5, "i")); + BOOST_REQUIRE(*c.upper_bound(compound_key(5, "a"), cmp) == compound_key(5, "i")); + BOOST_REQUIRE(*c.upper_bound(compound_key(5, "i"), cmp) == compound_key(5, "o")); + BOOST_REQUIRE(*c.upper_bound(compound_key(5, "l"), cmp) == compound_key(5, "o")); + BOOST_REQUIRE(c.upper_bound(compound_key(5, "o"), cmp) == c.end()); + BOOST_REQUIRE(c.upper_bound(compound_key(5, "q"), cmp) == c.end()); + BOOST_REQUIRE(c.upper_bound(compound_key(6, "q"), cmp) == c.end()); + + c.clear(); +} +BOOST_AUTO_TEST_CASE(test_self_iterator) { + collection c(compound_key::less_compare{}); + test_data::compare cmp; + + c.insert(1, std::move(test_data(1, "a")), cmp); + c.insert(1, std::move(test_data(1, "b")), cmp); + c.insert(2, std::move(test_data(2, "c")), cmp); + c.insert(3, std::move(test_data(3, "d")), cmp); + c.insert(3, std::move(test_data(3, "e")), cmp); + + auto erase_by_ptr = [&] (int key, std::string sub) { + test_data* d = &*c.find(compound_key(key, sub), cmp); + collection::iterator di(d); + di.erase(compound_key::less_compare{}); + }; + + erase_by_ptr(1, "b"); + erase_by_ptr(2, "c"); + erase_by_ptr(3, "d"); + + auto i = c.begin(); + BOOST_REQUIRE(*i++ == compound_key(1, "a")); + BOOST_REQUIRE(*i++ == compound_key(3, "e")); + BOOST_REQUIRE(i == c.end()); + + c.clear(); +} + +BOOST_AUTO_TEST_CASE(test_end_iterator) { + collection c(compound_key::less_compare{}); + test_data::compare cmp; + + c.insert(1, std::move(test_data(1, "a")), cmp); + auto i = std::prev(c.end()); + BOOST_REQUIRE(*i == compound_key(1, "a")); + + c.clear(); +} + +void validate_sorted(collection& c) { + auto i = c.begin(); + if (i == c.end()) { + return; + } + + while (1) { + auto cur = i; + i++; + if (i == c.end()) { + break; + } + test_data::compare cmp; + BOOST_REQUIRE(cmp(*cur, *i) < 0); + } +} + +void compare_with_set(collection& c, oracle& s) { + test_data::compare cmp; + /* All keys must be findable */ + for (auto i = s.begin(); i != s.end(); i++) { + auto j = c.find(*i, cmp); + BOOST_REQUIRE(j != c.end() && *j == *i); + } + + /* Both iterators must coinside */ + auto i = c.begin(); + auto j = s.begin(); + + while (i != c.end()) { + BOOST_REQUIRE(*i == *j); + i++; + j++; + } +} + +BOOST_AUTO_TEST_CASE(test_insert_via_emplace) { + collection c(compound_key::less_compare{}); + test_data::compare cmp; + oracle s; + int nr = 0; + + while (nr < 4000) { + compound_key k(tests::random::get_int(900), tests::random::get_sstring(4)); + + collection::bound_hint h; + auto i = c.lower_bound(k, cmp, h); + + if (i == c.end() || !h.match) { + auto it = c.emplace_before(i, k.key, h, k.key, k.sub_key); + BOOST_REQUIRE(*it == k); + s.insert(std::move(k)); + nr++; + } + } + + compare_with_set(c, s); + c.clear(); +} + +BOOST_AUTO_TEST_CASE(test_insert_and_erase) { + collection c(compound_key::less_compare{}); + test_data::compare cmp; + int nr = 0; + + while (nr < 500) { + compound_key k(tests::random::get_int(100), tests::random::get_sstring(3)); + + if (c.find(k, cmp) == c.end()) { + auto it = c.insert(k.key, std::move(test_data(k.key, k.sub_key)), cmp); + BOOST_REQUIRE(*it == k); + nr++; + } + } + + validate_sorted(c); + + while (nr > 0) { + int n = tests::random::get_int() % nr; + + auto i = c.begin(); + while (n > 0) { + i++; + n--; + } + + i.erase(compound_key::less_compare{}); + nr--; + + validate_sorted(c); + } +} + +BOOST_AUTO_TEST_CASE(test_compaction) { + logalloc::region reg; + with_allocator(reg.allocator(), [&] { + collection c(compound_key::less_compare{}); + test_data::compare cmp; + oracle s; + + { + logalloc::reclaim_lock rl(reg); + + int nr = 0; + + while (nr < 1500) { + compound_key k(tests::random::get_int(400), tests::random::get_sstring(3)); + + if (c.find(k, cmp) == c.end()) { + auto it = c.insert(k.key, std::move(test_data(k.key, k.sub_key)), cmp); + BOOST_REQUIRE(*it == k); + s.insert(std::move(k)); + nr++; + } + } + } + + reg.full_compaction(); + + compare_with_set(c, s); + c.clear(); + }); +} + +BOOST_AUTO_TEST_CASE(test_range_erase) { + std::vector keys; + test_data::compare cmp; + + keys.emplace_back(1, "a"); + keys.emplace_back(1, "b"); + keys.emplace_back(1, "c"); + keys.emplace_back(1, "d"); + keys.emplace_back(2, "a"); + keys.emplace_back(2, "b"); + keys.emplace_back(2, "c"); + keys.emplace_back(2, "d"); + keys.emplace_back(2, "e"); + keys.emplace_back(3, "a"); + keys.emplace_back(3, "b"); + keys.emplace_back(3, "c"); + + for (size_t f = 0; f < keys.size(); f++) { + for (size_t t = f; t <= keys.size(); t++) { + collection c(compound_key::less_compare{}); + + for (auto&& k : keys) { + c.insert(k.key, std::move(test_data(k.key, k.sub_key)), cmp); + } + + auto iter_at = [&c] (size_t at) -> collection::iterator { + auto it = c.begin(); + for (size_t i = 0; i < at; i++, it++) ; + return it; + }; + + auto n = c.erase(iter_at(f), iter_at(t)); + + auto r = c.begin(); + for (size_t i = 0; i < keys.size(); i++) { + if (!(i >= f && i < t)) { + if (i == t) { + BOOST_REQUIRE(*n == keys[i]); + } + BOOST_REQUIRE(*(r++) == keys[i]); + } + } + if (t == keys.size()) { + BOOST_REQUIRE(n == c.end()); + } + BOOST_REQUIRE(r == c.end()); + } + } +} diff --git a/test/boost/intrusive_array_test.cc b/test/boost/intrusive_array_test.cc new file mode 100644 index 0000000000..66297060ca --- /dev/null +++ b/test/boost/intrusive_array_test.cc @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#include +#include +#include + +#include "utils/intrusive-array.hh" +#include "utils/logalloc.hh" + +class element { + bool _head = false; + bool _tail = false; + bool _train = false; + + long _data; + int *_cookie; + int *_cookie2; + +public: + explicit element(long val) : _data(val), _cookie(new int(0)), _cookie2(new int(0)) { } + + element(const element& other) = delete; + element(element&& other) noexcept : _head(other._head), _tail(other._tail), _train(other._train), + _data(other._data), _cookie(other._cookie), _cookie2(new int(0)) { + other._cookie = nullptr; + } + + ~element() { + if (_cookie != nullptr) { + delete _cookie; + } + + delete _cookie2; + } + + bool is_head() const noexcept { return _head; } + void set_head(bool v) noexcept { _head = v; } + bool is_tail() const noexcept { return _tail; } + void set_tail(bool v) noexcept { _tail = v; } + bool with_train() const noexcept { return _train; } + void set_train(bool v) noexcept { _train = v; } + + bool operator==(long v) const { return v == _data; } + long operator*() const { return _data; } + + bool bound_check(int idx, int size) { + return ((idx == 0) == is_head()) && ((idx == size - 1) == is_tail()); + } +}; + +using test_array = intrusive_array; + +static bool size_check(test_array& a, size_t size, unsigned short tlen) { + return a[size - 1].is_tail() && a.size() == size && + size_for_allocation_strategy(a) == (size + tlen) * sizeof(element) && + ((tlen != 0) == a[0].with_train()) && + ((tlen == 0) || *reinterpret_cast(&a[size]) == tlen); +} + +void show(const char *pfx, test_array& a, int sz) { + int i; + + fmt::print("{}", pfx); + for (i = 0; i < sz; i++) { + fmt::print("{}{}{}", a[i].is_head() ? 'H' : ' ', *a[i], a[i].is_tail() ? 'T' : ' '); + } + if (a[0].with_train()) { + fmt::print(" ~{}", *reinterpret_cast(&a[i])); + } + fmt::print("\n"); +} + +SEASTAR_THREAD_TEST_CASE(test_basic_construct) { + test_array array(12); + + for (auto i = array.begin(); i != array.end(); i++) { + BOOST_REQUIRE(*i == 12); + } +} + +test_array* grow(test_array& from, size_t nsize, int npos, long ndat) { + BOOST_REQUIRE(from.size() + 1 == nsize); + auto ptr = current_allocator().alloc(&get_standard_migrator(), sizeof(element) * nsize, alignof(test_array)); + return new (ptr) test_array(from, test_array::grow_tag{npos}, ndat); +} + +test_array* shrink(test_array& from, size_t nszie, int spos) { + BOOST_REQUIRE(from.size() - 1 == nszie); + auto ptr = current_allocator().alloc(&get_standard_migrator(), sizeof(element) * nszie, alignof(test_array)); + return new (ptr) test_array(from, test_array::shrink_tag{spos}); +} + +void grow_shrink_and_check(test_array& cur, int size, int depth) { + for (int i = 0; i <= size; i++) { + long nel = size + 12; + test_array* narr = grow(cur, size + 1, i, nel); + int idx = 0; + + BOOST_REQUIRE(size_check(*narr, size + 1, 0)); + + for (auto ni = narr->begin(); ni != narr->end(); ni++) { + if (idx == i) { + BOOST_REQUIRE(*ni == nel); + } else if (idx < i) { + BOOST_REQUIRE(*ni == *cur[idx]); + } else { + BOOST_REQUIRE(*ni == *cur[idx - 1]); + } + + BOOST_REQUIRE(ni->bound_check(idx, size + 1)); + idx++; + } + + if (size < depth) { + grow_shrink_and_check(*narr, size + 1, depth); + } + + current_allocator().destroy(narr); + } + + if (size > 1) { + for (int i = 0; i < size; i++) { + test_array* narr = shrink(cur, size - 1, i); + int idx = 0; + + BOOST_REQUIRE(size_check(*narr, size - 1, 0)); + + for (auto ni = narr->begin(); ni != narr->end(); ni++) { + if (idx == i) { + continue; + } else if (idx < i) { + BOOST_REQUIRE(*ni == *cur[idx]); + } else { + BOOST_REQUIRE(*ni == *cur[idx + 1]); + } + + BOOST_REQUIRE(ni->bound_check(idx, size - 1)); + idx++; + } + + current_allocator().destroy(narr); + } + } +} + +SEASTAR_THREAD_TEST_CASE(test_grow_shrink_construct) { + test_array array(12); + grow_shrink_and_check(array, 1, 5); +} + +SEASTAR_THREAD_TEST_CASE(test_erase) { + test_array a1(10); + test_array *a2 = grow(a1, 2, 1, 20); + test_array *a3 = grow(*a2, 3, 2, 30); + + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < 2; k++) { + std::vector x({10, 20, 30, 40}); + test_array *a4 = grow(*a3, 4, 3, 40); + + auto test_fn = [&] (int idx, int sz) { + a4->erase(idx); + x.erase(x.begin() + idx); + BOOST_REQUIRE(size_check(*a4, sz, 4 - sz)); + for (int a = 0; a < sz; a++) { + BOOST_REQUIRE(x[a] == *(*a4)[a]); + } + }; + + test_fn(i, 3); + test_fn(j, 2); + test_fn(k, 1); + + current_allocator().destroy(a4); + } + } + } + + current_allocator().destroy(a3); + current_allocator().destroy(a2); +} + +SEASTAR_THREAD_TEST_CASE(test_lower_bound) { + test_array a1(12); + struct compare { + int operator()(const element& a, const element& b) const { return *a - *b; } + }; + + test_array *a2 = grow(a1, 2, 1, 14); + + auto i = a2->lower_bound(element(13), compare{}); + BOOST_REQUIRE(*i == 14 && a2->index_of(i) == 1); + + test_array *a3 = grow(*a2, 3, 2, 17); + + bool match; + BOOST_REQUIRE(*a3->lower_bound(element(11), compare{}, match) == 12 && !match); + BOOST_REQUIRE(*a3->lower_bound(element(12), compare{}, match) == 12 && match); + BOOST_REQUIRE(*a3->lower_bound(element(13), compare{}, match) == 14 && !match); + BOOST_REQUIRE(*a3->lower_bound(element(14), compare{}, match) == 14 && match); + BOOST_REQUIRE(*a3->lower_bound(element(15), compare{}, match) == 17 && !match); + BOOST_REQUIRE(*a3->lower_bound(element(16), compare{}, match) == 17 && !match); + BOOST_REQUIRE(*a3->lower_bound(element(17), compare{}, match) == 17 && match); + BOOST_REQUIRE(a3->lower_bound(element(18), compare{}, match) == a3->end()); + + current_allocator().destroy(a3); + current_allocator().destroy(a2); +} + +SEASTAR_THREAD_TEST_CASE(test_from_element) { + test_array a1(12); + test_array *a2 = grow(a1, 2, 1, 14); + test_array *a3 = grow(*a2, 3, 2, 17); + + element* i = &((*a3)[2]); + BOOST_REQUIRE(*i == 17); + int idx; + test_array& x = test_array::from_element(i, idx); + BOOST_REQUIRE(&x == a3 && idx == 2); + + current_allocator().destroy(a3); + current_allocator().destroy(a2); +} diff --git a/test/perf/memory_footprint_test.cc b/test/perf/memory_footprint_test.cc index fcfa55405a..104e7a5cb0 100644 --- a/test/perf/memory_footprint_test.cc +++ b/test/perf/memory_footprint_test.cc @@ -57,11 +57,13 @@ class size_calculator { public: static void print_cache_entry_size() { std::cout << prefix() << "sizeof(cache_entry) = " << sizeof(cache_entry) << "\n"; + std::cout << prefix() << "sizeof(memtable_entry) = " << sizeof(memtable_entry) << "\n"; + std::cout << prefix() << "sizeof(bptree::node) = " << sizeof(row_cache::partitions_type::outer_tree::node) << "\n"; + std::cout << prefix() << "sizeof(bptree::data) = " << sizeof(row_cache::partitions_type::outer_tree::data) << "\n"; { nest n; std::cout << prefix() << "sizeof(decorated_key) = " << sizeof(dht::decorated_key) << "\n"; - std::cout << prefix() << "sizeof(cache_link_type) = " << sizeof(cache_entry::cache_link_type) << "\n"; print_mutation_partition_size(); } diff --git a/test/perf/perf.hh b/test/perf/perf.hh index 9de2b25410..e73ac859ae 100644 --- a/test/perf/perf.hh +++ b/test/perf/perf.hh @@ -24,7 +24,10 @@ #include #include #include +#include #include "seastarx.hh" +#include "utils/extremum_tracking.hh" +#include "utils/estimated_histogram.hh" #include #include @@ -126,3 +129,71 @@ std::vector time_parallel(Func func, unsigned concurrency_per_core, int } return results; } + +template +auto duration_in_seconds(Func&& f) { + using clk = std::chrono::steady_clock; + auto start = clk::now(); + f(); + auto end = clk::now(); + return std::chrono::duration_cast>(end - start); +} + +class scheduling_latency_measurer : public weakly_referencable { + using clk = std::chrono::steady_clock; + clk::time_point _last = clk::now(); + utils::estimated_histogram _hist{300}; + min_max_tracker _minmax; + bool _stop = false; +private: + void schedule_tick(); + void tick() { + auto old = _last; + _last = clk::now(); + auto latency = _last - old; + _minmax.update(latency); + _hist.add(latency.count()); + if (!_stop) { + schedule_tick(); + } + } +public: + void start() { + schedule_tick(); + } + void stop() { + _stop = true; + later().get(); // so that the last scheduled tick is counted + } + const utils::estimated_histogram& histogram() const { + return _hist; + } + clk::duration min() const { return _minmax.min(); } + clk::duration max() const { return _minmax.max(); } +}; + +void scheduling_latency_measurer::schedule_tick() { + seastar::schedule(make_task(default_scheduling_group(), [self = weak_from_this()] () mutable { + if (self) { + self->tick(); + } + })); +} + +std::ostream& operator<<(std::ostream& out, const scheduling_latency_measurer& slm) { + auto to_ms = [] (int64_t nanos) { + return float(nanos) / 1e6; + }; + return out << sprint("{count: %d, " + //"min: %.6f [ms], " + //"50%%: %.6f [ms], " + //"90%%: %.6f [ms], " + "99%%: %.6f [ms], " + "max: %.6f [ms]}", + slm.histogram().count(), + //to_ms(slm.min().count()), + //to_ms(slm.histogram().percentile(0.5)), + //to_ms(slm.histogram().percentile(0.9)), + to_ms(slm.histogram().percentile(0.99)), + to_ms(slm.max().count())); +} diff --git a/test/perf/perf_bptree.cc b/test/perf/perf_bptree.cc new file mode 100644 index 0000000000..51271da2ea --- /dev/null +++ b/test/perf/perf_bptree.cc @@ -0,0 +1,240 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include "perf.hh" + +using per_key_t = int64_t; + +struct key_compare { + bool operator()(const per_key_t& a, const per_key_t& b) const noexcept { return a < b; } +}; + +#include "utils/bptree.hh" + +using namespace bplus; +using namespace seastar; + +constexpr int TEST_NODE_SIZE = 4; + +/* On node size 32 (this test) linear search works better */ +using test_tree = tree; + +class collection_tester { +public: + virtual void insert(per_key_t k) = 0; + virtual void lower_bound(per_key_t k) = 0; + virtual void erase(per_key_t k) = 0; + virtual void drain(int batch) = 0; + virtual void show_stats() = 0; + virtual ~collection_tester() {}; +}; + +class bptree_tester : public collection_tester { + test_tree _t; +public: + bptree_tester() : _t(key_compare{}) {} + virtual void insert(per_key_t k) override { _t.emplace(k, 0); } + virtual void lower_bound(per_key_t k) override { + auto i = _t.lower_bound(k); + assert(i != _t.end()); + } + virtual void erase(per_key_t k) override { _t.erase(k); } + virtual void drain(int batch) override { + int x = 0; + auto i = _t.begin(); + while (i != _t.end()) { + i = i.erase(key_compare{}); + if (++x % batch == 0) { + seastar::thread::yield(); + } + } + } + virtual void show_stats() { + struct bplus::stats st = _t.get_stats(); + fmt::print("nodes: {}\n", st.nodes); + for (int i = 0; i < (int)st.nodes_filled.size(); i++) { + fmt::print(" {}: {} ({}%)\n", i, st.nodes_filled[i], st.nodes_filled[i] * 100 / st.nodes); + } + fmt::print("leaves: {}\n", st.leaves); + for (int i = 0; i < (int)st.leaves_filled.size(); i++) { + fmt::print(" {}: {} ({}%)\n", i, st.leaves_filled[i], st.leaves_filled[i] * 100 / st.leaves); + } + fmt::print("datas: {}\n", st.datas); + } + virtual ~bptree_tester() { + _t.clear(); + } +}; + +class set_tester : public collection_tester { + std::set _s; +public: + virtual void insert(per_key_t k) override { _s.insert(k); } + virtual void lower_bound(per_key_t k) override { + auto i = _s.lower_bound(k); + assert(i != _s.end()); + } + virtual void erase(per_key_t k) override { _s.erase(k); } + virtual void drain(int batch) override { + int x = 0; + auto i = _s.begin(); + while (i != _s.end()) { + i = _s.erase(i); + if (++x % batch == 0) { + seastar::thread::yield(); + } + } + } + virtual void show_stats() { } + virtual ~set_tester() = default; +}; + +class map_tester : public collection_tester { + std::map _m; +public: + virtual void insert(per_key_t k) override { _m[k] = 0; } + virtual void lower_bound(per_key_t k) override { + auto i = _m.lower_bound(k); + assert(i != _m.end()); + } + virtual void erase(per_key_t k) override { _m.erase(k); } + virtual void drain(int batch) override { + int x = 0; + auto i = _m.begin(); + while (i != _m.end()) { + i = _m.erase(i); + if (++x % batch == 0) { + seastar::thread::yield(); + } + } + } + virtual void show_stats() { } + virtual ~map_tester() = default; +}; + +int main(int argc, char **argv) { + namespace bpo = boost::program_options; + app_template app; + app.add_options() + ("count", bpo::value()->default_value(5000000), "number of keys to fill the tree with") + ("batch", bpo::value()->default_value(50), "number of operations between deferring points") + ("iters", bpo::value()->default_value(1), "number of iterations") + ("col", bpo::value()->default_value("bptree"), "collection to test") + ("test", bpo::value()->default_value("erase"), "what to test (erase, drain, find)") + ("stats", bpo::value()->default_value(false), "show stats"); + + return app.run(argc, argv, [&app] { + auto count = app.configuration()["count"].as(); + auto iters = app.configuration()["iters"].as(); + auto batch = app.configuration()["batch"].as(); + auto col = app.configuration()["col"].as(); + auto tst = app.configuration()["test"].as(); + auto stats = app.configuration()["stats"].as(); + + return seastar::async([count, iters, batch, col, tst, stats] { + std::unique_ptr c; + + if (col == "bptree") { + c = std::make_unique(); + } else if (col == "set") { + c = std::make_unique(); + } else if (col == "map") { + c = std::make_unique(); + } else { + fmt::print("Unknown collection\n"); + return; + } + + std::vector keys; + + for (per_key_t i = 0; i < count; i++) { + keys.push_back(i + 1); + } + + std::random_device rd; + std::mt19937 g(rd()); + + fmt::print("Inserting {:d} k:v pairs into {} {:d} times\n", count, col, iters); + + for (auto rep = 0; rep < iters; rep++) { + std::shuffle(keys.begin(), keys.end(), g); + seastar::thread::yield(); + + auto d = duration_in_seconds([&] { + for (int i = 0; i < count; i++) { + c->insert(keys[i]); + if ((i + 1) % batch == 0) { + seastar::thread::yield(); + } + } + }); + + fmt::print("fill: {:.6f} ms\n", d.count() * 1000); + + if (stats) { + c->show_stats(); + } + + if (tst == "erase") { + std::shuffle(keys.begin(), keys.end(), g); + seastar::thread::yield(); + + d = duration_in_seconds([&] { + for (int i = 0; i < count; i++) { + c->erase(keys[i]); + if ((i + 1) % batch == 0) { + seastar::thread::yield(); + } + } + }); + + fmt::print("erase: {:.6f} ms\n", d.count() * 1000); + } else if (tst == "drain") { + d = duration_in_seconds([&] { + c->drain(batch); + }); + + fmt::print("drain: {:.6f} ms\n", d.count() * 1000); + } else if (tst == "find") { + std::shuffle(keys.begin(), keys.end(), g); + seastar::thread::yield(); + + d = duration_in_seconds([&] { + for (int i = 0; i < count; i++) { + c->lower_bound(keys[i]); + if ((i + 1) % batch == 0) { + seastar::thread::yield(); + } + } + }); + + fmt::print("find: {:.6f} ms\n", d.count() * 1000); + } + } + }); + }); +} diff --git a/test/perf/perf_row_cache_update.cc b/test/perf/perf_row_cache_update.cc index e4d37fad12..ad4d92115b 100644 --- a/test/perf/perf_row_cache_update.cc +++ b/test/perf/perf_row_cache_update.cc @@ -19,16 +19,13 @@ * along with Scylla. If not, see . */ -#include #include #include #include #include -#include #include #include "utils/managed_bytes.hh" -#include "utils/extremum_tracking.hh" #include "utils/logalloc.hh" #include "row_cache.hh" #include "log.hh" @@ -41,74 +38,6 @@ static const int update_iterations = 16; static const int cell_size = 128; static bool cancelled = false; -template -auto duration_in_seconds(Func&& f) { - using clk = std::chrono::steady_clock; - auto start = clk::now(); - f(); - auto end = clk::now(); - return std::chrono::duration_cast>(end - start); -} - -class scheduling_latency_measurer : public weakly_referencable { - using clk = std::chrono::steady_clock; - clk::time_point _last = clk::now(); - utils::estimated_histogram _hist{300}; - min_max_tracker _minmax; - bool _stop = false; -private: - void schedule_tick(); - void tick() { - auto old = _last; - _last = clk::now(); - auto latency = _last - old; - _minmax.update(latency); - _hist.add(latency.count()); - if (!_stop) { - schedule_tick(); - } - } -public: - void start() { - schedule_tick(); - } - void stop() { - _stop = true; - later().get(); // so that the last scheduled tick is counted - } - const utils::estimated_histogram& histogram() const { - return _hist; - } - clk::duration min() const { return _minmax.min(); } - clk::duration max() const { return _minmax.max(); } -}; - -void scheduling_latency_measurer::schedule_tick() { - seastar::schedule(make_task(default_scheduling_group(), [self = weak_from_this()] () mutable { - if (self) { - self->tick(); - } - })); -} - -std::ostream& operator<<(std::ostream& out, const scheduling_latency_measurer& slm) { - auto to_ms = [] (int64_t nanos) { - return float(nanos) / 1e6; - }; - return out << sprint("{count: %d, " - //"min: %.6f [ms], " - //"50%%: %.6f [ms], " - //"90%%: %.6f [ms], " - "99%%: %.6f [ms], " - "max: %.6f [ms]}", - slm.histogram().count(), - //to_ms(slm.min().count()), - //to_ms(slm.histogram().percentile(0.5)), - //to_ms(slm.histogram().percentile(0.9)), - to_ms(slm.histogram().percentile(0.99)), - to_ms(slm.max().count())); -} - template void run_test(const sstring& name, schema_ptr s, MutationGenerator&& gen) { cache_tracker tracker; diff --git a/test/unit/bptree_compaction_test.cc b/test/unit/bptree_compaction_test.cc new file mode 100644 index 0000000000..0687e43b08 --- /dev/null +++ b/test/unit/bptree_compaction_test.cc @@ -0,0 +1,207 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils/logalloc.hh" + +constexpr int TEST_NODE_SIZE = 7; + +#include "tree_test_key.hh" +#include "utils/bptree.hh" +#include "bptree_validation.hh" + +using namespace bplus; +using namespace seastar; + +using test_key = tree_test_key_base; + +class test_data { + int _value; +public: + test_data() : _value(0) {} + test_data(test_key& k) : _value((int)k + 10) {} + + operator unsigned long() const { return _value; } + bool match_key(const test_key& k) const { return _value == (int)k + 10; } +}; +using test_tree = tree; +using test_validator = validator; + +class reference { + reference* _ref = nullptr; +public: + reference() = default; + reference(const reference& other) = delete; + + reference(reference&& other) noexcept : _ref(other._ref) { + if (_ref != nullptr) { + _ref->_ref = this; + } + other._ref = nullptr; + } + + ~reference() { + if (_ref != nullptr) { + _ref->_ref = nullptr; + } + } + + void link(reference& other) { + assert(_ref == nullptr); + _ref = &other; + other._ref = this; + } + + reference* get() { + assert(_ref != nullptr); + return _ref; + } +}; + +class tree_pointer { + reference _ref; + + class tree_wrapper { + friend class tree_pointer; + test_tree _tree; + reference _ref; + public: + tree_wrapper() : _tree(test_key_compare{}) {} + }; + + tree_wrapper* get_wrapper() { + return boost::intrusive::get_parent_from_member(_ref.get(), &tree_wrapper::_ref); + } + +public: + + tree_pointer(const tree_pointer& other) = delete; + tree_pointer(tree_pointer&& other) = delete; + + tree_pointer() { + tree_wrapper *t = current_allocator().construct(); + _ref.link(t->_ref); + } + + test_tree* operator->() { + tree_wrapper *tw = get_wrapper(); + return &tw->_tree; + } + + test_tree& operator*() { + tree_wrapper *tw = get_wrapper(); + return tw->_tree; + } + + ~tree_pointer() { + tree_wrapper *tw = get_wrapper(); + current_allocator().destroy(tw); + } +}; + +int main(int argc, char **argv) { + namespace bpo = boost::program_options; + app_template app; + app.add_options() + ("count", bpo::value()->default_value(10000), "number of keys to fill the tree with") + ("iters", bpo::value()->default_value(13), "number of iterations") + ("verb", bpo::value()->default_value(false), "be verbose"); + + return app.run(argc, argv, [&app] { + auto count = app.configuration()["count"].as(); + auto iter = app.configuration()["iters"].as(); + auto verb = app.configuration()["verb"].as(); + + return seastar::async([count, iter, verb] { + std::vector keys; + for (int i = 0; i < count; i++) { + keys.push_back(i + 1); + } + + std::random_device rd; + std::mt19937 g(rd()); + + fmt::print("Compacting {:d} k:v pairs {:d} times\n", count, iter); + + test_validator tv; + + logalloc::region mem; + + with_allocator(mem.allocator(), [&] { + tree_pointer t; + + for (auto rep = 0; rep < iter; rep++) { + { + std::shuffle(keys.begin(), keys.end(), g); + + logalloc::reclaim_lock rl(mem); + + for (int i = 0; i < count; i++) { + test_key k(keys[i]); + + auto ti = t->emplace(std::move(copy_key(k)), k); + assert(ti.second); + seastar::thread::maybe_yield(); + } + } + + mem.full_compaction(); + + if (verb) { + fmt::print("After fill + compact\n"); + tv.print_tree(*t, '|'); + } + + tv.validate(*t); + + { + std::shuffle(keys.begin(), keys.end(), g); + + logalloc::reclaim_lock rl(mem); + + for (int i = 0; i < count; i++) { + test_key k(keys[i]); + + t->erase(k); + seastar::thread::maybe_yield(); + } + } + + mem.full_compaction(); + + if (verb) { + fmt::print("After erase + compact\n"); + tv.print_tree(*t, '|'); + } + + tv.validate(*t); + } + }); + }); + }); +} diff --git a/test/unit/bptree_stress_test.cc b/test/unit/bptree_stress_test.cc new file mode 100644 index 0000000000..50e7b5eeda --- /dev/null +++ b/test/unit/bptree_stress_test.cc @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +constexpr int TEST_NODE_SIZE = 16; + +#include "tree_test_key.hh" +#include "utils/bptree.hh" +#include "bptree_validation.hh" + +using namespace bplus; +using namespace seastar; + +using test_key = tree_test_key_base; + +class test_data { + int _value; +public: + test_data() : _value(0) {} + test_data(test_key& k) : _value((int)k + 10) {} + + operator unsigned long() const { return _value; } + bool match_key(const test_key& k) const { return _value == (int)k + 10; } +}; + +std::ostream& operator<<(std::ostream& os, test_data d) { + os << (unsigned long)d; + return os; +} + +using test_tree = tree; +using test_node = typename test_tree::node; +using test_validator = validator; +using test_iterator_checker = iterator_checker; + +int main(int argc, char **argv) { + namespace bpo = boost::program_options; + app_template app; + app.add_options() + ("count", bpo::value()->default_value(4132), "number of keys to fill the tree with") + ("iters", bpo::value()->default_value(9), "number of iterations") + ("keys", bpo::value()->default_value("rand"), "how to generate keys (rand, asc, desc)") + ("verb", bpo::value()->default_value(false), "be verbose"); + + return app.run(argc, argv, [&app] { + auto count = app.configuration()["count"].as(); + auto iters = app.configuration()["iters"].as(); + auto ks = app.configuration()["keys"].as(); + auto verb = app.configuration()["verb"].as(); + + return seastar::async([count, iters, ks, verb] { + auto t = std::make_unique(test_key_compare{}); + std::map oracle; + + int p = count / 10; + if (p == 0) { + p = 1; + } + + std::vector keys; + + for (int i = 0; i < count; i++) { + keys.push_back(i + 1); + } + + std::random_device rd; + std::mt19937 g(rd()); + + fmt::print("Inserting {:d} k:v pairs {:d} times\n", count, iters); + + test_validator tv; + + if (ks == "desc") { + fmt::print("Reversing keys vector\n"); + std::reverse(keys.begin(), keys.end()); + } + + bool shuffle = ks == "rand"; + if (shuffle) { + fmt::print("Will shuffle keys each iteration\n"); + } + + + for (auto rep = 0; rep < iters; rep++) { + if (verb) { + fmt::print("Iteration {:d}\n", rep); + } + + auto* itc = new test_iterator_checker(tv, *t); + + if (shuffle) { + std::shuffle(keys.begin(), keys.end(), g); + } + + for (int i = 0; i < count; i++) { + test_key k(keys[i]); + + if (verb) { + fmt::print("+++ {}\n", (int)k); + } + + if (rep % 2 != 1) { + auto ir = t->emplace(std::move(copy_key(k)), k); + assert(ir.second); + } else { + auto ir = t->lower_bound(k); + ir.emplace_before(std::move(copy_key(k)), test_key_compare{}, k); + } + oracle[keys[i]] = keys[i] + 10; + + if (verb) { + fmt::print("Validating\n"); + tv.print_tree(*t, '|'); + } + + /* Limit validation rate for many keys */ + if (i % (i/1000 + 1) == 0) { + tv.validate(*t); + } + + if (i % 7 == 0) { + if (!itc->step()) { + delete itc; + itc = new test_iterator_checker(tv, *t); + } + } + + seastar::thread::maybe_yield(); + } + + auto sz = t->size_slow(); + if (sz != (size_t)count) { + fmt::print("Size {} != count {}\n", sz, count); + throw "size"; + } + + auto ti = t->begin(); + for (auto oe : oracle) { + if (*ti != oe.second) { + fmt::print("Data mismatch {} vs {}\n", oe.second, *ti); + throw "oracle"; + } + ti++; + } + + if (shuffle) { + std::shuffle(keys.begin(), keys.end(), g); + } + + for (int i = 0; i < count; i++) { + test_key k(keys[i]); + + /* + * kill iterator if we're removing what it points to, + * otherwise it's not invalidated + */ + if (itc->here(k)) { + delete itc; + itc = nullptr; + } + + if (verb) { + fmt::print("--- {}\n", (int)k); + } + + if (rep % 3 != 2) { + t->erase(k); + } else { + auto ri = t->find(k); + auto ni = ri; + ni++; + auto eni = ri.erase(test_key_compare{}); + assert(ni == eni); + } + + oracle.erase(keys[i]); + + if (verb) { + fmt::print("Validating\n"); + tv.print_tree(*t, '|'); + } + + if ((count-i) % ((count-i)/1000 + 1) == 0) { + tv.validate(*t); + } + + if (itc == nullptr) { + itc = new test_iterator_checker(tv, *t); + } + + if (i % 5 == 0) { + if (!itc->step()) { + delete itc; + itc = new test_iterator_checker(tv, *t); + } + } + + seastar::thread::maybe_yield(); + } + + delete itc; + } + }); + }); +} diff --git a/test/unit/bptree_validation.hh b/test/unit/bptree_validation.hh new file mode 100644 index 0000000000..766b88c8ff --- /dev/null +++ b/test/unit/bptree_validation.hh @@ -0,0 +1,318 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +namespace bplus { + +template +class validator { + using tree = class tree; + using node = typename tree::node; + + void validate_node(const tree& t, const node& n, int& prev, int& min, bool is_root); + void validate_list(const tree& t); + +public: + void print_tree(const tree& t, char pfx) const { + fmt::print("/ {} <- | {} | -> {}\n", t._left->id(), t._root->id(), t._right->id()); + print_node(*t._root, pfx, 2); + fmt::print("\\\n"); + } + + void print_node(const node& n, char pfx, int indent) const { + int i; + + fmt::print("{:<{}c}{:s} {:d} ({:d} keys, {:x} flags):", pfx, indent, + n.is_leaf() ? "leaf" : "node", n.id(), n._num_keys, n._flags); + if (n.is_leaf()) { + for (i = 0; i < n._num_keys; i++) { + fmt::print(" {}", (int)n._keys[i].v); + } + fmt::print("\n"); + + return; + } + fmt::print("\n"); + + if (n._kids[0].n != nullptr) { + print_node(*n._kids[0].n, pfx, indent + 2); + } + for (i = 0; i < n._num_keys; i++) { + fmt::print("{:<{}c}---{}---\n", pfx, indent, (int)n._keys[i].v); + print_node(*n._kids[i + 1].n, pfx, indent + 2); + } + } + + void validate(const tree& t); +}; + + +template +void validator::validate_node(const tree& t, const node& n, int& prev_key, int& min_key, bool is_root) { + int i; + + if (n.is_root() != is_root) { + fmt::print("node {} needs to {} root, but {}\n", n.id(), is_root ? "be" : "be not", n._flags); + throw "root broken"; + } + + for (i = 0; i < n._num_keys; i++) { + if (!n._keys[i].v.is_alive()) { + fmt::print("node {} key {} is not alive\n", n.id(), i); + throw "key dead"; + } + } + + if (n.is_leaf()) { + for (i = 0; i < n._num_keys; i++) { + if (t._less(n._keys[i].v, K(prev_key))) { + fmt::print("node misordered @{} (prev {})\n", (int)n._keys[i].v, prev_key); + throw "misorder"; + } + if (n._kids[i + 1].d->_leaf != &n) { + fmt::print("data mispoint\n"); + throw "data backlink"; + } + + prev_key = n._keys[i].v; + if (!n._kids[i + 1].d->value.match_key(n._keys[i].v)) { + fmt::print("node value corrupted @{:d}.{:d}\n", n.id(), i); + throw "data corruption"; + } + } + + if (n._num_keys > 0) { + min_key = (int)n._keys[0].v; + } + } else if (n._num_keys > 0) { + node* k = n._kids[0].n; + + if (k->_parent != &n) { + fmt::print("node {:d} -parent-> {:d}, expect {:d}\n", k->id(), k->_parent->id(), n.id()); + throw "mis-parented node"; + } + validate_node(t, *k, prev_key, min_key, false); + for (i = 0; i < n._num_keys; i++) { + k = n._kids[i + 1].n; + if (k->_parent != &n) { + fmt::print("node {:d} -parent-> {:d}, expect {:d}\n", + k->id(), k->_parent ? k->_parent->id() : -1, n.id()); + throw "mis-parented node"; + } + if (t._less(k->_keys[0].v, n._keys[i].v)) { + fmt::print("node {:d}.{:d}, separation key {}, kid has {}\n", n.id(), k->id(), + (int)n._keys[i].v, (int)k->_keys[0].v); + throw "separation key mismatch"; + } + + int min = 0; + validate_node(t, *k, prev_key, min, false); + if (t._less(n._keys[i].v, K(min)) || t._less(K(min), n._keys[i].v)) { + fmt::print("node {:d}.[{:d}]{:d}, separation key {}, min {}\n", + n.id(), i, k->id(), (int)n._keys[i].v, min); + if (strict_separation_key || t._less(K(min), n._keys[i].v)) { + throw "separation key screw"; + } + } + } + } +} + +template +void validator::validate_list(const tree& t) { + int prev = 0; + + node* lh = t.left_leaf_slow(); + node* rh = t.right_leaf_slow(); + + if (lh != t._left) { + fmt::print("left {:d}, slow {:d}\n", t._left->id(), lh->id()); + throw "list broken"; + } + + if (!(lh->_flags & node::NODE_LEFTMOST)) { + fmt::print("left {:d} is not marked as such {}\n", t._left->id(), t._left->_flags);; + throw "list broken"; + } + + if (rh != t._right) { + fmt::print("right {:d}, slow {:d}\n", t._right->id(), rh->id()); + throw "list broken"; + } + + if (!(rh->_flags & node::NODE_RIGHTMOST)) { + fmt::print("right {:d} is not marked as such {}\n", t._right->id(), t._right->_flags);; + throw "list broken"; + } + + node* r = lh; + while (1) { + node *ln; + + if (!r->is_rightmost()) { + ln = r->get_next(); + if (ln->get_prev() != r) { + fmt::print("next leaf {:d} points to {:d}, expect {:d}\n", ln->id(), ln->get_prev()->id(), r->id()); + throw "list broken"; + } + } else if (r->_rightmost_tree != &t) { + fmt::print("right leaf doesn't point to tree\n"); + throw "list broken"; + } + + if (!r->is_leftmost()) { + ln = r->get_prev(); + if (ln->get_next() != r) { + fmt::print("prev leaf {:d} points to {:d}, expect {:d}\n", ln->id(), ln->get_next()->id(), r->id()); + throw "list broken"; + } + } else if (r->_kids[0]._leftmost_tree != &t) { + fmt::print("left leaf doesn't point to tree\n"); + throw "list broken"; + } + + if (r->_num_keys > 0 && t._less(r->_keys[0].v, K(prev))) { + fmt::print("list misorder on element {:d}, keys {}..., prev {:d}\n", r->id(), (int)r->_keys[0].v, prev); + throw "list broken"; + } + + if (!r->is_root() && r->_parent != nullptr) { + const auto p = r->_parent; + int i = p->index_for(r->_keys[0].v, t._less); + if (i > 0) { + if (p->_kids[i - 1].n != r->get_prev()) { + fmt::print("list misorder on parent check: node {:d}.{:d}, parent prev {:d}, list prev {:d}\n", + p->id(), r->id(), p->_kids[i - 1].n->id(), r->get_prev()->id()); + throw "list broken"; + } + } + if (i < p->_num_keys - 1) { + if (p->_kids[i + 1].n != r->get_next()) { + fmt::print("list misorder on parent check: node {:d}.{:d}, parent next {:d}, list next {:d}\n", + p->id(), r->id(), p->_kids[i + 1].n->id(), r->get_next()->id()); + throw "list broken"; + } + } + } + + if (r->_num_keys > 0) { + prev = (int)r->_keys[r->_num_keys - 1].v; + } + + if (r != t._left && r != t._right && (r->_flags & (node::NODE_LEFTMOST | node::NODE_RIGHTMOST))) { + fmt::print("middle {:d} is marked as left/right {}\n", r->id(), r->_flags);; + throw "list broken"; + } + + if (r->is_rightmost()) { + break; + } + + r = r->get_next(); + } +} + +template +void validator::validate(const tree& t) { + try { + validate_list(t); + int min = 0, prev = 0; + if (t._root->_root_tree != &t) { + fmt::print("root doesn't point to tree\n"); + throw "root broken"; + } + + validate_node(t, *t._root, prev, min, true); + } catch (...) { + print_tree(t, '|'); + fmt::print("[ "); + node* lh = t._left; + while (1) { + fmt::print(" {:d}", lh->id()); + if (lh->is_rightmost()) { + break; + } + lh = lh->get_next(); + } + fmt::print("]\n"); + throw; + } +} + +template +class iterator_checker { + using tree = class tree; + + validator& _tv; + tree& _t; + typename tree::iterator _fwd, _fend; + T _fprev; + +public: + iterator_checker(validator& tv, tree& t) : _tv(tv), _t(t), + _fwd(t.begin()), _fend(t.end()) { + } + + bool step() { + try { + return forward_check(); + } catch(...) { + _tv.print_tree(_t, ':'); + throw; + } + } + + bool here(const K& k) { + return _fwd != _fend && _fwd->match_key(k); + } + +private: + bool forward_check() { + if (_fwd == _fend) { + return false; + } + _fwd++; + if (_fwd == _fend) { + return false; + } + T val = *_fwd; + _fwd++; + if (_fwd == _fend) { + return false; + } + _fwd--; + if (val != *_fwd) { + fmt::print("Iterator broken, {:d} != {:d}\n", val, *_fwd); + throw "iterator"; + } + if (val < _fprev) { + fmt::print("Iterator broken, {:d} < {:d}\n", val, _fprev); + throw "iterator"; + } + _fprev = val; + + return true; + } +}; + +} // namespace + diff --git a/test/unit/tree_test_key.hh b/test/unit/tree_test_key.hh new file mode 100644 index 0000000000..14ef31df4d --- /dev/null +++ b/test/unit/tree_test_key.hh @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +/* + * Helper class that helps to check that tree + * - works with keys without default contstuctor + * - moves the keys around properly + */ +class tree_test_key_base { + int _val; + int* _cookie; + int* _p_cookie; + +public: + bool is_alive() const { + if (_val == -1) { + fmt::print("key value is reset\n"); + return false; + } + + if (_cookie == nullptr) { + fmt::print("key cookie is reset\n"); + return false; + } + + if (*_cookie != 0) { + fmt::print("key cookie value is corrupted {}\n", *_cookie); + return false; + } + + return true; + } + + bool less(const tree_test_key_base& o) const noexcept { + return _val < o._val; + } + + explicit tree_test_key_base(int nr, int cookie = 0) : _val(nr) { + _cookie = new int(cookie); + _p_cookie = new int(1); + } + + operator int() const noexcept { return _val; } + + tree_test_key_base& operator=(const tree_test_key_base& other) = delete; + tree_test_key_base& operator=(tree_test_key_base&& other) = delete; + +private: + /* + * Keep this private to make bptree.hh explicitly call the + * copy_key in the places where the key is copied + */ + tree_test_key_base(const tree_test_key_base& other) : _val(other._val) { + _cookie = new int(*other._cookie); + _p_cookie = new int(*other._p_cookie); + } + + friend tree_test_key_base copy_key(const tree_test_key_base&); + +public: + tree_test_key_base(tree_test_key_base&& other) noexcept : _val(other._val) { + other._val = -1; + _cookie = other._cookie; + other._cookie = nullptr; + _p_cookie = new int(*other._p_cookie); + } + + ~tree_test_key_base() { + if (_cookie != nullptr) { + delete _cookie; + } + assert(_p_cookie != nullptr); + delete _p_cookie; + } +}; + +tree_test_key_base copy_key(const tree_test_key_base& other) { return tree_test_key_base(other); } + +struct test_key_compare { + bool operator()(const tree_test_key_base& a, const tree_test_key_base& b) const noexcept { return a.less(b); } +}; diff --git a/utils/bptree.hh b/utils/bptree.hh new file mode 100644 index 0000000000..e43da75308 --- /dev/null +++ b/utils/bptree.hh @@ -0,0 +1,1941 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include +#include +#include +#include "utils/logalloc.hh" +#include "utils/collection-concepts.hh" +#include "utils/neat-object-id.hh" + +namespace bplus { + +enum class with_debug { no, yes }; + +/* + * Linear search in a sorted array of keys slightly beats the + * binary one on small sizes. For debugging purposes both methods + * should be used (and the result must coincide). + */ +enum class key_search { linear, binary, both }; + +/* + * This wrapper prevents the value from being default-constructed + * when its container is created. The intended usage is to wrap + * elements of static arrays or containers with .emplace() methods + * that can live some time without the value in it. + * + * Similarly, the value is _not_ automatically destructed when this + * thing is, so ~Value() must be called by hand. For this there is the + * .remove() method and two helpers for common cases -- std::move-ing + * the value into another maybe-location (.emplace(maybe&&)) and + * constructing the new in place of the existing one (.replace(args...)) + */ +template +union maybe_key { + Value v; + maybe_key() noexcept {} + ~maybe_key() {} + maybe_key(const maybe_key&) = delete; + maybe_key(maybe_key&&) = delete; + + void reset() noexcept { v.~Value(); } + + /* + * Constructs the value inside the empty maybe wrapper. + */ + template + void emplace(Args&&... args) noexcept { + new (&v) Value (std::forward(args)...); + } + + /* + * The special-case handling of moving some other alive maybe-value. + * Calls the source destructor after the move. + */ + void emplace(maybe_key&& other) noexcept { + new (&v) Value(std::move(other.v)); + other.reset(); + } + + /* + * Similar to emplace, but to be used on the alive maybe. + * Calls the destructor on it before constructing the new value. + */ + template + void replace(Args&&... args) noexcept { + reset(); + emplace(std::forward(args)...); + } + + void replace(maybe_key&& other) = delete; // not to be called by chance +}; + +// For .{do_something_with_data}_and_dispose methods below +template +void default_dispose(T* value) noexcept { } + +/* + * Helper to explicitly capture all keys copying. + * Check test_key for more information. + */ +template +SEASTAR_CONCEPT(requires std::is_nothrow_copy_constructible_v) +Key copy_key(const Key& other) noexcept { + return Key(other); +} + +/* + * Consider a small 2-level tree like this + * + * [ . 5 . ] + * | | + * +------+ +-----+ + * | | + * [ 1 . 2 . 3 . ] [ 5 . 6 . 7 . ] + * + * And we remove key 5 from it. First -- the key is removed + * from the leaf entry + * + * [ . 5 . ] + * | | + * +------+ +-----+ + * | | + * [ 1 . 2 . 3 . ] [ 6 . 7. ] + * + * At this point we have a choice -- whether or not to update + * the separation key on the parent (root). Strictly speaking, + * the whole tree is correct now -- all the keys on the right + * are greater-or-equal than their separation key, though the + * "equal" never happens. + * + * This can be problematic if the keys are stored on data nodes + * and are referenced from the (non-)leaf nodes. In this case + * the separation key must be updated to point to some real key + * in its sub-tree. + * + * [ . 6 . ] <--- this key updated + * | | + * +------+ +-----+ + * | | + * [ 1 . 2 . 3 . ] [ 6 . 7. ] + * + * As this update takes some time, this behaviour is tunable. + * + */ +constexpr bool strict_separation_key = true; + +/* + * This is for testing, validator will be everybody's friend + * to have rights to check if the tree is internally correct. + */ +template class validator; +template class statistics; + +template class node; +template class data; + +/* + * The tree itself. + * Equipped with O(1) (with little constant) begin() and end() + * and the iterator, that scans through sorted keys and is not + * invalidated on insert/remove. + * + * The NodeSize parameter describes the amount of keys to be + * held on each node. Inner nodes will thus have N+1 sub-trees, + * leaf nodes will have N data pointers. + */ + +SEASTAR_CONCEPT( + template + concept CanGetKeyFromValue = requires (T val) { + { val.key() } -> std::same_as; + }; +) + +struct stats { + unsigned long nodes; + std::vector nodes_filled; + unsigned long leaves; + std::vector leaves_filled; + unsigned long datas; +}; + +template +SEASTAR_CONCEPT( requires LessNothrowComparable && + std::is_nothrow_move_constructible_v && + std::is_nothrow_move_constructible_v +) +class tree { +public: + class iterator; + class const_iterator; + + friend class validator; + friend class node; + + // Sanity not to allow slow key-search in non-debug mode + static_assert(Debug == with_debug::yes || Search != key_search::both); + + using node = class node; + using data = class data; + using kid_index = node::kid_index; + +private: + + node* _root = nullptr; + node* _left = nullptr; + node* _right = nullptr; + [[no_unique_address]] Less _less; + + template + node& find_leaf_for(const K& k) const noexcept { + node* cur = _root; + + while (!cur->is_leaf()) { + kid_index i = cur->index_for(k, _less); + cur = cur->_kids[i].n; + } + + return *cur; + } + + void maybe_init_empty_tree() { + if (_root != nullptr) { + return; + } + + node* n = node::create(); + n->_flags |= node::NODE_LEAF | node::NODE_ROOT | node::NODE_RIGHTMOST | node::NODE_LEFTMOST; + do_set_root(n); + do_set_left(n); + do_set_right(n); + } + + node* left_leaf_slow() const noexcept { + node* cur = _root; + while (!cur->is_leaf()) { + cur = cur->_kids[0].n; + } + return cur; + } + + node* right_leaf_slow() const noexcept { + node* cur = _root; + while (!cur->is_leaf()) { + cur = cur->_kids[cur->_num_keys].n; + } + return cur; + } + + template + SEASTAR_CONCEPT(requires LessNothrowComparable) + const_iterator get_bound(const K& k, bool upper, bool& match) const noexcept { + match = false; + if (empty()) { + return end(); + } + + node& n = find_leaf_for(k); + kid_index i = n.index_for(k, _less); + + /* + * Element at i (key at i - 1) is less or equal to the k, + * the next element is greater. Mind corner cases. + */ + + if (i == 0) { + assert(n.is_leftmost()); + return begin(); + } else if (i <= n._num_keys) { + const_iterator cur = const_iterator(n._kids[i].d, i); + if (upper || _less(n._keys[i - 1].v, k)) { + cur++; + } else { + match = true; + } + + return cur; + } else { + assert(n.is_rightmost()); + return end(); + } + } + + template + iterator get_bound(const K& k, bool upper, bool& match) noexcept { + return iterator(const_cast(this)->get_bound(k, upper, match)); + } + +public: + + tree(const tree& other) = delete; + const tree& operator=(const tree& other) = delete; + tree& operator=(tree&& other) = delete; + + explicit tree(Less less) noexcept : _less(less) { } + ~tree() { clear(); } + + Less less() const noexcept { return _less; } + + tree(tree&& other) noexcept : _less(std::move(other._less)) { + if (other._root) { + do_set_root(other._root); + do_set_left(other._left); + do_set_right(other._right); + + other._root = nullptr; + other._left = nullptr; + other._right = nullptr; + } + } + + // XXX -- this uses linear scan over the leaf nodes + size_t size_slow() const noexcept { + if (_root == nullptr) { + return 0; + } + + size_t ret = 0; + const node* leaf = _left; + while (1) { + assert(leaf->is_leaf()); + ret += leaf->_num_keys; + if (leaf == _right) { + break; + } + leaf = leaf->get_next(); + } + + return ret; + } + + // Returns result that is equal (both not less than each other) + template + SEASTAR_CONCEPT(requires LessNothrowComparable) + const_iterator find(const K& k) const noexcept { + if (empty()) { + return end(); + } + + node& n = find_leaf_for(k); + kid_index i = n.index_for(k, _less); + + if (i >= 1 && !_less(n._keys[i - 1].v, k)) { + return const_iterator(n._kids[i].d, i); + } else { + return end(); + } + } + + template + SEASTAR_CONCEPT(requires LessNothrowComparable) + iterator find(const K& k) noexcept { + return iterator(const_cast(this)->find(k)); + } + + // Returns the least x out of those !less(x, k) + template + iterator lower_bound(const K& k) noexcept { + bool match; + return get_bound(k, false, match); + } + + template + const_iterator lower_bound(const K& k) const noexcept { + bool match; + return get_bound(k, false, match); + } + + template + iterator lower_bound(const K& k, bool& match) noexcept { + return get_bound(k, false, match); + } + + template + const_iterator lower_bound(const K& k, bool& match) const noexcept { + return get_bound(k, false, match); + } + + // Returns the least x out of those less(k, x) + template + iterator upper_bound(const K& k) noexcept { + bool match; + return get_bound(k, true, match); + } + + template + const_iterator upper_bound(const K& k) const noexcept { + bool match; + return get_bound(k, true, match); + } + + /* + * Constructs the element with key k inside the tree and returns + * iterator on it. If the key already exists -- just returns the + * iterator on it and sets the .second to false. + */ + template + std::pair emplace(Key k, Args&&... args) { + maybe_init_empty_tree(); + + node& n = find_leaf_for(k); + kid_index i = n.index_for(k, _less); + + if (i >= 1 && !_less(n._keys[i - 1].v, k)) { + // Direct hit + return std::pair(iterator(n._kids[i].d, i), false); + } + + data* d = data::create(std::forward(args)...); + auto x = seastar::defer([&d] { data::destroy(*d, default_dispose); }); + n.insert(i, std::move(k), d, _less); + assert(d->attached()); + x.cancel(); + return std::pair(iterator(d, i + 1), true); + } + + template + SEASTAR_CONCEPT(requires Disposer) + iterator erase_and_dispose(const Key& k, Func&& disp) noexcept { + maybe_init_empty_tree(); + + node& n = find_leaf_for(k); + + data* d; + kid_index i = n.index_for(k, _less); + + if (i == 0) { + return end(); + } + + assert(n._num_keys > 0); + + if (_less(n._keys[i - 1].v, k)) { + return end(); + } + + d = n._kids[i].d; + iterator it(d, i); + it++; + + n.remove(i, _less); + + data::destroy(*d, disp); + return it; + } + + template + SEASTAR_CONCEPT(requires Disposer) + iterator erase_and_dispose(iterator from, iterator to, Func&& disp) noexcept { + /* + * FIXME this is dog slow k*logN algo, need k+logN one + */ + while (from != to) { + from = from.erase_and_dispose(disp, _less); + } + + return to; + } + + template + iterator erase(Args&&... args) noexcept { return erase_and_dispose(std::forward(args)..., default_dispose); } + + template + SEASTAR_CONCEPT(requires Disposer) + void clear_and_dispose(Func&& disp) noexcept { + if (_root != nullptr) { + _root->clear( + [this, &disp] (data* d) noexcept { data::destroy(*d, disp); }, + [this] (node* n) noexcept { node::destroy(*n); } + ); + + node::destroy(*_root); + _root = nullptr; + _left = nullptr; + _right = nullptr; + } + } + + void clear() noexcept { clear_and_dispose(default_dispose); } + +private: + void do_set_left(node *n) noexcept { + assert(n->is_leftmost()); + _left = n; + n->_kids[0]._leftmost_tree = this; + } + + void do_set_right(node *n) noexcept { + assert(n->is_rightmost()); + _right = n; + n->_rightmost_tree = this; + } + + void do_set_root(node *n) noexcept { + assert(n->is_root()); + n->_root_tree = this; + _root = n; + } + +public: + /* + * Iterator. Scans the datas in the sorted-by-key order. + * Is not invalidated by emplace/erase-s of other elements. + * Move constructors may turn the _idx invalid, but the + * .revalidate() method makes it good again. + */ + template + class iterator_base { + protected: + using tree_ptr = std::conditional_t; + using data_ptr = std::conditional_t; + using node_ptr = std::conditional_t; + + /* + * When the iterator gets to the end the _data is + * replaced with the _tree obtained from the right + * leaf, and the _idx is set to npos + */ + union { + tree_ptr _tree; + data_ptr _data; + }; + kid_index _idx; // Index in leaf's _kids array pointing to _data + + /* + * Leaf nodes cannot have kids (data nodes) at 0 position, so + * 0 is good for unsigned undefined position. + */ + static constexpr kid_index npos = 0; + + bool is_end() const noexcept { return _idx == npos; } + + explicit iterator_base(tree_ptr t) noexcept : _tree(t), _idx(npos) { } + iterator_base(data_ptr d, kid_index idx) noexcept : _data(d), _idx(idx) { + assert(!is_end()); + } + iterator_base() noexcept : iterator_base(static_cast(nullptr)) {} + + /* + * The routine makes sure the iterator's index is valid + * and returns back the leaf that points to it. + */ + node_ptr revalidate() noexcept { + assert(!is_end()); + + node_ptr leaf = _data->_leaf; + + /* + * The data._leaf pointer is always valid (it's updated + * on insert/remove operations), the datas do not move + * as well, so if the leaf still points at us, it is valid. + */ + if (_idx > leaf->_num_keys || leaf->_kids[_idx].d != _data) { + _idx = leaf->index_for(_data); + } + + return leaf; + } + + public: + using iterator_category = std::bidirectional_iterator_tag; + using value_type = std::conditional_t; + using difference_type = ssize_t; + using pointer = value_type*; + using reference = value_type&; + + reference operator*() const noexcept { return _data->value; } + pointer operator->() const noexcept { return &_data->value; } + + iterator_base& operator++() noexcept { + node_ptr leaf = revalidate(); + if (_idx < leaf->_num_keys) { + _idx++; + } else { + if (leaf->is_rightmost()) { + _idx = npos; + _tree = leaf->_rightmost_tree; + return *this; + } + + leaf = leaf->get_next(); + _idx = 1; + } + _data = leaf->_kids[_idx].d; + return *this; + } + + iterator_base& operator--() noexcept { + if (is_end()) { + node* n = _tree->_right; + assert(n->_num_keys > 0); + _data = n->_kids[n->_num_keys].d; + _idx = n->_num_keys; + return *this; + } + + node_ptr leaf = revalidate(); + if (_idx > 1) { + _idx--; + } else { + leaf = leaf->get_prev(); + _idx = leaf->_num_keys; + } + _data = leaf->_kids[_idx].d; + return *this; + } + + iterator_base operator++(int) noexcept { + iterator_base cur = *this; + operator++(); + return cur; + } + + iterator_base operator--(int) noexcept { + iterator_base cur = *this; + operator--(); + return cur; + } + + bool operator==(const iterator_base& o) const noexcept { return is_end() ? o.is_end() : _data == o._data; } + bool operator!=(const iterator_base& o) const noexcept { return !(*this == o); } + }; + + using iterator_base_const = iterator_base; + using iterator_base_nonconst = iterator_base; + + class const_iterator final : public iterator_base_const { + friend class tree; + using super = iterator_base_const; + + explicit const_iterator(const tree* t) noexcept : super(t) {} + const_iterator(const data* d, kid_index idx) noexcept : super(d, idx) {} + + public: + const_iterator() noexcept : super() {} + }; + + class iterator final : public iterator_base_nonconst { + friend class tree; + using super = iterator_base_nonconst; + + explicit iterator(tree* t) noexcept : super(t) {} + iterator(data* d, kid_index idx) noexcept : super(d, idx) {} + + public: + iterator(const const_iterator&& other) noexcept { + if (other.is_end()) { + super::_idx = super::npos; + super::_tree = const_cast(other._tree); + } else { + super::_idx = other._idx; + super::_data = const_cast(other._data); + } + } + + iterator() noexcept : super() {} + + /* + * Special constructor for the case when there's the need for an + * iterator to the given value poiter. In this case we need to + * get three things: + * - pointer on class data: we assume that the value pointer + * is indeed embedded into the data and do the "container_of" + * maneuver + * - index at which the data is seen on the leaf: use the + * standard revalidation. Note, that we start with index 1 + * which gives us 1/NodeSize chance of hitting the right index + * right at once :) + * - the tree itself: the worst thing here, creating an iterator + * like this is logN operation + */ + explicit iterator(T* value) noexcept + : super(boost::intrusive::get_parent_from_member(value, &data::value), 1) { + super::revalidate(); + } + + /* + * The key _MUST_ be in order and not exist, + * neither of those is checked + */ + template + iterator emplace_before(KeyFn key, Less less, Args&&... args) { + node* leaf; + kid_index i; + + if (!super::is_end()) { + leaf = super::revalidate(); + i = super::_idx - 1; + + if (i == 0 && !leaf->is_leftmost()) { + /* + * If we're about to insert a key before the 0th one, then + * we must make sure the separation keys from upper layers + * will separate the new key as well. If they won't then we + * should select the left sibling for insertion. + * + * For !strict_separation_key the solution is simple -- the + * upper level separation keys match the current 0th one, so + * we always switch to the left sibling. + * + * If we're already on the left-most leaf -- just insert, as + * there's no separatio key above it. + */ + if (!strict_separation_key) { + assert(false && "Not implemented"); + } + leaf = leaf->get_prev(); + i = leaf->_num_keys; + } + } else { + super::_tree->maybe_init_empty_tree(); + leaf = super::_tree->_right; + i = leaf->_num_keys; + } + + assert(i >= 0); + + data* d = data::create(std::forward(args)...); + auto x = seastar::defer([&d] { data::destroy(*d, default_dispose); }); + leaf->insert(i, std::move(key(d)), d, less); + assert(d->attached()); + x.cancel(); + /* + * XXX -- if the node was not split we can ++ it index + * and keep iterator valid :) + */ + return iterator(d, i + 1); + } + + template + iterator emplace_before(Key k, Less less, Args&&... args) { + return emplace_before([&k] (data*) -> Key { return std::move(k); }, + less, std::forward(args)...); + } + + template + SEASTAR_CONCEPT(requires CanGetKeyFromValue) + iterator emplace_before(Less less, Args&&... args) { + return emplace_before([] (data* d) -> Key { return d->value.key(); }, + less, std::forward(args)...); + } + + private: + /* + * Prepare a likely valid iterator for the next element. + * Likely means, that unless removal starts rebalancing + * datas the _idx will be for the correct pointer. + * + * This is just like the operator++, with the exception + * that staying on the leaf doesn't increase the _idx, as + * in this case the next element will be shifted left to + * the current position. + */ + iterator next_after_erase(node* leaf) const noexcept { + if (super::_idx < leaf->_num_keys) { + return iterator(leaf->_kids[super::_idx + 1].d, super::_idx); + } + + if (leaf->is_rightmost()) { + return iterator(leaf->_rightmost_tree); + } + + leaf = leaf->get_next(); + return iterator(leaf->_kids[1].d, 1); + } + + public: + template + SEASTAR_CONCEPT(requires Disposer) + iterator erase_and_dispose(Func&& disp, Less less) noexcept { + node* leaf = super::revalidate(); + iterator cur = next_after_erase(leaf); + + leaf->remove(super::_idx, less); + data::destroy(*super::_data, disp); + + return cur; + } + + iterator erase(Less less) { return erase_and_dispose(default_dispose, less); } + + template + void reconstruct(size_t new_payload_size, Args&&... args) { + size_t new_size = super::_data->storage_size(new_payload_size); + + node* leaf = super::revalidate(); + auto ptr = current_allocator().alloc(&get_standard_migrator(), new_size, alignof(data)); + data *dat, *cur = super::_data; + + try { + dat = new (ptr) data(std::forward(args)...); + } catch(...) { + current_allocator().free(ptr, new_size); + throw; + } + + dat->_leaf = leaf; + cur->_leaf = nullptr; + + super::_data = dat; + leaf->_kids[super::_idx].d = dat; + + current_allocator().destroy(cur); + } + }; + + const_iterator begin() const noexcept { + if (empty()) { + return end(); + } + + assert(_left->_num_keys > 0); + // Leaf nodes have data pointers starting from index 1 + return const_iterator(_left->_kids[1].d, 1); + } + const_iterator end() const noexcept { return const_iterator(this); } + + using const_reverse_iterator = std::reverse_iterator; + const_reverse_iterator rbegin() const noexcept { return std::make_reverse_iterator(end()); } + const_reverse_iterator rend() const noexcept { return std::make_reverse_iterator(begin()); } + + iterator begin() noexcept { return iterator(const_cast(this)->begin()); } + iterator end() noexcept { return iterator(this); } + + using reverse_iterator = std::reverse_iterator; + reverse_iterator rbegin() noexcept { return std::make_reverse_iterator(end()); } + reverse_iterator rend() noexcept { return std::make_reverse_iterator(begin()); } + + bool empty() const noexcept { return _root == nullptr || _root->_num_keys == 0; } + + struct stats get_stats() const noexcept { + struct stats st; + + st.nodes = 0; + st.leaves = 0; + st.datas = 0; + + if (_root != nullptr) { + st.nodes_filled.resize(NodeSize + 1); + st.leaves_filled.resize(NodeSize + 1); + _root->fill_stats(st); + } + + return st; + } +}; + +/* + * Algorithms for searching a key in array. + * + * The gt() method accepts sorted array of keys and searches the index of the + * upper-bound element of the given key. + */ + +template +struct searcher { }; + +template +struct searcher { + static size_t gt(const K& k, const maybe_key* keys, size_t nr, Less less) noexcept { + size_t i; + + for (i = 0; i < nr; i++) { + if (less(k, keys[i].v)) { + break; + } + } + + return i; + }; +}; + +template +struct searcher { + static size_t gt(const K& k, const maybe_key* keys, size_t nr, Less less) noexcept { + ssize_t s = 0, e = nr - 1; // signed for below s <= e corner cases + + while (s <= e) { + size_t i = (s + e) / 2; + if (less(k, keys[i].v)) { + e = i - 1; + } else { + s = i + 1; + } + } + + return s; + } +}; + +template +struct searcher { + static size_t gt(const K& k, const maybe_key* keys, size_t nr, Less less) noexcept { + size_t rl = searcher::gt(k, keys, nr, less); + size_t rb = searcher::gt(k, keys, nr, less); + assert(rl == rb); + assert(rl <= nr); + return rl; + } +}; + +/* + * A node describes both, inner and leaf nodes. + */ +template +class node final { + friend class validator; + friend class tree; + friend class data; + + using tree = class tree; + using data = class data; + + class prealloc; + + /* + * The NodeHalf is the level at which the node is considered + * to be underflown and should be re-filled. This slightly + * differs for even and odd sizes. + * + * For odd sizes the node will stand until it contains literally + * more than 1/2 of it's size (e.g. for size 5 keeping 3 keys + * is OK). For even cases this barrier is less than the actual + * half (e.g. for size 4 keeping 2 is still OK). + */ + static constexpr size_t NodeHalf = ((NodeSize - 1) / 2); + static_assert(NodeHalf >= 1); + + union node_or_data_or_tree { + node* n; + data* d; + + tree* _leftmost_tree; // See comment near node::__next about this + }; + + using node_or_data = node_or_data_or_tree; + + friend data::data(data&&); + + [[no_unique_address]] utils::neat_id id; + + unsigned short _num_keys; + unsigned short _flags; + + static const unsigned short NODE_ROOT = 0x1; + static const unsigned short NODE_LEAF = 0x2; + static const unsigned short NODE_LEFTMOST = 0x4; // leaf with smallest keys in the tree + static const unsigned short NODE_RIGHTMOST = 0x8; // leaf with greatest keys in the tree + + bool is_leaf() const noexcept { return _flags & NODE_LEAF; } + bool is_root() const noexcept { return _flags & NODE_ROOT; } + bool is_rightmost() const noexcept { return _flags & NODE_RIGHTMOST; } + bool is_leftmost() const noexcept { return _flags & NODE_LEFTMOST; } + + /* + * separation keys + * non-leaf nodes: + * keys in kids[i] < keys[i] <= keys in kids[i+1], i in [0, NodeSize) + * leaf nodes: + * kids[i + 1] is the data for keys[i] + * kids[0] is unused + * + * In the examples below the leaf nodes will be shown like + * + * keys: [012] + * datas: [-012] + * + * and the non-leaf ones like + * + * keys: [012] + * kids: [A012] + * + * to have digits correspond to different elements and staying + * in its correct positions. And the A kid is this left-most one + * at index 0 for the non-leaf node. + */ + + maybe_key _keys[NodeSize]; + node_or_data _kids[NodeSize + 1]; + + // Type-aliases for code-reading convenience + using key_index = size_t; + using kid_index = size_t; + + /* + * The root node uses this to point to the tree object. This is + * needed to update tree->_root on node move. + */ + union { + node* _parent; + tree* _root_tree; + }; + + /* + * Leaf nodes are linked in a list, since leaf nodes do + * not use the _kids[0] pointer we re-use it. Respectively, + * non-leaf nodes don't use the __next one. + * + * Also, leftmost and rightmost respectively have prev and + * next pointing to the tree object itsef. This is done for + * _left/_right update on node move. + */ + union { + node* __next; + tree* _rightmost_tree; + }; + + node* get_next() const noexcept { + assert(is_leaf()); + return __next; + } + + void set_next(node *n) noexcept { + assert(is_leaf()); + __next = n; + } + + node* get_prev() const noexcept { + assert(is_leaf()); + return _kids[0].n; + } + + void set_prev(node* n) noexcept { + assert(is_leaf()); + _kids[0].n = n; + } + + // Links the new node n right after the current one + void link(node& n) noexcept { + if (is_rightmost()) { + _flags &= ~NODE_RIGHTMOST; + n._flags |= node::NODE_RIGHTMOST; + tree* t = _rightmost_tree; + assert(t->_right == this); + t->do_set_right(&n); + } else { + n.set_next(get_next()); + get_next()->set_prev(&n); + } + + n.set_prev(this); + set_next(&n); + } + + void unlink() noexcept { + node* x; + tree* t; + + switch (_flags & (node::NODE_LEFTMOST | node::NODE_RIGHTMOST)) { + case node::NODE_LEFTMOST: + x = get_next(); + _flags &= ~node::NODE_LEFTMOST; + x->_flags |= node::NODE_LEFTMOST; + t = _kids[0]._leftmost_tree; + assert(t->_left == this); + t->do_set_left(x); + break; + case node::NODE_RIGHTMOST: + x = get_prev(); + _flags &= ~node::NODE_RIGHTMOST; + x->_flags |= node::NODE_RIGHTMOST; + t = _rightmost_tree; + assert(t->_right == this); + t->do_set_right(x); + break; + case 0: + get_prev()->set_next(get_next()); + get_next()->set_prev(get_prev()); + break; + default: + /* + * Right- and left-most at the same time can only be root, + * otherwise this would mean we have root with 0 keys. + */ + assert(false); + } + + set_next(this); + set_prev(this); + } + + node(const node& other) = delete; + const node& operator=(const node& other) = delete; + node& operator=(node&& other) = delete; + + /* + * There's no pointer/reference from nodes to the tree, neither + * there is such from data, because otherwise we'd have to update + * all of them inside tree move constructor, which in turn would + * make it toooo slow linear operation. Thus we walk up the nodes + * ._parent chain up to the root node which has the _root_tree. + */ + tree* tree_slow() const noexcept { + const node* cur = this; + + while (!cur->is_root()) { + cur = cur->_parent; + } + + return cur->_root_tree; + } + + /* + * For inner node finds the subtree to which k belongs. + * For leaf node finds the data that should correspond to the key, + * in this case index is not 0 for sure. + * + * In both cases keys[index - 1] <= k < keys[index]. + */ + template + kid_index index_for(const K& k, Less less) const noexcept { + return searcher::gt(k, _keys, _num_keys, less); + } + + kid_index index_for(node *n) const noexcept { + // Keep index on kid (FIXME?) + + kid_index i; + + for (i = 0; i <= _num_keys; i++) { + if (_kids[i].n == n) { + break; + } + } + assert(i <= _num_keys); + return i; + } + + bool need_refill() const noexcept { + return _num_keys <= NodeHalf; + } + + bool can_grab_from() const noexcept { + return _num_keys > NodeHalf + 1; + } + + bool can_push_to() const noexcept { + return _num_keys < NodeSize; + } + + bool can_merge_with(const node& n) const noexcept { + return _num_keys + n._num_keys + (is_leaf() ? 0u : 1u) <= NodeSize; + } + + void shift_right(size_t s) noexcept { + for (size_t i = _num_keys; i > s; i--) { + _keys[i].emplace(std::move(_keys[i - 1])); + _kids[i + 1] = _kids[i]; + } + _num_keys++; + } + + void shift_left(size_t s) noexcept { + // The key at s is expected to be .remove()-d ! + for (size_t i = s + 1; i < _num_keys; i++) { + _keys[i - 1].emplace(std::move(_keys[i])); + _kids[i] = _kids[i + 1]; + } + _num_keys--; + } + + void move_keys_and_kids(size_t foff, node& to, size_t count) noexcept { + size_t toff = to._num_keys; + + for (size_t i = 0; i < count; i++) { + to._keys[toff + i].emplace(std::move(_keys[foff + i])); + to._kids[toff + i + 1] = _kids[foff + i + 1]; + } + _num_keys = foff; + + if (is_leaf()) { + for (size_t i = toff; i < toff + count; i++) { + to._kids[i + 1].d->reattach(&to); + } + } else { + for (size_t i = toff; i < toff + count; i++) { + to._kids[i + 1].n->_parent = &to; + } + } + to._num_keys += count; + } + + void move_to(node& to, size_t off) noexcept { + assert(off <= _num_keys); + to._num_keys = 0; + move_keys_and_kids(off, to, _num_keys - off); + } + + void grab_from_left(node& from, maybe_key& sep) noexcept { + /* + * Grab one element from the left sibling and return + * the new separation key for them. + * + * Leaf: just move the last key (and the last kid) and report + * it as new separation key + * + * keys: [012] -> [56] = [01] [256] 2 is new separation + * datas: [-012] -> [-56] = [-01] [-256] + * + * Non-leaf is trickier. We need the current separation key + * as we're grabbing the last element which has no the right + * boundary on the node. So the parent node tells us one. + * + * keys: [012] -> s [56] = [01] 2 [s56] 2 is new separation + * kids: [A012] -> [B56] = [A01] [2B56] + */ + + assert(from._num_keys > 0); + key_index i = from._num_keys - 1; + + shift_right(0); + from._num_keys--; + + if (is_leaf()) { + _keys[0].emplace(std::move(from._keys[i])); + _kids[1] = from._kids[i + 1]; + _kids[1].d->reattach(this); + sep.replace(copy_key(_keys[0].v)); + } else { + _keys[0].emplace(std::move(sep)); + _kids[1] = _kids[0]; + _kids[0] = from._kids[i + 1]; + _kids[0].n->_parent = this; + sep.emplace(std::move(from._keys[i])); + } + } + + void merge_into(node& t, Key key) noexcept { + /* + * Merge current node into t preparing it for being + * killed. This merge is slightly different for leaves + * and for non-leaves wrt the 0th element. + * + * Non-leaves. For those we need the separation key, whic + * is passed to us. The caller "knows" that this and t are + * two siblings and thus the separation key is the one from + * the parent node. For this reason merging two non-leaf + * nodes needs one more slot in the target as compared to + * the leaf-nodes case. + * + * keys: [012] + K + [456] = [012K456] + * kids: [A012] + [B456] = [A012B456] + * + * Leaves. This is simple -- just go ahead and merge. + * + * keys: [012] + [456] = [012456] + * datas: [-012] + [-456] = [-012456] + */ + + if (!t.is_leaf()) { + key_index i = t._num_keys; + t._keys[i].emplace(std::move(key)); + t._kids[i + 1] = _kids[0]; + t._kids[i + 1].n->_parent = &t; + t._num_keys++; + } + + move_keys_and_kids(0, t, _num_keys); + } + + void grab_from_right(node& from, maybe_key& sep) noexcept { + /* + * Grab one element from the right sibling and return + * the new separation key for them. + * + * Leaf: just move the 0th key (and 1st kid) and the + * new separation key is what becomes 0 in the source. + * + * keys: [01] <- [456] = [014] [56] 5 is new separation + * datas: [-01] <- [-456] = [-014] [-56] + * + * Non-leaf is trickier. We need the current separation + * key as we're grabbing the kids[0] element which has no + * corresponding keys[-1]. So the parent node tells us one. + * + * keys: [01] <- s [456] = [01s] 4 [56] 4 is new separation + * kids: [A01] <- [B456] = [A01B] [456] + */ + + key_index i = _num_keys; + + if (is_leaf()) { + _keys[i].emplace(std::move(from._keys[0])); + _kids[i + 1] = from._kids[1]; + _kids[i + 1].d->reattach(this); + sep.replace(copy_key(from._keys[1].v)); + } else { + _kids[i + 1] = from._kids[0]; + _kids[i + 1].n->_parent = this; + _keys[i].emplace(std::move(sep)); + from._kids[0] = from._kids[1]; + sep.emplace(std::move(from._keys[0])); + } + + _num_keys++; + from.shift_left(0); + } + + /* + * When splitting, the result should be almost equal. The + * "almost" depends on the node-size being odd or even and + * on the node itself being leaf or inner. + */ + bool equally_split(const node& n2) const noexcept { + if (Debug == with_debug::yes) { + return (_num_keys == n2._num_keys) || + (_num_keys == n2._num_keys + 1) || + (_num_keys + 1 == n2._num_keys); + } + return true; + } + + // Helper for assert(). See comment for do_insert for details. + bool left_kid_sorted(const Key& k, Less less) const noexcept { + if (Debug == with_debug::yes && !is_leaf() && _num_keys > 0) { + node* x = _kids[0].n; + if (x != nullptr && less(k, x->_keys[x->_num_keys - 1].v)) { + return false; + } + } + + return true; + } + + template + SEASTAR_CONCEPT(requires Disposer && Disposer) + void clear(DFunc&& ddisp, NFunc&& ndisp) noexcept { + if (is_leaf()) { + _flags &= ~(node::NODE_LEFTMOST | node::NODE_RIGHTMOST); + set_next(this); + set_prev(this); + } else { + node* n = _kids[0].n; + n->clear(ddisp, ndisp); + ndisp(n); + } + + for (key_index i = 0; i < _num_keys; i++) { + _keys[i].reset(); + if (is_leaf()) { + ddisp(_kids[i + 1].d); + } else { + node* n = _kids[i + 1].n; + n->clear(ddisp, ndisp); + ndisp(n); + } + } + + _num_keys = 0; + } + + static node* create() { + return current_allocator().construct(); + } + + static void destroy(node& n) noexcept { + current_allocator().destroy(&n); + } + + void drop() noexcept { + assert(!is_root()); + if (is_leaf()) { + unlink(); + } + destroy(*this); + } + + void insert_into_full(kid_index idx, Key k, node_or_data nd, Less less, prealloc& nodes) noexcept { + if (!is_root()) { + node& p = *_parent; + kid_index i = p.index_for(_keys[0].v, less); + + /* + * Try to push left or right existing keys to the respective + * siblings. Keep in mind two corner cases: + * + * 1. Push to left. In this case the new key should not go + * to the [0] element, otherwise we'd have to update the p's + * separation key one more time. + * + * 2. Push to right. In this case we must make sure the new + * key is not the rightmost itself, otherwise it's _him_ who + * must be pushed there. + * + * Both corner cases are possible to implement though. + */ + if (idx > 1 && i > 0) { + node* left = p._kids[i - 1].n; + if (left->can_push_to()) { + /* + * We've moved the 0th elemet from this, so the index + * for the new key shifts too + */ + idx--; + left->grab_from_right(*this, p._keys[i - 1]); + } + } + + if (idx < _num_keys && i < p._num_keys) { + node* right = p._kids[i + 1].n; + if (right->can_push_to()) { + right->grab_from_left(*this, p._keys[i]); + } + } + + if (_num_keys < NodeSize) { + do_insert(idx, std::move(k), nd, less); + nodes.drain(); + return; + } + + /* + * We can only get here if both ->can_push_to() checks above + * had failed. In this case -- go ahead and split this. + */ + } + + split_and_insert(idx, std::move(k), nd, less, nodes); + } + + void split_and_insert(kid_index idx, Key k, node_or_data nd, Less less, prealloc& nodes) noexcept { + assert(_num_keys == NodeSize); + + node* nn = nodes.pop(); + maybe_key sep; + + /* + * Insertion with split. + * 1. Existing node (this) is split into two. We try a bit harder + * than we might to to make the split equal. + * 2. The new element is added to either of the resulting nodes. + * 3. The new node nn is inserted into parent one with the help + * of a separation key sep + * + * First -- find the position in the current node at which the + * new element should have appeared. + */ + + size_t off = NodeHalf + (idx > NodeHalf ? 1 : 0); + + if (is_leaf()) { + nn->_flags |= NODE_LEAF; + link(*nn); + + /* + * Split of leaves. This is simple -- just copy the needed + * amount of keys and kids from this to nn, then insert the + * new pair into the proper place. When inserting the new + * node into parent the separation key is the one latter + * starts with. + * + * keys: [01234] + * datas: [-01234] + * + * if the new key is below 2, then + * keys: -> [01] [234] -> [0n1] [234] -> sep is 2 + * datas: -> [-01] [-234] -> [-0n1] [-234] + * + * if the new key is above 2, then + * keys: -> [012] [34] -> [012] [3n4] -> sep is 3 (or n) + * datas: -> [-012] [-34] -> [-012] [-3n4] + */ + move_to(*nn, off); + + if (idx <= NodeHalf) { + do_insert(idx, std::move(k), nd, less); + } else { + nn->do_insert(idx - off, std::move(k), nd, less); + } + sep.emplace(std::move(copy_key(nn->_keys[0].v))); + } else { + /* + * Node insertion has one special case -- when the new key + * gets directly into the middle. + */ + if (idx == NodeHalf + 1) { + /* + * Split of nodes and the new key is in the middle. In this + * we need to split the node into two, but take the k as the + * separation kep. The corresponding data becomes new node's + * 0 kid. + * + * keys: [012345] -> [012] k [345] (and the k goes up) + * kids: [A012345] -> [A012] [n345] + */ + move_to(*nn, off); + sep.emplace(std::move(k)); + nn->_kids[0] = nd; + nn->_kids[0].n->_parent = nn; + } else { + /* + * Split of nodes and the new key gets into either of the + * halves. This is like leaves split, but we need to carefully + * handle the kids[0] for both. The correspoding key is not + * on the node and "has" an index of -1 and thus becomes the + * separation one for the upper layer. + * + * keys: [012345] + * datas: [A012345] + * + * if the new key goes left then + * keys: -> [01] 2 [345] -> [0n1] 2 [345] + * datas: -> [A01] [2345] -> [A0n1] [2345] + * + * if the new key goes right then + * keys: -> [012] 3 [45] -> [012] 3 [4n5] + * datas: -> [A012] [345] -> [-123] [34n5] + */ + move_to(*nn, off + 1); + sep.emplace(std::move(_keys[off])); + nn->_kids[0] = _kids[off + 1]; + nn->_kids[0].n->_parent = nn; + _num_keys--; + + if (idx <= NodeHalf) { + do_insert(idx, std::move(k), nd, less); + } else { + nd.n->_parent = nn; + nn->do_insert(idx - off - 1, std::move(k), nd, less); + } + } + } + + assert(equally_split(*nn)); + + if (is_root()) { + insert_into_root(*nn, std::move(sep.v), nodes); + } else { + insert_into_parent(*nn, std::move(sep.v), less, nodes); + } + sep.reset(); + } + + void do_insert(kid_index i, Key k, node_or_data nd, Less less) noexcept { + assert(_num_keys < NodeSize); + + /* + * The new k:nd pair should be put into the given index and + * shift offenders to the right. However, if it should be + * put left to the non-leaf's left-most node -- it's a BUG, + * as there's no corresponding key here. + * + * Non-leaf nodes get here when their kids are split, and + * when they do, if the kid gets into the left-most sub-tree, + * it's directly put there, and this helper is not called. + * Said that, if we're inserting a new pair, the newbie can + * only get to the right of the left-most kid. + */ + assert(i != 0 || left_kid_sorted(k, less)); + + shift_right(i); + + /* + * The k:nd pair belongs to keys[i-1]:kids[i] subtree, and since + * what's already there is less than this newcomer, the latter goes + * one step right. + */ + _keys[i].emplace(std::move(k)); + _kids[i + 1] = nd; + if (is_leaf()) { + nd.d->attach(*this); + } + } + + void insert_into_parent(node& nn, Key sep, Less less, prealloc& nodes) noexcept { + nn._parent = _parent; + _parent->insert_key(std::move(sep), node_or_data{n: &nn}, less, nodes); + } + + void insert_into_root(node& nn, Key sep, prealloc& nodes) noexcept { + tree* t = _root_tree; + + node* nr = nodes.pop(); + + nr->_num_keys = 1; + nr->_keys[0].emplace(std::move(sep)); + nr->_kids[0].n = this; + nr->_kids[1].n = &nn; + _flags &= ~node::NODE_ROOT; + _parent = nr; + nn._parent = nr; + + nr->_flags |= node::NODE_ROOT; + t->do_set_root(nr); + } + + void insert_key(Key k, node_or_data nd, Less less, prealloc& nodes) noexcept { + kid_index i = index_for(k, less); + insert(i, std::move(k), nd, less, nodes); + } + + void insert(kid_index i, Key k, node_or_data nd, Less less, prealloc& nodes) noexcept { + if (_num_keys == NodeSize) { + insert_into_full(i, std::move(k), nd, less, nodes); + } else { + do_insert(i, std::move(k), nd, less); + } + } + + void insert(kid_index i, Key k, data* d, Less less) { + prealloc nodes; + + /* + * Prepare the nodes for split in advaice, if the node::create will + * start throwing while splitting we'll have troubles "unsplitting" + * the nodes back. + */ + node* cur = this; + while (cur->_num_keys == NodeSize) { + nodes.push(); + if (cur->is_root()) { + nodes.push(); + break; + } + cur = cur->_parent; + } + + insert(i, std::move(k), node_or_data{d: d}, less, nodes); + assert(nodes.empty()); + } + + void remove_from(key_index i, Less less) noexcept { + _keys[i].reset(); + shift_left(i); + + if (!is_root()) { + if (need_refill()) { + refill(less); + } + } else if (_num_keys == 0 && !is_leaf()) { + node* nr; + nr = _kids[0].n; + nr->_flags |= node::NODE_ROOT; + _root_tree->do_set_root(nr); + + _flags &= ~node::NODE_ROOT; + _parent = nullptr; + drop(); + } + } + + void merge_kids(node& t, node& n, key_index sep_idx, Less less) noexcept { + n.merge_into(t, std::move(_keys[sep_idx].v)); + n.drop(); + remove_from(sep_idx, less); + } + + void refill(Less less) noexcept { + node& p = *_parent, *left, *right; + + /* + * We need to locate this node's index at parent array by using + * the 0th key, so make sure it exists. We can go even without + * it, but since we don't let's be on the safe side. + */ + assert(_num_keys > 0); + kid_index i = p.index_for(_keys[0].v, less); + assert(p._kids[i].n == this); + + /* + * The node is "underflown" (see comment near NodeHalf + * about what this means), so we try to refill it at the + * siblings' expense. Many cases possible, but we go with + * only four: + * + * 1. Left sibling exists and it has at least 1 item + * above being the half-full. -> we grab one element + * from it. + * + * 2. Left sibling exists and we can merge current with + * it. "Can" means the resulting node will not overflow + * which, in turn, differs by one for leaf and non-leaf + * nodes. For leaves the merge is possible is the total + * number of the elements fits the maximum. For non-leaf + * we'll need room for one more element, here's why: + * + * [012] + [456] -> [012X456] + * [A012] + [B456] -> [A012B456] + * + * The key X in the middle separates B from everything on + * the left and this key was not sitting on either of the + * wannabe merging nodes. This X is the current separation + * of these two nodes taken from their parent. + * + * And two same cases for the right sibling. + */ + + left = i > 0 ? p._kids[i - 1].n : nullptr; + right = i < p._num_keys ? p._kids[i + 1].n : nullptr; + + if (left != nullptr && left->can_grab_from()) { + grab_from_left(*left, p._keys[i - 1]); + return; + } + + if (right != nullptr && right->can_grab_from()) { + grab_from_right(*right, p._keys[i]); + return; + } + + if (left != nullptr && can_merge_with(*left)) { + p.merge_kids(*left, *this, i - 1, less); + return; + } + + if (right != nullptr && can_merge_with(*right)) { + p.merge_kids(*this, *right, i, less); + return; + } + + /* + * Susprisingly, the node in the B+ tree can violate the + * "minimally filled" rule for non roots. It _can_ stay with + * less than half elements on board. The next remove from + * it or either of its siblings will probably refill it. + * + * Keeping 1 key on the non-root node is possible, but needs + * some special care -- if we will remove this last key from + * this node, the code will try to refill one and will not + * be able to find this node's index at parent (the call for + * index_for() above). + */ + assert(_num_keys > 1); + } + + void remove(kid_index ki, Less less) noexcept { + key_index i = ki - 1; + + /* + * Update the matching separation key from above. It + * exists only if we're removing the 0th key, but for + * the left-most child it doesn't exist. + * + * Note, that the latter check is crucial for clear() + * performance, as it's always removes the left-most + * key, without this check each remove() would walk the + * tree upwards in vain. + */ + if (strict_separation_key && i == 0 && !is_leftmost()) { + const Key& k = _keys[i].v; + node* p = this; + + while (!p->is_root()) { + p = p->_parent; + kid_index j = p->index_for(k, less); + if (j > 0) { + p->_keys[j - 1].replace(copy_key(_keys[1].v)); + break; + } + } + } + + remove_from(i, less); + } + +public: + explicit node() noexcept : _num_keys(0) , _flags(0) , _parent(nullptr) { } + + ~node() { + assert(_num_keys == 0); + assert(is_root() || !is_leaf() || (get_prev() == this && get_next() == this)); + } + + node(node&& other) noexcept : _flags(other._flags) { + if (is_leaf()) { + if (!is_rightmost()) { + set_next(other.get_next()); + get_next()->set_prev(this); + } else { + other._rightmost_tree->do_set_right(this); + } + + if (!is_leftmost()) { + set_prev(other.get_prev()); + get_prev()->set_next(this); + } else { + other._kids[0]._leftmost_tree->do_set_left(this); + } + + other._flags &= ~(NODE_LEFTMOST | NODE_RIGHTMOST); + other.set_next(&other); + other.set_prev(&other); + } else { + _kids[0].n = other._kids[0].n; + _kids[0].n->_parent = this; + } + + other.move_to(*this, 0); + + if (!is_root()) { + _parent = other._parent; + kid_index i = _parent->index_for(&other); + assert(_parent->_kids[i].n == &other); + _parent->_kids[i].n = this; + } else { + other._root_tree->do_set_root(this); + } + } + + kid_index index_for(const data *d) const noexcept { + /* + * We'd could look up the data's new idex with binary search, + * but we don't have the key at hands + */ + + kid_index i; + + for (i = 1; i <= _num_keys; i++) { + if (_kids[i].d == d) { + break; + } + } + assert(i <= _num_keys); + return i; + } + +private: + class prealloc { + std::vector _nodes; + public: + bool empty() noexcept { return _nodes.empty(); } + + void push() { + _nodes.push_back(node::create()); + } + + node* pop() noexcept { + assert(!_nodes.empty()); + node* ret = _nodes.back(); + _nodes.pop_back(); + return ret; + } + + void drain() noexcept { + while (!empty()) { + node::destroy(*pop()); + } + } + + ~prealloc() { + drain(); + } + }; + + void fill_stats(struct stats& st) const noexcept { + if (is_leaf()) { + st.leaves_filled[_num_keys]++; + st.leaves++; + st.datas += _num_keys; + } else { + st.nodes_filled[_num_keys]++; + st.nodes++; + for (kid_index i = 0; i <= _num_keys; i++) { + _kids[i].n->fill_stats(st); + } + } + } +}; + +/* + * The data represents data node (the actual data is stored "outside" + * of the tree). The tree::emplace() constructs the payload inside the + * data before inserting it into the tree. + */ +template +class data final { + friend class validator; + template + friend class tree::iterator; + template + friend class tree::iterator_base_const; + template + friend class tree::iterator_base_nonconst; + + using node = class node; + + node* _leaf; + T value; + +public: + template + static data* create(Args&&... args) { + return current_allocator().construct(std::forward(args)...); + } + + template + SEASTAR_CONCEPT(requires Disposer) + static void destroy(data& d, Func&& disp) noexcept { + disp(&d.value); + d._leaf = nullptr; + current_allocator().destroy(&d); + } + + template + data(Args&& ... args) : _leaf(nullptr), value(std::forward(args)...) {} + + data(data&& other) noexcept : _leaf(other._leaf), value(std::move(other.value)) { + if (attached()) { + auto i = _leaf->index_for(&other); + _leaf->_kids[i].d = this; + other._leaf = nullptr; + } + } + + ~data() { assert(!attached()); } + + bool attached() const noexcept { return _leaf != nullptr; } + + void attach(node& to) noexcept { + assert(!attached()); + _leaf = &to; + } + + void reattach(node* to) noexcept { + assert(attached()); + _leaf = to; + } + +private: + // Data node may describe a T without fixed size, e.g. an array that grows on + // demand. So this helper returns the size of the memory chunk that's required + // to carry the node with T of the payload size on board. + // + // The tree::iterator::reconstruct does this growing (or shrinking). + size_t storage_size(size_t payload) const noexcept { + return sizeof(data) - sizeof(T) + payload; + } + + size_t storage_size() const noexcept { + return storage_size(size_for_allocation_strategy(value)); + } + +public: + friend size_t size_for_allocation_strategy(const data& obj) noexcept { + return obj.storage_size(); + } +}; + +} // namespace bplus diff --git a/utils/collection-concepts.hh b/utils/collection-concepts.hh new file mode 100644 index 0000000000..bc98939445 --- /dev/null +++ b/utils/collection-concepts.hh @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once +#include +#include + +SEASTAR_CONCEPT( + template + concept Disposer = requires (Func f, T* val) { + { f(val) } noexcept -> std::same_as; + }; +) + +SEASTAR_CONCEPT( + template + concept LessComparable = requires (const Key1& a, const Key2& b, Less less) { + { less(a, b) } -> std::same_as; + { less(b, a) } -> std::same_as; + }; + + template + concept LessNothrowComparable = LessComparable && std::is_nothrow_invocable_v; +) + +SEASTAR_CONCEPT( + template + concept Comparable = requires (const T1& a, const T2& b, Compare cmp) { + // The Comparable is trichotomic comparator that should return + // negative value when a < b + // zero when a == b + // positive value when a > b + { cmp(a, b) } -> std::same_as; + }; +) diff --git a/utils/double-decker.hh b/utils/double-decker.hh new file mode 100644 index 0000000000..c907fd4e65 --- /dev/null +++ b/utils/double-decker.hh @@ -0,0 +1,412 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include +#include +#include "utils/bptree.hh" +#include "utils/intrusive-array.hh" +#include "utils/collection-concepts.hh" +#include + +/* + * The double-decker is the ordered keeper of key:value pairs having + * the pairs sorted by both key and value (key first). + * + * The keys collisions are expected to be rare enough to afford holding + * the values in a sorted array with the help of linear algorithms. + */ + +template +SEASTAR_CONCEPT( requires Comparable && std::is_nothrow_move_constructible_v ) +class double_decker { +public: + using inner_array = intrusive_array; + using outer_tree = bplus::tree; + using outer_iterator = typename outer_tree::iterator; + using outer_const_iterator = typename outer_tree::const_iterator; + +private: + outer_tree _tree; + +public: + template + class iterator_base { + friend class double_decker; + using outer_iterator = std::conditional_t; + + protected: + outer_iterator _bucket; + int _idx; + + public: + iterator_base() = default; + iterator_base(outer_iterator bkt, int idx) noexcept : _bucket(bkt), _idx(idx) {} + + using iterator_category = std::bidirectional_iterator_tag; + using difference_type = ssize_t; + using value_type = std::conditional_t; + using pointer = value_type*; + using reference = value_type&; + + reference operator*() const noexcept { return (*_bucket)[_idx]; } + pointer operator->() const noexcept { return &((*_bucket)[_idx]); } + + iterator_base& operator++() noexcept { + if ((*_bucket)[_idx++].is_tail()) { + _bucket++; + _idx = 0; + } + + return *this; + } + + iterator_base operator++(int) noexcept { + iterator_base cur = *this; + operator++(); + return cur; + } + + iterator_base& operator--() noexcept { + if (_idx-- == 0) { + _bucket--; + _idx = _bucket->index_of(_bucket->end()) - 1; + } + + return *this; + } + + iterator_base operator--(int) noexcept { + iterator_base cur = *this; + operator--(); + return cur; + } + + bool operator==(const iterator_base& o) const noexcept { return _bucket == o._bucket && _idx == o._idx; } + bool operator!=(const iterator_base& o) const noexcept { return !(*this == o); } + }; + + using const_iterator = iterator_base; + + class iterator final : public iterator_base { + friend class double_decker; + using super = iterator_base; + + iterator(const const_iterator&& other) noexcept : super(std::move(other._bucket), other._idx) {} + + public: + iterator() noexcept : super() {} + iterator(outer_iterator bkt, int idx) noexcept : super(bkt, idx) {} + + iterator(T* ptr) noexcept { + inner_array& arr = inner_array::from_element(ptr, super::_idx); + super::_bucket = outer_iterator(&arr); + } + + template + SEASTAR_CONCEPT(requires Disposer) + iterator erase_and_dispose(Less less, Func&& disp) noexcept { + disp(&**this); // * to deref this, * to call operator*, & to get addr from ref + + if (super::_bucket->is_single_element()) { + outer_iterator bkt = super::_bucket.erase(less); + return iterator(bkt, 0); + } + + bool tail = (*super::_bucket)[super::_idx].is_tail(); + super::_bucket->erase(super::_idx); + if (tail) { + super::_bucket++; + super::_idx = 0; + } + + return *this; + } + + iterator erase(Less less) noexcept { return erase_and_dispose(less, bplus::default_dispose); } + }; + + /* + * Structure that shed some more light on how the lower_bound + * actually found the bounding elements. + */ + struct bound_hint { + /* + * Set to true if the element fully matched to the key + * according to Compare + */ + bool match; + /* + * Set to true if the bucket for the given key exists + */ + bool key_match; + /* + * Set to true if the given key is more than anything + * on the bucket and iterator was switched to the next + * one (or when the key_match is false) + */ + bool key_tail; + + /* + * This helper says whether the emplace will invalidate (some) + * iterators or not. Emplacing with !key_match will go and create + * new node in B+ which doesn't invalidate iterators. In another + * case some existing B+ data node will be reconstructed, so the + * iterators on those nodes will become invalid. + */ + bool emplace_keeps_iterators() const noexcept { return !key_match; } + }; + + iterator begin() noexcept { return iterator(_tree.begin(), 0); } + const_iterator begin() const noexcept { return const_iterator(_tree.begin(), 0); } + const_iterator cbegin() const noexcept { return const_iterator(_tree.begin(), 0); } + + iterator end() noexcept { return iterator(_tree.end(), 0); } + const_iterator end() const noexcept { return const_iterator(_tree.end(), 0); } + const_iterator cend() const noexcept { return const_iterator(_tree.end(), 0); } + + explicit double_decker(Less less) noexcept : _tree(less) { } + + double_decker(const double_decker& other) = delete; + double_decker(double_decker&& other) noexcept : _tree(std::move(other._tree)) {} + + iterator insert(Key k, T value, Compare cmp) { + std::pair oip = _tree.emplace(std::move(k), std::move(value)); + outer_iterator& bkt = oip.first; + int idx = 0; + + if (!oip.second) { + /* + * Unlikely, but in this case we reconstruct the array. The value + * must not have been moved by emplace() above. + */ + idx = bkt->index_of(bkt->lower_bound(value, cmp)); + size_t new_size = (bkt->size() + 1) * sizeof(T); + bkt.reconstruct(new_size, *bkt, + typename inner_array::grow_tag{idx}, std::move(value)); + } + + return iterator(bkt, idx); + } + + template + iterator emplace_before(iterator i, Key k, const bound_hint& hint, Args&&... args) { + assert(!hint.match); + outer_iterator& bucket = i._bucket; + + if (!hint.key_match) { + /* + * The most expected case -- no key conflict, respectively the + * bucket is not found, and i points to the next one. Just go + * ahead and emplace the new bucket before the i and push the + * 0th element into it. + */ + outer_iterator nb = bucket.emplace_before(std::move(k), _tree.less(), std::forward(args)...); + return iterator(nb, 0); + } + + /* + * Key conflict, need to expand some inner vector, but still there + * are two cases -- whether the bounding element is on k's bucket + * or the bound search overflew and switched to the next one. + */ + + int idx = i._idx; + + if (hint.key_tail) { + /* + * The latter case -- i points to the next one. Need to shift + * back and append the new element to its tail. + */ + bucket--; + idx = bucket->index_of(bucket->end()); + } + + size_t new_size = (bucket->size() + 1) * sizeof(T); + bucket.reconstruct(new_size, *bucket, + typename inner_array::grow_tag{idx}, std::forward(args)...); + return iterator(bucket, idx); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + const_iterator find(const K& key, Compare cmp) const { + outer_const_iterator bkt = _tree.find(key); + int idx = 0; + + if (bkt != _tree.end()) { + bool match = false; + idx = bkt->index_of(bkt->lower_bound(key, cmp, match)); + if (!match) { + bkt = _tree.end(); + idx = 0; + } + } + + return const_iterator(bkt, idx); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + iterator find(const K& k, Compare cmp) { + return iterator(const_cast(this)->find(k, std::move(cmp))); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + const_iterator lower_bound(const K& key, Compare cmp, bound_hint& hint) const { + outer_const_iterator bkt = _tree.lower_bound(key, hint.key_match); + + hint.key_tail = false; + hint.match = false; + + if (bkt == _tree.end() || !hint.key_match) { + return const_iterator(bkt, 0); + } + + int i = bkt->index_of(bkt->lower_bound(key, cmp, hint.match)); + + if (i != 0 && (*bkt)[i - 1].is_tail()) { + /* + * The lower_bound is after the last element -- shift + * to the net bucket's 0'th one. + */ + bkt++; + i = 0; + hint.key_tail = true; + } + + return const_iterator(bkt, i); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + iterator lower_bound(const K& key, Compare cmp, bound_hint& hint) { + return iterator(const_cast(this)->lower_bound(key, std::move(cmp), hint)); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + const_iterator lower_bound(const K& key, Compare cmp) const { + bound_hint hint; + return lower_bound(key, cmp, hint); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + iterator lower_bound(const K& key, Compare cmp) { + return iterator(const_cast(this)->lower_bound(key, std::move(cmp))); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + const_iterator upper_bound(const K& key, Compare cmp) const { + bool key_match; + outer_const_iterator bkt = _tree.lower_bound(key, key_match); + + if (bkt == _tree.end() || !key_match) { + return const_iterator(bkt, 0); + } + + int i = bkt->index_of(bkt->upper_bound(key, cmp)); + + if (i != 0 && (*bkt)[i - 1].is_tail()) { + // Beyond the end() boundary + bkt++; + i = 0; + } + + return const_iterator(bkt, i); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + iterator upper_bound(const K& key, Compare cmp) { + return iterator(const_cast(this)->upper_bound(key, std::move(cmp))); + } + + template + SEASTAR_CONCEPT(requires Disposer) + void clear_and_dispose(Func&& disp) noexcept { + _tree.clear_and_dispose([&disp] (inner_array* arr) noexcept { + arr->for_each(disp); + }); + } + + void clear() noexcept { clear_and_dispose(bplus::default_dispose); } + + template + SEASTAR_CONCEPT(requires Disposer) + iterator erase_and_dispose(iterator begin, iterator end, Func&& disp) noexcept { + bool same_bucket = begin._bucket == end._bucket; + + // Drop the tail of the starting bucket if it's not fully erased + while (begin._idx != 0) { + if (same_bucket) { + if (begin == end) { + return begin; + } + end._idx--; + } + + begin = begin.erase_and_dispose(_tree.less(), disp); + } + + // Drop all the buckets in between + outer_iterator nb = _tree.erase_and_dispose(begin._bucket, end._bucket, [&disp] (inner_array* arr) noexcept { + arr->for_each(disp); + }); + + assert(nb == end._bucket); + + /* + * Drop the head of the ending bucket. Every erased element is the 0th + * one, when erased it will shift the rest left and reconstruct the array, + * thus we cannot rely on the end to keep neither _bucket not _idx. + * + * Said that -- just erase the required number of elements. A corner case + * when end points to the tree end is handled, _idx is 0 in this case. + */ + iterator next(nb, 0); + while (end._idx-- != 0) { + next = next.erase_and_dispose(_tree.less(), disp); + } + + return next; + } + + iterator erase(iterator begin, iterator end) noexcept { + return erase_and_dispose(begin, end, bplus::default_dispose); + } + + bool empty() const noexcept { return _tree.empty(); } + + static size_t estimated_object_memory_size_in_allocator(allocation_strategy& allocator, const T* obj) noexcept { + /* + * The T-s are merged together in array, so getting any run-time + * value of a pointer would be wrong. So here's some guessing of + * how much memory would this thing occupy in memory + */ + return sizeof(typename outer_tree::data); + } +}; diff --git a/utils/intrusive-array.hh b/utils/intrusive-array.hh new file mode 100644 index 0000000000..d198203b83 --- /dev/null +++ b/utils/intrusive-array.hh @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include +#include +#include + +#include "utils/allocation_strategy.hh" +#include "utils/collection-concepts.hh" + +SEASTAR_CONCEPT( + template + concept BoundsKeeper = requires (T val, bool bit) { + { val.is_head() } noexcept -> std::same_as; + { val.set_head(bit) } noexcept -> std::same_as; + { val.is_tail() } noexcept -> std::same_as; + { val.set_tail(bit) } noexcept -> std::same_as; + { val.with_train() } noexcept -> std::same_as; + { val.set_train(bit) } noexcept -> std::same_as; + }; +) + +/* + * A plain array of T-s that grows and shrinks by constructing a new + * instances. Holds at least one element. Has facilities for sorting + * the elements and for doing "container_of" by the given element + * pointer. LSA-compactible. + * + * Important feature of the array is zero memory overhead -- it doesn't + * keep its size/capacity onboard. The size is calculated each time by + * walking the array of T-s and checking which one of them is the tail + * element. Respectively, the T must keep head/tail flags on itself. + */ +template +SEASTAR_CONCEPT( requires BoundsKeeper && std::is_nothrow_move_constructible_v ) +class intrusive_array { + // Sanity constant to avoid infinite loops searching for tail + static constexpr int max_len = std::numeric_limits::max(); + + union maybe_constructed { + maybe_constructed() { } + ~maybe_constructed() { } + T object; + + /* + * Train is 1 or more allocated but unoccupied memory slots after + * the tail one. Being unused, this memory keeps the train length. + * An array with the train is marked with the respective flag on + * the 0th element. Train is created by the erase() call and can + * be up to 65535 elements long + * + * Train length is included into the storage_size() to make + * allocator and compaction work correctly, but is not included + * into the number_of_elements(), so the array behaves just like + * there's no train + * + * Respectively both grow and shrink constructors do not carry + * the train (and drop the bit from 0th element) and don't expect + * the memory for the new array to include one + */ + unsigned short train_len; + static_assert(sizeof(T) >= sizeof(unsigned short)); + }; + + maybe_constructed _data[1]; + + size_t number_of_elements() const noexcept { + for (int i = 0; i < max_len; i++) { + if (_data[i].object.is_tail()) { + return i + 1; + } + } + + std::abort(); + } + + size_t storage_size() const noexcept { + size_t nr = number_of_elements(); + if (_data[0].object.with_train()) { + nr += _data[nr].train_len; + } + return nr * sizeof(T); + } + +public: + using iterator = T*; + using const_iterator = const T*; + + /* + * There are 3 constructing options for the array: initial, grow + * and shrink. + * + * * initial just creates a 1-element array + * * grow -- makes a new one moving all elements from the original + * array and inserting the one (only one) more element at the given + * position + * * shrink -- also makes a new array skipping the not needed + * element while moving them from the original one + * + * In all cases the enough big memory chunk must be provided by the + * caller! + * + * Note, that none of them calls destructors on T-s, unlike vector. + * This is because when the older array is destroyed it has no idea + * about whether or not it was grown/shrunk and thus it destroys + * T-s itself. + */ + + // Initial + template + intrusive_array(Args&&... args) { + new (&_data[0].object) T(std::forward(args)...); + _data[0].object.set_head(true); + _data[0].object.set_tail(true); + } + + // Growing + struct grow_tag { + int add_pos; + }; + + template + intrusive_array(intrusive_array& from, grow_tag grow, Args&&... args) { + // The add_pos is strongly _expected_ to be within bounds + int i, off = 0; + bool tail = false; + + for (i = 0; !tail; i++) { + if (i == grow.add_pos) { + off = 1; + continue; + } + + tail = from._data[i - off].object.is_tail(); + new (&_data[i].object) T(std::move(from._data[i - off].object)); + } + + assert(grow.add_pos <= i && i < max_len); + + new (&_data[grow.add_pos].object) T(std::forward(args)...); + + _data[0].object.set_head(true); + _data[0].object.set_train(false); + if (grow.add_pos == 0) { + _data[1].object.set_head(false); + } + _data[i - off].object.set_tail(true); + if (off == 0) { + _data[i - 1].object.set_tail(false); + } + } + + // Shrinking + struct shrink_tag { + int del_pos; + }; + + intrusive_array(intrusive_array& from, shrink_tag shrink) { + int i, off = 0; + bool tail = false; + + for (i = 0; !tail; i++) { + tail = from._data[i].object.is_tail(); + if (i == shrink.del_pos) { + off = 1; + } else { + new (&_data[i - off].object) T(std::move(from._data[i].object)); + } + } + + _data[0].object.set_head(true); + _data[0].object.set_train(false); + _data[i - off - 1].object.set_tail(true); + } + + intrusive_array(const intrusive_array& other) = delete; + intrusive_array(intrusive_array&& other) noexcept { + bool tail = false; + int i; + + for (i = 0; !tail; i++) { + tail = other._data[i].object.is_tail(); + + new (&_data[i].object) T(std::move(other._data[i].object)); + } + + if (_data[0].object.with_train()) { + _data[i].train_len = other._data[i].train_len; + } + } + + ~intrusive_array() { + bool tail = false; + + for (int i = 0; !tail; i++) { + tail = _data[i].object.is_tail(); + _data[i].object.~T(); + } + } + + /* + * Drops the element in-place at position @pos and grows the + * "train". To be used in places where reconstruction is not + * welcome (e.g. because it throws) + * + * Single-elemented array cannot be erased from, just drop it + * alltogether if needed + */ + void erase(int pos) noexcept { + assert(!is_single_element()); + assert(pos < max_len); + + bool with_train = _data[0].object.with_train(); + bool tail = _data[pos].object.is_tail(); + _data[pos].object.~T(); + + if (tail) { + assert(pos > 0); + _data[pos - 1].object.set_tail(true); + } else { + while (!tail) { + new (&_data[pos].object) T(std::move(_data[pos + 1].object)); + _data[pos + 1].object.~T(); + tail = _data[pos++].object.is_tail(); + } + _data[0].object.set_head(true); + } + + _data[0].object.set_train(true); + unsigned short train_len = with_train ? _data[pos + 1].train_len : 0; + assert(train_len < max_len); + _data[pos].train_len = train_len + 1; + } + + T& operator[](int pos) noexcept { return _data[pos].object; } + const T& operator[](int pos) const noexcept { return _data[pos].object; } + + iterator begin() noexcept { return &_data[0].object; } + const_iterator begin() const noexcept { return &_data[0].object; } + const_iterator cbegin() const noexcept { return &_data[0].object; } + iterator end() noexcept { return &_data[number_of_elements()].object; } + const_iterator end() const noexcept { return &_data[number_of_elements()].object; } + const_iterator cend() const noexcept { return &_data[number_of_elements()].object; } + + size_t index_of(iterator i) const noexcept { return i - &_data[0].object; } + size_t index_of(const_iterator i) const noexcept { return i - &_data[0].object; } + bool is_single_element() const noexcept { return _data[0].object.is_tail(); } + + // A helper for keeping the array sorted + template + SEASTAR_CONCEPT( requires Comparable ) + const_iterator lower_bound(const K& val, Compare cmp, bool& match) const { + int i = 0; + + do { + int x = cmp(_data[i].object, val); + if (x >= 0) { + match = (x == 0); + break; + } + } while (!_data[i++].object.is_tail()); + + return &_data[i].object; + } + + template + SEASTAR_CONCEPT( requires Comparable ) + iterator lower_bound(const K& val, Compare cmp, bool& match) { + return const_cast(const_cast(this)->lower_bound(val, std::move(cmp), match)); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + const_iterator lower_bound(const K& val, Compare cmp) const { + bool match = false; + return lower_bound(val, cmp, match); + } + + template + SEASTAR_CONCEPT( requires Comparable ) + iterator lower_bound(const K& val, Compare cmp) { + return const_cast(const_cast(this)->lower_bound(val, std::move(cmp))); + } + + // And its peer ... just to be used + template + SEASTAR_CONCEPT( requires Comparable ) + const_iterator upper_bound(const K& val, Compare cmp) const { + int i = 0; + + do { + if (cmp(_data[i].object, val) > 0) { + break; + } + } while (!_data[i++].object.is_tail()); + + return &_data[i].object; + } + + template + SEASTAR_CONCEPT( requires Comparable ) + iterator upper_bound(const K& val, Compare cmp) { + return const_cast(const_cast(this)->upper_bound(val, std::move(cmp))); + } + + template + SEASTAR_CONCEPT(requires Disposer) + void for_each(Func&& fn) noexcept { + bool tail = false; + + for (int i = 0; !tail; i++) { + tail = _data[i].object.is_tail(); + fn(&_data[i].object); + } + } + + size_t size() const noexcept { return number_of_elements(); } + + friend size_t size_for_allocation_strategy(const intrusive_array& obj) noexcept { + return obj.storage_size(); + } + + static intrusive_array& from_element(T* ptr, int& idx) noexcept { + idx = 0; + while (!ptr->is_head()) { + assert(idx < max_len); // may the force be with us... + idx++; + ptr--; + } + + static_assert(offsetof(intrusive_array, _data[0].object) == 0); + return *reinterpret_cast(ptr); + } +}; diff --git a/utils/neat-object-id.hh b/utils/neat-object-id.hh new file mode 100644 index 0000000000..ed9ab9a6cc --- /dev/null +++ b/utils/neat-object-id.hh @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2020 ScyllaDB + */ + +/* + * This file is part of Scylla. + * + * Scylla is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Scylla is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Scylla. If not, see . + */ + +#pragma once + +#include + +namespace utils { + +/* + * The neat_id class is purely a debugging thing -- when reading + * the logs with object IDs in it it's more handy to look at those + * consisting * of 1-3 digits, rather than 16 hex-digits of a printed + * pointer. + * + * Embed with [[no_unique_address]] tag for memory efficiency + */ +template +struct neat_id { + unsigned int operator()() const noexcept { return reinterpret_cast(this); } +}; + +template <> +struct neat_id { + unsigned int _id; + static unsigned int _next() noexcept { + static std::atomic rover {1}; + return rover.fetch_add(1); + } + + neat_id() noexcept : _id(_next()) {} + unsigned int operator()() const noexcept { return _id; } +}; + +} // namespace