From 9d38846ed28093f33bd17ae33fbc896863e1c664 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Wed, 6 May 2020 16:39:29 +0300
Subject: [PATCH 01/11] test: Move perf measurement helpers into header

To use the code in new perf tests in next patches.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 test/perf/perf.hh                  | 71 ++++++++++++++++++++++++++++++
 test/perf/perf_row_cache_update.cc | 71 ------------------------------
 2 files changed, 71 insertions(+), 71 deletions(-)
diff --git a/test/perf/perf.hh b/test/perf/perf.hh
index 9de2b25410..e73ac859ae 100644
--- a/test/perf/perf.hh
+++ b/test/perf/perf.hh
@@ -24,7 +24,10 @@
 #include <seastar/core/print.hh>
 #include <seastar/core/future-util.hh>
 #include <seastar/core/distributed.hh>
+#include <seastar/core/weak_ptr.hh>
 #include "seastarx.hh"
+#include "utils/extremum_tracking.hh"
+#include "utils/estimated_histogram.hh"
 
 #include <chrono>
 #include <iosfwd>
@@ -126,3 +129,71 @@ std::vector<double> time_parallel(Func func, unsigned concurrency_per_core, int
     }
     return results;
 }
+
+template<typename Func>
+auto duration_in_seconds(Func&& f) {
+    using clk = std::chrono::steady_clock;
+    auto start = clk::now();
+    f();
+    auto end = clk::now();
+    return std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
+}
+
+class scheduling_latency_measurer : public weakly_referencable<scheduling_latency_measurer> {
+    using clk = std::chrono::steady_clock;
+    clk::time_point _last = clk::now();
+    utils::estimated_histogram _hist{300};
+    min_max_tracker<clk::duration> _minmax;
+    bool _stop = false;
+private:
+    void schedule_tick();
+    void tick() {
+        auto old = _last;
+        _last = clk::now();
+        auto latency = _last - old;
+        _minmax.update(latency);
+        _hist.add(latency.count());
+        if (!_stop) {
+            schedule_tick();
+        }
+    }
+public:
+    void start() {
+        schedule_tick();
+    }
+    void stop() {
+        _stop = true;
+        later().get(); // so that the last scheduled tick is counted
+    }
+    const utils::estimated_histogram& histogram() const {
+        return _hist;
+    }
+    clk::duration min() const { return _minmax.min(); }
+    clk::duration max() const { return _minmax.max(); }
+};
+
+void scheduling_latency_measurer::schedule_tick() {
+    seastar::schedule(make_task(default_scheduling_group(), [self = weak_from_this()] () mutable {
+        if (self) {
+            self->tick();
+        }
+    }));
+}
+
+std::ostream& operator<<(std::ostream& out, const scheduling_latency_measurer& slm) {
+    auto to_ms = [] (int64_t nanos) {
+        return float(nanos) / 1e6;
+    };
+    return out << sprint("{count: %d, "
+                         //"min: %.6f [ms], "
+                         //"50%%: %.6f [ms], "
+                         //"90%%: %.6f [ms], "
+                         "99%%: %.6f [ms], "
+                         "max: %.6f [ms]}",
+        slm.histogram().count(),
+        //to_ms(slm.min().count()),
+        //to_ms(slm.histogram().percentile(0.5)),
+        //to_ms(slm.histogram().percentile(0.9)),
+        to_ms(slm.histogram().percentile(0.99)),
+        to_ms(slm.max().count()));
+}
diff --git a/test/perf/perf_row_cache_update.cc b/test/perf/perf_row_cache_update.cc
index e4d37fad12..ad4d92115b 100644
--- a/test/perf/perf_row_cache_update.cc
+++ b/test/perf/perf_row_cache_update.cc
@@ -19,16 +19,13 @@
  * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <chrono>
 #include <seastar/core/distributed.hh>
 #include <seastar/core/app-template.hh>
 #include <seastar/core/sstring.hh>
 #include <seastar/core/thread.hh>
-#include <seastar/core/weak_ptr.hh>
 #include <seastar/core/reactor.hh>
 
 #include "utils/managed_bytes.hh"
-#include "utils/extremum_tracking.hh"
 #include "utils/logalloc.hh"
 #include "row_cache.hh"
 #include "log.hh"
@@ -41,74 +38,6 @@ static const int update_iterations = 16;
 static const int cell_size = 128;
 static bool cancelled = false;
 
-template<typename Func>
-auto duration_in_seconds(Func&& f) {
-    using clk = std::chrono::steady_clock;
-    auto start = clk::now();
-    f();
-    auto end = clk::now();
-    return std::chrono::duration_cast<std::chrono::duration<float>>(end - start);
-}
-
-class scheduling_latency_measurer : public weakly_referencable<scheduling_latency_measurer> {
-    using clk = std::chrono::steady_clock;
-    clk::time_point _last = clk::now();
-    utils::estimated_histogram _hist{300};
-    min_max_tracker<clk::duration> _minmax;
-    bool _stop = false;
-private:
-    void schedule_tick();
-    void tick() {
-        auto old = _last;
-        _last = clk::now();
-        auto latency = _last - old;
-        _minmax.update(latency);
-        _hist.add(latency.count());
-        if (!_stop) {
-            schedule_tick();
-        }
-    }
-public:
-    void start() {
-        schedule_tick();
-    }
-    void stop() {
-        _stop = true;
-        later().get(); // so that the last scheduled tick is counted
-    }
-    const utils::estimated_histogram& histogram() const {
-        return _hist;
-    }
-    clk::duration min() const { return _minmax.min(); }
-    clk::duration max() const { return _minmax.max(); }
-};
-
-void scheduling_latency_measurer::schedule_tick() {
-    seastar::schedule(make_task(default_scheduling_group(), [self = weak_from_this()] () mutable {
-        if (self) {
-            self->tick();
-        }
-    }));
-}
-
-std::ostream& operator<<(std::ostream& out, const scheduling_latency_measurer& slm) {
-    auto to_ms = [] (int64_t nanos) {
-        return float(nanos) / 1e6;
-    };
-    return out << sprint("{count: %d, "
-                         //"min: %.6f [ms], "
-                         //"50%%: %.6f [ms], "
-                         //"90%%: %.6f [ms], "
-                         "99%%: %.6f [ms], "
-                         "max: %.6f [ms]}",
-        slm.histogram().count(),
-        //to_ms(slm.min().count()),
-        //to_ms(slm.histogram().percentile(0.5)),
-        //to_ms(slm.histogram().percentile(0.9)),
-        to_ms(slm.histogram().percentile(0.99)),
-        to_ms(slm.max().count()));
-}
-
 template<typename MutationGenerator>
 void run_test(const sstring& name, schema_ptr s, MutationGenerator&& gen) {
     cache_tracker tracker;

From 95f15ea383b97330e158875d0cdfdf21b309212e Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Wed, 1 Apr 2020 17:26:20 +0300
Subject: [PATCH 02/11] utils: B+ tree implementation

// The story is at
// https://groups.google.com/forum/#!msg/scylladb-dev/sxqTHM9rSDQ/WqwF1AQDAQAJ

This is the B+ version which satisfies several specific requirements
to be suitable for row-cache usage.

1. Insert/Remove doesn't invalidate iterators
2. Elements should be LSA-compactable
3. Low overhead of data nodes (1 pointer)
4. External less-only comparator
5. As little actions on insert/delete as possible
6. Iterator walks the sorted keys

The design, briefly is:

There are 3 types of nodes: inner, leaf and data, inner and leaf
keep build-in array of N keys and N(+1) nodes. Leaf nodes sit in
a doubly linked list. Data nodes live separately from the leaf ones
and keep pointers on them. Tree handler keeps pointers on root and
left-most and right-most leaves. Nodes do _not_ keep pointers or
references on the tree (except 3 of them, see below).

changes in v9:

- explicitly marked keys/kids indices with type aliases
- marked the whole erase/clear stuff noexcept
- disposers now accept object pointer instead of reference
- clear tree in destructor
- added more comments
- style/readability review comments fixed

Prior changes

**
- Add noexcepts where possible
- Restrict Less-comparator constraint -- it must be noexcept
- Generalized node_id
- Packed code for beging()/cbegin()

**
- Unsigned indices everywhere
- Cosmetics changes

**
- Const iterators
- C++20 concepts

**
- The index_for() implmenetation is templatized the other way
  to make it possible for AVX key search specialization (further
  patching)

**
- Insertion tries to push kids to siblings before split

  Before this change insertion into full node resulted into this
  node being split into two equal parts. This behaviour for random
  keys stress gives a tree with ~2/3 of nodes half-filled.

  With this change before splitting the full node try to push one
  element to each of the siblings (if they exist and not full).
  This slows the insertion a bit (but it's still way faster than
  the std::set), but gives 15% less total number of nodes.

- Iterator method to reconstruct the data at the given position

  The helper creates a new data node, emplaces data into it and
  replaces the iterator's one with it. Needed to keep arrays of
  data in tree.

- Milli-optimize erase()
  - Return back an iterator that will likely be not re-validated
  - Do not try to update ancestors separation key for leftmost kid

  This caused the clear()-like workload work poorly as compared to
  std:set. In particular the row_cache::invalidate() method does
  exactly this and this change improves its timing.

- Perf test to measure drain speed
- Helper call to collect tree counters

**
- Fix corner case of iterator.emplace_before()
- Clean heterogenous lookup API
- Handle exceptions from nodes allocations
- Explicitly mark places where the key is copied (for future)
- Extend the tree.lower_bound() API to report back whether
  the bound hit the key or not
- Addressed style/cleanness review comments

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 configure.py                        |    8 +
 test/boost/bptree_test.cc           |  332 +++++
 test/perf/perf_bptree.cc            |  240 ++++
 test/unit/bptree_compaction_test.cc |  207 +++
 test/unit/bptree_stress_test.cc     |  232 ++++
 test/unit/bptree_validation.hh      |  318 +++++
 test/unit/tree_test_key.hh          |  101 ++
 utils/bptree.hh                     | 1941 +++++++++++++++++++++++++++
 utils/collection-concepts.hh        |   42 +
 utils/neat-object-id.hh             |   53 +
 10 files changed, 3474 insertions(+)
 create mode 100644 test/boost/bptree_test.cc
 create mode 100644 test/perf/perf_bptree.cc
 create mode 100644 test/unit/bptree_compaction_test.cc
 create mode 100644 test/unit/bptree_stress_test.cc
 create mode 100644 test/unit/bptree_validation.hh
 create mode 100644 test/unit/tree_test_key.hh
 create mode 100644 utils/bptree.hh
 create mode 100644 utils/collection-concepts.hh
 create mode 100644 utils/neat-object-id.hh

diff --git a/configure.py b/configure.py
index 80d94466f3..fdd15d99e5 100755
--- a/configure.py
+++ b/configure.py
@@ -388,6 +388,7 @@ scylla_tests = set([
     'test/boost/view_schema_ckey_test',
     'test/boost/vint_serialization_test',
     'test/boost/virtual_reader_test',
+    'test/boost/bptree_test',
     'test/manual/ec2_snitch_test',
     'test/manual/gce_snitch_test',
     'test/manual/gossip',
@@ -404,6 +405,7 @@ scylla_tests = set([
     'test/perf/perf_fast_forward',
     'test/perf/perf_hash',
     'test/perf/perf_mutation',
+    'test/perf/perf_bptree',
     'test/perf/perf_row_cache_update',
     'test/perf/perf_simple_query',
     'test/perf/perf_sstable',
@@ -411,6 +413,8 @@ scylla_tests = set([
     'test/unit/lsa_sync_eviction_test',
     'test/unit/row_cache_alloc_stress_test',
     'test/unit/row_cache_stress_test',
+    'test/unit/bptree_stress_test',
+    'test/unit/bptree_compaction_test',
 ])
 
 perf_tests = set([
@@ -958,6 +962,7 @@ pure_boost_tests = set([
     'test/boost/small_vector_test',
     'test/boost/top_k_test',
     'test/boost/vint_serialization_test',
+    'test/boost/bptree_test',
     'test/manual/streaming_histogram_test',
 ])
 
@@ -971,10 +976,13 @@ tests_not_using_seastar_test_framework = set([
     'test/perf/perf_cql_parser',
     'test/perf/perf_hash',
     'test/perf/perf_mutation',
+    'test/perf/perf_bptree',
     'test/perf/perf_row_cache_update',
     'test/unit/lsa_async_eviction_test',
     'test/unit/lsa_sync_eviction_test',
     'test/unit/row_cache_alloc_stress_test',
+    'test/unit/bptree_stress_test',
+    'test/unit/bptree_compaction_test',
     'test/manual/sstable_scan_footprint_test',
 ]) | pure_boost_tests
 
diff --git a/test/boost/bptree_test.cc b/test/boost/bptree_test.cc
new file mode 100644
index 0000000000..e398cbfe6e
--- /dev/null
+++ b/test/boost/bptree_test.cc
@@ -0,0 +1,332 @@
+
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE bptree
+
+#include <boost/test/unit_test.hpp>
+#include <fmt/core.h>
+
+#include "utils/bptree.hh"
+#include "test/unit/tree_test_key.hh"
+
+struct int_compare {
+    bool operator()(const int& a, const int& b) const noexcept { return a < b; }
+};
+
+using namespace bplus;
+using test_key = tree_test_key_base;
+using test_tree = tree<int, unsigned long, int_compare, 4, key_search::both, with_debug::yes>;
+
+BOOST_AUTO_TEST_CASE(test_ops_empty_tree) {
+    /* Sanity checks for no nullptr dereferences */
+    test_tree t(int_compare{});
+    t.erase(1);
+    t.find(1);
+}
+
+BOOST_AUTO_TEST_CASE(test_double_insert) {
+    /* No assertions should happen in ~tree */
+    test_tree t(int_compare{});
+    auto i = t.emplace(1, 1);
+    BOOST_REQUIRE(i.second);
+    i = t.emplace(1, 1);
+    BOOST_REQUIRE(!i.second);
+    t.erase(1);
+}
+
+BOOST_AUTO_TEST_CASE(test_cookie_find) {
+    struct int_to_key_compare {
+        bool operator()(const test_key& a, const int& b) const noexcept { return (int)a < b; }
+        bool operator()(const int& a, const test_key& b) const noexcept { return a < (int)b; }
+        bool operator()(const test_key& a, const test_key& b) const noexcept {
+            test_key_compare cmp;
+            return cmp(a, b);
+        }
+    };
+
+    using test_tree = tree<test_key, int, int_to_key_compare, 4, key_search::both, with_debug::yes>;
+
+    test_tree t(int_to_key_compare{});
+    t.emplace(test_key{1}, 132);
+
+    auto i = t.find(1);
+    BOOST_REQUIRE(*i == 132);
+}
+
+BOOST_AUTO_TEST_CASE(test_double_erase) {
+    test_tree t(int_compare{});
+    t.emplace(1, 1);
+    t.emplace(2, 2);
+    auto i = t.erase(1);
+    BOOST_REQUIRE(*i == 2);
+    i = t.erase(1);
+    BOOST_REQUIRE(i == t.end());
+    i = t.erase(2);
+    BOOST_REQUIRE(i == t.end());
+    t.erase(2);
+}
+
+BOOST_AUTO_TEST_CASE(test_remove_corner_case) {
+    /* Sanity check for erasure to be precise */
+    test_tree t(int_compare{});
+    t.emplace(1, 1);
+    t.emplace(2, 123);
+    t.emplace(3, 3);
+    t.erase(1);
+    t.erase(3);
+    auto f = t.find(2);
+    BOOST_REQUIRE(*f == 123);
+    t.erase(2);
+}
+
+BOOST_AUTO_TEST_CASE(test_end_iterator) {
+    /* Check std::prev(end()) */
+    test_tree t(int_compare{});
+    t.emplace(1, 123);
+    auto i = std::prev(t.end());
+    BOOST_REQUIRE(*i = 123);
+    t.erase(1);
+}
+
+BOOST_AUTO_TEST_CASE(test_next_to_end_iterator) {
+    /* Same, but with "artificial" end iterator */
+    test_tree t(int_compare{});
+    auto i = t.emplace(1, 123).first;
+    i++;
+    BOOST_REQUIRE(i == t.end());
+    i--;
+    BOOST_REQUIRE(*i = 123);
+    t.erase(1);
+}
+
+BOOST_AUTO_TEST_CASE(test_clear) {
+    /* Quick check for tree::clear */
+    test_tree t(int_compare{});
+
+    for (int i = 0; i < 32; i++) {
+        t.emplace(i, i);
+    }
+
+    t.clear();
+}
+
+BOOST_AUTO_TEST_CASE(test_post_clear) {
+    /* Check that tree is work-able after clear */
+    test_tree t(int_compare{});
+
+    t.emplace(1, 1);
+    t.clear();
+    t.emplace(2, 2);
+    t.erase(2);
+}
+
+BOOST_AUTO_TEST_CASE(test_iterator_erase) {
+    /* Check iterator::erase */
+    test_tree t(int_compare{});
+    auto it = t.emplace(2, 2);
+    t.emplace(1, 321);
+    it.first.erase(int_compare{});
+    BOOST_REQUIRE(*t.find(1) == 321);
+    t.erase(1);
+}
+
+BOOST_AUTO_TEST_CASE(test_iterator_equal) {
+    test_tree t(int_compare{});
+    auto i1 = t.emplace(1, 1);
+    auto i2 = t.emplace(2, 2);
+    auto i3 = t.find(1);
+    BOOST_REQUIRE(i1.first == i3);
+    BOOST_REQUIRE(i1.first != i2.first);
+}
+
+BOOST_AUTO_TEST_CASE(test_lower_bound) {
+    test_tree t(int_compare{});
+    t.emplace(1, 11);
+    t.emplace(3, 13);
+
+    bool match;
+    BOOST_REQUIRE(*t.lower_bound(0, match) == 11 && !match);
+    BOOST_REQUIRE(*t.lower_bound(1, match) == 11 && match);
+    BOOST_REQUIRE(*t.lower_bound(2, match) == 13 && !match);
+    BOOST_REQUIRE(*t.lower_bound(3, match) == 13 && match);
+    BOOST_REQUIRE(t.lower_bound(4, match) == t.end() && !match);
+}
+
+BOOST_AUTO_TEST_CASE(test_upper_bound) {
+    test_tree t(int_compare{});
+    t.emplace(1, 11);
+    t.emplace(3, 13);
+
+    BOOST_REQUIRE(*t.upper_bound(0) == 11);
+    BOOST_REQUIRE(*t.upper_bound(1) == 13);
+    BOOST_REQUIRE(*t.upper_bound(2) == 13);
+    BOOST_REQUIRE(t.upper_bound(3) == t.end());
+    BOOST_REQUIRE(t.upper_bound(4) == t.end());
+}
+
+BOOST_AUTO_TEST_CASE(test_insert_iterator_index) {
+    /* Check insertion iterator ++ and duplicate key */
+    test_tree t(int_compare{});
+    t.emplace(1, 10);
+    t.emplace(3, 13);
+    auto i = t.emplace(2, 2).first;
+    i++;
+    BOOST_REQUIRE(*i == 13);
+    auto i2 = t.emplace(2, 2); /* 2nd insert finds the previous */
+    BOOST_REQUIRE(!i2.second);
+    i2.first++;
+    BOOST_REQUIRE(*(i2.first) == 13);
+}
+
+BOOST_AUTO_TEST_CASE(test_insert_before) {
+    /* Check iterator::insert_before */
+    test_tree t(int_compare{});
+    auto i3 = t.emplace(3, 13).first;
+    auto i2 = i3.emplace_before(2, int_compare{}, 12);
+    BOOST_REQUIRE(++i2 == i3);
+    BOOST_REQUIRE(*i3 == 13);
+    BOOST_REQUIRE(*--i2 == 12);
+    BOOST_REQUIRE(*--i3 == 12);
+}
+
+BOOST_AUTO_TEST_CASE(test_insert_before_end) {
+    /* The same but for end() iterator */
+    test_tree t(int_compare{});
+    auto i = t.emplace(1, 1).first;
+    auto i2 = t.end().emplace_before(2, int_compare{}, 12);
+    BOOST_REQUIRE(++i == i2);
+    BOOST_REQUIRE(++i2 == t.end());
+}
+
+BOOST_AUTO_TEST_CASE(test_insert_before_end_empty) {
+    /* The same, but for empty tree */
+    test_tree t(int_compare{});
+    auto i = t.end().emplace_before(42, int_compare{}, 142);
+    BOOST_REQUIRE(i == t.begin());
+    t.erase(42);
+}
+
+BOOST_AUTO_TEST_CASE(test_iterators) {
+    test_tree t(int_compare{});
+
+    for (auto i = t.rbegin(); i != t.rend(); i++) {
+        BOOST_REQUIRE(false);
+    }
+    for (auto i = t.begin(); i != t.end(); i++) {
+        BOOST_REQUIRE(false);
+    }
+
+    t.emplace(1, 7);
+    t.emplace(2, 9);
+
+    {
+        auto i = t.begin();
+        BOOST_REQUIRE(*(i++) == 7);
+        BOOST_REQUIRE(*(i++) == 9);
+        BOOST_REQUIRE(i == t.end());
+    }
+
+    {
+        auto i = t.rbegin();
+        BOOST_REQUIRE(*(i++) == 9);
+        BOOST_REQUIRE(*(i++) == 7);
+        BOOST_REQUIRE(i == t.rend());
+    }
+}
+
+/*
+ * Special test that makes sure "self-iterator" works OK.
+ * See comment near the bptree::iterator(T* d) constructor
+ * for details.
+ */
+class tree_data {
+    int _key;
+    int _cookie;
+public:
+    explicit tree_data(int cookie) : _key(-1), _cookie(cookie) {}
+    tree_data(int key, int cookie) : _key(key), _cookie(cookie) {}
+    int cookie() const { return _cookie; }
+    int key() const {
+        assert(_key != -1);
+        return _key;
+    }
+};
+
+BOOST_AUTO_TEST_CASE(test_data_self_iterator) {
+    using test_tree = tree<int, tree_data, int_compare, 4, key_search::both, with_debug::yes>;
+
+    test_tree t(int_compare{});
+    auto i = t.emplace(1, 42);
+    BOOST_REQUIRE(i.second);
+
+    tree_data* d = &(*i.first);
+    BOOST_REQUIRE(d->cookie() == 42);
+
+    test_tree::iterator di(d);
+    BOOST_REQUIRE(di->cookie() == 42);
+
+    di.erase(int_compare{});
+    BOOST_REQUIRE(t.find(1) == t.end());
+}
+
+BOOST_AUTO_TEST_CASE(test_insert_before_nokey) {
+    using test_tree = tree<int, tree_data, int_compare, 4, key_search::both, with_debug::yes>;
+
+    test_tree t(int_compare{});
+    auto i = t.emplace(2, 52).first;
+    auto ni = i.emplace_before(int_compare{}, 1, 42);
+    BOOST_REQUIRE(ni->cookie() == 42);
+    ni++;
+    BOOST_REQUIRE(ni == i);
+}
+
+
+BOOST_AUTO_TEST_CASE(test_self_iterator_rover) {
+    test_tree t(int_compare{});
+    auto i = t.emplace(2, 42).first;
+    unsigned long* d = &(*i);
+    test_tree::iterator di(d);
+
+    i = di.emplace_before(1, int_compare{}, 31);
+    BOOST_REQUIRE(*i == 31);
+    BOOST_REQUIRE(*(++i) == 42);
+    BOOST_REQUIRE(++i == t.end());
+    BOOST_REQUIRE(++di == t.end());
+}
+
+BOOST_AUTO_TEST_CASE(test_erase_range) {
+    /* Quick check for tree::erase(from, to) */
+    test_tree t(int_compare{});
+
+    for (int i = 0; i < 32; i++) {
+        t.emplace(i, i);
+    }
+
+    auto b = t.find(8);
+    auto e = t.find(25);
+    t.erase(b, e);
+
+    BOOST_REQUIRE(*t.find(7) == 7);
+    BOOST_REQUIRE(t.find(8) == t.end());
+    BOOST_REQUIRE(t.find(24) == t.end());
+    BOOST_REQUIRE(*t.find(25) == 25);
+}
diff --git a/test/perf/perf_bptree.cc b/test/perf/perf_bptree.cc
new file mode 100644
index 0000000000..51271da2ea
--- /dev/null
+++ b/test/perf/perf_bptree.cc
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/thread.hh>
+#include <algorithm>
+#include <vector>
+#include <random>
+#include <fmt/core.h>
+#include "perf.hh"
+
+using per_key_t = int64_t;
+
+struct key_compare {
+    bool operator()(const per_key_t& a, const per_key_t& b) const noexcept { return a < b; }
+};
+
+#include "utils/bptree.hh"
+
+using namespace bplus;
+using namespace seastar;
+
+constexpr int TEST_NODE_SIZE = 4;
+
+/* On node size 32 (this test) linear search works better */
+using test_tree = tree<per_key_t, unsigned long, key_compare, TEST_NODE_SIZE, key_search::linear>;
+
+class collection_tester {
+public:
+    virtual void insert(per_key_t k) = 0;
+    virtual void lower_bound(per_key_t k) = 0;
+    virtual void erase(per_key_t k) = 0;
+    virtual void drain(int batch) = 0;
+    virtual void show_stats() = 0;
+    virtual ~collection_tester() {};
+};
+
+class bptree_tester : public collection_tester {
+    test_tree _t;
+public:
+    bptree_tester() : _t(key_compare{}) {}
+    virtual void insert(per_key_t k) override { _t.emplace(k, 0); }
+    virtual void lower_bound(per_key_t k) override {
+        auto i = _t.lower_bound(k);
+        assert(i != _t.end());
+    }
+    virtual void erase(per_key_t k) override { _t.erase(k); }
+    virtual void drain(int batch) override {
+        int x = 0;
+        auto i = _t.begin();
+        while (i != _t.end()) {
+            i = i.erase(key_compare{});
+            if (++x % batch == 0) {
+                seastar::thread::yield();
+            }
+        }
+    }
+    virtual void show_stats() {
+        struct bplus::stats st = _t.get_stats();
+        fmt::print("nodes:     {}\n", st.nodes);
+        for (int i = 0; i < (int)st.nodes_filled.size(); i++) {
+            fmt::print("   {}: {} ({}%)\n", i, st.nodes_filled[i], st.nodes_filled[i] * 100 / st.nodes);
+        }
+        fmt::print("leaves:    {}\n", st.leaves);
+        for (int i = 0; i < (int)st.leaves_filled.size(); i++) {
+            fmt::print("   {}: {} ({}%)\n", i, st.leaves_filled[i], st.leaves_filled[i] * 100 / st.leaves);
+        }
+        fmt::print("datas:     {}\n", st.datas);
+    }
+    virtual ~bptree_tester() {
+        _t.clear();
+    }
+};
+
+class set_tester : public collection_tester {
+    std::set<per_key_t> _s;
+public:
+    virtual void insert(per_key_t k) override { _s.insert(k); }
+    virtual void lower_bound(per_key_t k) override {
+        auto i = _s.lower_bound(k);
+        assert(i != _s.end());
+    }
+    virtual void erase(per_key_t k) override { _s.erase(k); }
+    virtual void drain(int batch) override {
+        int x = 0;
+        auto i = _s.begin();
+        while (i != _s.end()) {
+            i = _s.erase(i);
+            if (++x % batch == 0) {
+                seastar::thread::yield();
+            }
+        }
+    }
+    virtual void show_stats() { }
+    virtual ~set_tester() = default;
+};
+
+class map_tester : public collection_tester {
+    std::map<per_key_t, unsigned long> _m;
+public:
+    virtual void insert(per_key_t k) override { _m[k] = 0; }
+    virtual void lower_bound(per_key_t k) override {
+        auto i = _m.lower_bound(k);
+        assert(i != _m.end());
+    }
+    virtual void erase(per_key_t k) override { _m.erase(k); }
+    virtual void drain(int batch) override {
+        int x = 0;
+        auto i = _m.begin();
+        while (i != _m.end()) {
+            i = _m.erase(i);
+            if (++x % batch == 0) {
+                seastar::thread::yield();
+            }
+        }
+    }
+    virtual void show_stats() { }
+    virtual ~map_tester() = default;
+};
+
+int main(int argc, char **argv) {
+    namespace bpo = boost::program_options;
+    app_template app;
+    app.add_options()
+        ("count", bpo::value<int>()->default_value(5000000), "number of keys to fill the tree with")
+        ("batch", bpo::value<int>()->default_value(50), "number of operations between deferring points")
+        ("iters", bpo::value<int>()->default_value(1), "number of iterations")
+        ("col", bpo::value<std::string>()->default_value("bptree"), "collection to test")
+        ("test", bpo::value<std::string>()->default_value("erase"), "what to test (erase, drain, find)")
+        ("stats", bpo::value<bool>()->default_value(false), "show stats");
+
+    return app.run(argc, argv, [&app] {
+        auto count = app.configuration()["count"].as<int>();
+        auto iters = app.configuration()["iters"].as<int>();
+        auto batch = app.configuration()["batch"].as<int>();
+        auto col = app.configuration()["col"].as<std::string>();
+        auto tst = app.configuration()["test"].as<std::string>();
+        auto stats = app.configuration()["stats"].as<bool>();
+
+        return seastar::async([count, iters, batch, col, tst, stats] {
+            std::unique_ptr<collection_tester> c;
+
+            if (col == "bptree") {
+                c = std::make_unique<bptree_tester>();
+            } else if (col == "set") {
+                c = std::make_unique<set_tester>();
+            } else if (col == "map") {
+                c = std::make_unique<map_tester>();
+            } else {
+                fmt::print("Unknown collection\n");
+                return;
+            }
+
+            std::vector<per_key_t> keys;
+
+            for (per_key_t i = 0; i < count; i++) {
+                keys.push_back(i + 1);
+            }
+
+            std::random_device rd;
+            std::mt19937 g(rd());
+
+            fmt::print("Inserting {:d} k:v pairs into {} {:d} times\n", count, col, iters);
+
+            for (auto rep = 0; rep < iters; rep++) {
+                std::shuffle(keys.begin(), keys.end(), g);
+                seastar::thread::yield();
+
+                auto d = duration_in_seconds([&] {
+                    for (int i = 0; i < count; i++) {
+                        c->insert(keys[i]);
+                        if ((i + 1) % batch == 0) {
+                            seastar::thread::yield();
+                        }
+                    }
+                });
+
+                fmt::print("fill: {:.6f} ms\n", d.count() * 1000);
+
+                if (stats) {
+                    c->show_stats();
+                }
+
+                if (tst == "erase") {
+                    std::shuffle(keys.begin(), keys.end(), g);
+                    seastar::thread::yield();
+
+                    d = duration_in_seconds([&] {
+                        for (int i = 0; i < count; i++) {
+                            c->erase(keys[i]);
+                            if ((i + 1) % batch == 0) {
+                                seastar::thread::yield();
+                            }
+                        }
+                    });
+
+                    fmt::print("erase: {:.6f} ms\n", d.count() * 1000);
+                } else if (tst == "drain") {
+                    d = duration_in_seconds([&] {
+                        c->drain(batch);
+                    });
+
+                    fmt::print("drain: {:.6f} ms\n", d.count() * 1000);
+                } else if (tst == "find") {
+                    std::shuffle(keys.begin(), keys.end(), g);
+                    seastar::thread::yield();
+
+                    d = duration_in_seconds([&] {
+                        for (int i = 0; i < count; i++) {
+                            c->lower_bound(keys[i]);
+                            if ((i + 1) % batch == 0) {
+                                seastar::thread::yield();
+                            }
+                        }
+                    });
+
+                    fmt::print("find: {:.6f} ms\n", d.count() * 1000);
+                }
+            }
+        });
+    });
+}
diff --git a/test/unit/bptree_compaction_test.cc b/test/unit/bptree_compaction_test.cc
new file mode 100644
index 0000000000..0687e43b08
--- /dev/null
+++ b/test/unit/bptree_compaction_test.cc
@@ -0,0 +1,207 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/thread.hh>
+#include <map>
+#include <vector>
+#include <random>
+#include <string>
+#include <iostream>
+#include <fmt/core.h>
+#include "utils/logalloc.hh"
+
+constexpr int TEST_NODE_SIZE = 7;
+
+#include "tree_test_key.hh"
+#include "utils/bptree.hh"
+#include "bptree_validation.hh"
+
+using namespace bplus;
+using namespace seastar;
+
+using test_key = tree_test_key_base;
+
+class test_data {
+    int _value;
+public:
+    test_data() : _value(0) {}
+    test_data(test_key& k) : _value((int)k + 10) {}
+
+    operator unsigned long() const { return _value; }
+    bool match_key(const test_key& k) const { return _value == (int)k + 10; }
+};
+using test_tree = tree<test_key, test_data, test_key_compare, TEST_NODE_SIZE, key_search::both, with_debug::yes>;
+using test_validator = validator<test_key, test_data, test_key_compare, TEST_NODE_SIZE>;
+
+class reference {
+    reference* _ref = nullptr;
+public:
+    reference() = default;
+    reference(const reference& other) = delete;
+
+    reference(reference&& other) noexcept : _ref(other._ref) {
+        if (_ref != nullptr) {
+            _ref->_ref = this;
+        }
+        other._ref = nullptr;
+    }
+
+    ~reference() {
+        if (_ref != nullptr) {
+            _ref->_ref = nullptr;
+        }
+    }
+
+    void link(reference& other) {
+        assert(_ref == nullptr);
+        _ref = &other;
+        other._ref = this;
+    }
+
+    reference* get() {
+        assert(_ref != nullptr);
+        return _ref;
+    }
+};
+
+class tree_pointer {
+    reference _ref;
+
+    class tree_wrapper {
+        friend class tree_pointer;
+        test_tree _tree;
+        reference _ref;
+    public:
+        tree_wrapper() : _tree(test_key_compare{}) {}
+    };
+
+    tree_wrapper* get_wrapper() {
+        return boost::intrusive::get_parent_from_member(_ref.get(), &tree_wrapper::_ref);
+    }
+
+public:
+
+    tree_pointer(const tree_pointer& other) = delete;
+    tree_pointer(tree_pointer&& other) = delete;
+
+    tree_pointer() {
+        tree_wrapper *t = current_allocator().construct<tree_wrapper>();
+        _ref.link(t->_ref);
+    }
+
+    test_tree* operator->() {
+        tree_wrapper *tw = get_wrapper();
+        return &tw->_tree;
+    }
+
+    test_tree& operator*() {
+        tree_wrapper *tw = get_wrapper();
+        return tw->_tree;
+    }
+
+    ~tree_pointer() {
+        tree_wrapper *tw = get_wrapper();
+        current_allocator().destroy(tw);
+    }
+};
+
+int main(int argc, char **argv) {
+    namespace bpo = boost::program_options;
+    app_template app;
+    app.add_options()
+        ("count", bpo::value<int>()->default_value(10000), "number of keys to fill the tree with")
+        ("iters", bpo::value<int>()->default_value(13), "number of iterations")
+        ("verb",  bpo::value<bool>()->default_value(false), "be verbose");
+
+    return app.run(argc, argv, [&app] {
+        auto count = app.configuration()["count"].as<int>();
+        auto iter = app.configuration()["iters"].as<int>();
+        auto verb = app.configuration()["verb"].as<bool>();
+
+        return seastar::async([count, iter, verb] {
+            std::vector<int> keys;
+            for (int i = 0; i < count; i++) {
+                keys.push_back(i + 1);
+            }
+
+            std::random_device rd;
+            std::mt19937 g(rd());
+
+            fmt::print("Compacting {:d} k:v pairs {:d} times\n", count, iter);
+
+            test_validator tv;
+
+            logalloc::region mem;
+
+            with_allocator(mem.allocator(), [&] {
+                tree_pointer t;
+
+                for (auto rep = 0; rep < iter; rep++) {
+                    {
+                        std::shuffle(keys.begin(), keys.end(), g);
+
+                        logalloc::reclaim_lock rl(mem);
+
+                        for (int i = 0; i < count; i++) {
+                            test_key k(keys[i]);
+
+                            auto ti = t->emplace(std::move(copy_key(k)), k);
+                            assert(ti.second);
+                            seastar::thread::maybe_yield();
+                        }
+                    }
+
+                    mem.full_compaction();
+
+                    if (verb) {
+                        fmt::print("After fill + compact\n");
+                        tv.print_tree(*t, '|');
+                    }
+
+                    tv.validate(*t);
+
+                    {
+                        std::shuffle(keys.begin(), keys.end(), g);
+
+                        logalloc::reclaim_lock rl(mem);
+
+                        for (int i = 0; i < count; i++) {
+                            test_key k(keys[i]);
+
+                            t->erase(k);
+                            seastar::thread::maybe_yield();
+                        }
+                    }
+
+                    mem.full_compaction();
+
+                    if (verb) {
+                        fmt::print("After erase + compact\n");
+                        tv.print_tree(*t, '|');
+                    }
+
+                    tv.validate(*t);
+                }
+            });
+        });
+    });
+}
diff --git a/test/unit/bptree_stress_test.cc b/test/unit/bptree_stress_test.cc
new file mode 100644
index 0000000000..50e7b5eeda
--- /dev/null
+++ b/test/unit/bptree_stress_test.cc
@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <seastar/core/app-template.hh>
+#include <seastar/core/thread.hh>
+#include <map>
+#include <vector>
+#include <random>
+#include <string>
+#include <iostream>
+#include <fmt/core.h>
+#include <fmt/ostream.h>
+
+constexpr int TEST_NODE_SIZE = 16;
+
+#include "tree_test_key.hh"
+#include "utils/bptree.hh"
+#include "bptree_validation.hh"
+
+using namespace bplus;
+using namespace seastar;
+
+using test_key = tree_test_key_base;
+
+class test_data {
+    int _value;
+public:
+    test_data() : _value(0) {}
+    test_data(test_key& k) : _value((int)k + 10) {}
+
+    operator unsigned long() const { return _value; }
+    bool match_key(const test_key& k) const { return _value == (int)k + 10; }
+};
+
+std::ostream& operator<<(std::ostream& os, test_data d) {
+    os << (unsigned long)d;
+    return os;
+}
+
+using test_tree = tree<test_key, test_data, test_key_compare, TEST_NODE_SIZE, key_search::both, with_debug::yes>;
+using test_node = typename test_tree::node;
+using test_validator = validator<test_key, test_data, test_key_compare, TEST_NODE_SIZE>;
+using test_iterator_checker = iterator_checker<test_key, test_data, test_key_compare, TEST_NODE_SIZE>;
+
+int main(int argc, char **argv) {
+    namespace bpo = boost::program_options;
+    app_template app;
+    app.add_options()
+        ("count", bpo::value<int>()->default_value(4132), "number of keys to fill the tree with")
+        ("iters", bpo::value<int>()->default_value(9), "number of iterations")
+        ("keys",  bpo::value<std::string>()->default_value("rand"), "how to generate keys (rand, asc, desc)")
+        ("verb",  bpo::value<bool>()->default_value(false), "be verbose");
+
+    return app.run(argc, argv, [&app] {
+        auto count = app.configuration()["count"].as<int>();
+        auto iters = app.configuration()["iters"].as<int>();
+        auto ks = app.configuration()["keys"].as<std::string>();
+        auto verb = app.configuration()["verb"].as<bool>();
+
+        return seastar::async([count, iters, ks, verb] {
+            auto t = std::make_unique<test_tree>(test_key_compare{});
+            std::map<int, unsigned long> oracle;
+
+            int p = count / 10;
+            if (p == 0) {
+                p = 1;
+            }
+
+            std::vector<int> keys;
+
+            for (int i = 0; i < count; i++) {
+                keys.push_back(i + 1);
+            }
+
+            std::random_device rd;
+            std::mt19937 g(rd());
+
+            fmt::print("Inserting {:d} k:v pairs {:d} times\n", count, iters);
+
+            test_validator tv;
+
+            if (ks == "desc") {
+                fmt::print("Reversing keys vector\n");
+                std::reverse(keys.begin(), keys.end());
+            }
+
+            bool shuffle = ks == "rand";
+            if (shuffle) {
+                fmt::print("Will shuffle keys each iteration\n");
+            }
+
+
+            for (auto rep = 0; rep < iters; rep++) {
+                if (verb) {
+                    fmt::print("Iteration {:d}\n", rep);
+                }
+
+                auto* itc = new test_iterator_checker(tv, *t);
+
+                if (shuffle) {
+                    std::shuffle(keys.begin(), keys.end(), g);
+                }
+
+                for (int i = 0; i < count; i++) {
+                    test_key k(keys[i]);
+
+                    if (verb) {
+                        fmt::print("+++ {}\n", (int)k);
+                    }
+
+                    if (rep % 2 != 1) {
+                        auto ir = t->emplace(std::move(copy_key(k)), k);
+                        assert(ir.second);
+                    } else {
+                        auto ir = t->lower_bound(k);
+                        ir.emplace_before(std::move(copy_key(k)), test_key_compare{}, k);
+                    }
+                    oracle[keys[i]] = keys[i] + 10;
+
+                    if (verb) {
+                        fmt::print("Validating\n");
+                        tv.print_tree(*t, '|');
+                    }
+
+                    /* Limit validation rate for many keys */
+                    if (i % (i/1000 + 1) == 0) {
+                        tv.validate(*t);
+                    }
+
+                    if (i % 7 == 0) {
+                        if (!itc->step()) {
+                            delete itc;
+                            itc = new test_iterator_checker(tv, *t);
+                        }
+                    }
+
+                    seastar::thread::maybe_yield();
+                }
+
+                auto sz = t->size_slow();
+                if (sz != (size_t)count) {
+                    fmt::print("Size {} != count {}\n", sz, count);
+                    throw "size";
+                }
+
+                auto ti = t->begin();
+                for (auto oe : oracle) {
+                    if (*ti != oe.second) {
+                        fmt::print("Data mismatch {} vs {}\n", oe.second, *ti);
+                        throw "oracle";
+                    }
+                    ti++;
+                }
+
+                if (shuffle) {
+                    std::shuffle(keys.begin(), keys.end(), g);
+                }
+
+                for (int i = 0; i < count; i++) {
+                    test_key k(keys[i]);
+
+                    /*
+                     * kill iterator if we're removing what it points to,
+                     * otherwise it's not invalidated
+                     */
+                    if (itc->here(k)) {
+                        delete itc;
+                        itc = nullptr;
+                    }
+
+                    if (verb) {
+                        fmt::print("--- {}\n", (int)k);
+                    }
+
+                    if (rep % 3 != 2) {
+                        t->erase(k);
+                    } else {
+                        auto ri = t->find(k);
+                        auto ni = ri;
+                        ni++;
+                        auto eni = ri.erase(test_key_compare{});
+                        assert(ni == eni);
+                    }
+
+                    oracle.erase(keys[i]);
+
+                    if (verb) {
+                        fmt::print("Validating\n");
+                        tv.print_tree(*t, '|');
+                    }
+
+                    if ((count-i) % ((count-i)/1000 + 1) == 0) {
+                        tv.validate(*t);
+                    }
+
+                    if (itc == nullptr) {
+                        itc = new test_iterator_checker(tv, *t);
+                    }
+
+                    if (i % 5 == 0) {
+                        if (!itc->step()) {
+                            delete itc;
+                            itc = new test_iterator_checker(tv, *t);
+                        }
+                    }
+
+                    seastar::thread::maybe_yield();
+                }
+
+                delete itc;
+            }
+        });
+    });
+}
diff --git a/test/unit/bptree_validation.hh b/test/unit/bptree_validation.hh
new file mode 100644
index 0000000000..766b88c8ff
--- /dev/null
+++ b/test/unit/bptree_validation.hh
@@ -0,0 +1,318 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace bplus {
+
+template <typename K, typename T, typename Less, size_t NodeSize>
+class validator {
+    using tree = class tree<K, T, Less, NodeSize, key_search::both, with_debug::yes>;
+    using node = typename tree::node;
+
+    void validate_node(const tree& t, const node& n, int& prev, int& min, bool is_root);
+    void validate_list(const tree& t);
+
+public:
+    void print_tree(const tree& t, char pfx) const {
+        fmt::print("/ {} <- | {} | -> {}\n", t._left->id(), t._root->id(), t._right->id());
+        print_node(*t._root, pfx, 2);
+        fmt::print("\\\n");
+    }
+
+    void print_node(const node& n, char pfx, int indent) const {
+        int i;
+
+        fmt::print("{:<{}c}{:s} {:d} ({:d} keys, {:x} flags):", pfx, indent,
+                n.is_leaf() ? "leaf" : "node", n.id(), n._num_keys, n._flags);
+        if (n.is_leaf()) {
+            for (i = 0; i < n._num_keys; i++) {
+                fmt::print(" {}", (int)n._keys[i].v);
+            }
+            fmt::print("\n");
+
+            return;
+        }
+        fmt::print("\n");
+
+        if (n._kids[0].n != nullptr) {
+            print_node(*n._kids[0].n, pfx, indent + 2);
+        }
+        for (i = 0; i < n._num_keys; i++) {
+            fmt::print("{:<{}c}---{}---\n", pfx, indent, (int)n._keys[i].v);
+            print_node(*n._kids[i + 1].n, pfx, indent + 2);
+        }
+    }
+
+    void validate(const tree& t);
+};
+
+
+template <typename K, typename T, typename L, size_t NS>
+void validator<K, T, L, NS>::validate_node(const tree& t, const node& n, int& prev_key, int& min_key, bool is_root) {
+    int i;
+
+    if (n.is_root() != is_root) {
+        fmt::print("node {} needs to {} root, but {}\n", n.id(), is_root ? "be" : "be not", n._flags);
+        throw "root broken";
+    }
+
+    for (i = 0; i < n._num_keys; i++) {
+        if (!n._keys[i].v.is_alive()) {
+            fmt::print("node {} key {} is not alive\n", n.id(), i);
+            throw "key dead";
+        }
+    }
+
+    if (n.is_leaf()) {
+        for (i = 0; i < n._num_keys; i++) {
+            if (t._less(n._keys[i].v, K(prev_key))) {
+                fmt::print("node misordered @{} (prev {})\n", (int)n._keys[i].v, prev_key);
+                throw "misorder";
+            }
+            if (n._kids[i + 1].d->_leaf != &n) {
+                fmt::print("data mispoint\n");
+                throw "data backlink";
+            }
+
+            prev_key = n._keys[i].v;
+            if (!n._kids[i + 1].d->value.match_key(n._keys[i].v)) {
+                fmt::print("node value corrupted @{:d}.{:d}\n", n.id(), i);
+                throw "data corruption";
+            }
+        }
+
+        if (n._num_keys > 0) {
+            min_key = (int)n._keys[0].v;
+        }
+    } else if (n._num_keys > 0) {
+        node* k = n._kids[0].n;
+
+        if (k->_parent != &n) {
+            fmt::print("node {:d} -parent-> {:d}, expect {:d}\n", k->id(), k->_parent->id(), n.id());
+            throw "mis-parented node";
+        }
+        validate_node(t, *k, prev_key, min_key, false);
+        for (i = 0; i < n._num_keys; i++) {
+            k = n._kids[i + 1].n;
+            if (k->_parent != &n) {
+                fmt::print("node {:d} -parent-> {:d}, expect {:d}\n",
+                        k->id(), k->_parent ? k->_parent->id() : -1, n.id());
+                throw "mis-parented node";
+            }
+            if (t._less(k->_keys[0].v, n._keys[i].v)) {
+                fmt::print("node {:d}.{:d}, separation key {}, kid has {}\n", n.id(), k->id(),
+                        (int)n._keys[i].v, (int)k->_keys[0].v);
+                throw "separation key mismatch";
+            }
+
+            int min = 0;
+            validate_node(t, *k, prev_key, min, false);
+            if (t._less(n._keys[i].v, K(min)) || t._less(K(min), n._keys[i].v)) {
+                fmt::print("node {:d}.[{:d}]{:d}, separation key {}, min {}\n",
+                        n.id(), i, k->id(), (int)n._keys[i].v, min);
+                if (strict_separation_key || t._less(K(min), n._keys[i].v)) {
+                    throw "separation key screw";
+                }
+            }
+        }
+    }
+}
+
+template <typename K, typename T, typename L, size_t NS>
+void validator<K, T, L, NS>::validate_list(const tree& t) {
+    int prev = 0;
+
+    node* lh = t.left_leaf_slow();
+    node* rh = t.right_leaf_slow();
+
+    if (lh != t._left) {
+        fmt::print("left {:d}, slow {:d}\n", t._left->id(), lh->id());
+        throw "list broken";
+    }
+
+    if (!(lh->_flags & node::NODE_LEFTMOST)) {
+        fmt::print("left {:d} is not marked as such {}\n", t._left->id(), t._left->_flags);;
+        throw "list broken";
+    }
+
+    if (rh != t._right) {
+        fmt::print("right {:d}, slow {:d}\n", t._right->id(), rh->id());
+        throw "list broken";
+    }
+
+    if (!(rh->_flags & node::NODE_RIGHTMOST)) {
+        fmt::print("right {:d} is not marked as such {}\n", t._right->id(), t._right->_flags);;
+        throw "list broken";
+    }
+
+    node* r = lh;
+    while (1) {
+        node *ln;
+
+        if (!r->is_rightmost()) {
+            ln = r->get_next();
+            if (ln->get_prev() != r) {
+                fmt::print("next leaf {:d} points to {:d}, expect {:d}\n", ln->id(), ln->get_prev()->id(), r->id());
+                throw "list broken";
+            }
+        } else if (r->_rightmost_tree != &t) {
+            fmt::print("right leaf doesn't point to tree\n");
+            throw "list broken";
+        }
+
+        if (!r->is_leftmost()) {
+            ln = r->get_prev();
+            if (ln->get_next() != r) {
+                fmt::print("prev leaf {:d} points to {:d}, expect {:d}\n", ln->id(), ln->get_next()->id(), r->id());
+                throw "list broken";
+            }
+        } else if (r->_kids[0]._leftmost_tree != &t) {
+            fmt::print("left leaf doesn't point to tree\n");
+            throw "list broken";
+        }
+
+        if (r->_num_keys > 0 && t._less(r->_keys[0].v, K(prev))) {
+            fmt::print("list misorder on element {:d}, keys {}..., prev {:d}\n", r->id(), (int)r->_keys[0].v, prev);
+            throw "list broken";
+        }
+
+        if (!r->is_root() && r->_parent != nullptr) {
+            const auto p = r->_parent;
+            int i = p->index_for(r->_keys[0].v, t._less);
+            if (i > 0) {
+                if (p->_kids[i - 1].n != r->get_prev()) {
+                    fmt::print("list misorder on parent check: node {:d}.{:d}, parent prev {:d}, list prev {:d}\n",
+                            p->id(), r->id(), p->_kids[i - 1].n->id(), r->get_prev()->id());
+                    throw "list broken";
+                }
+            }
+            if (i < p->_num_keys - 1) {
+                if (p->_kids[i + 1].n != r->get_next()) {
+                    fmt::print("list misorder on parent check: node {:d}.{:d}, parent next {:d}, list next {:d}\n",
+                            p->id(), r->id(), p->_kids[i + 1].n->id(), r->get_next()->id());
+                    throw "list broken";
+                }
+            }
+        }
+
+        if (r->_num_keys > 0) {
+            prev = (int)r->_keys[r->_num_keys - 1].v;
+        }
+
+        if (r != t._left && r != t._right && (r->_flags & (node::NODE_LEFTMOST | node::NODE_RIGHTMOST))) {
+            fmt::print("middle {:d} is marked as left/right {}\n", r->id(), r->_flags);;
+            throw "list broken";
+        }
+
+        if (r->is_rightmost()) {
+            break;
+        }
+
+        r = r->get_next();
+    }
+}
+
+template <typename K, typename T, typename L, size_t NS>
+void validator<K, T, L, NS>::validate(const tree& t) {
+    try {
+        validate_list(t);
+        int min = 0, prev = 0;
+        if (t._root->_root_tree != &t) {
+            fmt::print("root doesn't point to tree\n");
+            throw "root broken";
+        }
+
+        validate_node(t, *t._root, prev, min, true);
+    } catch (...) {
+        print_tree(t, '|');
+        fmt::print("[ ");
+        node* lh = t._left;
+        while (1) {
+            fmt::print(" {:d}", lh->id());
+            if (lh->is_rightmost()) {
+                break;
+            }
+            lh = lh->get_next();
+        }
+        fmt::print("]\n");
+        throw;
+    }
+}
+
+template <typename K, typename T, typename Less, size_t NodeSize>
+class iterator_checker {
+    using tree = class tree<K, T, Less, NodeSize, key_search::both, with_debug::yes>;
+
+    validator<K, T, Less, NodeSize>& _tv;
+    tree& _t;
+    typename tree::iterator _fwd, _fend;
+    T _fprev;
+
+public:
+    iterator_checker(validator<K, T, Less, NodeSize>& tv, tree& t) : _tv(tv), _t(t),
+            _fwd(t.begin()), _fend(t.end()) {
+    }
+
+    bool step() {
+        try {
+            return forward_check();
+        } catch(...) {
+            _tv.print_tree(_t, ':');
+            throw;
+        }
+    }
+
+    bool here(const K& k) {
+        return _fwd != _fend && _fwd->match_key(k);
+    }
+
+private:
+    bool forward_check() {
+        if (_fwd == _fend) {
+            return false;
+        }
+        _fwd++;
+        if (_fwd == _fend) {
+            return false;
+        }
+        T val = *_fwd;
+        _fwd++;
+        if (_fwd == _fend) {
+            return false;
+        }
+        _fwd--;
+        if (val != *_fwd) {
+            fmt::print("Iterator broken, {:d} != {:d}\n", val, *_fwd);
+            throw "iterator";
+        }
+        if (val < _fprev) {
+            fmt::print("Iterator broken, {:d} < {:d}\n", val, _fprev);
+            throw "iterator";
+        }
+        _fprev = val;
+
+        return true;
+    }
+};
+
+} // namespace
+
diff --git a/test/unit/tree_test_key.hh b/test/unit/tree_test_key.hh
new file mode 100644
index 0000000000..14ef31df4d
--- /dev/null
+++ b/test/unit/tree_test_key.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+/*
+ * Helper class that helps to check that tree
+ * - works with keys without default contstuctor
+ * - moves the keys around properly
+ */
+class tree_test_key_base {
+    int _val;
+    int* _cookie;
+    int* _p_cookie;
+
+public:
+    bool is_alive() const {
+        if (_val == -1) {
+            fmt::print("key value is reset\n");
+            return false;
+        }
+
+        if (_cookie == nullptr) {
+            fmt::print("key cookie is reset\n");
+            return false;
+        }
+
+        if (*_cookie != 0) {
+            fmt::print("key cookie value is corrupted {}\n", *_cookie);
+            return false;
+        }
+
+        return true;
+    }
+
+    bool less(const tree_test_key_base& o) const noexcept {
+        return _val < o._val;
+    }
+
+    explicit tree_test_key_base(int nr, int cookie = 0) : _val(nr) {
+        _cookie = new int(cookie);
+        _p_cookie = new int(1);
+    }
+
+    operator int() const noexcept { return _val; }
+
+    tree_test_key_base& operator=(const tree_test_key_base& other) = delete;
+    tree_test_key_base& operator=(tree_test_key_base&& other) = delete;
+
+private:
+    /*
+     * Keep this private to make bptree.hh explicitly call the
+     * copy_key in the places where the key is copied
+     */
+    tree_test_key_base(const tree_test_key_base& other) : _val(other._val) {
+        _cookie = new int(*other._cookie);
+        _p_cookie = new int(*other._p_cookie);
+    }
+
+    friend tree_test_key_base copy_key(const tree_test_key_base&);
+
+public:
+    tree_test_key_base(tree_test_key_base&& other) noexcept : _val(other._val) {
+        other._val = -1;
+        _cookie = other._cookie;
+        other._cookie = nullptr;
+        _p_cookie = new int(*other._p_cookie);
+    }
+
+    ~tree_test_key_base() {
+        if (_cookie != nullptr) {
+            delete _cookie;
+        }
+        assert(_p_cookie != nullptr);
+        delete _p_cookie;
+    }
+};
+
+tree_test_key_base copy_key(const tree_test_key_base& other) { return tree_test_key_base(other); }
+
+struct test_key_compare {
+    bool operator()(const tree_test_key_base& a, const tree_test_key_base& b) const noexcept { return a.less(b); }
+};
diff --git a/utils/bptree.hh b/utils/bptree.hh
new file mode 100644
index 0000000000..e43da75308
--- /dev/null
+++ b/utils/bptree.hh
@@ -0,0 +1,1941 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <boost/intrusive/parent_from_member.hpp>
+#include <seastar/util/defer.hh>
+#include <cassert>
+#include "utils/logalloc.hh"
+#include "utils/collection-concepts.hh"
+#include "utils/neat-object-id.hh"
+
+namespace bplus {
+
+enum class with_debug { no, yes };
+
+/*
+ * Linear search in a sorted array of keys slightly beats the
+ * binary one on small sizes. For debugging purposes both methods
+ * should be used (and the result must coincide).
+ */
+enum class key_search { linear, binary, both };
+
+/*
+ * This wrapper prevents the value from being default-constructed
+ * when its container is created. The intended usage is to wrap
+ * elements of static arrays or containers with .emplace() methods
+ * that can live some time without the value in it.
+ *
+ * Similarly, the value is _not_ automatically destructed when this
+ * thing is, so ~Value() must be called by hand. For this there is the
+ * .remove() method and two helpers for common cases -- std::move-ing
+ * the value into another maybe-location (.emplace(maybe&&)) and
+ * constructing the new in place of the existing one (.replace(args...))
+ */
+template <typename Value>
+union maybe_key {
+    Value v;
+    maybe_key() noexcept {}
+    ~maybe_key() {}
+    maybe_key(const maybe_key&) = delete;
+    maybe_key(maybe_key&&) = delete;
+
+    void reset() noexcept { v.~Value(); }
+
+    /*
+     * Constructs the value inside the empty maybe wrapper.
+     */
+    template <typename... Args>
+    void emplace(Args&&... args) noexcept {
+        new (&v) Value (std::forward<Args>(args)...);
+    }
+
+    /*
+     * The special-case handling of moving some other alive maybe-value.
+     * Calls the source destructor after the move.
+     */
+    void emplace(maybe_key&& other) noexcept {
+        new (&v) Value(std::move(other.v));
+        other.reset();
+    }
+
+    /*
+     * Similar to emplace, but to be used on the alive maybe.
+     * Calls the destructor on it before constructing the new value.
+     */
+    template <typename... Args>
+    void replace(Args&&... args) noexcept {
+        reset();
+        emplace(std::forward<Args>(args)...);
+    }
+
+    void replace(maybe_key&& other) = delete; // not to be called by chance
+};
+
+// For .{do_something_with_data}_and_dispose methods below
+template <typename T>
+void default_dispose(T* value) noexcept { }
+
+/*
+ * Helper to explicitly capture all keys copying.
+ * Check test_key for more information.
+ */
+template <typename Key>
+SEASTAR_CONCEPT(requires std::is_nothrow_copy_constructible_v<Key>)
+Key copy_key(const Key& other) noexcept {
+    return Key(other);
+}
+
+/*
+ * Consider a small 2-level tree like this
+ *
+ *        [ . 5 . ]
+ *          |   |
+ *   +------+   +-----+
+ *   |                |
+ *   [ 1 . 2 . 3 . ]  [ 5 . 6 . 7 . ]
+ *
+ * And we remove key 5 from it. First -- the key is removed
+ * from the leaf entry
+ *
+ *        [ . 5 . ]
+ *          |   |
+ *   +------+   +-----+
+ *   |                |
+ *   [ 1 . 2 . 3 . ]  [ 6 . 7. ]
+ *
+ * At this point we have a choice -- whether or not to update
+ * the separation key on the parent (root). Strictly speaking,
+ * the whole tree is correct now -- all the keys on the right
+ * are greater-or-equal than their separation key, though the
+ * "equal" never happens.
+ *
+ * This can be problematic if the keys are stored on data nodes
+ * and are referenced from the (non-)leaf nodes. In this case
+ * the separation key must be updated to point to some real key
+ * in its sub-tree.
+ *
+ *        [ . 6 . ]  <--- this key updated
+ *          |   |
+ *   +------+   +-----+
+ *   |                |
+ *   [ 1 . 2 . 3 . ]  [ 6 . 7. ]
+ *
+ * As this update takes some time, this behaviour is tunable.
+ *
+ */
+constexpr bool strict_separation_key = true;
+
+/*
+ * This is for testing, validator will be everybody's friend
+ * to have rights to check if the tree is internally correct.
+ */
+template <typename Key, typename T, typename Less, size_t NodeSize> class validator;
+template <with_debug Debug> class statistics;
+
+template <typename Key, typename T, typename Less, size_t NodeSize, key_search Search, with_debug Debug> class node;
+template <typename Key, typename T, typename Less, size_t NodeSize, key_search Search, with_debug Debug> class data;
+
+/*
+ * The tree itself.
+ * Equipped with O(1) (with little constant) begin() and end()
+ * and the iterator, that scans through sorted keys and is not
+ * invalidated on insert/remove.
+ *
+ * The NodeSize parameter describes the amount of keys to be
+ * held on each node. Inner nodes will thus have N+1 sub-trees,
+ * leaf nodes will have N data pointers.
+ */
+
+SEASTAR_CONCEPT(
+    template <typename T, typename Key>
+    concept CanGetKeyFromValue = requires (T val) {
+        { val.key() } -> std::same_as<Key>;
+    };
+)
+
+struct stats {
+    unsigned long nodes;
+    std::vector<unsigned long> nodes_filled;
+    unsigned long leaves;
+    std::vector<unsigned long> leaves_filled;
+    unsigned long datas;
+};
+
+template <typename Key, typename T, typename Less, size_t NodeSize,
+            key_search Search = key_search::binary, with_debug Debug = with_debug::no>
+SEASTAR_CONCEPT( requires LessNothrowComparable<Key, Key, Less> &&
+                        std::is_nothrow_move_constructible_v<Key> &&
+                        std::is_nothrow_move_constructible_v<T>
+)
+class tree {
+public:
+    class iterator;
+    class const_iterator;
+
+    friend class validator<Key, T, Less, NodeSize>;
+    friend class node<Key, T, Less, NodeSize, Search, Debug>;
+
+    // Sanity not to allow slow key-search in non-debug mode
+    static_assert(Debug == with_debug::yes || Search != key_search::both);
+
+    using node = class node<Key, T, Less, NodeSize, Search, Debug>;
+    using data = class data<Key, T, Less, NodeSize, Search, Debug>;
+    using kid_index = node::kid_index;
+
+private:
+
+    node* _root = nullptr;
+    node* _left = nullptr;
+    node* _right = nullptr;
+    [[no_unique_address]] Less _less;
+
+    template <typename K>
+    node& find_leaf_for(const K& k) const noexcept {
+        node* cur = _root;
+
+        while (!cur->is_leaf()) {
+            kid_index i = cur->index_for(k, _less);
+            cur = cur->_kids[i].n;
+        }
+
+        return *cur;
+    }
+
+    void maybe_init_empty_tree() {
+        if (_root != nullptr) {
+            return;
+        }
+
+        node* n = node::create();
+        n->_flags |= node::NODE_LEAF | node::NODE_ROOT | node::NODE_RIGHTMOST | node::NODE_LEFTMOST;
+        do_set_root(n);
+        do_set_left(n);
+        do_set_right(n);
+    }
+
+    node* left_leaf_slow() const noexcept {
+        node* cur = _root;
+        while (!cur->is_leaf()) {
+            cur = cur->_kids[0].n;
+        }
+        return cur;
+    }
+
+    node* right_leaf_slow() const noexcept {
+        node* cur = _root;
+        while (!cur->is_leaf()) {
+            cur = cur->_kids[cur->_num_keys].n;
+        }
+        return cur;
+    }
+
+    template <typename K>
+    SEASTAR_CONCEPT(requires LessNothrowComparable<K, Key, Less>)
+    const_iterator get_bound(const K& k, bool upper, bool& match) const noexcept {
+        match = false;
+        if (empty()) {
+            return end();
+        }
+
+        node& n = find_leaf_for(k);
+        kid_index i = n.index_for(k, _less);
+
+        /*
+         * Element at i (key at i - 1) is less or equal to the k,
+         * the next element is greater. Mind corner cases.
+         */
+
+        if (i == 0) {
+            assert(n.is_leftmost());
+            return begin();
+        } else if (i <= n._num_keys) {
+            const_iterator cur = const_iterator(n._kids[i].d, i);
+            if (upper || _less(n._keys[i - 1].v, k)) {
+                cur++;
+            } else {
+                match = true;
+            }
+
+            return cur;
+        } else {
+            assert(n.is_rightmost());
+            return end();
+        }
+    }
+
+    template <typename K>
+    iterator get_bound(const K& k, bool upper, bool& match) noexcept {
+        return iterator(const_cast<const tree*>(this)->get_bound(k, upper, match));
+    }
+
+public:
+
+    tree(const tree& other) = delete;
+    const tree& operator=(const tree& other) = delete;
+    tree& operator=(tree&& other) = delete;
+
+    explicit tree(Less less) noexcept : _less(less) { }
+    ~tree() { clear(); }
+
+    Less less() const noexcept { return _less; }
+
+    tree(tree&& other) noexcept : _less(std::move(other._less)) {
+        if (other._root) {
+            do_set_root(other._root);
+            do_set_left(other._left);
+            do_set_right(other._right);
+
+            other._root = nullptr;
+            other._left = nullptr;
+            other._right = nullptr;
+        }
+    }
+
+    // XXX -- this uses linear scan over the leaf nodes
+    size_t size_slow() const noexcept {
+        if (_root == nullptr) {
+            return 0;
+        }
+
+        size_t ret = 0;
+        const node* leaf = _left;
+        while (1) {
+            assert(leaf->is_leaf());
+            ret += leaf->_num_keys;
+            if (leaf == _right) {
+                break;
+            }
+            leaf = leaf->get_next();
+        }
+
+        return ret;
+    }
+
+    // Returns result that is equal (both not less than each other)
+    template <typename K = Key>
+    SEASTAR_CONCEPT(requires LessNothrowComparable<K, Key, Less>)
+    const_iterator find(const K& k) const noexcept {
+        if (empty()) {
+            return end();
+        }
+
+        node& n = find_leaf_for(k);
+        kid_index i = n.index_for(k, _less);
+
+        if (i >= 1 && !_less(n._keys[i - 1].v, k)) {
+            return const_iterator(n._kids[i].d, i);
+        } else {
+            return end();
+        }
+    }
+
+    template <typename K = Key>
+    SEASTAR_CONCEPT(requires LessNothrowComparable<K, Key, Less>)
+    iterator find(const K& k) noexcept {
+        return iterator(const_cast<const tree*>(this)->find(k));
+    }
+
+    // Returns the least x out of those !less(x, k)
+    template <typename K = Key>
+    iterator lower_bound(const K& k) noexcept {
+        bool match;
+        return get_bound(k, false, match);
+    }
+
+    template <typename K = Key>
+    const_iterator lower_bound(const K& k) const noexcept {
+        bool match;
+        return get_bound(k, false, match);
+    }
+
+    template <typename K = Key>
+    iterator lower_bound(const K& k, bool& match) noexcept {
+        return get_bound(k, false, match);
+    }
+
+    template <typename K = Key>
+    const_iterator lower_bound(const K& k, bool& match) const noexcept {
+        return get_bound(k, false, match);
+    }
+
+    // Returns the least x out of those less(k, x)
+    template <typename K = Key>
+    iterator upper_bound(const K& k) noexcept {
+        bool match;
+        return get_bound(k, true, match);
+    }
+
+    template <typename K = Key>
+    const_iterator upper_bound(const K& k) const noexcept {
+        bool match;
+        return get_bound(k, true, match);
+    }
+
+    /*
+     * Constructs the element with key k inside the tree and returns
+     * iterator on it. If the key already exists -- just returns the
+     * iterator on it and sets the .second to false.
+     */
+    template <typename... Args>
+    std::pair<iterator, bool> emplace(Key k, Args&&... args) {
+        maybe_init_empty_tree();
+
+        node& n = find_leaf_for(k);
+        kid_index i = n.index_for(k, _less);
+
+        if (i >= 1 && !_less(n._keys[i - 1].v, k)) {
+            // Direct hit
+            return std::pair(iterator(n._kids[i].d, i), false);
+        }
+
+        data* d = data::create(std::forward<Args>(args)...);
+        auto x = seastar::defer([&d] { data::destroy(*d, default_dispose<T>); });
+        n.insert(i, std::move(k), d, _less);
+        assert(d->attached());
+        x.cancel();
+        return std::pair(iterator(d, i + 1), true);
+    }
+
+    template <typename Func>
+    SEASTAR_CONCEPT(requires Disposer<Func, T>)
+    iterator erase_and_dispose(const Key& k, Func&& disp) noexcept {
+        maybe_init_empty_tree();
+
+        node& n = find_leaf_for(k);
+
+        data* d;
+        kid_index i = n.index_for(k, _less);
+
+        if (i == 0) {
+            return end();
+        }
+
+        assert(n._num_keys > 0);
+
+        if (_less(n._keys[i - 1].v, k)) {
+            return end();
+        }
+
+        d = n._kids[i].d;
+        iterator it(d, i);
+        it++;
+
+        n.remove(i, _less);
+
+        data::destroy(*d, disp);
+        return it;
+    }
+
+    template <typename Func>
+    SEASTAR_CONCEPT(requires Disposer<Func, T>)
+    iterator erase_and_dispose(iterator from, iterator to, Func&& disp) noexcept {
+        /*
+         * FIXME this is dog slow k*logN algo, need k+logN one
+         */
+        while (from != to) {
+            from = from.erase_and_dispose(disp, _less);
+        }
+
+        return to;
+    }
+
+    template <typename... Args>
+    iterator erase(Args&&... args) noexcept { return erase_and_dispose(std::forward<Args>(args)..., default_dispose<T>); }
+
+    template <typename Func>
+    SEASTAR_CONCEPT(requires Disposer<Func, T>)
+    void clear_and_dispose(Func&& disp) noexcept {
+        if (_root != nullptr) {
+            _root->clear(
+                [this, &disp] (data* d) noexcept { data::destroy(*d, disp); },
+                [this] (node* n) noexcept { node::destroy(*n); }
+            );
+
+            node::destroy(*_root);
+            _root = nullptr;
+            _left = nullptr;
+            _right = nullptr;
+        }
+    }
+
+    void clear() noexcept { clear_and_dispose(default_dispose<T>); }
+
+private:
+    void do_set_left(node *n) noexcept {
+        assert(n->is_leftmost());
+        _left = n;
+        n->_kids[0]._leftmost_tree = this;
+    }
+
+    void do_set_right(node *n) noexcept {
+        assert(n->is_rightmost());
+        _right = n;
+        n->_rightmost_tree = this;
+    }
+
+    void do_set_root(node *n) noexcept {
+        assert(n->is_root());
+        n->_root_tree = this;
+        _root = n;
+    }
+
+public:
+    /*
+     * Iterator. Scans the datas in the sorted-by-key order.
+     * Is not invalidated by emplace/erase-s of other elements.
+     * Move constructors may turn the _idx invalid, but the
+     * .revalidate() method makes it good again.
+     */
+    template <bool Const>
+    class iterator_base {
+    protected:
+        using tree_ptr = std::conditional_t<Const, const tree*, tree*>;
+        using data_ptr = std::conditional_t<Const, const data*, data*>;
+        using node_ptr = std::conditional_t<Const, const node*, node*>;
+
+        /*
+         * When the iterator gets to the end the _data is
+         * replaced with the _tree obtained from the right
+         * leaf, and the _idx is set to npos
+         */
+        union {
+            tree_ptr    _tree;
+            data_ptr    _data;
+        };
+        kid_index _idx; // Index in leaf's _kids array pointing to _data
+
+        /*
+         * Leaf nodes cannot have kids (data nodes) at 0 position, so
+         * 0 is good for unsigned undefined position.
+         */
+        static constexpr kid_index npos = 0;
+
+        bool is_end() const noexcept { return _idx == npos; }
+
+        explicit iterator_base(tree_ptr t) noexcept : _tree(t), _idx(npos) { }
+        iterator_base(data_ptr d, kid_index idx) noexcept : _data(d), _idx(idx) {
+            assert(!is_end());
+        }
+        iterator_base() noexcept : iterator_base(static_cast<tree_ptr>(nullptr)) {}
+
+        /*
+         * The routine makes sure the iterator's index is valid
+         * and returns back the leaf that points to it.
+         */
+        node_ptr revalidate() noexcept {
+            assert(!is_end());
+
+            node_ptr leaf = _data->_leaf;
+
+            /*
+             * The data._leaf pointer is always valid (it's updated
+             * on insert/remove operations), the datas do not move
+             * as well, so if the leaf still points at us, it is valid.
+             */
+            if (_idx > leaf->_num_keys || leaf->_kids[_idx].d != _data) {
+                _idx = leaf->index_for(_data);
+            }
+
+            return leaf;
+        }
+
+    public:
+        using iterator_category = std::bidirectional_iterator_tag;
+        using value_type = std::conditional_t<Const, const T, T>;
+        using difference_type = ssize_t;
+        using pointer = value_type*;
+        using reference = value_type&;
+
+        reference operator*() const noexcept { return _data->value; }
+        pointer operator->() const noexcept { return &_data->value; }
+
+        iterator_base& operator++() noexcept {
+            node_ptr leaf = revalidate();
+            if (_idx < leaf->_num_keys) {
+                _idx++;
+            } else {
+                if (leaf->is_rightmost()) {
+                    _idx = npos;
+                    _tree = leaf->_rightmost_tree;
+                    return *this;
+                }
+
+                leaf = leaf->get_next();
+                _idx = 1;
+            }
+            _data = leaf->_kids[_idx].d;
+            return *this;
+        }
+
+        iterator_base& operator--() noexcept {
+            if (is_end()) {
+                node* n = _tree->_right;
+                assert(n->_num_keys > 0);
+                _data = n->_kids[n->_num_keys].d;
+                _idx = n->_num_keys;
+                return *this;
+            }
+
+            node_ptr leaf = revalidate();
+            if (_idx > 1) {
+                _idx--;
+            } else {
+                leaf = leaf->get_prev();
+                _idx = leaf->_num_keys;
+            }
+            _data = leaf->_kids[_idx].d;
+            return *this;
+        }
+
+        iterator_base operator++(int) noexcept {
+            iterator_base cur = *this;
+            operator++();
+            return cur;
+        }
+
+        iterator_base operator--(int) noexcept {
+            iterator_base cur = *this;
+            operator--();
+            return cur;
+        }
+
+        bool operator==(const iterator_base& o) const noexcept { return is_end() ? o.is_end() : _data == o._data; }
+        bool operator!=(const iterator_base& o) const noexcept { return !(*this == o); }
+    };
+
+    using iterator_base_const = iterator_base<true>;
+    using iterator_base_nonconst = iterator_base<false>;
+
+    class const_iterator final : public iterator_base_const {
+        friend class tree;
+        using super = iterator_base_const;
+
+        explicit const_iterator(const tree* t) noexcept : super(t) {}
+        const_iterator(const data* d, kid_index idx) noexcept : super(d, idx) {}
+
+    public:
+        const_iterator() noexcept : super() {}
+    };
+
+    class iterator final : public iterator_base_nonconst {
+        friend class tree;
+        using super = iterator_base_nonconst;
+
+        explicit iterator(tree* t) noexcept : super(t) {}
+        iterator(data* d, kid_index idx) noexcept : super(d, idx) {}
+
+    public:
+        iterator(const const_iterator&& other) noexcept {
+            if (other.is_end()) {
+                super::_idx = super::npos;
+                super::_tree = const_cast<tree *>(other._tree);
+            } else {
+                super::_idx = other._idx;
+                super::_data = const_cast<data *>(other._data);
+            }
+        }
+
+        iterator() noexcept : super() {}
+
+        /*
+         * Special constructor for the case when there's the need for an
+         * iterator to the given value poiter. In this case we need to
+         * get three things:
+         *  - pointer on class data: we assume that the value pointer
+         *    is indeed embedded into the data and do the "container_of"
+         *    maneuver
+         *  - index at which the data is seen on the leaf: use the
+         *    standard revalidation. Note, that we start with index 1
+         *    which gives us 1/NodeSize chance of hitting the right index
+         *    right at once :)
+         *  - the tree itself: the worst thing here, creating an iterator
+         *    like this is logN operation
+         */
+        explicit iterator(T* value) noexcept
+                : super(boost::intrusive::get_parent_from_member(value, &data::value), 1) {
+            super::revalidate();
+        }
+
+        /*
+         * The key _MUST_ be in order and not exist,
+         * neither of those is checked
+         */
+        template <typename KeyFn, typename... Args>
+        iterator emplace_before(KeyFn key, Less less, Args&&... args) {
+            node* leaf;
+            kid_index i;
+
+            if (!super::is_end()) {
+                leaf = super::revalidate();
+                i = super::_idx - 1;
+
+                if (i == 0 && !leaf->is_leftmost()) {
+                    /*
+                     * If we're about to insert a key before the 0th one, then
+                     * we must make sure the separation keys from upper layers
+                     * will separate the new key as well. If they won't then we
+                     * should select the left sibling for insertion.
+                     *
+                     * For !strict_separation_key the solution is simple -- the
+                     * upper level separation keys match the current 0th one, so
+                     * we always switch to the left sibling.
+                     *
+                     * If we're already on the left-most leaf -- just insert, as
+                     * there's no separatio key above it.
+                     */
+                    if (!strict_separation_key) {
+                        assert(false && "Not implemented");
+                    }
+                    leaf = leaf->get_prev();
+                    i = leaf->_num_keys;
+                }
+            } else {
+                super::_tree->maybe_init_empty_tree();
+                leaf = super::_tree->_right;
+                i = leaf->_num_keys;
+            }
+
+            assert(i >= 0);
+
+            data* d = data::create(std::forward<Args>(args)...);
+            auto x = seastar::defer([&d] { data::destroy(*d, default_dispose<T>); });
+            leaf->insert(i, std::move(key(d)), d, less);
+            assert(d->attached());
+            x.cancel();
+            /*
+             * XXX -- if the node was not split we can ++ it index
+             * and keep iterator valid :)
+             */
+            return iterator(d, i + 1);
+        }
+
+        template <typename... Args>
+        iterator emplace_before(Key k, Less less, Args&&... args) {
+            return emplace_before([&k] (data*) -> Key { return std::move(k); },
+                    less, std::forward<Args>(args)...);
+        }
+
+        template <typename... Args>
+        SEASTAR_CONCEPT(requires CanGetKeyFromValue<T, Key>)
+        iterator emplace_before(Less less, Args&&... args) {
+            return emplace_before([] (data* d) -> Key { return d->value.key(); },
+                    less, std::forward<Args>(args)...);
+        }
+
+    private:
+        /*
+         * Prepare a likely valid iterator for the next element.
+         * Likely means, that unless removal starts rebalancing
+         * datas the _idx will be for the correct pointer.
+         *
+         * This is just like the operator++, with the exception
+         * that staying on the leaf doesn't increase the _idx, as
+         * in this case the next element will be shifted left to
+         * the current position.
+         */
+        iterator next_after_erase(node* leaf) const noexcept {
+            if (super::_idx < leaf->_num_keys) {
+                return iterator(leaf->_kids[super::_idx + 1].d, super::_idx);
+            }
+
+            if (leaf->is_rightmost()) {
+                return iterator(leaf->_rightmost_tree);
+            }
+
+            leaf = leaf->get_next();
+            return iterator(leaf->_kids[1].d, 1);
+        }
+
+    public:
+        template <typename Func>
+        SEASTAR_CONCEPT(requires Disposer<Func, T>)
+        iterator erase_and_dispose(Func&& disp, Less less) noexcept {
+            node* leaf = super::revalidate();
+            iterator cur = next_after_erase(leaf);
+
+            leaf->remove(super::_idx, less);
+            data::destroy(*super::_data, disp);
+
+            return cur;
+        }
+
+        iterator erase(Less less) { return erase_and_dispose(default_dispose<T>, less); }
+
+        template <typename... Args>
+        void reconstruct(size_t new_payload_size, Args&&... args) {
+            size_t new_size = super::_data->storage_size(new_payload_size);
+
+            node* leaf = super::revalidate();
+            auto ptr = current_allocator().alloc(&get_standard_migrator<data>(), new_size, alignof(data));
+            data *dat, *cur = super::_data;
+
+            try {
+                dat = new (ptr) data(std::forward<Args>(args)...);
+            } catch(...) {
+                current_allocator().free(ptr, new_size);
+                throw;
+            }
+
+            dat->_leaf = leaf;
+            cur->_leaf = nullptr;
+
+            super::_data = dat;
+            leaf->_kids[super::_idx].d = dat;
+
+            current_allocator().destroy(cur);
+        }
+    };
+
+    const_iterator begin() const noexcept {
+        if (empty()) {
+            return end();
+        }
+
+        assert(_left->_num_keys > 0);
+        // Leaf nodes have data pointers starting from index 1
+        return const_iterator(_left->_kids[1].d, 1);
+    }
+    const_iterator end() const noexcept { return const_iterator(this); }
+
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+    const_reverse_iterator rbegin() const noexcept { return std::make_reverse_iterator(end()); }
+    const_reverse_iterator rend() const noexcept { return std::make_reverse_iterator(begin()); }
+
+    iterator begin() noexcept { return iterator(const_cast<const tree*>(this)->begin()); }
+    iterator end() noexcept { return iterator(this); }
+
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    reverse_iterator rbegin() noexcept { return std::make_reverse_iterator(end()); }
+    reverse_iterator rend() noexcept { return std::make_reverse_iterator(begin()); }
+
+    bool empty() const noexcept { return _root == nullptr || _root->_num_keys == 0; }
+
+    struct stats get_stats() const noexcept {
+        struct stats st;
+
+        st.nodes = 0;
+        st.leaves = 0;
+        st.datas = 0;
+
+        if (_root != nullptr) {
+            st.nodes_filled.resize(NodeSize + 1);
+            st.leaves_filled.resize(NodeSize + 1);
+            _root->fill_stats(st);
+        }
+
+        return st;
+    }
+};
+
+/*
+ * Algorithms for searching a key in array.
+ *
+ * The gt() method accepts sorted array of keys and searches the index of the
+ * upper-bound element of the given key.
+ */
+
+template <typename K, typename Key, typename Less, size_t Size, key_search Search>
+struct searcher { };
+
+template <typename K, typename Key, typename Less, size_t Size>
+struct searcher<K, Key, Less, Size, key_search::linear> {
+    static size_t gt(const K& k, const maybe_key<Key>* keys, size_t nr, Less less) noexcept {
+        size_t i;
+
+        for (i = 0; i < nr; i++) {
+            if (less(k, keys[i].v)) {
+                break;
+            }
+        }
+
+        return i;
+    };
+};
+
+template <typename K, typename Key, typename Less, size_t Size>
+struct searcher<K, Key, Less, Size, key_search::binary> {
+    static size_t gt(const K& k, const maybe_key<Key>* keys, size_t nr, Less less) noexcept {
+        ssize_t s = 0, e = nr - 1; // signed for below s <= e corner cases
+
+        while (s <= e) {
+            size_t i = (s + e) / 2;
+            if (less(k, keys[i].v)) {
+                e = i - 1;
+            } else {
+                s = i + 1;
+            }
+        }
+
+        return s;
+    }
+};
+
+template <typename K, typename Key, typename Less, size_t Size>
+struct searcher<K, Key, Less, Size, key_search::both> {
+    static size_t gt(const K& k, const maybe_key<Key>* keys, size_t nr, Less less) noexcept {
+        size_t rl = searcher<K, Key, Less, Size, key_search::linear>::gt(k, keys, nr, less);
+        size_t rb = searcher<K, Key, Less, Size, key_search::binary>::gt(k, keys, nr, less);
+        assert(rl == rb);
+        assert(rl <= nr);
+        return rl;
+    }
+};
+
+/*
+ * A node describes both, inner and leaf nodes.
+ */
+template <typename Key, typename T, typename Less, size_t NodeSize, key_search Search, with_debug Debug>
+class node final {
+    friend class validator<Key, T, Less, NodeSize>;
+    friend class tree<Key, T, Less, NodeSize, Search, Debug>;
+    friend class data<Key, T, Less, NodeSize, Search, Debug>;
+
+    using tree = class tree<Key, T, Less, NodeSize, Search, Debug>;
+    using data = class data<Key, T, Less, NodeSize, Search, Debug>;
+
+    class prealloc;
+
+    /*
+     * The NodeHalf is the level at which the node is considered
+     * to be underflown and should be re-filled. This slightly
+     * differs for even and odd sizes.
+     *
+     * For odd sizes the node will stand until it contains literally
+     * more than 1/2 of it's size (e.g. for size 5 keeping 3 keys
+     * is OK). For even cases this barrier is less than the actual
+     * half (e.g. for size 4 keeping 2 is still OK).
+     */
+    static constexpr size_t NodeHalf = ((NodeSize - 1) / 2);
+    static_assert(NodeHalf >= 1);
+
+    union node_or_data_or_tree {
+        node* n;
+        data* d;
+
+        tree* _leftmost_tree; // See comment near node::__next about this
+    };
+
+    using node_or_data = node_or_data_or_tree;
+
+    friend data::data(data&&);
+
+    [[no_unique_address]] utils::neat_id<Debug == with_debug::yes> id;
+
+    unsigned short _num_keys;
+    unsigned short _flags;
+
+    static const unsigned short NODE_ROOT       = 0x1;
+    static const unsigned short NODE_LEAF       = 0x2;
+    static const unsigned short NODE_LEFTMOST   = 0x4; // leaf with smallest keys in the tree
+    static const unsigned short NODE_RIGHTMOST  = 0x8; // leaf with greatest keys in the tree
+
+    bool is_leaf() const noexcept { return _flags & NODE_LEAF; }
+    bool is_root() const noexcept { return _flags & NODE_ROOT; }
+    bool is_rightmost() const noexcept { return _flags & NODE_RIGHTMOST; }
+    bool is_leftmost() const noexcept { return _flags & NODE_LEFTMOST; }
+
+    /*
+     * separation keys
+     *   non-leaf nodes:
+     *     keys in kids[i] < keys[i] <= keys in kids[i+1], i in [0, NodeSize)
+     *   leaf nodes:
+     *     kids[i + 1] is the data for keys[i]
+     *     kids[0] is unused
+     *
+     * In the examples below the leaf nodes will be shown like
+     *
+     *  keys: [012]
+     * datas: [-012]
+     *
+     * and the non-leaf ones like
+     *
+     *  keys: [012]
+     *  kids: [A012]
+     *
+     * to have digits correspond to different elements and staying
+     * in its correct positions. And the A kid is this left-most one
+     * at index 0 for the non-leaf node.
+     */
+
+    maybe_key<Key> _keys[NodeSize];
+    node_or_data _kids[NodeSize + 1];
+
+    // Type-aliases for code-reading convenience
+    using key_index = size_t;
+    using kid_index = size_t;
+
+    /*
+     * The root node uses this to point to the tree object. This is
+     * needed to update tree->_root on node move.
+     */
+    union {
+        node* _parent;
+        tree* _root_tree;
+    };
+
+    /*
+     * Leaf nodes are linked in a list, since leaf nodes do
+     * not use the _kids[0] pointer we re-use it. Respectively,
+     * non-leaf nodes don't use the __next one.
+     *
+     * Also, leftmost and rightmost respectively have prev and
+     * next pointing to the tree object itsef. This is done for
+     * _left/_right update on node move.
+     */
+    union {
+        node* __next;
+        tree* _rightmost_tree;
+    };
+
+    node* get_next() const noexcept {
+        assert(is_leaf());
+        return __next;
+    }
+
+    void set_next(node *n) noexcept {
+        assert(is_leaf());
+        __next = n;
+    }
+
+    node* get_prev() const noexcept {
+        assert(is_leaf());
+        return _kids[0].n;
+    }
+
+    void set_prev(node* n) noexcept {
+        assert(is_leaf());
+        _kids[0].n = n;
+    }
+
+    // Links the new node n right after the current one
+    void link(node& n) noexcept {
+        if (is_rightmost()) {
+            _flags &= ~NODE_RIGHTMOST;
+            n._flags |= node::NODE_RIGHTMOST;
+            tree* t = _rightmost_tree;
+            assert(t->_right == this);
+            t->do_set_right(&n);
+        } else {
+            n.set_next(get_next());
+            get_next()->set_prev(&n);
+        }
+
+        n.set_prev(this);
+        set_next(&n);
+    }
+
+    void unlink() noexcept {
+        node* x;
+        tree* t;
+
+        switch (_flags & (node::NODE_LEFTMOST | node::NODE_RIGHTMOST)) {
+        case node::NODE_LEFTMOST:
+            x = get_next();
+            _flags &= ~node::NODE_LEFTMOST;
+            x->_flags |= node::NODE_LEFTMOST;
+            t = _kids[0]._leftmost_tree;
+            assert(t->_left == this);
+            t->do_set_left(x);
+            break;
+        case node::NODE_RIGHTMOST:
+            x = get_prev();
+            _flags &= ~node::NODE_RIGHTMOST;
+            x->_flags |= node::NODE_RIGHTMOST;
+            t = _rightmost_tree;
+            assert(t->_right == this);
+            t->do_set_right(x);
+            break;
+        case 0:
+            get_prev()->set_next(get_next());
+            get_next()->set_prev(get_prev());
+            break;
+        default:
+            /*
+             * Right- and left-most at the same time can only be root,
+             * otherwise this would mean we have root with 0 keys.
+             */
+            assert(false);
+        }
+
+        set_next(this);
+        set_prev(this);
+    }
+
+    node(const node& other) = delete;
+    const node& operator=(const node& other) = delete;
+    node& operator=(node&& other) = delete;
+
+    /*
+     * There's no pointer/reference from nodes to the tree, neither
+     * there is such from data, because otherwise we'd have to update
+     * all of them inside tree move constructor, which in turn would
+     * make it toooo slow linear operation. Thus we walk up the nodes
+     * ._parent chain up to the root node which has the _root_tree.
+     */
+    tree* tree_slow() const noexcept {
+        const node* cur = this;
+
+        while (!cur->is_root()) {
+            cur = cur->_parent;
+        }
+
+        return cur->_root_tree;
+    }
+
+    /*
+     * For inner node finds the subtree to which k belongs.
+     * For leaf node finds the data that should correspond to the key,
+     * in this case index is not 0 for sure.
+     *
+     * In both cases keys[index - 1] <= k < keys[index].
+     */
+    template <typename K>
+    kid_index index_for(const K& k, Less less) const noexcept {
+        return searcher<K, Key, Less, NodeSize, Search>::gt(k, _keys, _num_keys, less);
+    }
+
+    kid_index index_for(node *n) const noexcept {
+        // Keep index on kid (FIXME?)
+
+        kid_index i;
+
+        for (i = 0; i <= _num_keys; i++) {
+            if (_kids[i].n == n) {
+                break;
+            }
+        }
+        assert(i <= _num_keys);
+        return i;
+    }
+
+    bool need_refill() const noexcept {
+        return _num_keys <= NodeHalf;
+    }
+
+    bool can_grab_from() const noexcept {
+        return _num_keys > NodeHalf + 1;
+    }
+
+    bool can_push_to() const noexcept {
+        return _num_keys < NodeSize;
+    }
+
+    bool can_merge_with(const node& n) const noexcept {
+        return _num_keys + n._num_keys + (is_leaf() ? 0u : 1u) <= NodeSize;
+    }
+
+    void shift_right(size_t s) noexcept {
+        for (size_t i = _num_keys; i > s; i--) {
+            _keys[i].emplace(std::move(_keys[i - 1]));
+            _kids[i + 1] = _kids[i];
+        }
+        _num_keys++;
+    }
+
+    void shift_left(size_t s) noexcept {
+        // The key at s is expected to be .remove()-d !
+        for (size_t i = s + 1; i < _num_keys; i++) {
+            _keys[i - 1].emplace(std::move(_keys[i]));
+            _kids[i] = _kids[i + 1];
+        }
+        _num_keys--;
+    }
+
+    void move_keys_and_kids(size_t foff, node& to, size_t count) noexcept {
+        size_t toff = to._num_keys;
+
+        for (size_t i = 0; i < count; i++) {
+            to._keys[toff + i].emplace(std::move(_keys[foff + i]));
+            to._kids[toff + i + 1] = _kids[foff + i + 1];
+        }
+        _num_keys = foff;
+
+        if (is_leaf()) {
+            for (size_t i = toff; i < toff + count; i++) {
+                to._kids[i + 1].d->reattach(&to);
+            }
+        } else {
+            for (size_t i = toff; i < toff + count; i++) {
+                to._kids[i + 1].n->_parent = &to;
+            }
+        }
+        to._num_keys += count;
+    }
+
+    void move_to(node& to, size_t off) noexcept {
+        assert(off <= _num_keys);
+        to._num_keys = 0;
+        move_keys_and_kids(off, to, _num_keys - off);
+    }
+
+    void grab_from_left(node& from, maybe_key<Key>& sep) noexcept {
+        /*
+         * Grab one element from the left sibling and return
+         * the new separation key for them.
+         *
+         * Leaf: just move the last key (and the last kid) and report
+         * it as new separation key
+         *
+         *  keys: [012]  -> [56]  = [01]  [256]  2 is new separation
+         * datas: [-012] -> [-56] = [-01] [-256]
+         *
+         * Non-leaf is trickier. We need the current separation key
+         * as we're grabbing the last element which has no the right
+         * boundary on the node. So the parent node tells us one.
+         *
+         *  keys: [012]  -> s [56]  = [01]  2 [s56] 2 is new separation
+         *  kids: [A012] ->   [B56] = [A01]   [2B56]
+         */
+
+        assert(from._num_keys > 0);
+        key_index i = from._num_keys - 1;
+
+        shift_right(0);
+        from._num_keys--;
+
+        if (is_leaf()) {
+            _keys[0].emplace(std::move(from._keys[i]));
+            _kids[1] = from._kids[i + 1];
+            _kids[1].d->reattach(this);
+            sep.replace(copy_key(_keys[0].v));
+        } else {
+            _keys[0].emplace(std::move(sep));
+            _kids[1] = _kids[0];
+            _kids[0] = from._kids[i + 1];
+            _kids[0].n->_parent = this;
+            sep.emplace(std::move(from._keys[i]));
+        }
+    }
+
+    void merge_into(node& t, Key key) noexcept {
+        /*
+         * Merge current node into t preparing it for being
+         * killed. This merge is slightly different for leaves
+         * and for non-leaves wrt the 0th element.
+         *
+         * Non-leaves. For those we need the separation key, whic
+         * is passed to us. The caller "knows" that this and t are
+         * two siblings and thus the separation key is the one from
+         * the parent node. For this reason merging two non-leaf
+         * nodes needs one more slot in the target as compared to
+         * the leaf-nodes case.
+         *
+         *   keys: [012]  + K + [456]  = [012K456]
+         *   kids: [A012] +     [B456] = [A012B456]
+         *
+         * Leaves. This is simple -- just go ahead and merge.
+         *
+         *   keys: [012]  + [456]  = [012456]
+         *  datas: [-012] + [-456] = [-012456]
+         */
+
+        if (!t.is_leaf()) {
+            key_index i = t._num_keys;
+            t._keys[i].emplace(std::move(key));
+            t._kids[i + 1] = _kids[0];
+            t._kids[i + 1].n->_parent = &t;
+            t._num_keys++;
+        }
+
+        move_keys_and_kids(0, t, _num_keys);
+    }
+
+    void grab_from_right(node& from, maybe_key<Key>& sep) noexcept {
+        /*
+         * Grab one element from the right sibling and return
+         * the new separation key for them.
+         *
+         * Leaf: just move the 0th key (and 1st kid) and the
+         * new separation key is what becomes 0 in the source.
+         *
+         *  keys: [01]  <- [456]  = [014]  [56]  5 is new separation
+         * datas: [-01] <- [-456] = [-014] [-56]
+         *
+         * Non-leaf is trickier. We need the current separation
+         * key as we're grabbing the kids[0] element which has no
+         * corresponding keys[-1]. So the parent node tells us one.
+         *
+         *  keys: [01]  <- s [456]  = [01s]  4 [56] 4 is new separation
+         *  kids: [A01] <-   [B456] = [A01B]   [456]
+         */
+
+        key_index i = _num_keys;
+
+        if (is_leaf()) {
+            _keys[i].emplace(std::move(from._keys[0]));
+            _kids[i + 1] = from._kids[1];
+            _kids[i + 1].d->reattach(this);
+            sep.replace(copy_key(from._keys[1].v));
+        } else {
+            _kids[i + 1] = from._kids[0];
+            _kids[i + 1].n->_parent = this;
+            _keys[i].emplace(std::move(sep));
+            from._kids[0] = from._kids[1];
+            sep.emplace(std::move(from._keys[0]));
+        }
+
+        _num_keys++;
+        from.shift_left(0);
+    }
+
+    /*
+     * When splitting, the result should be almost equal. The
+     * "almost" depends on the node-size being odd or even and
+     * on the node itself being leaf or inner.
+     */
+    bool equally_split(const node& n2) const noexcept {
+        if (Debug == with_debug::yes) {
+            return (_num_keys == n2._num_keys) ||
+                    (_num_keys == n2._num_keys + 1) ||
+                    (_num_keys + 1 == n2._num_keys);
+        }
+        return true;
+    }
+
+    // Helper for assert(). See comment for do_insert for details.
+    bool left_kid_sorted(const Key& k, Less less) const noexcept {
+        if (Debug == with_debug::yes && !is_leaf() && _num_keys > 0) {
+            node* x = _kids[0].n;
+            if (x != nullptr && less(k, x->_keys[x->_num_keys - 1].v)) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    template <typename DFunc, typename NFunc>
+    SEASTAR_CONCEPT(requires Disposer<DFunc, data> && Disposer<NFunc, node>)
+    void clear(DFunc&& ddisp, NFunc&& ndisp) noexcept {
+        if (is_leaf()) {
+            _flags &= ~(node::NODE_LEFTMOST | node::NODE_RIGHTMOST);
+            set_next(this);
+            set_prev(this);
+        } else {
+            node* n = _kids[0].n;
+            n->clear(ddisp, ndisp);
+            ndisp(n);
+        }
+
+        for (key_index i = 0; i < _num_keys; i++) {
+            _keys[i].reset();
+            if (is_leaf()) {
+                ddisp(_kids[i + 1].d);
+            } else {
+                node* n = _kids[i + 1].n;
+                n->clear(ddisp, ndisp);
+                ndisp(n);
+            }
+        }
+
+        _num_keys = 0;
+    }
+
+    static node* create() {
+        return current_allocator().construct<node>();
+    }
+
+    static void destroy(node& n) noexcept {
+        current_allocator().destroy(&n);
+    }
+
+    void drop() noexcept {
+        assert(!is_root());
+        if (is_leaf()) {
+            unlink();
+        }
+        destroy(*this);
+    }
+
+    void insert_into_full(kid_index idx, Key k, node_or_data nd, Less less, prealloc& nodes) noexcept {
+        if (!is_root()) {
+            node& p = *_parent;
+            kid_index i = p.index_for(_keys[0].v, less);
+
+            /*
+             * Try to push left or right existing keys to the respective
+             * siblings. Keep in mind two corner cases:
+             *
+             * 1. Push to left. In this case the new key should not go
+             * to the [0] element, otherwise we'd have to update the p's
+             * separation key one more time.
+             *
+             * 2. Push to right. In this case we must make sure the new
+             * key is not the rightmost itself, otherwise it's _him_ who
+             * must be pushed there.
+             *
+             * Both corner cases are possible to implement though.
+             */
+            if (idx > 1 && i > 0) {
+                node* left = p._kids[i - 1].n;
+                if (left->can_push_to()) {
+                    /*
+                     * We've moved the 0th elemet from this, so the index
+                     * for the new key shifts too
+                     */
+                    idx--;
+                    left->grab_from_right(*this, p._keys[i - 1]);
+                }
+            }
+
+            if (idx < _num_keys && i < p._num_keys) {
+                node* right = p._kids[i + 1].n;
+                if (right->can_push_to()) {
+                    right->grab_from_left(*this, p._keys[i]);
+                }
+            }
+
+            if (_num_keys < NodeSize) {
+                do_insert(idx, std::move(k), nd, less);
+                nodes.drain();
+                return;
+            }
+
+            /*
+             * We can only get here if both ->can_push_to() checks above
+             * had failed. In this case -- go ahead and split this.
+             */
+        }
+
+        split_and_insert(idx, std::move(k), nd, less, nodes);
+    }
+
+    void split_and_insert(kid_index idx, Key k, node_or_data nd, Less less, prealloc& nodes) noexcept {
+        assert(_num_keys == NodeSize);
+
+        node* nn = nodes.pop();
+        maybe_key<Key> sep;
+
+        /*
+         * Insertion with split.
+         * 1. Existing node (this) is split into two. We try a bit harder
+         *    than we might to to make the split equal.
+         * 2. The new element is added to either of the resulting nodes.
+         * 3. The new node nn is inserted into parent one with the help
+         *    of a separation key sep
+         *
+         * First -- find the position in the current node at which the
+         * new element should have appeared.
+         */
+
+        size_t off = NodeHalf + (idx > NodeHalf ? 1 : 0);
+
+        if (is_leaf()) {
+            nn->_flags |= NODE_LEAF;
+            link(*nn);
+
+            /*
+             * Split of leaves. This is simple -- just copy the needed
+             * amount of keys and kids from this to nn, then insert the
+             * new pair into the proper place. When inserting the new
+             * node into parent the separation key is the one latter
+             * starts with.
+             *
+             *  keys: [01234]
+             * datas: [-01234]
+             *
+             * if the new key is below 2, then
+             *  keys: -> [01]  [234]   -> [0n1]  [234]   -> sep is 2
+             * datas: -> [-01] [-234]  -> [-0n1] [-234]
+             *
+             * if the new key is above 2, then
+             *  keys: -> [012]  [34]   -> [012]  [3n4]   -> sep is 3 (or n)
+             * datas: -> [-012] [-34]  -> [-012] [-3n4]
+             */
+            move_to(*nn, off);
+
+            if (idx <= NodeHalf) {
+                do_insert(idx, std::move(k), nd, less);
+            } else {
+                nn->do_insert(idx - off, std::move(k), nd, less);
+            }
+            sep.emplace(std::move(copy_key(nn->_keys[0].v)));
+        } else {
+            /*
+             * Node insertion has one special case -- when the new key
+             * gets directly into the middle.
+             */
+            if (idx == NodeHalf + 1) {
+                /*
+                 * Split of nodes and the new key is in the middle. In this
+                 * we need to split the node into two, but take the k as the
+                 * separation kep. The corresponding data becomes new node's
+                 * 0 kid.
+                 *
+                 *  keys: [012345]  -> [012] k [345]   (and the k goes up)
+                 *  kids: [A012345] -> [A012]  [n345]
+                 */
+                move_to(*nn, off);
+                sep.emplace(std::move(k));
+                nn->_kids[0] = nd;
+                nn->_kids[0].n->_parent = nn;
+            } else {
+                /*
+                 * Split of nodes and the new key gets into either of the
+                 * halves. This is like leaves split, but we need to carefully
+                 * handle the kids[0] for both. The correspoding key is not
+                 * on the node and "has" an index of -1 and thus becomes the
+                 * separation one for the upper layer.
+                 *
+                 *  keys: [012345]
+                 * datas: [A012345]
+                 *
+                 * if the new key goes left then
+                 *  keys: -> [01] 2 [345]   -> [0n1]  2 [345]
+                 * datas: -> [A01]  [2345]  -> [A0n1]   [2345]
+                 *
+                 * if the new key goes right then
+                 *  keys: -> [012]  3 [45]   -> [012]  3 [4n5]
+                 * datas: -> [A012]   [345]  -> [-123]   [34n5]
+                 */
+                move_to(*nn, off + 1);
+                sep.emplace(std::move(_keys[off]));
+                nn->_kids[0] = _kids[off + 1];
+                nn->_kids[0].n->_parent = nn;
+                _num_keys--;
+
+                if (idx <= NodeHalf) {
+                    do_insert(idx, std::move(k), nd, less);
+                } else {
+                    nd.n->_parent = nn;
+                    nn->do_insert(idx - off - 1, std::move(k), nd, less);
+                }
+            }
+        }
+
+        assert(equally_split(*nn));
+
+        if (is_root()) {
+            insert_into_root(*nn, std::move(sep.v), nodes);
+        } else {
+            insert_into_parent(*nn, std::move(sep.v), less, nodes);
+        }
+        sep.reset();
+    }
+
+    void do_insert(kid_index i, Key k, node_or_data nd, Less less) noexcept {
+        assert(_num_keys < NodeSize);
+
+        /*
+         * The new k:nd pair should be put into the given index and
+         * shift offenders to the right. However, if it should be
+         * put left to the non-leaf's left-most node -- it's a BUG,
+         * as there's no corresponding key here.
+         *
+         * Non-leaf nodes get here when their kids are split, and
+         * when they do, if the kid gets into the left-most sub-tree,
+         * it's directly put there, and this helper is not called.
+         * Said that, if we're inserting a new pair, the newbie can
+         * only get to the right of the left-most kid.
+         */
+        assert(i != 0 || left_kid_sorted(k, less));
+
+        shift_right(i);
+
+        /*
+         * The k:nd pair belongs to keys[i-1]:kids[i] subtree, and since
+         * what's already there is less than this newcomer, the latter goes
+         * one step right.
+         */
+        _keys[i].emplace(std::move(k));
+        _kids[i + 1] = nd;
+        if (is_leaf()) {
+            nd.d->attach(*this);
+        }
+    }
+
+    void insert_into_parent(node& nn, Key sep, Less less, prealloc& nodes) noexcept {
+        nn._parent = _parent;
+        _parent->insert_key(std::move(sep), node_or_data{n: &nn}, less, nodes);
+    }
+
+    void insert_into_root(node& nn, Key sep, prealloc& nodes) noexcept {
+        tree* t = _root_tree;
+
+        node* nr = nodes.pop();
+
+        nr->_num_keys = 1;
+        nr->_keys[0].emplace(std::move(sep));
+        nr->_kids[0].n = this;
+        nr->_kids[1].n = &nn;
+        _flags &= ~node::NODE_ROOT;
+        _parent = nr;
+        nn._parent = nr;
+
+        nr->_flags |= node::NODE_ROOT;
+        t->do_set_root(nr);
+    }
+
+    void insert_key(Key k, node_or_data nd, Less less, prealloc& nodes) noexcept {
+        kid_index i = index_for(k, less);
+        insert(i, std::move(k), nd, less, nodes);
+    }
+
+    void insert(kid_index i, Key k, node_or_data nd, Less less, prealloc& nodes) noexcept {
+        if (_num_keys == NodeSize) {
+            insert_into_full(i, std::move(k), nd, less, nodes);
+        } else {
+            do_insert(i, std::move(k), nd, less);
+        }
+    }
+
+    void insert(kid_index i, Key k, data* d, Less less) {
+        prealloc nodes;
+
+        /*
+         * Prepare the nodes for split in advaice, if the node::create will
+         * start throwing while splitting we'll have troubles "unsplitting"
+         * the nodes back.
+         */
+        node* cur = this;
+        while (cur->_num_keys == NodeSize) {
+            nodes.push();
+            if (cur->is_root()) {
+                nodes.push();
+                break;
+            }
+            cur = cur->_parent;
+        }
+
+        insert(i, std::move(k), node_or_data{d: d}, less, nodes);
+        assert(nodes.empty());
+    }
+
+    void remove_from(key_index i, Less less) noexcept {
+        _keys[i].reset();
+        shift_left(i);
+
+        if (!is_root()) {
+            if (need_refill()) {
+                refill(less);
+            }
+        } else if (_num_keys == 0 && !is_leaf()) {
+            node* nr;
+            nr = _kids[0].n;
+            nr->_flags |= node::NODE_ROOT;
+            _root_tree->do_set_root(nr);
+
+            _flags &= ~node::NODE_ROOT;
+            _parent = nullptr;
+            drop();
+        }
+    }
+
+    void merge_kids(node& t, node& n, key_index sep_idx, Less less) noexcept {
+        n.merge_into(t, std::move(_keys[sep_idx].v));
+        n.drop();
+        remove_from(sep_idx, less);
+    }
+
+    void refill(Less less) noexcept {
+        node& p = *_parent, *left, *right;
+
+        /*
+         * We need to locate this node's index at parent array by using
+         * the 0th key, so make sure it exists. We can go even without
+         * it, but since we don't let's be on the safe side.
+         */
+        assert(_num_keys > 0);
+        kid_index i = p.index_for(_keys[0].v, less);
+        assert(p._kids[i].n == this);
+
+        /*
+         * The node is "underflown" (see comment near NodeHalf
+         * about what this means), so we try to refill it at the
+         * siblings' expense. Many cases possible, but we go with
+         * only four:
+         *
+         * 1. Left sibling exists and it has at least 1 item
+         *    above being the half-full. -> we grab one element
+         *    from it.
+         *
+         * 2. Left sibling exists and we can merge current with
+         * it. "Can" means the resulting node will not overflow
+         * which, in turn, differs by one for leaf and non-leaf
+         * nodes. For leaves the merge is possible is the total
+         * number of the elements fits the maximum. For non-leaf
+         * we'll need room for one more element, here's why:
+         *
+         *  [012]  + [456]   ->  [012X456]
+         *  [A012] + [B456]  ->  [A012B456]
+         *
+         * The key X in the middle separates B from everything on
+         * the left and this key was not sitting on either of the
+         * wannabe merging nodes. This X is the current separation
+         * of these two nodes taken from their parent.
+         *
+         * And two same cases for the right sibling.
+         */
+
+        left = i > 0 ? p._kids[i - 1].n : nullptr;
+        right = i < p._num_keys ? p._kids[i + 1].n : nullptr;
+
+        if (left != nullptr && left->can_grab_from()) {
+            grab_from_left(*left, p._keys[i - 1]);
+            return;
+        }
+
+        if (right != nullptr && right->can_grab_from()) {
+            grab_from_right(*right, p._keys[i]);
+            return;
+        }
+
+        if (left != nullptr && can_merge_with(*left)) {
+            p.merge_kids(*left, *this, i - 1, less);
+            return;
+        }
+
+        if (right != nullptr && can_merge_with(*right)) {
+            p.merge_kids(*this, *right, i, less);
+            return;
+        }
+
+        /*
+         * Susprisingly, the node in the B+ tree can violate the
+         * "minimally filled" rule for non roots. It _can_ stay with
+         * less than half elements on board. The next remove from
+         * it or either of its siblings will probably refill it.
+         *
+         * Keeping 1 key on the non-root node is possible, but needs
+         * some special care -- if we will remove this last key from
+         * this node, the code will try to refill one and will not
+         * be able to find this node's index at parent (the call for
+         * index_for() above).
+         */
+        assert(_num_keys > 1);
+    }
+
+    void remove(kid_index ki, Less less) noexcept {
+        key_index i = ki - 1;
+
+        /*
+         * Update the matching separation key from above. It
+         * exists only if we're removing the 0th key, but for
+         * the left-most child it doesn't exist.
+         *
+         * Note, that the latter check is crucial for clear()
+         * performance, as it's always removes the left-most
+         * key, without this check each remove() would walk the
+         * tree upwards in vain.
+         */
+        if (strict_separation_key && i == 0 && !is_leftmost()) {
+            const Key& k = _keys[i].v;
+            node* p = this;
+
+            while (!p->is_root()) {
+                p = p->_parent;
+                kid_index j = p->index_for(k, less);
+                if (j > 0) {
+                    p->_keys[j - 1].replace(copy_key(_keys[1].v));
+                    break;
+                }
+            }
+        }
+
+        remove_from(i, less);
+    }
+
+public:
+    explicit node() noexcept : _num_keys(0) , _flags(0) , _parent(nullptr) { }
+
+    ~node() {
+        assert(_num_keys == 0);
+        assert(is_root() || !is_leaf() || (get_prev() == this && get_next() == this));
+    }
+
+    node(node&& other) noexcept : _flags(other._flags) {
+        if (is_leaf()) {
+            if (!is_rightmost()) {
+                set_next(other.get_next());
+                get_next()->set_prev(this);
+            } else {
+                other._rightmost_tree->do_set_right(this);
+            }
+
+            if (!is_leftmost()) {
+                set_prev(other.get_prev());
+                get_prev()->set_next(this);
+            } else {
+                other._kids[0]._leftmost_tree->do_set_left(this);
+            }
+
+            other._flags &= ~(NODE_LEFTMOST | NODE_RIGHTMOST);
+            other.set_next(&other);
+            other.set_prev(&other);
+        } else {
+            _kids[0].n = other._kids[0].n;
+            _kids[0].n->_parent = this;
+        }
+
+        other.move_to(*this, 0);
+
+        if (!is_root()) {
+            _parent = other._parent;
+            kid_index i = _parent->index_for(&other);
+            assert(_parent->_kids[i].n == &other);
+            _parent->_kids[i].n = this;
+        } else {
+            other._root_tree->do_set_root(this);
+        }
+    }
+
+    kid_index index_for(const data *d) const noexcept {
+        /*
+         * We'd could look up the data's new idex with binary search,
+         * but we don't have the key at hands
+         */
+
+        kid_index i;
+
+        for (i = 1; i <= _num_keys; i++) {
+            if (_kids[i].d == d) {
+                break;
+            }
+        }
+        assert(i <= _num_keys);
+        return i;
+    }
+
+private:
+    class prealloc {
+        std::vector<node*> _nodes;
+    public:
+        bool empty() noexcept { return _nodes.empty(); }
+
+        void push() {
+            _nodes.push_back(node::create());
+        }
+
+        node* pop() noexcept {
+            assert(!_nodes.empty());
+            node* ret = _nodes.back();
+            _nodes.pop_back();
+            return ret;
+        }
+
+        void drain() noexcept {
+            while (!empty()) {
+                node::destroy(*pop());
+            }
+        }
+
+        ~prealloc() {
+            drain();
+        }
+    };
+
+    void fill_stats(struct stats& st) const noexcept {
+        if (is_leaf()) {
+            st.leaves_filled[_num_keys]++;
+            st.leaves++;
+            st.datas += _num_keys;
+        } else {
+            st.nodes_filled[_num_keys]++;
+            st.nodes++;
+            for (kid_index i = 0; i <= _num_keys; i++) {
+                _kids[i].n->fill_stats(st);
+            }
+        }
+    }
+};
+
+/*
+ * The data represents data node (the actual data is stored "outside"
+ * of the tree). The tree::emplace() constructs the payload inside the
+ * data before inserting it into the tree.
+ */
+template <typename K, typename T, typename Less, size_t NS, key_search S, with_debug D>
+class data final {
+    friend class validator<K, T, Less, NS>;
+    template <typename c1, typename c2, typename c3, size_t s1, key_search p1, with_debug p2>
+            friend class tree<c1, c2, c3, s1, p1, p2>::iterator;
+    template <typename c1, typename c2, typename c3, size_t s1, key_search p1, with_debug p2>
+            friend class tree<c1, c2, c3, s1, p1, p2>::iterator_base_const;
+    template <typename c1, typename c2, typename c3, size_t s1, key_search p1, with_debug p2>
+            friend class tree<c1, c2, c3, s1, p1, p2>::iterator_base_nonconst;
+
+    using node = class node<K, T, Less, NS, S, D>;
+
+    node* _leaf;
+    T value;
+
+public:
+    template <typename... Args>
+    static data* create(Args&&... args) {
+        return current_allocator().construct<data>(std::forward<Args>(args)...);
+    }
+
+    template <typename Func>
+    SEASTAR_CONCEPT(requires Disposer<Func, T>)
+    static void destroy(data& d, Func&& disp) noexcept {
+        disp(&d.value);
+        d._leaf = nullptr;
+        current_allocator().destroy(&d);
+    }
+
+    template <typename... Args>
+    data(Args&& ... args) : _leaf(nullptr), value(std::forward<Args>(args)...) {}
+
+    data(data&& other) noexcept : _leaf(other._leaf), value(std::move(other.value)) {
+        if (attached()) {
+            auto i = _leaf->index_for(&other);
+            _leaf->_kids[i].d = this;
+            other._leaf = nullptr;
+        }
+    }
+
+    ~data() { assert(!attached()); }
+
+    bool attached() const noexcept { return _leaf != nullptr; }
+
+    void attach(node& to) noexcept {
+        assert(!attached());
+        _leaf = &to;
+    }
+
+    void reattach(node* to) noexcept {
+        assert(attached());
+        _leaf = to;
+    }
+
+private:
+    // Data node may describe a T without fixed size, e.g. an array that grows on
+    // demand. So this helper returns the size of the memory chunk that's required
+    // to carry the node with T of the payload size on board.
+    //
+    // The tree::iterator::reconstruct does this growing (or shrinking).
+    size_t storage_size(size_t payload) const noexcept {
+        return sizeof(data) - sizeof(T) + payload;
+    }
+
+    size_t storage_size() const noexcept {
+        return storage_size(size_for_allocation_strategy(value));
+    }
+
+public:
+    friend size_t size_for_allocation_strategy(const data& obj) noexcept {
+        return obj.storage_size();
+    }
+};
+
+} // namespace bplus
diff --git a/utils/collection-concepts.hh b/utils/collection-concepts.hh
new file mode 100644
index 0000000000..4bf32ce203
--- /dev/null
+++ b/utils/collection-concepts.hh
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <type_traits>
+#include <seastar/util/concepts.hh>
+
+SEASTAR_CONCEPT(
+    template <typename Func, typename T>
+    concept Disposer = requires (Func f, T* val) { 
+        { f(val) } noexcept -> std::same_as<void>;
+    };
+)
+
+SEASTAR_CONCEPT(
+    template <typename Key1, typename Key2, typename Less>
+    concept LessComparable = requires (const Key1& a, const Key2& b, Less less) {
+        { less(a, b) } -> std::same_as<bool>;
+        { less(b, a) } -> std::same_as<bool>;
+    };
+
+    template <typename Key1, typename Key2, typename Less>
+    concept LessNothrowComparable = LessComparable<Key1, Key2, Less> && std::is_nothrow_invocable_v<Less, Key1, Key2>;
+)
diff --git a/utils/neat-object-id.hh b/utils/neat-object-id.hh
new file mode 100644
index 0000000000..ed9ab9a6cc
--- /dev/null
+++ b/utils/neat-object-id.hh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <atomic>
+
+namespace utils {
+
+/*
+ * The neat_id class is purely a debugging thing -- when reading
+ * the logs with object IDs in it it's more handy to look at those
+ * consisting * of 1-3 digits, rather than 16 hex-digits of a printed
+ * pointer.
+ *
+ * Embed with [[no_unique_address]] tag for memory efficiency
+ */
+template <bool Debug>
+struct neat_id {
+    unsigned int operator()() const noexcept { return reinterpret_cast<uintptr_t>(this); }
+};
+
+template <>
+struct neat_id<true> {
+    unsigned int _id;
+    static unsigned int _next() noexcept {
+        static std::atomic<unsigned int> rover {1};
+        return rover.fetch_add(1);
+    }
+
+    neat_id() noexcept : _id(_next()) {}
+    unsigned int operator()() const noexcept { return _id; }
+};
+
+} // namespace

From eb70644c1c16fe648178891f69230df490e270bc Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Mon, 4 May 2020 12:18:50 +0300
Subject: [PATCH 03/11] intrusive-array: Array with trusted bounds

A plain array of elements that grows and shrinks by
constructing the new instance from an existing one and
moving the elements from it.

Behaves similarly to vector's external array, but has
0-bytes overhead. The array bounds (0-th and N-th
elemements) are determined by checking the flags on the
elements themselves. For this the type must support
getters and setters for the flags.

To remove an element from array there's also a nothrow
option that drops the requested element from array,
shifts the righter ones left and keeps the trailing
unused memory (so called "train") until reconstruction
or destruction.

Also comes with lower_bound() helper that helps keeping
the elements sotred and the from_element() one that
returns back reference to the array in which the element
sits.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 configure.py                       |   1 +
 test/boost/intrusive_array_test.cc | 243 ++++++++++++++++++++
 utils/collection-concepts.hh       |  11 +
 utils/intrusive-array.hh           | 354 +++++++++++++++++++++++++++++
 4 files changed, 609 insertions(+)
 create mode 100644 test/boost/intrusive_array_test.cc
 create mode 100644 utils/intrusive-array.hh

diff --git a/configure.py b/configure.py
index fdd15d99e5..baad29aed6 100755
--- a/configure.py
+++ b/configure.py
@@ -333,6 +333,7 @@ scylla_tests = set([
     'test/boost/estimated_histogram_test',
     'test/boost/logalloc_test',
     'test/boost/managed_vector_test',
+    'test/boost/intrusive_array_test',
     'test/boost/map_difference_test',
     'test/boost/memtable_test',
     'test/boost/meta_test',
diff --git a/test/boost/intrusive_array_test.cc b/test/boost/intrusive_array_test.cc
new file mode 100644
index 0000000000..66297060ca
--- /dev/null
+++ b/test/boost/intrusive_array_test.cc
@@ -0,0 +1,243 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <boost/test/unit_test.hpp>
+#include <seastar/testing/thread_test_case.hh>
+#include <fmt/core.h>
+
+#include "utils/intrusive-array.hh"
+#include "utils/logalloc.hh"
+
+class element {
+    bool _head = false;
+    bool _tail = false;
+    bool _train = false;
+
+    long _data;
+    int *_cookie;
+    int *_cookie2;
+
+public:
+    explicit element(long val) : _data(val), _cookie(new int(0)), _cookie2(new int(0)) { }
+
+    element(const element& other) = delete;
+    element(element&& other) noexcept : _head(other._head), _tail(other._tail), _train(other._train),
+            _data(other._data), _cookie(other._cookie), _cookie2(new int(0)) {
+        other._cookie = nullptr;
+    }
+
+    ~element() {
+        if (_cookie != nullptr) {
+            delete _cookie;
+        }
+
+        delete _cookie2;
+    }
+
+    bool is_head() const noexcept { return _head; }
+    void set_head(bool v) noexcept { _head = v; }
+    bool is_tail() const noexcept { return _tail; }
+    void set_tail(bool v) noexcept { _tail = v; }
+    bool with_train() const noexcept { return _train; }
+    void set_train(bool v) noexcept { _train = v; }
+
+    bool operator==(long v) const { return v == _data; }
+    long operator*() const { return _data; }
+
+    bool bound_check(int idx, int size) {
+        return ((idx == 0) == is_head()) && ((idx == size - 1) == is_tail());
+    }
+};
+
+using test_array = intrusive_array<element>;
+
+static bool size_check(test_array& a, size_t size, unsigned short tlen) {
+    return a[size - 1].is_tail() && a.size() == size &&
+        size_for_allocation_strategy(a) == (size + tlen) * sizeof(element) &&
+        ((tlen != 0) == a[0].with_train()) &&
+        ((tlen == 0) || *reinterpret_cast<unsigned short*>(&a[size]) == tlen);
+}
+
+void show(const char *pfx, test_array& a, int sz) {
+    int i;
+
+    fmt::print("{}", pfx);
+    for (i = 0; i < sz; i++) {
+        fmt::print("{}{}{}", a[i].is_head() ? 'H' : ' ', *a[i], a[i].is_tail() ? 'T' : ' ');
+    }
+    if (a[0].with_train()) {
+        fmt::print(" ~{}", *reinterpret_cast<unsigned short *>(&a[i]));
+    }
+    fmt::print("\n");
+}
+
+SEASTAR_THREAD_TEST_CASE(test_basic_construct) {
+    test_array array(12);
+
+    for (auto i = array.begin(); i != array.end(); i++) {
+       BOOST_REQUIRE(*i == 12);
+    }
+}
+
+test_array* grow(test_array& from, size_t nsize, int npos, long ndat) {
+    BOOST_REQUIRE(from.size() + 1 == nsize);
+    auto ptr = current_allocator().alloc(&get_standard_migrator<test_array>(), sizeof(element) * nsize, alignof(test_array));
+    return new (ptr) test_array(from, test_array::grow_tag{npos}, ndat);
+}
+
+test_array* shrink(test_array& from, size_t nszie, int spos) {
+    BOOST_REQUIRE(from.size() - 1 == nszie);
+    auto ptr = current_allocator().alloc(&get_standard_migrator<test_array>(), sizeof(element) * nszie, alignof(test_array));
+    return new (ptr) test_array(from, test_array::shrink_tag{spos});
+}
+
+void grow_shrink_and_check(test_array& cur, int size, int depth) {
+    for (int i = 0; i <= size; i++) {
+        long nel = size + 12;
+        test_array* narr = grow(cur, size + 1, i, nel);
+        int idx = 0;
+
+        BOOST_REQUIRE(size_check(*narr, size + 1, 0));
+
+        for (auto ni = narr->begin(); ni != narr->end(); ni++) {
+            if (idx == i) {
+                BOOST_REQUIRE(*ni == nel);
+            } else if (idx < i) {
+                BOOST_REQUIRE(*ni == *cur[idx]);
+            } else {
+                BOOST_REQUIRE(*ni == *cur[idx - 1]);
+            }
+
+            BOOST_REQUIRE(ni->bound_check(idx, size + 1));
+            idx++;
+        }
+
+        if (size < depth) {
+            grow_shrink_and_check(*narr, size + 1, depth);
+        }
+
+        current_allocator().destroy(narr);
+    }
+
+    if (size > 1) {
+        for (int i = 0; i < size; i++) {
+            test_array* narr = shrink(cur, size - 1, i);
+            int idx = 0;
+
+            BOOST_REQUIRE(size_check(*narr, size - 1, 0));
+
+            for (auto ni = narr->begin(); ni != narr->end(); ni++) {
+                if (idx == i) {
+                    continue;
+                } else if (idx < i) {
+                    BOOST_REQUIRE(*ni == *cur[idx]);
+                } else {
+                    BOOST_REQUIRE(*ni == *cur[idx + 1]);
+                }
+
+                BOOST_REQUIRE(ni->bound_check(idx, size - 1));
+                idx++;
+            }
+
+            current_allocator().destroy(narr);
+        }
+    }
+}
+
+SEASTAR_THREAD_TEST_CASE(test_grow_shrink_construct) {
+    test_array array(12);
+    grow_shrink_and_check(array, 1, 5);
+}
+
+SEASTAR_THREAD_TEST_CASE(test_erase) {
+    test_array a1(10);
+    test_array *a2 = grow(a1, 2, 1, 20);
+    test_array *a3 = grow(*a2, 3, 2, 30);
+
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 3; j++) {
+            for (int k = 0; k < 2; k++) {
+                std::vector<int> x({10, 20, 30, 40});
+                test_array *a4 = grow(*a3, 4, 3, 40);
+
+                auto test_fn = [&] (int idx, int sz) {
+                    a4->erase(idx);
+                    x.erase(x.begin() + idx);
+                    BOOST_REQUIRE(size_check(*a4, sz, 4 - sz));
+                    for (int a = 0; a < sz; a++) {
+                        BOOST_REQUIRE(x[a] == *(*a4)[a]);
+                    }
+                };
+
+                test_fn(i, 3);
+                test_fn(j, 2);
+                test_fn(k, 1);
+
+                current_allocator().destroy(a4);
+            }
+        }
+    }
+
+    current_allocator().destroy(a3);
+    current_allocator().destroy(a2);
+}
+
+SEASTAR_THREAD_TEST_CASE(test_lower_bound) {
+    test_array a1(12);
+    struct compare {
+        int operator()(const element& a, const element& b) const { return *a - *b; }
+    };
+
+    test_array *a2 = grow(a1, 2, 1, 14);
+
+    auto i = a2->lower_bound(element(13), compare{});
+    BOOST_REQUIRE(*i == 14 && a2->index_of(i) == 1);
+
+    test_array *a3 = grow(*a2, 3, 2, 17);
+
+    bool match;
+    BOOST_REQUIRE(*a3->lower_bound(element(11), compare{}, match) == 12 && !match);
+    BOOST_REQUIRE(*a3->lower_bound(element(12), compare{}, match) == 12 && match);
+    BOOST_REQUIRE(*a3->lower_bound(element(13), compare{}, match) == 14 && !match);
+    BOOST_REQUIRE(*a3->lower_bound(element(14), compare{}, match) == 14 && match);
+    BOOST_REQUIRE(*a3->lower_bound(element(15), compare{}, match) == 17 && !match);
+    BOOST_REQUIRE(*a3->lower_bound(element(16), compare{}, match) == 17 && !match);
+    BOOST_REQUIRE(*a3->lower_bound(element(17), compare{}, match) == 17 && match);
+    BOOST_REQUIRE(a3->lower_bound(element(18), compare{}, match) == a3->end());
+
+    current_allocator().destroy(a3);
+    current_allocator().destroy(a2);
+}
+
+SEASTAR_THREAD_TEST_CASE(test_from_element) {
+    test_array a1(12);
+    test_array *a2 = grow(a1, 2, 1, 14);
+    test_array *a3 = grow(*a2, 3, 2, 17);
+
+    element* i = &((*a3)[2]);
+    BOOST_REQUIRE(*i == 17);
+    int idx;
+    test_array& x = test_array::from_element(i, idx);
+    BOOST_REQUIRE(&x == a3 && idx == 2);
+
+    current_allocator().destroy(a3);
+    current_allocator().destroy(a2);
+}
diff --git a/utils/collection-concepts.hh b/utils/collection-concepts.hh
index 4bf32ce203..bc98939445 100644
--- a/utils/collection-concepts.hh
+++ b/utils/collection-concepts.hh
@@ -40,3 +40,14 @@ SEASTAR_CONCEPT(
     template <typename Key1, typename Key2, typename Less>
     concept LessNothrowComparable = LessComparable<Key1, Key2, Less> && std::is_nothrow_invocable_v<Less, Key1, Key2>;
 )
+
+SEASTAR_CONCEPT(
+    template <typename T1, typename T2, typename Compare>
+    concept Comparable = requires (const T1& a, const T2& b, Compare cmp) {
+        // The Comparable is trichotomic comparator that should return 
+        //   negative value when a < b
+        //   zero when a == b
+        //   positive value when a > b
+        { cmp(a, b) } -> std::same_as<int>;
+    };
+)
diff --git a/utils/intrusive-array.hh b/utils/intrusive-array.hh
new file mode 100644
index 0000000000..d198203b83
--- /dev/null
+++ b/utils/intrusive-array.hh
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <seastar/util/concepts.hh>
+
+#include "utils/allocation_strategy.hh"
+#include "utils/collection-concepts.hh"
+
+SEASTAR_CONCEPT(
+    template <typename T>
+    concept BoundsKeeper = requires (T val, bool bit) {
+        { val.is_head() } noexcept -> std::same_as<bool>;
+        { val.set_head(bit) } noexcept -> std::same_as<void>;
+        { val.is_tail() } noexcept -> std::same_as<bool>;
+        { val.set_tail(bit) } noexcept -> std::same_as<void>;
+        { val.with_train() } noexcept -> std::same_as<bool>;
+        { val.set_train(bit) } noexcept -> std::same_as<void>;
+    };
+)
+
+/*
+ * A plain array of T-s that grows and shrinks by constructing a new
+ * instances. Holds at least one element. Has facilities for sorting
+ * the elements and for doing "container_of" by the given element
+ * pointer. LSA-compactible.
+ *
+ * Important feature of the array is zero memory overhead -- it doesn't
+ * keep its size/capacity onboard. The size is calculated each time by
+ * walking the array of T-s and checking which one of them is the tail
+ * element. Respectively, the T must keep head/tail flags on itself.
+ */
+template <typename T>
+SEASTAR_CONCEPT( requires BoundsKeeper<T> && std::is_nothrow_move_constructible_v<T> )
+class intrusive_array {
+    // Sanity constant to avoid infinite loops searching for tail
+    static constexpr int max_len = std::numeric_limits<short int>::max();
+
+    union maybe_constructed {
+        maybe_constructed() { }
+        ~maybe_constructed() { }
+        T object;
+
+        /*
+         * Train is 1 or more allocated but unoccupied memory slots after 
+         * the tail one. Being unused, this memory keeps the train length.
+         * An array with the train is marked with the respective flag on 
+         * the 0th element. Train is created by the erase() call and can 
+         * be up to 65535 elements long
+         *
+         * Train length is included into the storage_size() to make 
+         * allocator and compaction work correctly, but is not included 
+         * into the number_of_elements(), so the array behaves just like
+         * there's no train
+         *
+         * Respectively both grow and shrink constructors do not carry 
+         * the train (and drop the bit from 0th element) and don't expect 
+         * the memory for the new array to include one
+         */
+        unsigned short train_len;
+        static_assert(sizeof(T) >= sizeof(unsigned short));
+    };
+
+    maybe_constructed   _data[1];
+
+    size_t number_of_elements() const noexcept {
+        for (int i = 0; i < max_len; i++) {
+            if (_data[i].object.is_tail()) {
+                return i + 1;
+            }
+        }
+
+        std::abort();
+    }
+
+    size_t storage_size() const noexcept {
+        size_t nr = number_of_elements();
+        if (_data[0].object.with_train()) {
+            nr += _data[nr].train_len;
+        }
+        return nr * sizeof(T);
+    }
+
+public:
+    using iterator = T*;
+    using const_iterator = const T*;
+
+    /*
+     * There are 3 constructing options for the array: initial, grow
+     * and shrink.
+     *
+     * * initial just creates a 1-element array
+     * * grow -- makes a new one moving all elements from the original
+     * array and inserting the one (only one) more element at the given
+     * position
+     * * shrink -- also makes a new array skipping the not needed
+     * element while moving them from the original one
+     *
+     * In all cases the enough big memory chunk must be provided by the
+     * caller!
+     *
+     * Note, that none of them calls destructors on T-s, unlike vector.
+     * This is because when the older array is destroyed it has no idea
+     * about whether or not it was grown/shrunk and thus it destroys
+     * T-s itself.
+     */
+
+    // Initial
+    template <typename... Args>
+    intrusive_array(Args&&... args) {
+        new (&_data[0].object) T(std::forward<Args>(args)...);
+        _data[0].object.set_head(true);
+        _data[0].object.set_tail(true);
+    }
+
+    // Growing
+    struct grow_tag {
+        int add_pos;
+    };
+
+    template <typename... Args>
+    intrusive_array(intrusive_array& from, grow_tag grow, Args&&... args) {
+        // The add_pos is strongly _expected_ to be within bounds
+        int i, off = 0;
+        bool tail = false;
+
+        for (i = 0; !tail; i++) {
+            if (i == grow.add_pos) {
+                off = 1;
+                continue;
+            }
+
+            tail = from._data[i - off].object.is_tail();
+            new (&_data[i].object) T(std::move(from._data[i - off].object));
+        }
+
+        assert(grow.add_pos <= i && i < max_len);
+
+        new (&_data[grow.add_pos].object) T(std::forward<Args>(args)...);
+
+        _data[0].object.set_head(true);
+        _data[0].object.set_train(false);
+        if (grow.add_pos == 0) {
+            _data[1].object.set_head(false);
+        }
+        _data[i - off].object.set_tail(true);
+        if (off == 0) {
+            _data[i - 1].object.set_tail(false);
+        }
+    }
+
+    // Shrinking
+    struct shrink_tag {
+        int del_pos;
+    };
+
+    intrusive_array(intrusive_array& from, shrink_tag shrink) {
+        int i, off = 0;
+        bool tail = false;
+
+        for (i = 0; !tail; i++) {
+            tail = from._data[i].object.is_tail();
+            if (i == shrink.del_pos) {
+                off = 1;
+            } else {
+                new (&_data[i - off].object) T(std::move(from._data[i].object));
+            }
+        }
+
+        _data[0].object.set_head(true);
+        _data[0].object.set_train(false);
+        _data[i - off - 1].object.set_tail(true);
+    }
+
+    intrusive_array(const intrusive_array& other) = delete;
+    intrusive_array(intrusive_array&& other) noexcept {
+        bool tail = false;
+        int i;
+
+        for (i = 0; !tail; i++) {
+            tail = other._data[i].object.is_tail();
+
+            new (&_data[i].object) T(std::move(other._data[i].object));
+        }
+
+        if (_data[0].object.with_train()) {
+            _data[i].train_len = other._data[i].train_len;
+        }
+    }
+
+    ~intrusive_array() {
+        bool tail = false;
+
+        for (int i = 0; !tail; i++) {
+            tail = _data[i].object.is_tail();
+            _data[i].object.~T();
+        }
+    }
+
+    /*
+     * Drops the element in-place at position @pos and grows the
+     * "train". To be used in places where reconstruction is not
+     * welcome (e.g. because it throws)
+     *
+     * Single-elemented array cannot be erased from, just drop it
+     * alltogether if needed
+     */
+    void erase(int pos) noexcept {
+        assert(!is_single_element());
+        assert(pos < max_len);
+
+        bool with_train = _data[0].object.with_train();
+        bool tail = _data[pos].object.is_tail();
+        _data[pos].object.~T();
+
+        if (tail) {
+            assert(pos > 0);
+            _data[pos - 1].object.set_tail(true);
+        } else {
+            while (!tail) {
+                new (&_data[pos].object) T(std::move(_data[pos + 1].object));
+                _data[pos + 1].object.~T();
+                tail = _data[pos++].object.is_tail();
+            }
+            _data[0].object.set_head(true);
+        }
+
+        _data[0].object.set_train(true);
+        unsigned short train_len = with_train ? _data[pos + 1].train_len : 0;
+        assert(train_len < max_len);
+        _data[pos].train_len = train_len + 1;
+    }
+
+    T& operator[](int pos) noexcept { return _data[pos].object; }
+    const T& operator[](int pos) const noexcept { return _data[pos].object; }
+
+    iterator begin() noexcept { return &_data[0].object; }
+    const_iterator begin() const noexcept { return &_data[0].object; }
+    const_iterator cbegin() const noexcept { return &_data[0].object; }
+    iterator end() noexcept { return &_data[number_of_elements()].object; }
+    const_iterator end() const noexcept { return &_data[number_of_elements()].object; }
+    const_iterator cend() const noexcept { return &_data[number_of_elements()].object; }
+
+    size_t index_of(iterator i) const noexcept { return i - &_data[0].object; }
+    size_t index_of(const_iterator i) const noexcept { return i - &_data[0].object; }
+    bool is_single_element() const noexcept { return _data[0].object.is_tail(); }
+
+    // A helper for keeping the array sorted
+    template <typename K, typename Compare>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    const_iterator lower_bound(const K& val, Compare cmp, bool& match) const {
+        int i = 0;
+
+        do {
+            int x = cmp(_data[i].object, val);
+            if (x >= 0) {
+                match = (x == 0);
+                break;
+            }
+        } while (!_data[i++].object.is_tail());
+
+        return &_data[i].object;
+    }
+
+    template <typename K, typename Compare>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    iterator lower_bound(const K& val, Compare cmp, bool& match) {
+        return const_cast<iterator>(const_cast<const intrusive_array*>(this)->lower_bound(val, std::move(cmp), match));
+    }
+
+    template <typename K, typename Compare>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    const_iterator lower_bound(const K& val, Compare cmp) const {
+        bool match = false;
+        return lower_bound(val, cmp, match);
+    }
+
+    template <typename K, typename Compare>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    iterator lower_bound(const K& val, Compare cmp) {
+        return const_cast<iterator>(const_cast<const intrusive_array*>(this)->lower_bound(val, std::move(cmp)));
+    }
+
+    // And its peer ... just to be used
+    template <typename K, typename Compare>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    const_iterator upper_bound(const K& val, Compare cmp) const {
+        int i = 0;
+
+        do {
+            if (cmp(_data[i].object, val) > 0) {
+                break;
+            }
+        } while (!_data[i++].object.is_tail());
+
+        return &_data[i].object;
+    }
+
+    template <typename K, typename Compare>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    iterator upper_bound(const K& val, Compare cmp) {
+        return const_cast<iterator>(const_cast<const intrusive_array*>(this)->upper_bound(val, std::move(cmp)));
+    }
+
+    template <typename Func>
+    SEASTAR_CONCEPT(requires Disposer<Func, T>)
+    void for_each(Func&& fn) noexcept {
+        bool tail = false;
+
+        for (int i = 0; !tail; i++) {
+            tail = _data[i].object.is_tail();
+            fn(&_data[i].object);
+        }
+    }
+
+    size_t size() const noexcept { return number_of_elements(); }
+
+    friend size_t size_for_allocation_strategy(const intrusive_array& obj) noexcept {
+        return obj.storage_size();
+    }
+
+    static intrusive_array& from_element(T* ptr, int& idx) noexcept {
+        idx = 0;
+        while (!ptr->is_head()) {
+            assert(idx < max_len); // may the force be with us...
+            idx++;
+            ptr--;
+        }
+
+        static_assert(offsetof(intrusive_array, _data[0].object) == 0);
+        return *reinterpret_cast<intrusive_array*>(ptr);
+    }
+};

From cf1315cde5aaa43ff28354b4e686984d652d929e Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Thu, 23 Apr 2020 13:32:49 +0300
Subject: [PATCH 04/11] double-decker: A combination of B+tree with array

The collection is K:V store

   bplus::tree<Key = K, Value = array_trusted_bounds<V>>

It will be used as partitions cache. The outer tree is used to
quickly map token to cache_entry, the inner array -- to resolve
(expected to be rare) hash collisions.

It also must be equipped with two comparators -- less one for
keys and full one for values. The latter is not kept on-board,
but it required on all calls.

The core API consists of just 2 calls

- Heterogenuous lower_bound(search_key) -> iterator : finds the
  element that's greater or equal to the provided search key

  Other than the iterator the call returns a "hint" object
  that helps the next call.

- emplace_before(iterator, key, hint, ...) : the call construct
  the element right before the given iterator. The key and hint
  are needed for more optimal algo, but strictly speaking not
  required.

  Adding an entry to the double_decker may result in growing the
  node's array. Here to B+ iterator's .reconstruct() method
  comes into play. The new array is created, old elements are
  moved onto it, then the fresh node replaces the old one.

// TODO: Ideally this should be turned into the
// template <typename OuterCollection, typename InnerCollection>
// but for now the double_decker still has some intimate knowledge
// about what outer and inner collections are.

Insertion into this collection _may_ invalidate iterators, but
may leave intact. Invalidation only happens in case of hashing
conflict, which can be clearly seen from the hint object, so
there's a good room for improvement.

The main usage by row_cache (the find_or_create_entry) looks like

   cache_entry find_or_create_entry() {
       bound_hint hint;

       it = lower_bound(decorated_key, &hint);
       if (!hint.found) {
           it = emplace_before(it, decorated_key.token(), hint,
                                 <constructor args>)
       }
       return *it;
  }

Now the hint. It contains 3 booleans, that are

  - match: set to true when the "greater or equal" condition
    evaluated to "equal". This frees the caller from the need
    to manually check whether the entry returned matches the
    search key or the new one should be inserted.

    This is the "!found" check from the above snippet.

To explain the next 2 bools, here's a small example. Consider
the tree containing two elements {token, partition key}:

   { 3, "a" }, { 5, "z" }

As the collection is sorted they go in the order shown. Next,
this is what the lower_bound would return for some cases:

   { 3, "z" } -> { 5, "z" }
   { 4, "a" } -> { 5, "z" }
   { 5, "a" } -> { 5, "z" }

Apparently, the lower bound for those 3 elements are the same,
but the code-flows of emplacing them before one differ drastically.

   { 3, "z" } : need to get previous element from the tree and
                push the element to it's vector's back
   { 4, "a" } : need to create new element in the tree and populate
                its empty vector with the single element
   { 5, "a" } : need to put the new element in the found tree
                element right before the found vector position

To make one of the above decisions the .emplace_before would need
to perform another set of comparisons of keys and elements.
Fortunately, the needed information was already known inside the
lower_bound call and can be reported via the hint.

Said that,

  - key_match: set to true if tree.lower_bound() found the element
    for the Key (which is token). For above examples this will be
    true for cases 3z and 5a.

  - key_tail: set to true if the tree element was found, but when
    comparing values from array the bounding element turned out
    to belong to the next tree element and the iterator was ++-ed.
    For above examples this would be true for case 3z only.

And the last, but not least -- the "erase self" feature. Which is
given only the cache_entry pointer at hands remove it from the
collection. To make this happen we need to make two steps:

1. get the array the entry sits in
2. get the b+ tree node the vectors sits in

Both methods are provided by array_trusted_bounds and bplus::tree.
So, when we need to get iterator from the given T pointer, the algo
looks like

- Walk back the T array untill hitting the head element
- Call array_trusted_bounds::from_element() getting the array
- Construct b+ iterator from obtained array
- Construct the double_decker iterator from b+ iterator and from
  the number of "steps back" from above
- Call double_decker::iterator.erase()

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 configure.py                     |   1 +
 test/boost/double_decker_test.cc | 397 ++++++++++++++++++++++++++++++
 utils/double-decker.hh           | 403 +++++++++++++++++++++++++++++++
 3 files changed, 801 insertions(+)
 create mode 100644 test/boost/double_decker_test.cc
 create mode 100644 utils/double-decker.hh

diff --git a/configure.py b/configure.py
index baad29aed6..f160a1faf4 100755
--- a/configure.py
+++ b/configure.py
@@ -390,6 +390,7 @@ scylla_tests = set([
     'test/boost/vint_serialization_test',
     'test/boost/virtual_reader_test',
     'test/boost/bptree_test',
+    'test/boost/double_decker_test',
     'test/manual/ec2_snitch_test',
     'test/manual/gce_snitch_test',
     'test/manual/gossip',
diff --git a/test/boost/double_decker_test.cc b/test/boost/double_decker_test.cc
new file mode 100644
index 0000000000..0f18c69dad
--- /dev/null
+++ b/test/boost/double_decker_test.cc
@@ -0,0 +1,397 @@
+
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define BOOST_TEST_MODULE double_decker
+
+#include <seastar/core/print.hh>
+#include <boost/test/unit_test.hpp>
+#include <fmt/core.h>
+#include <string>
+
+#include "utils/double-decker.hh"
+#include "test/lib/random_utils.hh"
+
+class compound_key {
+public:
+    int key;
+    std::string sub_key;
+
+    compound_key(int k, std::string sk) noexcept : key(k), sub_key(sk) {}
+
+    compound_key(const compound_key& other) = delete;
+    compound_key(compound_key&& other) noexcept : key(other.key), sub_key(std::move(other.sub_key)) {}
+
+    compound_key& operator=(const compound_key& other) = delete;
+    compound_key& operator=(compound_key&& other) noexcept {
+        key = other.key;
+        sub_key = std::move(other.sub_key);
+        return *this;
+    }
+
+    std::string format() const {
+        return seastar::format("{}.{}", key, sub_key);
+    }
+
+    bool operator==(const compound_key& other) const {
+        return key == other.key && sub_key == other.sub_key;
+    }
+
+    bool operator!=(const compound_key& other) const { return !(*this == other); }
+
+    struct compare {
+        int operator()(const int& a, const int& b) const { return a - b; }
+        int operator()(const int& a, const compound_key& b) const { return a - b.key; }
+        int operator()(const compound_key& a, const int& b) const { return a.key - b; }
+
+        int operator()(const compound_key& a, const compound_key& b) const {
+            if (a.key != b.key) {
+                return this->operator()(a.key, b.key);
+            } else {
+                return a.sub_key.compare(b.sub_key);
+            }
+        }
+    };
+
+    struct less_compare {
+        compare cmp;
+
+        template <typename A, typename B>
+        bool operator()(const A& a, const B& b) const noexcept {
+            return cmp(a, b) < 0;
+        }
+    };
+};
+
+class test_data {
+    compound_key _key;
+    bool _head = false;
+    bool _tail = false;
+    bool _train = false;
+
+    int *_cookie;
+    int *_cookie2;
+public:
+    bool is_head() const noexcept { return _head; }
+    bool is_tail() const noexcept { return _tail; }
+    bool with_train() const noexcept { return _train; }
+    void set_head(bool v) noexcept { _head = v; }
+    void set_tail(bool v) noexcept { _tail = v; }
+    void set_train(bool v) noexcept { _train = v; }
+
+    test_data(int key, std::string sub) : _key(key, sub), _cookie(new int(0)), _cookie2(new int(0)) {}
+
+    test_data(const test_data& other) = delete;
+    test_data(test_data&& other) noexcept : _key(std::move(other._key)),
+            _head(other._head), _tail(other._tail), _train(other._train),
+            _cookie(other._cookie), _cookie2(new int(0)) {
+        other._cookie = nullptr;
+    }
+
+    ~test_data() {
+        if (_cookie != nullptr) {
+            delete _cookie;
+        }
+        delete _cookie2;
+    }
+
+    bool operator==(const compound_key& k) { return _key == k; }
+
+    test_data& operator=(const test_data& other) = delete;
+    test_data& operator=(test_data&& other) = delete;
+
+    std::string format() const { return _key.format(); }
+
+    struct compare {
+        compound_key::compare kcmp;
+        int operator()(const int& a, const int& b) { return kcmp(a, b); }
+        int operator()(const compound_key& a, const int& b) { return kcmp(a.key, b); }
+        int operator()(const int& a, const compound_key& b) { return kcmp(a, b.key); }
+        int operator()(const compound_key& a, const compound_key& b) { return kcmp(a, b); }
+        int operator()(const compound_key& a, const test_data& b) { return kcmp(a, b._key); }
+        int operator()(const test_data& a, const compound_key& b) { return kcmp(a._key, b); }
+        int operator()(const test_data& a, const test_data& b) { return kcmp(a._key, b._key); }
+    };
+};
+
+using collection = double_decker<int, test_data, compound_key::less_compare, test_data::compare, 4,
+                    bplus::key_search::both, bplus::with_debug::yes>;
+using oracle = std::set<compound_key, compound_key::less_compare>;
+
+BOOST_AUTO_TEST_CASE(test_lower_bound) {
+    collection c(compound_key::less_compare{});
+    test_data::compare cmp;
+
+    c.insert(3, test_data(3, "e"), cmp);
+    c.insert(5, test_data(5, "i"), cmp);
+    c.insert(5, test_data(5, "o"), cmp);
+
+    collection::bound_hint h;
+
+    BOOST_REQUIRE(*c.lower_bound(compound_key(2, "a"), cmp, h) == compound_key(3, "e") && !h.key_match);
+    BOOST_REQUIRE(*c.lower_bound(compound_key(3, "a"), cmp, h) == compound_key(3, "e") && h.key_match && !h.key_tail && !h.match);
+    BOOST_REQUIRE(*c.lower_bound(compound_key(3, "e"), cmp, h) == compound_key(3, "e") && h.key_match && !h.key_tail && h.match);
+    BOOST_REQUIRE(*c.lower_bound(compound_key(3, "o"), cmp, h) == compound_key(5, "i") && h.key_match && h.key_tail && !h.match);
+    BOOST_REQUIRE(*c.lower_bound(compound_key(4, "i"), cmp, h) == compound_key(5, "i") && !h.key_match);
+    BOOST_REQUIRE(*c.lower_bound(compound_key(5, "a"), cmp, h) == compound_key(5, "i") && h.key_match && !h.key_tail && !h.match);
+    BOOST_REQUIRE(*c.lower_bound(compound_key(5, "i"), cmp, h) == compound_key(5, "i") && h.key_match && !h.key_tail && h.match);
+    BOOST_REQUIRE(*c.lower_bound(compound_key(5, "l"), cmp, h) == compound_key(5, "o") && h.key_match && !h.key_tail && !h.match);
+    BOOST_REQUIRE(*c.lower_bound(compound_key(5, "o"), cmp, h) == compound_key(5, "o") && h.key_match && !h.key_tail && h.match);
+    BOOST_REQUIRE(c.lower_bound(compound_key(5, "q"), cmp, h) == c.end() && h.key_match && h.key_tail);
+    BOOST_REQUIRE(c.lower_bound(compound_key(6, "q"), cmp, h) == c.end() && !h.key_match);
+
+    c.clear();
+}
+
+BOOST_AUTO_TEST_CASE(test_upper_bound) {
+    collection c(compound_key::less_compare{});
+    test_data::compare cmp;
+
+    c.insert(3, test_data(3, "e"), cmp);
+    c.insert(5, test_data(5, "i"), cmp);
+    c.insert(5, test_data(5, "o"), cmp);
+
+    BOOST_REQUIRE(*c.upper_bound(compound_key(2, "a"), cmp) == compound_key(3, "e"));
+    BOOST_REQUIRE(*c.upper_bound(compound_key(3, "a"), cmp) == compound_key(3, "e"));
+    BOOST_REQUIRE(*c.upper_bound(compound_key(3, "e"), cmp) == compound_key(5, "i"));
+    BOOST_REQUIRE(*c.upper_bound(compound_key(3, "o"), cmp) == compound_key(5, "i"));
+    BOOST_REQUIRE(*c.upper_bound(compound_key(4, "i"), cmp) == compound_key(5, "i"));
+    BOOST_REQUIRE(*c.upper_bound(compound_key(5, "a"), cmp) == compound_key(5, "i"));
+    BOOST_REQUIRE(*c.upper_bound(compound_key(5, "i"), cmp) == compound_key(5, "o"));
+    BOOST_REQUIRE(*c.upper_bound(compound_key(5, "l"), cmp) == compound_key(5, "o"));
+    BOOST_REQUIRE(c.upper_bound(compound_key(5, "o"), cmp) == c.end());
+    BOOST_REQUIRE(c.upper_bound(compound_key(5, "q"), cmp) == c.end());
+    BOOST_REQUIRE(c.upper_bound(compound_key(6, "q"), cmp) == c.end());
+
+    c.clear();
+}
+BOOST_AUTO_TEST_CASE(test_self_iterator) {
+    collection c(compound_key::less_compare{});
+    test_data::compare cmp;
+
+    c.insert(1, std::move(test_data(1, "a")), cmp);
+    c.insert(1, std::move(test_data(1, "b")), cmp);
+    c.insert(2, std::move(test_data(2, "c")), cmp);
+    c.insert(3, std::move(test_data(3, "d")), cmp);
+    c.insert(3, std::move(test_data(3, "e")), cmp);
+
+    auto erase_by_ptr = [&] (int key, std::string sub) {
+        test_data* d = &*c.find(compound_key(key, sub), cmp);
+        collection::iterator di(d);
+        di.erase(compound_key::less_compare{});
+    };
+
+    erase_by_ptr(1, "b");
+    erase_by_ptr(2, "c");
+    erase_by_ptr(3, "d");
+
+    auto i = c.begin();
+    BOOST_REQUIRE(*i++ == compound_key(1, "a"));
+    BOOST_REQUIRE(*i++ == compound_key(3, "e"));
+    BOOST_REQUIRE(i == c.end());
+
+    c.clear();
+}
+
+BOOST_AUTO_TEST_CASE(test_end_iterator) {
+    collection c(compound_key::less_compare{});
+    test_data::compare cmp;
+
+    c.insert(1, std::move(test_data(1, "a")), cmp);
+    auto i = std::prev(c.end());
+    BOOST_REQUIRE(*i == compound_key(1, "a"));
+
+    c.clear();
+}
+
+void validate_sorted(collection& c) {
+    auto i = c.begin();
+    if (i == c.end()) {
+        return;
+    }
+
+    while (1) {
+        auto cur = i;
+        i++;
+        if (i == c.end()) {
+            break;
+        }
+        test_data::compare cmp;
+        BOOST_REQUIRE(cmp(*cur, *i) < 0);
+    }
+}
+
+void compare_with_set(collection& c, oracle& s) {
+    test_data::compare cmp;
+    /* All keys must be findable */
+    for (auto i = s.begin(); i != s.end(); i++) {
+        auto j = c.find(*i, cmp);
+        BOOST_REQUIRE(j != c.end() && *j == *i);
+    }
+
+    /* Both iterators must coinside */
+    auto i = c.begin();
+    auto j = s.begin();
+
+    while (i != c.end()) {
+        BOOST_REQUIRE(*i == *j);
+        i++;
+        j++;
+    }
+}
+
+BOOST_AUTO_TEST_CASE(test_insert_via_emplace) {
+    collection c(compound_key::less_compare{});
+    test_data::compare cmp;
+    oracle s;
+    int nr = 0;
+
+    while (nr < 4000) {
+        compound_key k(tests::random::get_int<int>(900), tests::random::get_sstring(4));
+
+        collection::bound_hint h;
+        auto i = c.lower_bound(k, cmp, h);
+
+        if (i == c.end() || !h.match) {
+            auto it = c.emplace_before(i, k.key, h, k.key, k.sub_key);
+            BOOST_REQUIRE(*it == k);
+            s.insert(std::move(k));
+            nr++;
+        }
+    }
+
+    compare_with_set(c, s);
+    c.clear();
+}
+
+BOOST_AUTO_TEST_CASE(test_insert_and_erase) {
+    collection c(compound_key::less_compare{});
+    test_data::compare cmp;
+    int nr = 0;
+
+    while (nr < 500) {
+        compound_key k(tests::random::get_int<int>(100), tests::random::get_sstring(3));
+
+        if (c.find(k, cmp) == c.end()) {
+            auto it = c.insert(k.key, std::move(test_data(k.key, k.sub_key)), cmp);
+            BOOST_REQUIRE(*it == k);
+            nr++;
+        }
+    }
+
+    validate_sorted(c);
+
+    while (nr > 0) {
+        int n = tests::random::get_int<int>() % nr;
+
+        auto i = c.begin();
+        while (n > 0) {
+            i++;
+            n--;
+        }
+
+        i.erase(compound_key::less_compare{});
+        nr--;
+
+        validate_sorted(c);
+    }
+}
+
+BOOST_AUTO_TEST_CASE(test_compaction) {
+    logalloc::region reg;
+    with_allocator(reg.allocator(), [&] {
+        collection c(compound_key::less_compare{});
+        test_data::compare cmp;
+        oracle s;
+
+        {
+            logalloc::reclaim_lock rl(reg);
+
+            int nr = 0;
+
+            while (nr < 1500) {
+                compound_key k(tests::random::get_int<int>(400), tests::random::get_sstring(3));
+
+                if (c.find(k, cmp) == c.end()) {
+                    auto it = c.insert(k.key, std::move(test_data(k.key, k.sub_key)), cmp);
+                    BOOST_REQUIRE(*it == k);
+                    s.insert(std::move(k));
+                    nr++;
+                }
+            }
+        }
+
+        reg.full_compaction();
+
+        compare_with_set(c, s);
+        c.clear();
+    });
+}
+
+BOOST_AUTO_TEST_CASE(test_range_erase) {
+    std::vector<compound_key> keys;
+    test_data::compare cmp;
+
+    keys.emplace_back(1, "a");
+    keys.emplace_back(1, "b");
+    keys.emplace_back(1, "c");
+    keys.emplace_back(1, "d");
+    keys.emplace_back(2, "a");
+    keys.emplace_back(2, "b");
+    keys.emplace_back(2, "c");
+    keys.emplace_back(2, "d");
+    keys.emplace_back(2, "e");
+    keys.emplace_back(3, "a");
+    keys.emplace_back(3, "b");
+    keys.emplace_back(3, "c");
+
+    for (size_t f = 0; f < keys.size(); f++) {
+        for (size_t t = f; t <= keys.size(); t++) {
+            collection c(compound_key::less_compare{});
+
+            for (auto&& k : keys) {
+                c.insert(k.key, std::move(test_data(k.key, k.sub_key)), cmp);
+            }
+
+            auto iter_at = [&c] (size_t at) -> collection::iterator {
+                auto it = c.begin();
+                for (size_t i = 0; i < at; i++, it++) ;
+                return it;
+            };
+
+            auto n = c.erase(iter_at(f), iter_at(t));
+
+            auto r = c.begin();
+            for (size_t i = 0; i < keys.size(); i++) {
+                if (!(i >= f && i < t)) {
+                    if (i == t) {
+                        BOOST_REQUIRE(*n == keys[i]);
+                    }
+                    BOOST_REQUIRE(*(r++) == keys[i]);
+                }
+            }
+            if (t == keys.size()) {
+                BOOST_REQUIRE(n == c.end());
+            }
+            BOOST_REQUIRE(r == c.end());
+        }
+    }
+}
diff --git a/utils/double-decker.hh b/utils/double-decker.hh
new file mode 100644
index 0000000000..5616d44b4d
--- /dev/null
+++ b/utils/double-decker.hh
@@ -0,0 +1,403 @@
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <type_traits>
+#include <seastar/util/concepts.hh>
+#include "utils/bptree.hh"
+#include "utils/intrusive-array.hh"
+#include "utils/collection-concepts.hh"
+#include <fmt/core.h>
+
+/*
+ * The double-decker is the ordered keeper of key:value pairs having
+ * the pairs sorted by both key and value (key first).
+ *
+ * The keys collisions are expected to be rare enough to afford holding
+ * the values in a sorted array with the help of linear algorithms.
+ */
+
+template <typename Key, typename T, typename Less, typename Compare, int NodeSize,
+            bplus::key_search Search = bplus::key_search::binary, bplus::with_debug Debug = bplus::with_debug::no>
+SEASTAR_CONCEPT( requires Comparable<T, T, Compare> && std::is_nothrow_move_constructible_v<T> )
+class double_decker {
+public:
+    using inner_array = intrusive_array<T>;
+    using outer_tree = bplus::tree<Key, inner_array, Less, NodeSize, Search, Debug>;
+    using outer_iterator = typename outer_tree::iterator;
+    using outer_const_iterator = typename outer_tree::const_iterator;
+
+private:
+    outer_tree  _tree;
+
+public:
+    template <bool Const>
+    class iterator_base {
+        friend class double_decker;
+        using outer_iterator = std::conditional_t<Const, typename double_decker::outer_const_iterator, typename double_decker::outer_iterator>;
+
+    protected:
+        outer_iterator _bucket;
+        int _idx;
+
+    public:
+        iterator_base() = default;
+        iterator_base(outer_iterator bkt, int idx) noexcept : _bucket(bkt), _idx(idx) {}
+
+        using iterator_category = std::bidirectional_iterator_tag;
+        using difference_type = ssize_t;
+        using value_type = std::conditional_t<Const, const T, T>;
+        using pointer = value_type*;
+        using reference = value_type&;
+
+        reference operator*() const noexcept { return (*_bucket)[_idx]; }
+        pointer operator->() const noexcept { return &((*_bucket)[_idx]); }
+
+        iterator_base& operator++() noexcept {
+            if ((*_bucket)[_idx++].is_tail()) {
+                _bucket++;
+                _idx = 0;
+            }
+
+            return *this;
+        }
+
+        iterator_base operator++(int) noexcept {
+            iterator_base cur = *this;
+            operator++();
+            return cur;
+        }
+
+        iterator_base& operator--() noexcept {
+            if (_idx-- == 0) {
+                _bucket--;
+                _idx = _bucket->index_of(_bucket->end()) - 1;
+            }
+
+            return *this;
+        }
+
+        iterator_base operator--(int) noexcept {
+            iterator_base cur = *this;
+            operator--();
+            return cur;
+        }
+
+        bool operator==(const iterator_base& o) const noexcept { return _bucket == o._bucket && _idx == o._idx; }
+        bool operator!=(const iterator_base& o) const noexcept { return !(*this == o); }
+    };
+
+    using const_iterator = iterator_base<true>;
+
+    class iterator final : public iterator_base<false> {
+        friend class double_decker;
+        using super = iterator_base<false>;
+
+        iterator(const const_iterator&& other) noexcept : super(std::move(other._bucket), other._idx) {}
+
+    public:
+        iterator() noexcept : super() {}
+        iterator(outer_iterator bkt, int idx) noexcept : super(bkt, idx) {}
+
+        iterator(T* ptr) noexcept {
+            inner_array& arr = inner_array::from_element(ptr, super::_idx);
+            super::_bucket = outer_iterator(&arr);
+        }
+
+        template <typename Func>
+        SEASTAR_CONCEPT(requires Disposer<Func, T>)
+        iterator erase_and_dispose(Less less, Func&& disp) noexcept {
+            disp(&**this); // * to deref this, * to call operator*, & to get addr from ref
+
+            if (super::_bucket->is_single_element()) {
+                outer_iterator bkt = super::_bucket.erase(less);
+                return iterator(bkt, 0);
+            }
+
+            bool tail = (*super::_bucket)[super::_idx].is_tail();
+            super::_bucket->erase(super::_idx);
+            if (tail) {
+                super::_bucket++;
+                super::_idx = 0;
+            }
+
+            return *this;
+        }
+
+        iterator erase(Less less) noexcept { return erase_and_dispose(less, bplus::default_dispose<T>); }
+    };
+
+    /*
+     * Structure that shed some more light on how the lower_bound
+     * actually found the bounding elements.
+     */
+    struct bound_hint {
+        /*
+         * Set to true if the element fully matched to the key
+         * according to Compare
+         */
+        bool match;
+        /*
+         * Set to true if the bucket for the given key exists
+         */
+        bool key_match;
+        /*
+         * Set to true if the given key is more than anything
+         * on the bucket and iterator was switched to the next
+         * one (or when the key_match is false)
+         */
+        bool key_tail;
+
+        /*
+         * This helper says whether the emplace will invalidate (some)
+         * iterators or not. Emplacing with !key_match will go and create
+         * new node in B+ which doesn't invalidate iterators. In another
+         * case some existing B+ data node will be reconstructed, so the
+         * iterators on those nodes will become invalid.
+         */
+        bool emplace_keeps_iterators() const noexcept { return !key_match; }
+    };
+
+    iterator begin() noexcept { return iterator(_tree.begin(), 0); }
+    const_iterator begin() const noexcept { return const_iterator(_tree.begin(), 0); }
+    const_iterator cbegin() const noexcept { return const_iterator(_tree.begin(), 0); }
+
+    iterator end() noexcept { return iterator(_tree.end(), 0); }
+    const_iterator end() const noexcept { return const_iterator(_tree.end(), 0); }
+    const_iterator cend() const noexcept { return const_iterator(_tree.end(), 0); }
+
+    explicit double_decker(Less less) noexcept : _tree(less) { }
+
+    double_decker(const double_decker& other) = delete;
+    double_decker(double_decker&& other) noexcept : _tree(std::move(other._tree)) {}
+
+    iterator insert(Key k, T value, Compare cmp) {
+        std::pair<outer_iterator, bool> oip = _tree.emplace(std::move(k), std::move(value));
+        outer_iterator& bkt = oip.first;
+        int idx = 0;
+
+        if (!oip.second) {
+            /*
+             * Unlikely, but in this case we reconstruct the array. The value
+             * must not have been moved by emplace() above.
+             */
+            idx = bkt->index_of(bkt->lower_bound(value, cmp));
+            size_t new_size = (bkt->size() + 1) * sizeof(T);
+            bkt.reconstruct(new_size, *bkt,
+                    typename inner_array::grow_tag{idx}, std::move(value));
+        }
+
+        return iterator(bkt, idx);
+    }
+
+    template <typename... Args>
+    iterator emplace_before(iterator i, Key k, const bound_hint& hint, Args&&... args) {
+        assert(!hint.match);
+        outer_iterator& bucket = i._bucket;
+
+        if (!hint.key_match) {
+            /*
+             * The most expected case -- no key conflict, respectively the
+             * bucket is not found, and i points to the next one. Just go
+             * ahead and emplace the new bucket before the i and push the
+             * 0th element into it.
+             */
+            outer_iterator nb = bucket.emplace_before(std::move(k), _tree.less(), std::forward<Args>(args)...);
+            return iterator(nb, 0);
+        }
+
+        /*
+         * Key conflict, need to expand some inner vector, but still there
+         * are two cases -- whether the bounding element is on k's bucket
+         * or the bound search overflew and switched to the next one.
+         */
+
+        int idx = i._idx;
+
+        if (hint.key_tail) {
+            /*
+             * The latter case -- i points to the next one. Need to shift
+             * back and append the new element to its tail.
+             */
+            bucket--;
+            idx = bucket->index_of(bucket->end());
+        }
+
+        size_t new_size = (bucket->size() + 1) * sizeof(T);
+        bucket.reconstruct(new_size, *bucket,
+                typename inner_array::grow_tag{idx}, std::forward<Args>(args)...);
+        return iterator(bucket, idx);
+    }
+
+    template <typename K = Key>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    const_iterator find(const K& key, Compare cmp) const {
+        outer_const_iterator bkt = _tree.find(key);
+        int idx = 0;
+
+        if (bkt != _tree.end()) {
+            bool match = false;
+            idx = bkt->index_of(bkt->lower_bound(key, cmp, match));
+            if (!match) {
+                bkt = _tree.end();
+                idx = 0;
+            }
+        }
+
+        return const_iterator(bkt, idx);
+    }
+
+    template <typename K = Key>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    iterator find(const K& k, Compare cmp) {
+        return iterator(const_cast<const double_decker*>(this)->find(k, std::move(cmp)));
+    }
+
+    template <typename K = Key>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    const_iterator lower_bound(const K& key, Compare cmp, bound_hint& hint) const {
+        outer_const_iterator bkt = _tree.lower_bound(key, hint.key_match);
+
+        hint.key_tail = false;
+        hint.match = false;
+
+        if (bkt == _tree.end() || !hint.key_match) {
+            return const_iterator(bkt, 0);
+        }
+
+        int i = bkt->index_of(bkt->lower_bound(key, cmp, hint.match));
+
+        if (i != 0 && (*bkt)[i - 1].is_tail()) {
+            /*
+             * The lower_bound is after the last element -- shift
+             * to the net bucket's 0'th one.
+             */
+            bkt++;
+            i = 0;
+            hint.key_tail = true;
+        }
+
+        return const_iterator(bkt, i);
+    }
+
+    template <typename K = Key>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    iterator lower_bound(const K& key, Compare cmp, bound_hint& hint) {
+        return iterator(const_cast<const double_decker*>(this)->lower_bound(key, std::move(cmp), hint));
+    }
+
+    template <typename K = Key>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    const_iterator lower_bound(const K& key, Compare cmp) const {
+        bound_hint hint;
+        return lower_bound(key, cmp, hint);
+    }
+
+    template <typename K = Key>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    iterator lower_bound(const K& key, Compare cmp) {
+        return iterator(const_cast<const double_decker*>(this)->lower_bound(key, std::move(cmp)));
+    }
+
+    template <typename K = Key>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    const_iterator upper_bound(const K& key, Compare cmp) const {
+        bool key_match;
+        outer_const_iterator bkt = _tree.lower_bound(key, key_match);
+
+        if (bkt == _tree.end() || !key_match) {
+            return const_iterator(bkt, 0);
+        }
+
+        int i = bkt->index_of(bkt->upper_bound(key, cmp));
+
+        if (i != 0 && (*bkt)[i - 1].is_tail()) {
+            // Beyond the end() boundary
+            bkt++;
+            i = 0;
+        }
+
+        return const_iterator(bkt, i);
+    }
+
+    template <typename K = Key>
+    SEASTAR_CONCEPT( requires Comparable<K, T, Compare> )
+    iterator upper_bound(const K& key, Compare cmp) {
+        return iterator(const_cast<const double_decker*>(this)->upper_bound(key, std::move(cmp)));
+    }
+
+    template <typename Func>
+    SEASTAR_CONCEPT(requires Disposer<Func, T>)
+    void clear_and_dispose(Func&& disp) noexcept {
+        _tree.clear_and_dispose([&disp] (inner_array* arr) noexcept {
+            arr->for_each(disp);
+        });
+    }
+
+    void clear() noexcept { clear_and_dispose(bplus::default_dispose<T>); }
+
+    template <typename Func>
+    SEASTAR_CONCEPT(requires Disposer<Func, T>)
+    iterator erase_and_dispose(iterator begin, iterator end, Func&& disp) noexcept {
+        bool same_bucket = begin._bucket == end._bucket;
+
+        // Drop the tail of the starting bucket if it's not fully erased
+        while (begin._idx != 0) {
+            if (same_bucket) {
+                if (begin == end) {
+                    return begin;
+                }
+                end._idx--;
+            }
+
+            begin = begin.erase_and_dispose(_tree.less(), disp);
+        }
+
+        // Drop all the buckets in between
+        outer_iterator nb = _tree.erase_and_dispose(begin._bucket, end._bucket, [&disp] (inner_array* arr) noexcept {
+            arr->for_each(disp);
+        });
+
+        assert(nb == end._bucket);
+
+        /*
+         * Drop the head of the ending bucket. Every erased element is the 0th
+         * one, when erased it will shift the rest left and reconstruct the array,
+         * thus we cannot rely on the end to keep neither _bucket not _idx.
+         *
+         * Said that -- just erase the required number of elements. A corner case
+         * when end points to the tree end is handled, _idx is 0 in this case.
+         */
+        iterator next(nb, 0);
+        while (end._idx-- != 0) {
+            next = next.erase_and_dispose(_tree.less(), disp);
+        }
+
+        return next;
+    }
+
+    iterator erase(iterator begin, iterator end) noexcept {
+        return erase_and_dispose(begin, end, bplus::default_dispose<T>);
+    }
+
+    bool empty() const noexcept { return _tree.empty(); }
+};

From 1e15c068891bbd3a995d122387c12072d09af542 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Thu, 21 May 2020 20:12:07 +0300
Subject: [PATCH 05/11] dht: Detach ring_position_comparator_for_sstables

Next patches will generalize ring_position_comparator with templates
to replace cache_entry's and memtable_entry's comparators. The overload
of operator() for sstables has its own implementation, that differs from
the "generic" one, for smoother generalization it's better to detach it.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 dht/i_partitioner.cc     |  8 ++------
 dht/i_partitioner.hh     | 11 ++++++++++-
 sstables/index_reader.hh |  2 +-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/dht/i_partitioner.cc b/dht/i_partitioner.cc
index 665a81415e..1d269dea85 100644
--- a/dht/i_partitioner.cc
+++ b/dht/i_partitioner.cc
@@ -316,11 +316,7 @@ int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_posit
     }
 }
 
-int ring_position_comparator::operator()(ring_position_view lh, ring_position_view rh) const {
-    return ring_position_tri_compare(s, lh, rh);
-}
-
-int ring_position_comparator::operator()(ring_position_view lh, sstables::decorated_key_view rh) const {
+int ring_position_comparator_for_sstables::operator()(ring_position_view lh, sstables::decorated_key_view rh) const {
     auto token_cmp = tri_compare(*lh._token, rh.token());
     if (token_cmp) {
         return token_cmp;
@@ -334,7 +330,7 @@ int ring_position_comparator::operator()(ring_position_view lh, sstables::decora
     return lh._weight;
 }
 
-int ring_position_comparator::operator()(sstables::decorated_key_view a, ring_position_view b) const {
+int ring_position_comparator_for_sstables::operator()(sstables::decorated_key_view a, ring_position_view b) const {
     return -(*this)(b, a);
 }
 
diff --git a/dht/i_partitioner.hh b/dht/i_partitioner.hh
index 115423d92a..23f77400ad 100644
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -330,6 +330,7 @@ public:
 class ring_position_view {
     friend int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh);
     friend class ring_position_comparator;
+    friend class ring_position_comparator_for_sstables;
     friend class ring_position_ext;
 
     // Order is lexicographical on (_token, _key) tuples, where _key part may be missing, and
@@ -570,7 +571,15 @@ int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_posit
 struct ring_position_comparator {
     const schema& s;
     ring_position_comparator(const schema& s_) : s(s_) {}
-    int operator()(ring_position_view, ring_position_view) const;
+
+    int operator()(ring_position_view lh, ring_position_view rh) const {
+        return ring_position_tri_compare(s, lh, rh);
+    }
+};
+
+struct ring_position_comparator_for_sstables {
+    const schema& s;
+    ring_position_comparator_for_sstables(const schema& s_) : s(s_) {}
     int operator()(ring_position_view, sstables::decorated_key_view) const;
     int operator()(sstables::decorated_key_view, ring_position_view) const;
 };
diff --git a/sstables/index_reader.hh b/sstables/index_reader.hh
index 3879d0cdb6..37fb995d00 100644
--- a/sstables/index_reader.hh
+++ b/sstables/index_reader.hh
@@ -299,7 +299,7 @@ public:
 
 // Less-comparator for lookups in the partition index.
 class index_comparator {
-    dht::ring_position_comparator _tri_cmp;
+    dht::ring_position_comparator_for_sstables _tri_cmp;
 public:
     index_comparator(const schema& s) : _tri_cmp(s) {}
 

From 7b2754cf5f0ef16bb5cbf2e36698d8f23cd825ab Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Tue, 2 Jun 2020 20:23:11 +0300
Subject: [PATCH 06/11] row-cache: Use ring_position_comparator in some places

The row cache (and memtable) code uses own comparators built on top
of the ring_position_comparator for collections of partitions. These
collections will be switched from the key less-compare to the pair
of token less-compare + key tri-compare.

Prepare for the switch by generalizing the ring_partition_comparator
and by patching all the non-collections usage of less-compare to use
one.

The memtable code doesn't use it outside of collections, but patch it
anyway as a part of preparations.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 dht/i_partitioner.hh | 21 +++++++++++++++++++++
 memtable.hh          |  1 +
 row_cache.cc         | 18 ++++++++++--------
 row_cache.hh         |  3 +++
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/dht/i_partitioner.hh b/dht/i_partitioner.hh
index 23f77400ad..9d8fa99ea9 100644
--- a/dht/i_partitioner.hh
+++ b/dht/i_partitioner.hh
@@ -567,6 +567,12 @@ public:
 
 int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh);
 
+template <typename T>
+requires std::is_convertible<T, ring_position_view>::value
+ring_position_view ring_position_view_to_compare(const T& val) {
+    return val;
+}
+
 // Trichotomic comparator for ring order
 struct ring_position_comparator {
     const schema& s;
@@ -575,6 +581,21 @@ struct ring_position_comparator {
     int operator()(ring_position_view lh, ring_position_view rh) const {
         return ring_position_tri_compare(s, lh, rh);
     }
+
+    template <typename T>
+    int operator()(const T& lh, ring_position_view rh) const {
+        return ring_position_tri_compare(s, ring_position_view_to_compare(lh), rh);
+    }
+
+    template <typename T>
+    int operator()(ring_position_view lh, const T& rh) const {
+        return ring_position_tri_compare(s, lh, ring_position_view_to_compare(rh));
+    }
+
+    template <typename T1, typename T2>
+    int operator()(const T1& lh, const T2& rh) const {
+        return ring_position_tri_compare(s, ring_position_view_to_compare(lh), ring_position_view_to_compare(rh));
+    }
 };
 
 struct ring_position_comparator_for_sstables {
diff --git a/memtable.hh b/memtable.hh
index 47361b7237..fc2e15f9a0 100644
--- a/memtable.hh
+++ b/memtable.hh
@@ -117,6 +117,7 @@ public:
         }
     };
 
+    friend dht::ring_position_view ring_position_view_to_compare(const memtable_entry& mt) { return mt._key; }
     friend std::ostream& operator<<(std::ostream&, const memtable_entry&);
 };
 
diff --git a/row_cache.cc b/row_cache.cc
index 873c535338..4ab5cd57f8 100644
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -283,13 +283,14 @@ public:
     // Can be called on invalid cursor, in which case it brings it back to validity.
     // Strong exception guarantees.
     bool advance_to(dht::ring_position_view pos) {
-        auto cmp = cache_entry::compare(_cache.get()._schema);
-        if (cmp(_end_pos, pos)) { // next() may have moved _start_pos past the _end_pos.
+        auto lcmp = cache_entry::compare(_cache.get()._schema);
+        dht::ring_position_comparator cmp(*_cache.get()._schema);
+        if (cmp(_end_pos, pos) < 0) { // next() may have moved _start_pos past the _end_pos.
             _end_pos = pos;
         }
-        _end = _cache.get()._partitions.lower_bound(_end_pos, cmp);
-        _it = _cache.get()._partitions.lower_bound(pos, cmp);
-        auto same = !cmp(pos, _it->position());
+        _end = _cache.get()._partitions.lower_bound(_end_pos, lcmp);
+        _it = _cache.get()._partitions.lower_bound(pos, lcmp);
+        auto same = cmp(pos, _it->position()) >= 0;
         set_position(*_it);
         _last_reclaim_count = _cache.get().get_cache_tracker().allocator().invalidate_counter();
         return same;
@@ -754,10 +755,11 @@ row_cache::make_reader(schema_ptr s,
     if (!ctx->is_range_query() && !fwd_mr) {
         auto mr = _read_section(_tracker.region(), [&] {
             return with_linearized_managed_bytes([&] {
-                cache_entry::compare cmp(_schema);
+                cache_entry::compare lcmp(_schema);
+                dht::ring_position_comparator cmp(*_schema);
                 auto&& pos = ctx->range().start()->value();
-                auto i = _partitions.lower_bound(pos, cmp);
-                if (i != _partitions.end() && !cmp(pos, i->position())) {
+                auto i = _partitions.lower_bound(pos, lcmp);
+                if (i != _partitions.end() && cmp(pos, i->position()) >= 0) {
                     cache_entry& e = *i;
                     upgrade_entry(e);
                     on_partition_hit();
diff --git a/row_cache.hh b/row_cache.hh
index 75b06d15a1..41275b3527 100644
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -137,6 +137,9 @@ public:
         }
         return _key;
     }
+
+    friend dht::ring_position_view ring_position_view_to_compare(const cache_entry& ce) noexcept { return ce.position(); }
+
     const partition_entry& partition() const noexcept { return _pe; }
     partition_entry& partition() { return _pe; }
     const schema_ptr& schema() const noexcept { return _schema; }

From ae28814b1c46f5158673ea818ddb036ae5c86221 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Mon, 18 May 2020 19:05:49 +0300
Subject: [PATCH 07/11] token: Introduce raw() helper and raw comparator

In next patches the entries having token on-board will be
moved onto B+-tree rails. For this the int64_t value of the
token will be used as B+ key, so prepare for this.

One corner case -- the after_all_keys tokens must be resolved
to int64::max value to appear at the "end" of the tree. This
is not the same as "before_all_keys" case, which maps to the
int64::min value which is not allowed for regular tokens. But
for the sake of B+ switch this is OK, the conflicts of token
raw values are explicitly resolved in next patches.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 dht/token.cc |  8 +-------
 dht/token.hh | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/dht/token.cc b/dht/token.cc
index 1056153d33..8b44757d7d 100644
--- a/dht/token.cc
+++ b/dht/token.cc
@@ -59,13 +59,7 @@ int tri_compare(const token& t1, const token& t2) {
     } else if (t1._kind > t2._kind) {
             return 1;
     } else if (t1._kind == token_kind::key) {
-        auto l1 = long_token(t1);
-        auto l2 = long_token(t2);
-        if (l1 == l2) {
-            return 0;
-        } else {
-            return l1 < l2 ? -1 : 1;
-        }
+        return tri_compare_raw(long_token(t1), long_token(t2));
     }
     return 0;
 }
diff --git a/dht/token.hh b/dht/token.hh
index 8ba1b0b53d..de7f4b14f4 100644
--- a/dht/token.hh
+++ b/dht/token.hh
@@ -160,6 +160,47 @@ public:
         return 0;  // hardcoded for now; unlikely to change
     }
 
+    int64_t raw() const noexcept {
+        if (is_minimum()) {
+            return std::numeric_limits<int64_t>::min();
+        }
+        if (is_maximum()) {
+            return std::numeric_limits<int64_t>::max();
+        }
+
+        return _data;
+    }
+};
+
+static inline int tri_compare_raw(const int64_t l1, const int64_t l2) noexcept {
+    if (l1 == l2) {
+        return 0;
+    } else {
+        return l1 < l2 ? -1 : 1;
+    }
+}
+
+template <typename T>
+concept TokenCarrier = requires (const T& v) {
+    { v.token() } -> std::same_as<const token&>;
+};
+
+struct raw_token_less_comparator {
+    bool operator()(const int64_t k1, const int64_t k2) const noexcept {
+        return dht::tri_compare_raw(k1, k2) < 0;
+    }
+
+    template <typename Key>
+    requires TokenCarrier<Key>
+    bool operator()(const Key& k1, const int64_t k2) const noexcept {
+        return dht::tri_compare_raw(k1.token().raw(), k2) < 0;
+    }
+
+    template <typename Key>
+    requires TokenCarrier<Key>
+    bool operator()(const int64_t k1, const Key& k2) const noexcept {
+        return dht::tri_compare_raw(k1, k2.token().raw()) < 0;
+    }
 };
 
 const token& minimum_token() noexcept;

From dff5eb6f252011b6e2a5920e6c92f2f67ddb2af8 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Wed, 13 May 2020 08:38:28 +0300
Subject: [PATCH 08/11] memtable: Count partitions separately

The B+ will not have constant-time .size() call, so do it by hands

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 memtable.cc  | 13 ++++++++-----
 memtable.hh  |  4 +++-
 row_cache.cc |  6 +++---
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/memtable.cc b/memtable.cc
index 14c9b02e9e..4dab06b9d2 100644
--- a/memtable.cc
+++ b/memtable.cc
@@ -137,11 +137,16 @@ uint64_t memtable::dirty_size() const {
     return occupancy().total_space();
 }
 
+void memtable::evict_entry(memtable_entry& e, mutation_cleaner& cleaner) noexcept {
+    e.partition().evict(cleaner);
+    nr_partitions--;
+}
+
 void memtable::clear() noexcept {
     auto dirty_before = dirty_size();
     with_allocator(allocator(), [this] {
         partitions.clear_and_dispose([this] (memtable_entry* e) {
-            e->partition().evict(_cleaner);
+            evict_entry(*e, _cleaner);
             current_deleter<memtable_entry>()(e);
         });
     });
@@ -154,6 +159,7 @@ future<> memtable::clear_gently() noexcept {
             auto& alloc = allocator();
 
             auto p = std::move(partitions);
+            nr_partitions = 0;
             while (!p.empty()) {
                 auto dirty_before = dirty_size();
                 with_allocator(alloc, [&] () noexcept {
@@ -210,6 +216,7 @@ memtable::find_or_create_partition(const dht::decorated_key& key) {
         memtable_entry* entry = current_allocator().construct<memtable_entry>(
             _schema, dht::decorated_key(key), mutation_partition(_schema));
         partitions.insert_before(i, *entry);
+        ++nr_partitions;
         ++_table_stats.memtable_partition_insertions;
         return entry->partition();
     } else {
@@ -759,10 +766,6 @@ mutation_source memtable::as_data_source() {
     });
 }
 
-size_t memtable::partition_count() const {
-    return partitions.size();
-}
-
 memtable_entry::memtable_entry(memtable_entry&& o) noexcept
     : _link()
     , _schema(std::move(o._schema))
diff --git a/memtable.hh b/memtable.hh
index fc2e15f9a0..7714d48fbf 100644
--- a/memtable.hh
+++ b/memtable.hh
@@ -138,6 +138,7 @@ private:
     logalloc::allocating_section _read_section;
     logalloc::allocating_section _allocating_section;
     partitions_type partitions;
+    size_t nr_partitions = 0;
     db::replay_position _replay_position;
     db::rp_set _rp_set;
     // mutation source to which reads fall-back after mark_flushed()
@@ -204,6 +205,7 @@ public:
     void apply(const mutation& m, db::rp_handle&& = {});
     // The mutation is upgraded to current schema.
     void apply(const frozen_mutation& m, const schema_ptr& m_schema, db::rp_handle&& = {});
+    void evict_entry(memtable_entry& e, mutation_cleaner& cleaner) noexcept;
 
     static memtable& from_region(logalloc::region& r) {
         return static_cast<memtable&>(r);
@@ -237,7 +239,7 @@ public:
         return _memtable_list;
     }
 
-    size_t partition_count() const;
+    size_t partition_count() const { return nr_partitions; }
     logalloc::occupancy_stats occupancy() const;
 
     // Creates a reader of data in this memtable for given partition range.
diff --git a/row_cache.cc b/row_cache.cc
index 4ab5cd57f8..83e543fb9a 100644
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -900,14 +900,14 @@ void row_cache::invalidate_sync(memtable& m) noexcept {
         bool blow_cache = false;
         // Note: clear_and_dispose() ought not to look up any keys, so it doesn't require
         // with_linearized_managed_bytes(), but invalidate() does.
-        m.partitions.clear_and_dispose([this, deleter = current_deleter<memtable_entry>(), &blow_cache] (memtable_entry* entry) {
+        m.partitions.clear_and_dispose([this, &m, deleter = current_deleter<memtable_entry>(), &blow_cache] (memtable_entry* entry) {
             with_linearized_managed_bytes([&] () noexcept {
                 try {
                     invalidate_locked(entry->key());
                 } catch (...) {
                     blow_cache = true;
                 }
-                entry->partition().evict(_tracker.memtable_cleaner());
+                m.evict_entry(*entry, _tracker.memtable_cleaner());
                 deleter(entry);
             });
         });
@@ -986,7 +986,7 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
                                 auto i = m.partitions.begin();
                                 memtable_entry& mem_e = *i;
                                 m.partitions.erase(i);
-                                mem_e.partition().evict(_tracker.memtable_cleaner());
+                                m.evict_entry(mem_e, _tracker.memtable_cleaner());
                                 current_allocator().destroy(&mem_e);
                               });
                             });

From 174b101a4922b76b947813c5b997e6ceca8153ee Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Fri, 24 Apr 2020 10:03:41 +0300
Subject: [PATCH 09/11] row_cache: Switch partition tree onto B+ rails

The row_cache::partitions_type is replaced from boost::intrusive::set
to bplus::tree<Key = int64_t, T = array_trusted_bounds<cache_entry>>

Where token is used to quickly locate the partition by its token and
the internal array -- to resolve hashing conflicts.

Summary of changes in cache_entry:

- compare's goes away as the new collection needs tri-compare one which
  is provided by ring_position_comparator

- when initialized the dummy entry is added with "after_all_keys" kind,
  not "before_all_keys" as it was by default. This is to make tree
  entries sorted by token

- insertion and removing of cache_entries happens inside double_decker,
  most of the changes in row_cache.cc are about passing constructor args
  from current_allocator.construct into double_decker.empace_before()

- the _flags is extended to keep array head/tail bits. There's a room
  for it, sizeof(cache_entry) remains unchanged

The rest fits smothly into the double_decker API.

Also, as was told in the previous patch, insertion and removal _may_
invalidate iterators, but may leave them intact. However, currently
this doesn't seem to be a problem as the cache_tracker ::insert() and
::on_partition_erase do invalidate iterators unconditionally.

Later this can be otimized, as iterators are invalidated by double-decker
only in case of hash conflict, otherwise it doesn't change arrays and
B+ tree doesn't invalidate its.

tests: unit(dev), perf(dev)

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 row_cache.cc                       | 111 +++++++++++++----------------
 row_cache.hh                       |  56 ++++-----------
 test/perf/memory_footprint_test.cc |   1 -
 3 files changed, 65 insertions(+), 103 deletions(-)

diff --git a/row_cache.cc b/row_cache.cc
index 83e543fb9a..0c6603f0d2 100644
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -283,13 +283,12 @@ public:
     // Can be called on invalid cursor, in which case it brings it back to validity.
     // Strong exception guarantees.
     bool advance_to(dht::ring_position_view pos) {
-        auto lcmp = cache_entry::compare(_cache.get()._schema);
         dht::ring_position_comparator cmp(*_cache.get()._schema);
         if (cmp(_end_pos, pos) < 0) { // next() may have moved _start_pos past the _end_pos.
             _end_pos = pos;
         }
-        _end = _cache.get()._partitions.lower_bound(_end_pos, lcmp);
-        _it = _cache.get()._partitions.lower_bound(pos, lcmp);
+        _end = _cache.get()._partitions.lower_bound(_end_pos, cmp);
+        _it = _cache.get()._partitions.lower_bound(pos, cmp);
         auto same = cmp(pos, _it->position()) >= 0;
         set_position(*_it);
         _last_reclaim_count = _cache.get().get_cache_tracker().allocator().invalidate_counter();
@@ -376,13 +375,14 @@ private:
                     _cache._read_section(_cache._tracker.region(), [this] {
                         with_allocator(_cache._tracker.allocator(), [this] {
                             dht::decorated_key dk = _read_context->range().start()->value().as_decorated_key();
-                            _cache.do_find_or_create_entry(dk, nullptr, [&] (auto i) {
+                            _cache.do_find_or_create_entry(dk, nullptr, [&] (auto i, const row_cache::partitions_type::bound_hint& hint) {
                                 mutation_partition mp(_cache._schema);
-                                cache_entry* entry = current_allocator().construct<cache_entry>(
+                                bool cont = i->continuous();
+                                row_cache::partitions_type::iterator entry = _cache._partitions.emplace_before(i, dk.token().raw(), hint,
                                     _cache._schema, std::move(dk), std::move(mp));
                                 _cache._tracker.insert(*entry);
-                                entry->set_continuous(i->continuous());
-                                return _cache._partitions.insert_before(i, *entry);
+                                entry->set_continuous(cont);
+                                return entry;
                             }, [&] (auto i) {
                                 _cache._tracker.on_miss_already_populated();
                             });
@@ -497,7 +497,7 @@ private:
             return;
         }
         if (!_reader.range().end() || !_reader.range().end()->is_inclusive()) {
-            cache_entry::compare cmp(_cache._schema);
+            dht::ring_position_comparator cmp(*_cache._schema);
             auto it = _reader.range().end() ? _cache._partitions.find(_reader.range().end()->value(), cmp)
                                            : std::prev(_cache._partitions.end());
             if (it != _cache._partitions.end()) {
@@ -755,10 +755,9 @@ row_cache::make_reader(schema_ptr s,
     if (!ctx->is_range_query() && !fwd_mr) {
         auto mr = _read_section(_tracker.region(), [&] {
             return with_linearized_managed_bytes([&] {
-                cache_entry::compare lcmp(_schema);
                 dht::ring_position_comparator cmp(*_schema);
                 auto&& pos = ctx->range().start()->value();
-                auto i = _partitions.lower_bound(pos, lcmp);
+                auto i = _partitions.lower_bound(pos, cmp);
                 if (i != _partitions.end() && cmp(pos, i->position()) >= 0) {
                     cache_entry& e = *i;
                     upgrade_entry(e);
@@ -791,22 +790,20 @@ row_cache::make_reader(schema_ptr s,
 
 row_cache::~row_cache() {
     with_allocator(_tracker.allocator(), [this] {
-        _partitions.clear_and_dispose([this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
+        _partitions.clear_and_dispose([this] (cache_entry* p) mutable noexcept {
             if (!p->is_dummy_entry()) {
                 _tracker.on_partition_erase();
             }
             p->evict(_tracker);
-            deleter(p);
         });
     });
 }
 
 void row_cache::clear_now() noexcept {
     with_allocator(_tracker.allocator(), [this] {
-        auto it = _partitions.erase_and_dispose(_partitions.begin(), partitions_end(), [this, deleter = current_deleter<cache_entry>()] (auto&& p) mutable {
+        auto it = _partitions.erase_and_dispose(_partitions.begin(), partitions_end(), [this] (cache_entry* p) noexcept {
             _tracker.on_partition_erase();
             p->evict(_tracker);
-            deleter(p);
         });
         _tracker.clear_continuity(*it);
     });
@@ -822,9 +819,11 @@ cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key,
 {
     return with_allocator(_tracker.allocator(), [&] () -> cache_entry& {
             return with_linearized_managed_bytes([&] () -> cache_entry& {
-                auto i = _partitions.lower_bound(key, cache_entry::compare(_schema));
-                if (i == _partitions.end() || !i->key().equal(*_schema, key)) {
-                    i = create_entry(i);
+                partitions_type::bound_hint hint;
+                dht::ring_position_comparator cmp(*_schema);
+                auto i = _partitions.lower_bound(key, cmp, hint);
+                if (i == _partitions.end() || !hint.match) {
+                    i = create_entry(i, hint);
                 } else {
                     visit_entry(i);
                 }
@@ -847,10 +846,11 @@ cache_entry& row_cache::do_find_or_create_entry(const dht::decorated_key& key,
 }
 
 cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone t, row_cache::phase_type phase, const previous_entry_pointer* previous) {
-    return do_find_or_create_entry(key, previous, [&] (auto i) { // create
-        auto entry = current_allocator().construct<cache_entry>(cache_entry::incomplete_tag{}, _schema, key, t);
+    return do_find_or_create_entry(key, previous, [&] (auto i, const partitions_type::bound_hint& hint) { // create
+        partitions_type::iterator entry = _partitions.emplace_before(i, key.token().raw(), hint,
+                cache_entry::incomplete_tag{}, _schema, key, t);
         _tracker.insert(*entry);
-        return _partitions.insert_before(i, *entry);
+        return entry;
     }, [&] (auto i) { // visit
         _tracker.on_miss_already_populated();
         cache_entry& e = *i;
@@ -861,14 +861,13 @@ cache_entry& row_cache::find_or_create(const dht::decorated_key& key, tombstone
 
 void row_cache::populate(const mutation& m, const previous_entry_pointer* previous) {
   _populate_section(_tracker.region(), [&] {
-    do_find_or_create_entry(m.decorated_key(), previous, [&] (auto i) {
-        cache_entry* entry = current_allocator().construct<cache_entry>(
+    do_find_or_create_entry(m.decorated_key(), previous, [&] (auto i, const partitions_type::bound_hint& hint) {
+        partitions_type::iterator entry = _partitions.emplace_before(i, m.decorated_key().token().raw(), hint,
                 m.schema(), m.decorated_key(), m.partition());
         _tracker.insert(*entry);
         entry->set_continuous(i->continuous());
-        i = _partitions.insert_before(i, *entry);
-        upgrade_entry(*i);
-        return i;
+        upgrade_entry(*entry);
+        return entry;
     }, [&] (auto i) {
         throw std::runtime_error(format("cache already contains entry for {}", m.key()));
     });
@@ -900,7 +899,7 @@ void row_cache::invalidate_sync(memtable& m) noexcept {
         bool blow_cache = false;
         // Note: clear_and_dispose() ought not to look up any keys, so it doesn't require
         // with_linearized_managed_bytes(), but invalidate() does.
-        m.partitions.clear_and_dispose([this, &m, deleter = current_deleter<memtable_entry>(), &blow_cache] (memtable_entry* entry) {
+        m.partitions.clear_and_dispose([this, &m, deleter = current_deleter<memtable_entry>(), &blow_cache] (memtable_entry* entry) noexcept {
             with_linearized_managed_bytes([&] () noexcept {
                 try {
                     invalidate_locked(entry->key());
@@ -952,7 +951,7 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
         partition_presence_checker is_present = _prev_snapshot->make_partition_presence_checker();
         while (!m.partitions.empty()) {
             with_allocator(_tracker.allocator(), [&] () {
-                auto cmp = cache_entry::compare(_schema);
+                auto cmp = dht::ring_position_comparator(*_schema);
                 {
                     size_t partition_count = 0;
                     {
@@ -968,8 +967,9 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
                                   with_linearized_managed_bytes([&] {
                                     memtable_entry& mem_e = *m.partitions.begin();
                                     size_entry = mem_e.size_in_allocator_without_rows(_tracker.allocator());
-                                    auto cache_i = _partitions.lower_bound(mem_e.key(), cmp);
-                                    update = updater(_update_section, cache_i, mem_e, is_present, real_dirty_acc);
+                                    partitions_type::bound_hint hint;
+                                    auto cache_i = _partitions.lower_bound(mem_e.key(), cmp, hint);
+                                    update = updater(_update_section, cache_i, mem_e, is_present, real_dirty_acc, hint);
                                   });
                                 });
                             }
@@ -1017,11 +1017,11 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
 future<> row_cache::update(external_updater eu, memtable& m) {
     return do_update(std::move(eu), m, [this] (logalloc::allocating_section& alloc,
             row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e, partition_presence_checker& is_present,
-            real_dirty_memory_accounter& acc) mutable {
+            real_dirty_memory_accounter& acc, const partitions_type::bound_hint& hint) mutable {
         // If cache doesn't contain the entry we cannot insert it because the mutation may be incomplete.
         // FIXME: keep a bitmap indicating which sstables we do cover, so we don't have to
         //        search it.
-        if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) {
+        if (cache_i != partitions_end() && hint.match) {
             cache_entry& entry = *cache_i;
             upgrade_entry(entry);
             assert(entry._schema == _schema);
@@ -1033,12 +1033,11 @@ future<> row_cache::update(external_updater eu, memtable& m) {
                    || with_allocator(standard_allocator(), [&] { return is_present(mem_e.key()); })
                       == partition_presence_checker_result::definitely_doesnt_exist) {
             // Partition is absent in underlying. First, insert a neutral partition entry.
-            cache_entry* entry = current_allocator().construct<cache_entry>(cache_entry::evictable_tag(),
-                _schema, dht::decorated_key(mem_e.key()),
+            partitions_type::iterator entry = _partitions.emplace_before(cache_i, mem_e.key().token().raw(), hint,
+                cache_entry::evictable_tag(), _schema, dht::decorated_key(mem_e.key()),
                 partition_entry::make_evictable(*_schema, mutation_partition(_schema)));
             entry->set_continuous(cache_i->continuous());
             _tracker.insert(*entry);
-            _partitions.insert_before(cache_i, *entry);
             mem_e.upgrade_schema(_schema, _tracker.memtable_cleaner());
             return entry->partition().apply_to_incomplete(*_schema, std::move(mem_e.partition()), _tracker.memtable_cleaner(),
                 alloc, _tracker.region(), _tracker, _underlying_phase, acc);
@@ -1051,7 +1050,7 @@ future<> row_cache::update(external_updater eu, memtable& m) {
 future<> row_cache::update_invalidating(external_updater eu, memtable& m) {
     return do_update(std::move(eu), m, [this] (logalloc::allocating_section& alloc,
         row_cache::partitions_type::iterator cache_i, memtable_entry& mem_e, partition_presence_checker& is_present,
-        real_dirty_memory_accounter& acc)
+        real_dirty_memory_accounter& acc, const partitions_type::bound_hint&)
     {
         if (cache_i != partitions_end() && cache_i->key().equal(*_schema, mem_e.key())) {
             // FIXME: Invalidate only affected row ranges.
@@ -1074,7 +1073,7 @@ void row_cache::refresh_snapshot() {
 void row_cache::touch(const dht::decorated_key& dk) {
  _read_section(_tracker.region(), [&] {
   with_linearized_managed_bytes([&] {
-    auto i = _partitions.find(dk, cache_entry::compare(_schema));
+    auto i = _partitions.find(dk, dht::ring_position_comparator(*_schema));
     if (i != _partitions.end()) {
         for (partition_version& pv : i->partition().versions_from_oldest()) {
             for (rows_entry& row : pv.partition().clustered_rows()) {
@@ -1089,7 +1088,7 @@ void row_cache::touch(const dht::decorated_key& dk) {
 void row_cache::unlink_from_lru(const dht::decorated_key& dk) {
     _read_section(_tracker.region(), [&] {
         with_linearized_managed_bytes([&] {
-            auto i = _partitions.find(dk, cache_entry::compare(_schema));
+            auto i = _partitions.find(dk, dht::ring_position_comparator(*_schema));
             if (i != _partitions.end()) {
                 for (partition_version& pv : i->partition().versions_from_oldest()) {
                     for (rows_entry& row : pv.partition().clustered_rows()) {
@@ -1102,15 +1101,14 @@ void row_cache::unlink_from_lru(const dht::decorated_key& dk) {
 }
 
 void row_cache::invalidate_locked(const dht::decorated_key& dk) {
-    auto pos = _partitions.lower_bound(dk, cache_entry::compare(_schema));
+    auto pos = _partitions.lower_bound(dk, dht::ring_position_comparator(*_schema));
     if (pos == partitions_end() || !pos->key().equal(*_schema, dk)) {
         _tracker.clear_continuity(*pos);
     } else {
-        auto it = _partitions.erase_and_dispose(pos,
-            [this, &dk, deleter = current_deleter<cache_entry>()](auto&& p) mutable {
+        auto it = pos.erase_and_dispose(dht::raw_token_less_comparator{},
+            [this](cache_entry* p) mutable noexcept {
                 _tracker.on_partition_erase();
                 p->evict(_tracker);
-                deleter(p);
             });
         _tracker.clear_continuity(*it);
     }
@@ -1140,17 +1138,16 @@ future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&
                 while (true) {
                     auto done = _update_section(_tracker.region(), [&] {
                         return with_linearized_managed_bytes([&] {
-                            auto cmp = cache_entry::compare(_schema);
+                            auto cmp = dht::ring_position_comparator(*_schema);
                             auto it = _partitions.lower_bound(*_prev_snapshot_pos, cmp);
                             auto end = _partitions.lower_bound(dht::ring_position_view::for_range_end(range), cmp);
                             return with_allocator(_tracker.allocator(), [&] {
-                                auto deleter = current_deleter<cache_entry>();
                                 while (it != end) {
-                                    it = _partitions.erase_and_dispose(it, [&] (cache_entry* p) mutable {
-                                        _tracker.on_partition_erase();
-                                        p->evict(_tracker);
-                                        deleter(p);
-                                    });
+                                    it = it.erase_and_dispose(dht::raw_token_less_comparator{},
+                                        [&] (cache_entry* p) mutable noexcept {
+                                            _tracker.on_partition_erase();
+                                            p->evict(_tracker);
+                                        });
                                     // it != end is necessary for correctness. We cannot set _prev_snapshot_pos to end->position()
                                     // because after resuming something may be inserted before "end" which falls into the next range.
                                     if (need_preempt() && it != end) {
@@ -1187,14 +1184,14 @@ void row_cache::evict() {
 row_cache::row_cache(schema_ptr s, snapshot_source src, cache_tracker& tracker, is_continuous cont)
     : _tracker(tracker)
     , _schema(std::move(s))
-    , _partitions(cache_entry::compare(_schema))
+    , _partitions(dht::raw_token_less_comparator{})
     , _underlying(src())
     , _snapshot_source(std::move(src))
 {
     with_allocator(_tracker.allocator(), [this, cont] {
-        cache_entry* entry = current_allocator().construct<cache_entry>(cache_entry::dummy_entry_tag());
-        _partitions.insert_before(_partitions.end(), *entry);
-        entry->set_continuous(bool(cont));
+        cache_entry entry(cache_entry::dummy_entry_tag{});
+        entry.set_continuous(bool(cont));
+        _partitions.insert(entry.position().token().raw(), std::move(entry), dht::ring_position_comparator{*_schema});
     });
 }
 
@@ -1203,13 +1200,7 @@ cache_entry::cache_entry(cache_entry&& o) noexcept
     , _key(std::move(o._key))
     , _pe(std::move(o._pe))
     , _flags(o._flags)
-    , _cache_link()
 {
-    {
-        using container_type = row_cache::partitions_type;
-        container_type::node_algorithms::replace_node(o._cache_link.this_ptr(), _cache_link.this_ptr());
-        container_type::node_algorithms::init(o._cache_link.this_ptr());
-    }
 }
 
 cache_entry::~cache_entry() {
@@ -1224,11 +1215,11 @@ void row_cache::set_schema(schema_ptr new_schema) noexcept {
 }
 
 void cache_entry::on_evicted(cache_tracker& tracker) noexcept {
-    auto it = row_cache::partitions_type::s_iterator_to(*this);
+    row_cache::partitions_type::iterator it(this);
     std::next(it)->set_continuous(false);
     evict(tracker);
-    current_deleter<cache_entry>()(this);
     tracker.on_partition_eviction();
+    it.erase(dht::raw_token_less_comparator{});
 }
 
 void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
diff --git a/row_cache.hh b/row_cache.hh
index 41275b3527..d7502c0985 100644
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -40,6 +40,7 @@
 #include <seastar/core/metrics_registration.hh>
 #include "flat_mutation_reader.hh"
 #include "mutation_cleaner.hh"
+#include "utils/double-decker.hh"
 
 namespace bi = boost::intrusive;
 
@@ -61,11 +62,6 @@ class lsa_manager;
 //
 // TODO: Make memtables use this format too.
 class cache_entry {
-    // We need auto_unlink<> option on the _cache_link because when entry is
-    // evicted from cache via LRU we don't have a reference to the container
-    // and don't want to store it with each entry.
-    using cache_link_type = bi::set_member_hook<bi::link_mode<bi::auto_unlink>>;
-
     schema_ptr _schema;
     dht::decorated_key _key;
     partition_entry _pe;
@@ -73,8 +69,10 @@ class cache_entry {
     struct {
         bool _continuous : 1;
         bool _dummy_entry : 1;
+        bool _head : 1;
+        bool _tail : 1;
+        bool _train : 1;
     } _flags{};
-    cache_link_type _cache_link;
     friend class size_calculator;
 
     flat_mutation_reader do_read(row_cache&, cache::read_context& reader);
@@ -82,6 +80,13 @@ public:
     friend class row_cache;
     friend class cache_tracker;
 
+    bool is_head() const noexcept { return _flags._head; }
+    void set_head(bool v) noexcept { _flags._head = v; }
+    bool is_tail() const noexcept { return _flags._tail; }
+    void set_tail(bool v) noexcept { _flags._tail = v; }
+    bool with_train() const noexcept { return _flags._train; }
+    void set_train(bool v) noexcept { _flags._train = v; }
+
     struct dummy_entry_tag{};
     struct incomplete_tag{};
     struct evictable_tag{};
@@ -151,38 +156,6 @@ public:
 
     bool is_dummy_entry() const noexcept { return _flags._dummy_entry; }
 
-    struct compare {
-        dht::ring_position_less_comparator _c;
-
-        compare(schema_ptr s)
-            : _c(*s)
-        {}
-
-        bool operator()(const dht::decorated_key& k1, const cache_entry& k2) const {
-            return _c(k1, k2.position());
-        }
-
-        bool operator()(dht::ring_position_view k1, const cache_entry& k2) const {
-            return _c(k1, k2.position());
-        }
-
-        bool operator()(const cache_entry& k1, const cache_entry& k2) const {
-            return _c(k1.position(), k2.position());
-        }
-
-        bool operator()(const cache_entry& k1, const dht::decorated_key& k2) const {
-            return _c(k1.position(), k2);
-        }
-
-        bool operator()(const cache_entry& k1, dht::ring_position_view k2) const {
-            return _c(k1.position(), k2);
-        }
-
-        bool operator()(dht::ring_position_view k1, dht::ring_position_view k2) const {
-            return _c(k1, k2);
-        }
-    };
-
     friend std::ostream& operator<<(std::ostream&, cache_entry&);
 };
 
@@ -318,10 +291,9 @@ void cache_tracker::insert(partition_entry& pe) noexcept {
 class row_cache final {
 public:
     using phase_type = utils::phased_barrier::phase_type;
-    using partitions_type = bi::set<cache_entry,
-        bi::member_hook<cache_entry, cache_entry::cache_link_type, &cache_entry::_cache_link>,
-        bi::constant_time_size<false>, // we need this to have bi::auto_unlink on hooks
-        bi::compare<cache_entry::compare>>;
+    using partitions_type = double_decker<int64_t, cache_entry,
+                            dht::raw_token_less_comparator, dht::ring_position_comparator,
+                            16, bplus::key_search::linear>;
     friend class cache::autoupdating_underlying_reader;
     friend class single_partition_populating_reader;
     friend class cache_entry;
diff --git a/test/perf/memory_footprint_test.cc b/test/perf/memory_footprint_test.cc
index fcfa55405a..9f04fd84d2 100644
--- a/test/perf/memory_footprint_test.cc
+++ b/test/perf/memory_footprint_test.cc
@@ -61,7 +61,6 @@ public:
         {
             nest n;
             std::cout << prefix() << "sizeof(decorated_key) = " << sizeof(dht::decorated_key) << "\n";
-            std::cout << prefix() << "sizeof(cache_link_type) = " << sizeof(cache_entry::cache_link_type) << "\n";
             print_mutation_partition_size();
         }
 

From 4d2f5f93a428c192e23dfdfcdd504f47fb897ca8 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Wed, 13 May 2020 09:49:06 +0300
Subject: [PATCH 10/11] memtable: Switch onto B+ rails

The change is the same as with row-cache -- use B+ with int64_t token
as key and array of memtable_entry-s inside it.

The changes are:

Similar to those for row_cache:

- compare() goes away, new collection uses ring_position_comparator

- insertion and removal happens with the help of double_decker, most
  of the places are about slightly changed semantics of it

- flags are added to memtable_entry, this makes its size larger than
  it could be, but still smaller than it was before

Memtable-specific:

- when the new entry is inserted into tree iterators _might_ get
  invalidated by double-decker inner array. This is easy to check
  when it happens, so the invalidation is avoided when possible

- the size_in_allocator_without_rows() is now not very precise. This
  is because after the patch memtable_entries are not allocated
  individually as they used to. They can be squashed together with
  those having token conflict and asking allocator for the occupied
  memory slot is not possible. As the closest (lower) estimate the
  size of enclosing B+ data node is used

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 memtable.cc            | 54 ++++++++++++++++++++++++------------------
 memtable.hh            | 53 +++++++++++++++--------------------------
 row_cache.cc           | 10 ++++----
 row_cache.hh           |  1 -
 utils/double-decker.hh |  9 +++++++
 5 files changed, 63 insertions(+), 64 deletions(-)

diff --git a/memtable.cc b/memtable.cc
index 4dab06b9d2..95472cb9f4 100644
--- a/memtable.cc
+++ b/memtable.cc
@@ -117,7 +117,7 @@ memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, table_stats& ta
         , _cleaner(*this, no_cache_tracker, table_stats.memtable_app_stats, compaction_scheduling_group)
         , _memtable_list(memtable_list)
         , _schema(std::move(schema))
-        , partitions(memtable_entry::compare(_schema))
+        , partitions(dht::raw_token_less_comparator{})
         , _table_stats(table_stats) {
 }
 
@@ -145,9 +145,8 @@ void memtable::evict_entry(memtable_entry& e, mutation_cleaner& cleaner) noexcep
 void memtable::clear() noexcept {
     auto dirty_before = dirty_size();
     with_allocator(allocator(), [this] {
-        partitions.clear_and_dispose([this] (memtable_entry* e) {
+        partitions.clear_and_dispose([this] (memtable_entry* e) noexcept {
             evict_entry(*e, _cleaner);
-            current_deleter<memtable_entry>()(e);
         });
     });
     remove_flushed_memory(dirty_before - dirty_size());
@@ -167,9 +166,7 @@ future<> memtable::clear_gently() noexcept {
                         if (p.begin()->clear_gently() == stop_iteration::no) {
                             break;
                         }
-                        p.erase_and_dispose(p.begin(), [&] (auto e) {
-                            alloc.destroy(e);
-                        });
+                        p.begin().erase(dht::raw_token_less_comparator{});
                         if (need_preempt()) {
                             break;
                         }
@@ -178,6 +175,13 @@ future<> memtable::clear_gently() noexcept {
                 remove_flushed_memory(dirty_before - dirty_size());
                 seastar::thread::yield();
             }
+
+            /*
+             * The collection is not guaranteed to free everything
+             * with the last erase. If anything gets freed in destructor,
+             * it will be unaccounted from wrong allocator, so handle it
+             */
+            with_allocator(alloc, [&p] { p.clear(); });
         });
         auto f = t->join();
         return f.then([t = std::move(t)] {});
@@ -211,13 +215,17 @@ memtable::find_or_create_partition(const dht::decorated_key& key) {
     assert(!reclaiming_enabled());
 
     // call lower_bound so we have a hint for the insert, just in case.
-    auto i = partitions.lower_bound(key, memtable_entry::compare(_schema));
-    if (i == partitions.end() || !key.equal(*_schema, i->key())) {
-        memtable_entry* entry = current_allocator().construct<memtable_entry>(
-            _schema, dht::decorated_key(key), mutation_partition(_schema));
-        partitions.insert_before(i, *entry);
+    partitions_type::bound_hint hint;
+    auto i = partitions.lower_bound(key, dht::ring_position_comparator(*_schema), hint);
+    if (i == partitions.end() || !hint.match) {
+        partitions_type::iterator entry = partitions.emplace_before(i,
+                key.token().raw(), hint,
+                _schema, dht::decorated_key(key), mutation_partition(_schema));
         ++nr_partitions;
         ++_table_stats.memtable_partition_insertions;
+        if (!hint.emplace_keeps_iterators()) {
+            current_allocator().invalidate_references();
+        }
         return entry->partition();
     } else {
         ++_table_stats.memtable_partition_hits;
@@ -230,14 +238,14 @@ boost::iterator_range<memtable::partitions_type::const_iterator>
 memtable::slice(const dht::partition_range& range) const {
     if (query::is_single_partition(range)) {
         const query::ring_position& pos = range.start()->value();
-        auto i = partitions.find(pos, memtable_entry::compare(_schema));
+        auto i = partitions.find(pos, dht::ring_position_comparator(*_schema));
         if (i != partitions.end()) {
             return boost::make_iterator_range(i, std::next(i));
         } else {
             return boost::make_iterator_range(i, i);
         }
     } else {
-        auto cmp = memtable_entry::compare(_schema);
+        auto cmp = dht::ring_position_comparator(*_schema);
 
         auto i1 = range.start()
                   ? (range.start()->is_inclusive()
@@ -266,7 +274,7 @@ class iterator_reader {
     size_t _last_partition_count = 0;
 
     memtable::partitions_type::iterator lookup_end() {
-        auto cmp = memtable_entry::compare(_memtable->_schema);
+        auto cmp = dht::ring_position_comparator(*_memtable->_schema);
         return _range->end()
             ? (_range->end()->is_inclusive()
                 ? _memtable->partitions.upper_bound(_range->end()->value(), cmp)
@@ -276,7 +284,7 @@ class iterator_reader {
     void update_iterators() {
         // We must be prepared that iterators may get invalidated during compaction.
         auto current_reclaim_counter = _memtable->reclaim_counter();
-        auto cmp = memtable_entry::compare(_memtable->_schema);
+        auto cmp = dht::ring_position_comparator(*_memtable->_schema);
         if (_last) {
             if (current_reclaim_counter != _last_reclaim_counter ||
                   _last_partition_count != _memtable->partition_count()) {
@@ -659,7 +667,7 @@ memtable::make_flat_reader(schema_ptr s,
         const query::ring_position& pos = range.start()->value();
         auto snp = _read_section(*this, [&] () -> partition_snapshot_ptr {
             managed_bytes::linearization_context_guard lcg;
-            auto i = partitions.find(pos, memtable_entry::compare(_schema));
+            auto i = partitions.find(pos, dht::ring_position_comparator(*_schema));
             if (i != partitions.end()) {
                 upgrade_entry(*i);
                 return i->snapshot(*this);
@@ -767,15 +775,11 @@ mutation_source memtable::as_data_source() {
 }
 
 memtable_entry::memtable_entry(memtable_entry&& o) noexcept
-    : _link()
-    , _schema(std::move(o._schema))
+    : _schema(std::move(o._schema))
     , _key(std::move(o._key))
     , _pe(std::move(o._pe))
-{
-    using container_type = memtable::partitions_type;
-    container_type::node_algorithms::replace_node(o._link.this_ptr(), _link.this_ptr());
-    container_type::node_algorithms::init(o._link.this_ptr());
-}
+    , _flags(o._flags)
+{ }
 
 stop_iteration memtable_entry::clear_gently() noexcept {
     return _pe.clear_gently(no_cache_tracker);
@@ -811,6 +815,10 @@ void memtable::set_schema(schema_ptr new_schema) noexcept {
     _schema = std::move(new_schema);
 }
 
+size_t memtable_entry::object_memory_size(allocation_strategy& allocator) {
+    return memtable::partitions_type::estimated_object_memory_size_in_allocator(allocator, this);
+}
+
 std::ostream& operator<<(std::ostream& out, memtable& mt) {
     logalloc::reclaim_lock rl(mt);
     return out << "{memtable: [" << ::join(",\n", mt.partitions) << "]}";
diff --git a/memtable.hh b/memtable.hh
index 7714d48fbf..5a810c9c66 100644
--- a/memtable.hh
+++ b/memtable.hh
@@ -32,11 +32,11 @@
 #include "db/commitlog/replay_position.hh"
 #include "db/commitlog/rp_set.hh"
 #include "utils/extremum_tracking.hh"
-#include "utils/logalloc.hh"
 #include "partition_version.hh"
 #include "flat_mutation_reader.hh"
 #include "mutation_cleaner.hh"
 #include "sstables/types.hh"
+#include "utils/double-decker.hh"
 
 class frozen_mutation;
 
@@ -44,11 +44,22 @@ class frozen_mutation;
 namespace bi = boost::intrusive;
 
 class memtable_entry {
-    bi::set_member_hook<> _link;
     schema_ptr _schema;
     dht::decorated_key _key;
     partition_entry _pe;
+    struct {
+        bool _head : 1;
+        bool _tail : 1;
+        bool _train : 1;
+    } _flags{};
 public:
+    bool is_head() const noexcept { return _flags._head; }
+    void set_head(bool v) noexcept { _flags._head = v; }
+    bool is_tail() const noexcept { return _flags._tail; }
+    void set_tail(bool v) noexcept { _flags._tail = v; }
+    bool with_train() const noexcept { return _flags._train; }
+    void set_train(bool v) noexcept { _flags._train = v; }
+
     friend class memtable;
 
     memtable_entry(schema_ptr s, dht::decorated_key key, mutation_partition p)
@@ -77,8 +88,10 @@ public:
         return _key.key().external_memory_usage();
     }
 
+    size_t object_memory_size(allocation_strategy& allocator);
+
     size_t size_in_allocator_without_rows(allocation_strategy& allocator) {
-        return allocator.object_memory_size_in_allocator(this) + external_memory_usage_without_rows();
+        return object_memory_size(allocator) + external_memory_usage_without_rows();
     }
 
     size_t size_in_allocator(allocation_strategy& allocator) {
@@ -89,34 +102,6 @@ public:
         return size;
     }
 
-    struct compare {
-        dht::decorated_key::less_comparator _c;
-
-        compare(schema_ptr s)
-            : _c(std::move(s))
-        {}
-
-        bool operator()(const dht::decorated_key& k1, const memtable_entry& k2) const {
-            return _c(k1, k2._key);
-        }
-
-        bool operator()(const memtable_entry& k1, const memtable_entry& k2) const {
-            return _c(k1._key, k2._key);
-        }
-
-        bool operator()(const memtable_entry& k1, const dht::decorated_key& k2) const {
-            return _c(k1._key, k2);
-        }
-
-        bool operator()(const memtable_entry& k1, const dht::ring_position& k2) const {
-            return _c(k1._key, k2);
-        }
-
-        bool operator()(const dht::ring_position& k1, const memtable_entry& k2) const {
-            return _c(k1, k2._key);
-        }
-    };
-
     friend dht::ring_position_view ring_position_view_to_compare(const memtable_entry& mt) { return mt._key; }
     friend std::ostream& operator<<(std::ostream&, const memtable_entry&);
 };
@@ -127,9 +112,9 @@ struct table_stats;
 // Managed by lw_shared_ptr<>.
 class memtable final : public enable_lw_shared_from_this<memtable>, private logalloc::region {
 public:
-    using partitions_type = bi::set<memtable_entry,
-        bi::member_hook<memtable_entry, bi::set_member_hook<>, &memtable_entry::_link>,
-        bi::compare<memtable_entry::compare>>;
+    using partitions_type = double_decker<int64_t, memtable_entry,
+                            dht::raw_token_less_comparator, dht::ring_position_comparator,
+                            16, bplus::key_search::linear>;
 private:
     dirty_memory_manager& _dirty_mgr;
     mutation_cleaner _cleaner;
diff --git a/row_cache.cc b/row_cache.cc
index 0c6603f0d2..5f7bc52295 100644
--- a/row_cache.cc
+++ b/row_cache.cc
@@ -899,7 +899,7 @@ void row_cache::invalidate_sync(memtable& m) noexcept {
         bool blow_cache = false;
         // Note: clear_and_dispose() ought not to look up any keys, so it doesn't require
         // with_linearized_managed_bytes(), but invalidate() does.
-        m.partitions.clear_and_dispose([this, &m, deleter = current_deleter<memtable_entry>(), &blow_cache] (memtable_entry* entry) noexcept {
+        m.partitions.clear_and_dispose([this, &m, &blow_cache] (memtable_entry* entry) noexcept {
             with_linearized_managed_bytes([&] () noexcept {
                 try {
                     invalidate_locked(entry->key());
@@ -907,7 +907,6 @@ void row_cache::invalidate_sync(memtable& m) noexcept {
                     blow_cache = true;
                 }
                 m.evict_entry(*entry, _tracker.memtable_cleaner());
-                deleter(entry);
             });
         });
         if (blow_cache) {
@@ -984,10 +983,9 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
                             _update_section(_tracker.region(), [&] {
                               with_linearized_managed_bytes([&] {
                                 auto i = m.partitions.begin();
-                                memtable_entry& mem_e = *i;
-                                m.partitions.erase(i);
-                                m.evict_entry(mem_e, _tracker.memtable_cleaner());
-                                current_allocator().destroy(&mem_e);
+                                i.erase_and_dispose(dht::raw_token_less_comparator{}, [&] (memtable_entry* e) noexcept {
+                                    m.evict_entry(*e, _tracker.memtable_cleaner());
+                                });
                               });
                             });
                             ++partition_count;
diff --git a/row_cache.hh b/row_cache.hh
index d7502c0985..f80aa20579 100644
--- a/row_cache.hh
+++ b/row_cache.hh
@@ -31,7 +31,6 @@
 
 #include "mutation_reader.hh"
 #include "mutation_partition.hh"
-#include "utils/logalloc.hh"
 #include "utils/phased_barrier.hh"
 #include "utils/histogram.hh"
 #include "partition_version.hh"
diff --git a/utils/double-decker.hh b/utils/double-decker.hh
index 5616d44b4d..c907fd4e65 100644
--- a/utils/double-decker.hh
+++ b/utils/double-decker.hh
@@ -400,4 +400,13 @@ public:
     }
 
     bool empty() const noexcept { return _tree.empty(); }
+
+    static size_t estimated_object_memory_size_in_allocator(allocation_strategy& allocator, const T* obj) noexcept {
+        /*
+         * The T-s are merged together in array, so getting any run-time
+         * value of a pointer would be wrong. So here's some guessing of
+         * how much memory would this thing occupy in memory
+         */
+        return sizeof(typename outer_tree::data);
+    }
 };

From f8ffc31218f33e0d1acafbc15835b36c6c5c86b6 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@scylladb.com>
Date: Mon, 18 May 2020 16:55:49 +0300
Subject: [PATCH 11/11] test: Print more sizes in memory_footprint_test

The row cache memory footprint changed after switch to B+
because we no longer have a sole cache_entry allocation, but
also the bplus::data and bplus::node. Knowing their sizes
helps analyzing the footprint changes.

Also print the size of memtable_entry that's now also stored
in B+'s data.

Signed-off-by: Pavel Emelyanov <xemul@scylladb.com>
---
 test/perf/memory_footprint_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/perf/memory_footprint_test.cc b/test/perf/memory_footprint_test.cc
index 9f04fd84d2..104e7a5cb0 100644
--- a/test/perf/memory_footprint_test.cc
+++ b/test/perf/memory_footprint_test.cc
@@ -57,6 +57,9 @@ class size_calculator {
 public:
     static void print_cache_entry_size() {
         std::cout << prefix() << "sizeof(cache_entry) = " << sizeof(cache_entry) << "\n";
+        std::cout << prefix() << "sizeof(memtable_entry) = " << sizeof(memtable_entry) << "\n";
+        std::cout << prefix() << "sizeof(bptree::node) = " << sizeof(row_cache::partitions_type::outer_tree::node) << "\n";
+        std::cout << prefix() << "sizeof(bptree::data) = " << sizeof(row_cache::partitions_type::outer_tree::data) << "\n";
 
         {
             nest n;