sstables: Cache partition index pages in LSA and link to LRU

As part of this change, the container for partition index pages was
changed from utils::loading_shared_values to intrusive_btree. This is
to avoid reactor stalls which the former induces with a large number
of elements (pages) due to its use of a hashtable under the hood,
which reallocates contiguous storage.
This commit is contained in:
Tomasz Grabiec
2021-04-15 15:38:09 +02:00
parent b3728f7d9b
commit 9f957f1cf9
8 changed files with 559 additions and 56 deletions

View File

@@ -476,6 +476,7 @@ scylla_tests = set([
'test/boost/sstable_3_x_test',
'test/boost/sstable_datafile_test',
'test/boost/sstable_mutation_test',
'test/boost/sstable_partition_index_cache_test',
'test/boost/schema_changes_test',
'test/boost/sstable_conforms_to_mutation_source_test',
'test/boost/sstable_resharding_test',

View File

@@ -1369,11 +1369,11 @@ class scylla_active_sstables(gdb.Command):
def invoke(self, arg, from_tty):
try:
sizeof_index_entry = int(gdb.parse_and_eval('sizeof(sstables::index_entry)'))
sizeof_entry = int(gdb.parse_and_eval('sizeof(sstables::shared_index_lists::entry)'))
sizeof_entry = int(gdb.parse_and_eval('sizeof(sstables::partition_index_cache::entry)'))
def count_index_lists(sst):
index_lists_size = 0
for key, entry in unordered_map(sst['_index_lists']['_lists']):
for key, entry in intrusive_btree(sst['_index_lists']['_entries']):
index_entries = std_vector(entry['list'])
index_lists_size += sizeof_entry
for e in index_entries:

View File

@@ -285,6 +285,10 @@ public:
const managed_ref<promoted_index>& get_promoted_index() const { return _index; }
managed_ref<promoted_index>& get_promoted_index() { return _index; }
uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
size_t external_memory_usage() const {
return _key.external_memory_usage() + _index.external_memory_usage();
}
};
// A partition index page.
@@ -301,6 +305,14 @@ public:
bool empty() const { return _entries.empty(); }
size_t size() const { return _entries.size(); }
size_t external_memory_usage() const {
size_t size = _entries.external_memory_usage();
for (auto&& e : _entries) {
size += sizeof(index_entry) + e->external_memory_usage();
}
return size;
}
};
using index_list = partition_index_page;

View File

@@ -0,0 +1,293 @@
/*
* Copyright (C) 2021 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "index_entry.hh"
#include <vector>
#include <seastar/core/future.hh>
#include <seastar/core/loop.hh>
#include <seastar/core/coroutine.hh>
#include "utils/loading_shared_values.hh"
#include "utils/chunked_vector.hh"
#include "utils/bptree.hh"
#include "utils/lru.hh"
#include "utils/lsa/weak_ptr.hh"
namespace sstables {
// Associative cache of summary index -> partition_index_page
// Entries stay around as long as there is any live external reference (entry_ptr) to them.
// Supports asynchronous insertion, ensures that only one entry will be loaded.
// Entries without a live entry_ptr are linked in the LRU.
// The instance must be destroyed only after all live_ptr:s are gone.
class partition_index_cache {
public:
using key_type = uint64_t;
private:
// Allocated inside LSA
class entry : public evictable, public lsa::weakly_referencable<entry> {
public:
partition_index_cache* _parent;
key_type _key;
std::variant<shared_promise<>, partition_index_page> _page;
size_t _size_in_allocator = 0;
public:
entry(partition_index_cache* parent, key_type key)
: _parent(parent)
, _key(key)
{ }
void set_page(partition_index_page&& page) noexcept {
with_allocator(_parent->_region.allocator(), [&] {
_size_in_allocator = sizeof(entry) + page.external_memory_usage();
});
_page = std::move(page);
}
entry(entry&&) noexcept = default;
~entry() {
assert(!is_referenced());
}
void on_evicted() noexcept override;
// Returns the amount of memory owned by this entry.
// Always returns the same value for a given state of _page.
size_t size_in_allocator() const { return _size_in_allocator; }
shared_promise<>& promise() { return std::get<shared_promise<>>(_page); }
bool ready() const { return std::holds_alternative<partition_index_page>(_page); }
partition_index_page& page() { return std::get<partition_index_page>(_page); }
const partition_index_page& page() const { return std::get<partition_index_page>(_page); }
key_type key() const { return _key; }
};
public:
static thread_local struct stats {
uint64_t hits = 0; // Number of times entry was found ready
uint64_t misses = 0; // Number of times entry was not found
uint64_t blocks = 0; // Number of times entry was not ready (>= misses)
uint64_t evictions = 0; // Number of times entry was evicted
uint64_t populations = 0; // Number of times entry was inserted
uint64_t used_bytes = 0; // Number of bytes entries occupy in memory
} _shard_stats;
struct key_less_comparator {
bool operator()(key_type lhs, key_type rhs) const noexcept {
return lhs < rhs;
}
};
// A shared pointer to cached partition_index_page.
//
// Prevents page from being evicted.
// Never invalidated.
// Can be accessed and destroyed in the standard allocator context.
//
// The partition_index_page reference obtained by dereferencing this pointer
// is invalidated when the owning LSA region invalidates references.
class entry_ptr {
// *_ref is kept alive by the means of unlinking from LRU.
lsa::weak_ptr<entry> _ref;
private:
friend class partition_index_cache;
entry& get_entry() { return *_ref; }
public:
using element_type = partition_index_page;
entry_ptr() = default;
explicit entry_ptr(lsa::weak_ptr<entry> ref)
: _ref(std::move(ref))
{
_ref->unlink_from_lru();
}
~entry_ptr() { *this = nullptr; }
entry_ptr(entry_ptr&&) noexcept = default;
entry_ptr(const entry_ptr&) noexcept = default;
entry_ptr& operator=(std::nullptr_t) noexcept {
if (_ref) {
if (_ref.unique()) {
_ref->_parent->_lru.add(*_ref);
}
_ref = nullptr;
}
return *this;
}
entry_ptr& operator=(entry_ptr&& o) noexcept {
if (this != &o) {
*this = nullptr;
_ref = std::move(o._ref);
}
return *this;
}
entry_ptr& operator=(const entry_ptr& o) noexcept {
if (this != &o) {
*this = nullptr;
_ref = o._ref;
}
return *this;
}
explicit operator bool() const noexcept { return bool(_ref); }
const element_type& operator*() const noexcept { return _ref->page(); }
const element_type* operator->() const noexcept { return &_ref->page(); }
element_type& operator*() noexcept { return _ref->page(); }
element_type* operator->() noexcept { return &_ref->page(); }
};
// Creates a shared pointer to cp.
// Invalidates cp.
entry_ptr share(entry& cp) {
auto wptr = cp.weak_from_this(); // may throw
return entry_ptr(std::move(wptr));
}
using list_ptr = entry_ptr; // for compatibility with old code
private:
using cache_type = bplus::tree<key_type, entry, key_less_comparator, 8, bplus::key_search::linear>;
cache_type _cache;
logalloc::region& _region;
logalloc::allocating_section _as;
lru& _lru;
public:
// Create a cache with a given LRU attached.
partition_index_cache(lru& lru_, logalloc::region& r)
: _cache(key_less_comparator())
, _region(r)
, _lru(lru_)
{ }
~partition_index_cache() {
with_allocator(_region.allocator(), [&] {
_cache.clear_and_dispose([this] (entry* e) noexcept {
on_evicted(*e);
});
});
}
partition_index_cache(partition_index_cache&&) = delete;
partition_index_cache(const partition_index_cache&) = delete;
// Returns a future which resolves with a shared pointer to index_list for given key.
// Always returns a valid pointer if succeeds. The pointer is never invalidated externally.
//
// If entry is missing, the loader is invoked. If list is already loading, this invocation
// will wait for prior loading to complete and use its result when it's done.
//
// The loader object does not survive deferring, so the caller must deal with its liveness.
//
// The returned future must be waited on before destroying this instance.
template<typename Loader>
future<entry_ptr> get_or_load(const key_type& key, Loader&& loader) {
auto i = _cache.lower_bound(key);
if (i != _cache.end() && i->_key == key) {
entry& cp = *i;
auto ptr = share(cp);
if (cp.ready()) {
++_shard_stats.hits;
return make_ready_future<entry_ptr>(std::move(ptr));
} else {
++_shard_stats.blocks;
return _as(_region, [ptr] () mutable {
return ptr.get_entry().promise().get_shared_future();
}).then([ptr] () mutable {
return std::move(ptr);
});
}
}
++_shard_stats.misses;
++_shard_stats.blocks;
entry_ptr ptr = _as(_region, [&] {
return with_allocator(_region.allocator(), [&] {
auto it_and_flag = _cache.emplace(key, this, key);
entry &cp = *it_and_flag.first;
assert(it_and_flag.second);
try {
return share(cp);
} catch (...) {
_cache.erase(key);
throw;
}
});
});
// No exceptions before then_wrapped() is installed so that ptr will be eventually populated.
return futurize_invoke(loader, key).then_wrapped([this, key, ptr] (auto&& f) mutable {
entry& e = ptr.get_entry();
try {
partition_index_page&& page = f.get0();
e.promise().set_value();
e.set_page(std::move(page));
_shard_stats.used_bytes += e.size_in_allocator();
++_shard_stats.populations;
} catch (...) {
e.promise().set_exception(std::current_exception());
with_allocator(_region.allocator(), [&] {
_cache.erase(key);
});
throw;
}
}).then([ptr] {
return ptr;
});
}
void on_evicted(entry& p) {
_shard_stats.used_bytes -= p.size_in_allocator();
++_shard_stats.evictions;
}
static const stats& shard_stats() { return _shard_stats; }
// Evicts all unreferenced entries.
future<> evict_gently() {
auto i = _cache.begin();
while (i != _cache.end()) {
with_allocator(_region.allocator(), [&] {
if (i->is_referenced()) {
++i;
} else {
on_evicted(*i);
i = i.erase(key_less_comparator());
}
});
if (need_preempt() && i != _cache.end()) {
auto key = i->key();
co_await make_ready_future<>();
i = _cache.lower_bound(key);
}
}
}
};
inline
void partition_index_cache::entry::on_evicted() noexcept {
_parent->on_evicted(*this);
cache_type::iterator it(this);
it.erase(key_less_comparator());
}
using shared_index_lists = partition_index_cache;
}

View File

@@ -21,62 +21,10 @@
#pragma once
#include "index_entry.hh"
#include <vector>
#include <seastar/core/future.hh>
#include "utils/loading_shared_values.hh"
#include "utils/chunked_vector.hh"
#include "partition_index_cache.hh"
namespace sstables {
// Associative cache of summary index -> partition_index_page
// Entries stay around as long as there is any live external reference (list_ptr) to them.
// Supports asynchronous insertion, ensures that only one entry will be loaded.
class partition_index_cache {
public:
using key_type = uint64_t;
static thread_local struct stats {
uint64_t hits = 0; // Number of times entry was found ready
uint64_t misses = 0; // Number of times entry was not found
uint64_t blocks = 0; // Number of times entry was not ready (>= misses)
uint64_t evictions = 0; // Number of times entry was evicted
} _shard_stats;
struct stats_updater {
static void inc_hits() noexcept { ++_shard_stats.hits; }
static void inc_misses() noexcept { ++_shard_stats.misses; }
static void inc_blocks() noexcept { ++_shard_stats.blocks; }
static void inc_evictions() noexcept { ++_shard_stats.evictions; }
};
using loading_shared_lists_type = utils::loading_shared_values<key_type, index_list, std::hash<key_type>, std::equal_to<key_type>, stats_updater>;
// Pointer to index_list
using list_ptr = loading_shared_lists_type::entry_ptr;
private:
loading_shared_lists_type _lists;
public:
// Create a cache with a given LRU attached.
partition_index_cache() = default;
partition_index_cache(partition_index_cache&&) = delete;
partition_index_cache(const partition_index_cache&) = delete;
// Returns a future which resolves with a shared pointer to index_list for given key.
// Always returns a valid pointer if succeeds. The pointer is never invalidated externally.
//
// If entry is missing, the loader is invoked. If list is already loading, this invocation
// will wait for prior loading to complete and use its result when it's done.
//
// The loader object does not survive deferring, so the caller must deal with its liveness.
template<typename Loader>
future<list_ptr> get_or_load(const key_type& key, Loader&& loader) {
return _lists.get_or_load(key, std::forward<Loader>(loader));
}
static const stats& shard_stats() { return _shard_stats; }
};
using shared_index_lists = partition_index_cache;
}

View File

@@ -2848,6 +2848,10 @@ future<> init_metrics() {
sm::description("Index page requests which needed to wait due to page not being loaded yet")),
sm::make_derive("index_page_evictions", [] { return shared_index_lists::shard_stats().evictions; },
sm::description("Index pages which got evicted from memory")),
sm::make_derive("index_page_populations", [] { return shared_index_lists::shard_stats().populations; },
sm::description("Index pages which got populated into memory")),
sm::make_gauge("index_page_used_bytes", [] { return shared_index_lists::shard_stats().used_bytes; },
sm::description("Amount of bytes used by index pages in memory")),
sm::make_derive("index_page_cache_hits", [] { return index_page_cache_metrics.page_hits; },
sm::description("Index page cache requests which were served from cache")),
@@ -2967,13 +2971,13 @@ sstable::sstable(schema_ptr schema,
, _generation(generation)
, _version(v)
, _format(f)
, _index_lists(manager.get_cache_tracker().get_lru(), manager.get_cache_tracker().region())
, _now(now)
, _read_error_handler(error_handler_gen(sstable_read_error))
, _write_error_handler(error_handler_gen(sstable_write_error))
, _large_data_handler(large_data_handler)
, _manager(manager)
{
_index_lists.set_allocator(manager.get_cache_tracker().region().allocator());
tracker.add(*this);
manager.add(this);
}

View File

@@ -0,0 +1,236 @@
/*
* Copyright (C) 2021-present ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#include <boost/test/unit_test.hpp>
#include <seastar/testing/test_case.hh>
#include <seastar/testing/thread_test_case.hh>
#include "sstables/partition_index_cache.hh"
#include "test/lib/simple_schema.hh"
using namespace sstables;
static void add_entry(logalloc::region& r,
const schema& s,
partition_index_page& page,
const partition_key& key,
uint64_t position)
{
logalloc::allocating_section as;
as(r, [&] {
sstables::key sst_key = sstables::key::from_partition_key(s, key);
page._entries.push_back(make_managed<index_entry>(
managed_bytes(sst_key.get_bytes()),
position,
managed_ref<promoted_index>()));
});
}
static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
partition_index_page page;
add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1);
add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3);
return page;
}
static void has_page0(partition_index_cache::list_ptr ptr) {
BOOST_REQUIRE(!ptr->empty());
BOOST_REQUIRE_EQUAL(ptr->_entries.size(), 4);
BOOST_REQUIRE_EQUAL(ptr->_entries[0]->position(), 0);
BOOST_REQUIRE_EQUAL(ptr->_entries[1]->position(), 1);
BOOST_REQUIRE_EQUAL(ptr->_entries[2]->position(), 2);
BOOST_REQUIRE_EQUAL(ptr->_entries[3]->position(), 3);
};
SEASTAR_THREAD_TEST_CASE(test_caching) {
::lru lru;
simple_schema s;
logalloc::region r;
partition_index_cache cache(lru, r);
auto page0_loader = [&] (partition_index_cache::key_type k) {
return later().then([&] {
return make_page0(r, s);
});
};
auto old_stats = cache.shard_stats();
auto f0 = cache.get_or_load(0, page0_loader);
auto f1 = cache.get_or_load(0, page0_loader);
BOOST_REQUIRE_EQUAL(cache.shard_stats().hits, old_stats.hits);
BOOST_REQUIRE_EQUAL(cache.shard_stats().misses, old_stats.misses + 1);
BOOST_REQUIRE_EQUAL(cache.shard_stats().blocks, old_stats.blocks + 2);
r.full_compaction();
with_allocator(r.allocator(), [&] {
lru.evict_all();
});
partition_index_cache::list_ptr ptr0 = f0.get0();
partition_index_cache::list_ptr ptr1 = f1.get0();
r.full_compaction();
with_allocator(r.allocator(), [&] {
lru.evict_all();
});
BOOST_REQUIRE_EQUAL(cache.shard_stats().populations, old_stats.populations + 1);
BOOST_REQUIRE_EQUAL(cache.shard_stats().evictions, old_stats.evictions);
BOOST_REQUIRE(cache.shard_stats().used_bytes > 0);
has_page0(ptr0);
has_page0(ptr1);
BOOST_REQUIRE(&*ptr0 == &*ptr1);
{
auto ptr2 = ptr1;
auto ptr3 = std::move(ptr2);
BOOST_REQUIRE(!ptr2);
BOOST_REQUIRE(ptr3);
ptr0 = nullptr;
ptr1 = nullptr;
BOOST_REQUIRE(!ptr1);
with_allocator(r.allocator(), [&] {
lru.evict_all();
});
// ptr3 prevents page 0 evictions
BOOST_REQUIRE_EQUAL(cache.shard_stats().evictions, old_stats.evictions);
has_page0(ptr3);
ptr3 = nullptr;
with_allocator(r.allocator(), [&] {
lru.evict_all();
});
BOOST_REQUIRE_EQUAL(cache.shard_stats().evictions, old_stats.evictions + 1);
BOOST_REQUIRE_EQUAL(cache.shard_stats().used_bytes, old_stats.used_bytes);
}
{
auto ptr4 = cache.get_or_load(0, page0_loader).get0();
has_page0(ptr4);
BOOST_REQUIRE_EQUAL(cache.shard_stats().misses, old_stats.misses + 2);
BOOST_REQUIRE_EQUAL(cache.shard_stats().populations, old_stats.populations + 2);
}
}
SEASTAR_THREAD_TEST_CASE(test_auto_clear) {
::lru lru;
simple_schema s;
logalloc::region r;
partition_index_cache::stats old_stats;
{
partition_index_cache cache(lru, r);
auto page0_loader = [&] (partition_index_cache::key_type k) {
return make_page0(r, s);
};
old_stats = cache.shard_stats();
cache.get_or_load(0, page0_loader).get();
cache.get_or_load(1, page0_loader).get();
cache.get_or_load(2, page0_loader).get();
}
partition_index_cache cache2(lru, r); // to get stats
BOOST_REQUIRE_EQUAL(cache2.shard_stats().evictions, old_stats.evictions + 3);
BOOST_REQUIRE_EQUAL(cache2.shard_stats().used_bytes, old_stats.used_bytes);
BOOST_REQUIRE_EQUAL(cache2.shard_stats().populations, old_stats.populations + 3);
}
SEASTAR_THREAD_TEST_CASE(test_destroy) {
::lru lru;
simple_schema s;
logalloc::region r;
partition_index_cache::stats old_stats;
partition_index_cache cache(lru, r);
auto page0_loader = [&] (partition_index_cache::key_type k) {
return make_page0(r, s);
};
old_stats = cache.shard_stats();
cache.get_or_load(0, page0_loader).get();
cache.get_or_load(1, page0_loader).get();
cache.get_or_load(2, page0_loader).get();
cache.evict_gently().get();
BOOST_REQUIRE_EQUAL(cache.shard_stats().evictions, old_stats.evictions + 3);
BOOST_REQUIRE_EQUAL(cache.shard_stats().used_bytes, old_stats.used_bytes);
BOOST_REQUIRE_EQUAL(cache.shard_stats().populations, old_stats.populations + 3);
}
SEASTAR_THREAD_TEST_CASE(test_evict_gently) {
::lru lru;
simple_schema s;
logalloc::region r;
partition_index_cache::stats old_stats;
partition_index_cache cache(lru, r);
auto page0_loader = [&] (partition_index_cache::key_type k) {
return make_page0(r, s);
};
old_stats = cache.shard_stats();
cache.get_or_load(0, page0_loader).get();
cache.get_or_load(1, page0_loader).get();
cache.get_or_load(2, page0_loader).get();
cache.evict_gently().get();
BOOST_REQUIRE_EQUAL(cache.shard_stats().evictions, old_stats.evictions + 3);
BOOST_REQUIRE_EQUAL(cache.shard_stats().used_bytes, old_stats.used_bytes);
BOOST_REQUIRE_EQUAL(cache.shard_stats().populations, old_stats.populations + 3);
// kept alive around evict_gently()
auto page = cache.get_or_load(1, page0_loader).get();
BOOST_REQUIRE_EQUAL(cache.shard_stats().populations, old_stats.populations + 4);
cache.evict_gently().get();
BOOST_REQUIRE_EQUAL(cache.shard_stats().evictions, old_stats.evictions + 3);
auto no_loader = [&] (partition_index_cache::key_type k) -> future<partition_index_page> {
throw std::runtime_error("should not have been invoked");
};
cache.get_or_load(1, no_loader).get(); // page keeps the page alive
cache.evict_gently().get();
}

View File

@@ -249,4 +249,13 @@ public:
}
return 0;
}
// Returns the amount of external memory used to hold inserted items.
// Takes into account reserved space.
size_t external_memory_usage() const {
if (is_external()) {
return sizeof(external) + _capacity * sizeof(T);
}
return 0;
}
};