mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-25 19:10:42 +00:00
This applies to small partition workload where index pages have high partition count, and the index doesn't fit in cache. It was observed that the count can be in the order of hundreds. In such a workload pages undergo constant population, LSA compaction, and LSA eviction, which has severe impact on CPU utilization.
Refs https://scylladb.atlassian.net/browse/SCYLLADB-620
This PR reduces the impact by several changes:
- reducing memory footprint in the partition index. Assuming partition key size is 16 bytes, the cost dropped from 96 bytes to 36 bytes per partition.
- flattening the object graph and amortizing storage. Storing entries directly in the vector. Storing all key values in a single managed_bytes. Making index_entry a trivial struct.
- index entries and key storage are now trivially moveable, and batched inside vector storage
so LSA migration can use memcpy(), which amortizes the cost per key. This reduces the cost of LSA segment compaction.
- LSA eviction is now pretty much constant time for the whole page
regardless of the number of entries, because elements are trivial and batched inside vectors.
Page eviction cost dropped from 50 us to 1 us.
Performance evaluated with:
scylla perf-simple-query -c1 -m200M --partitions=1000000
Before:
```
7774.96 tps (166.0 allocs/op, 521.7 logallocs/op, 54.0 tasks/op, 802428 insns/op, 430457 cycles/op, 0 errors)
7511.08 tps (166.1 allocs/op, 527.2 logallocs/op, 54.0 tasks/op, 804185 insns/op, 430752 cycles/op, 0 errors)
7740.44 tps (166.3 allocs/op, 526.2 logallocs/op, 54.2 tasks/op, 805347 insns/op, 432117 cycles/op, 0 errors)
7818.72 tps (165.2 allocs/op, 517.6 logallocs/op, 53.7 tasks/op, 794965 insns/op, 427751 cycles/op, 0 errors)
7865.49 tps (165.1 allocs/op, 513.3 logallocs/op, 53.6 tasks/op, 788898 insns/op, 425171 cycles/op, 0 errors)
```
After (+318%):
```
32492.40 tps (130.7 allocs/op, 12.8 logallocs/op, 36.1 tasks/op, 109236 insns/op, 103203 cycles/op, 0 errors)
32591.99 tps (130.4 allocs/op, 12.8 logallocs/op, 36.0 tasks/op, 108947 insns/op, 102889 cycles/op, 0 errors)
32514.52 tps (130.6 allocs/op, 12.8 logallocs/op, 36.0 tasks/op, 109118 insns/op, 103219 cycles/op, 0 errors)
32491.14 tps (130.6 allocs/op, 12.8 logallocs/op, 36.0 tasks/op, 109349 insns/op, 103272 cycles/op, 0 errors)
32582.90 tps (130.5 allocs/op, 12.8 logallocs/op, 36.0 tasks/op, 109269 insns/op, 102872 cycles/op, 0 errors)
32479.43 tps (130.6 allocs/op, 12.8 logallocs/op, 36.0 tasks/op, 109313 insns/op, 103242 cycles/op, 0 errors)
32418.48 tps (130.7 allocs/op, 12.8 logallocs/op, 36.1 tasks/op, 109201 insns/op, 103301 cycles/op, 0 errors)
31394.14 tps (130.7 allocs/op, 12.8 logallocs/op, 36.1 tasks/op, 109267 insns/op, 103301 cycles/op, 0 errors)
32298.55 tps (130.7 allocs/op, 12.8 logallocs/op, 36.1 tasks/op, 109323 insns/op, 103551 cycles/op, 0 errors)
```
When the workload is miss-only, with both row cache and index cache disabled (no cache maintenance cost):
perf-simple-query -c1 -m200M --duration 6000 --partitions=100000 --enable-index-cache=0 --enable-cache=0
Before:
```
9124.57 tps (146.2 allocs/op, 789.0 logallocs/op, 45.3 tasks/op, 889320 insns/op, 357937 cycles/op, 0 errors)
9437.23 tps (146.1 allocs/op, 789.3 logallocs/op, 45.3 tasks/op, 889613 insns/op, 357782 cycles/op, 0 errors)
9455.65 tps (146.0 allocs/op, 787.4 logallocs/op, 45.2 tasks/op, 887606 insns/op, 357167 cycles/op, 0 errors)
9451.22 tps (146.0 allocs/op, 787.4 logallocs/op, 45.3 tasks/op, 887627 insns/op, 357357 cycles/op, 0 errors)
9429.50 tps (146.0 allocs/op, 787.4 logallocs/op, 45.3 tasks/op, 887761 insns/op, 358148 cycles/op, 0 errors)
9430.29 tps (146.1 allocs/op, 788.2 logallocs/op, 45.3 tasks/op, 888501 insns/op, 357679 cycles/op, 0 errors)
9454.08 tps (146.0 allocs/op, 787.3 logallocs/op, 45.3 tasks/op, 887545 insns/op, 357132 cycles/op, 0 errors)
```
After (+55%):
```
14484.84 tps (150.7 allocs/op, 6.5 logallocs/op, 44.7 tasks/op, 396164 insns/op, 229490 cycles/op, 0 errors)
14526.21 tps (150.8 allocs/op, 6.5 logallocs/op, 44.8 tasks/op, 396401 insns/op, 228824 cycles/op, 0 errors)
14567.53 tps (150.7 allocs/op, 6.5 logallocs/op, 44.7 tasks/op, 396319 insns/op, 228701 cycles/op, 0 errors)
14545.63 tps (150.6 allocs/op, 6.5 logallocs/op, 44.7 tasks/op, 395889 insns/op, 228493 cycles/op, 0 errors)
14626.06 tps (150.5 allocs/op, 6.5 logallocs/op, 44.7 tasks/op, 395254 insns/op, 227891 cycles/op, 0 errors)
14593.74 tps (150.5 allocs/op, 6.5 logallocs/op, 44.7 tasks/op, 395480 insns/op, 227993 cycles/op, 0 errors)
14538.10 tps (150.8 allocs/op, 6.5 logallocs/op, 44.8 tasks/op, 397035 insns/op, 228831 cycles/op, 0 errors)
14527.18 tps (150.8 allocs/op, 6.5 logallocs/op, 44.8 tasks/op, 396992 insns/op, 228839 cycles/op, 0 errors)
```
Same as above, but with summary ratio increased from 0.0005 to 0.005 (smaller pages):
Before:
```
33906.70 tps (146.1 allocs/op, 83.6 logallocs/op, 45.1 tasks/op, 170553 insns/op, 98104 cycles/op, 0 errors)
32696.16 tps (146.0 allocs/op, 83.5 logallocs/op, 45.1 tasks/op, 170369 insns/op, 98405 cycles/op, 0 errors)
33889.05 tps (146.1 allocs/op, 83.6 logallocs/op, 45.1 tasks/op, 170551 insns/op, 98135 cycles/op, 0 errors)
33893.24 tps (146.1 allocs/op, 83.5 logallocs/op, 45.1 tasks/op, 170488 insns/op, 98168 cycles/op, 0 errors)
33836.73 tps (146.1 allocs/op, 83.6 logallocs/op, 45.1 tasks/op, 170528 insns/op, 98226 cycles/op, 0 errors)
33897.61 tps (146.0 allocs/op, 83.5 logallocs/op, 45.1 tasks/op, 170428 insns/op, 98081 cycles/op, 0 errors)
33834.73 tps (146.1 allocs/op, 83.5 logallocs/op, 45.1 tasks/op, 170438 insns/op, 98178 cycles/op, 0 errors)
33776.31 tps (146.3 allocs/op, 83.9 logallocs/op, 45.2 tasks/op, 170958 insns/op, 98418 cycles/op, 0 errors)
33808.08 tps (146.3 allocs/op, 83.9 logallocs/op, 45.2 tasks/op, 170940 insns/op, 98388 cycles/op, 0 errors)
```
After (+18%):
```
40081.51 tps (148.2 allocs/op, 4.4 logallocs/op, 45.0 tasks/op, 121047 insns/op, 82231 cycles/op, 0 errors)
40005.85 tps (148.6 allocs/op, 4.4 logallocs/op, 45.2 tasks/op, 121327 insns/op, 82545 cycles/op, 0 errors)
39816.75 tps (148.3 allocs/op, 4.4 logallocs/op, 45.1 tasks/op, 121067 insns/op, 82419 cycles/op, 0 errors)
39953.11 tps (148.1 allocs/op, 4.4 logallocs/op, 45.0 tasks/op, 121027 insns/op, 82258 cycles/op, 0 errors)
40073.96 tps (148.2 allocs/op, 4.4 logallocs/op, 45.0 tasks/op, 121006 insns/op, 82313 cycles/op, 0 errors)
39882.25 tps (148.2 allocs/op, 4.4 logallocs/op, 45.0 tasks/op, 120925 insns/op, 82320 cycles/op, 0 errors)
39916.08 tps (148.3 allocs/op, 4.4 logallocs/op, 45.1 tasks/op, 121054 insns/op, 82393 cycles/op, 0 errors)
39786.30 tps (148.2 allocs/op, 4.4 logallocs/op, 45.0 tasks/op, 121027 insns/op, 82465 cycles/op, 0 errors)
38662.45 tps (148.3 allocs/op, 4.4 logallocs/op, 45.0 tasks/op, 121108 insns/op, 82312 cycles/op, 0 errors)
39849.42 tps (148.3 allocs/op, 4.4 logallocs/op, 45.1 tasks/op, 121098 insns/op, 82447 cycles/op, 0 errors)
```
Closes scylladb/scylladb#28603
* github.com:scylladb/scylladb:
sstables: mx: index_reader: Optimize parsing for no promoted index case
vint: Use std::countl_zero()
test: sstable_partition_index_cache_test: Validate scenario of pages with sparse promoted index placement
sstables: mx: index_reader: Amoritze partition key storage
managed_bytes: Hoist write_fragmented() to common header
utils: managed_vector: Use std::uninitialized_move() to move objects
sstables: mx: index_reader: Keep promoted_index info next to index_entry
sstables: mx: index_reader: Extract partition_index_page::clear_gently()
sstables: mx: index_reader: Shave-off 16 bytes from index_entry by using raw_token
sstables: mx: index_reader: Reduce allocation_section overhead during index page parsing by batching allocation
sstables: mx: index_reader: Keep index_entry directly in the vector
dht: Introduce raw_token
test: perf_simple_query: Add 'sstable-format' command-line option
test: perf_simple_query: Add 'sstable-summary-ratio' command-line option
test: perf-simple-query: Add option to disable index cache
test: cql_test_env: Respect enable-index-cache config
(cherry picked from commit 5e7fb08bf3)
Closes scylladb/scylladb#29136
Closes scylladb/scylladb#29140
297 lines
10 KiB
C++
297 lines
10 KiB
C++
/*
|
|
* Copyright (C) 2021 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "index_entry.hh"
|
|
#include <seastar/core/future.hh>
|
|
#include <seastar/core/loop.hh>
|
|
#include <seastar/core/coroutine.hh>
|
|
#include <seastar/core/shared_future.hh>
|
|
#include <seastar/coroutine/maybe_yield.hh>
|
|
#include "utils/assert.hh"
|
|
#include "utils/bptree.hh"
|
|
#include "utils/lru.hh"
|
|
#include "utils/lsa/weak_ptr.hh"
|
|
#include "sstables/partition_index_cache_stats.hh"
|
|
|
|
namespace sstables {
|
|
|
|
// Associative cache of summary index -> partition_index_page
|
|
// Entries stay around as long as there is any live external reference (entry_ptr) to them.
|
|
// Supports asynchronous insertion, ensures that only one entry will be loaded.
|
|
// Entries without a live entry_ptr are linked in the LRU.
|
|
// The instance must be destroyed only after all live_ptr:s are gone.
|
|
class partition_index_cache {
|
|
public:
|
|
using key_type = uint64_t;
|
|
private:
|
|
// Allocated inside LSA
|
|
class entry final : public index_evictable, public lsa::weakly_referencable<entry> {
|
|
public:
|
|
partition_index_cache* _parent;
|
|
key_type _key;
|
|
std::variant<lw_shared_ptr<shared_promise<>>, partition_index_page> _page;
|
|
size_t _size_in_allocator = 0;
|
|
public:
|
|
entry(partition_index_cache* parent, key_type key)
|
|
: _parent(parent)
|
|
, _key(key)
|
|
, _page(make_lw_shared<shared_promise<>>())
|
|
{ }
|
|
|
|
void set_page(partition_index_page&& page) noexcept {
|
|
with_allocator(_parent->_region.allocator(), [&] {
|
|
_size_in_allocator = sizeof(entry) + page.external_memory_usage();
|
|
});
|
|
_page = std::move(page);
|
|
}
|
|
|
|
entry(entry&&) noexcept = default;
|
|
|
|
~entry() {
|
|
if (is_referenced()) {
|
|
// Live entry_ptr should keep the entry alive, except when the entry failed on loading.
|
|
// In that case, entry_ptr holders are not supposed to use the pointer, so it's safe
|
|
// to nullify those entry_ptrs.
|
|
SCYLLA_ASSERT(!ready());
|
|
}
|
|
}
|
|
|
|
void on_evicted() noexcept override;
|
|
|
|
// Returns the amount of memory owned by this entry.
|
|
// Always returns the same value for a given state of _page.
|
|
size_t size_in_allocator() const { return _size_in_allocator; }
|
|
|
|
lw_shared_ptr<shared_promise<>> promise() { return std::get<lw_shared_ptr<shared_promise<>>>(_page); }
|
|
bool ready() const { return std::holds_alternative<partition_index_page>(_page); }
|
|
partition_index_page& page() { return std::get<partition_index_page>(_page); }
|
|
const partition_index_page& page() const { return std::get<partition_index_page>(_page); }
|
|
key_type key() const { return _key; }
|
|
};
|
|
public:
|
|
struct key_less_comparator {
|
|
bool operator()(key_type lhs, key_type rhs) const noexcept {
|
|
return lhs < rhs;
|
|
}
|
|
};
|
|
|
|
// A shared pointer to cached partition_index_page.
|
|
//
|
|
// Prevents page from being evicted.
|
|
// Never invalidated.
|
|
// Can be accessed and destroyed in the standard allocator context.
|
|
//
|
|
// The partition_index_page reference obtained by dereferencing this pointer
|
|
// is invalidated when the owning LSA region invalidates references.
|
|
class entry_ptr {
|
|
// *_ref is kept alive by the means of unlinking from LRU.
|
|
lsa::weak_ptr<entry> _ref;
|
|
private:
|
|
friend class partition_index_cache;
|
|
entry& get_entry() { return *_ref; }
|
|
public:
|
|
using element_type = partition_index_page;
|
|
entry_ptr() = default;
|
|
explicit entry_ptr(lsa::weak_ptr<entry> ref)
|
|
: _ref(std::move(ref))
|
|
{
|
|
if (_ref->is_linked()) {
|
|
_ref->_parent->_lru.remove(*_ref);
|
|
}
|
|
}
|
|
~entry_ptr() { *this = nullptr; }
|
|
entry_ptr(entry_ptr&&) noexcept = default;
|
|
entry_ptr(const entry_ptr&) noexcept = default;
|
|
entry_ptr& operator=(std::nullptr_t) noexcept {
|
|
if (_ref) {
|
|
if (_ref.unique() && _ref->ready()) {
|
|
_ref->_parent->_lru.add(*_ref);
|
|
}
|
|
_ref = nullptr;
|
|
}
|
|
return *this;
|
|
}
|
|
entry_ptr& operator=(entry_ptr&& o) noexcept {
|
|
if (this != &o) {
|
|
*this = nullptr;
|
|
_ref = std::move(o._ref);
|
|
}
|
|
return *this;
|
|
}
|
|
entry_ptr& operator=(const entry_ptr& o) noexcept {
|
|
if (this != &o) {
|
|
*this = nullptr;
|
|
_ref = o._ref;
|
|
}
|
|
return *this;
|
|
}
|
|
explicit operator bool() const noexcept { return bool(_ref); }
|
|
const element_type& operator*() const noexcept { return _ref->page(); }
|
|
const element_type* operator->() const noexcept { return &_ref->page(); }
|
|
element_type& operator*() noexcept { return _ref->page(); }
|
|
element_type* operator->() noexcept { return &_ref->page(); }
|
|
};
|
|
|
|
// Creates a shared pointer to cp.
|
|
// Invalidates cp.
|
|
entry_ptr share(entry& cp) {
|
|
auto wptr = cp.weak_from_this(); // may throw
|
|
return entry_ptr(std::move(wptr));
|
|
}
|
|
|
|
private:
|
|
using cache_type = bplus::tree<key_type, entry, key_less_comparator, 8, bplus::key_search::linear>;
|
|
cache_type _cache;
|
|
logalloc::region& _region;
|
|
logalloc::allocating_section _as;
|
|
lru& _lru;
|
|
partition_index_cache_stats& _stats;
|
|
public:
|
|
|
|
// Create a cache with a given LRU attached.
|
|
partition_index_cache(lru& lru_, logalloc::region& r, partition_index_cache_stats& stats)
|
|
: _cache(key_less_comparator())
|
|
, _region(r)
|
|
, _lru(lru_)
|
|
, _stats(stats)
|
|
{ }
|
|
|
|
~partition_index_cache() {
|
|
with_allocator(_region.allocator(), [&] {
|
|
_cache.clear_and_dispose([this] (entry* e) noexcept {
|
|
_lru.remove(*e);
|
|
on_evicted(*e);
|
|
});
|
|
});
|
|
}
|
|
|
|
partition_index_cache(partition_index_cache&&) = delete;
|
|
partition_index_cache(const partition_index_cache&) = delete;
|
|
|
|
// Returns a future which resolves with a shared pointer to index_list for given key.
|
|
// Always returns a valid pointer if succeeds. The pointer is never invalidated externally.
|
|
//
|
|
// If entry is missing, the loader is invoked. If list is already loading, this invocation
|
|
// will wait for prior loading to complete and use its result when it's done.
|
|
//
|
|
// The loader object does not survive deferring, so the caller must deal with its liveness.
|
|
//
|
|
// The returned future must be waited on before destroying this instance.
|
|
template<typename Loader>
|
|
future<entry_ptr> get_or_load(const key_type& key, Loader&& loader) {
|
|
auto i = _cache.lower_bound(key);
|
|
if (i != _cache.end() && i->_key == key) {
|
|
entry& cp = *i;
|
|
auto ptr = share(cp);
|
|
if (cp.ready()) {
|
|
++_stats.hits;
|
|
return make_ready_future<entry_ptr>(std::move(ptr));
|
|
} else {
|
|
++_stats.blocks;
|
|
return ptr.get_entry().promise()->get_shared_future().then([ptr] () mutable {
|
|
return std::move(ptr);
|
|
});
|
|
}
|
|
}
|
|
|
|
++_stats.misses;
|
|
++_stats.blocks;
|
|
|
|
entry_ptr ptr = _as(_region, [&] {
|
|
return with_allocator(_region.allocator(), [&] {
|
|
auto it_and_flag = _cache.emplace(key, this, key);
|
|
entry &cp = *it_and_flag.first;
|
|
parse_assert(it_and_flag.second);
|
|
try {
|
|
return share(cp);
|
|
} catch (...) {
|
|
_cache.erase(key);
|
|
throw;
|
|
}
|
|
});
|
|
});
|
|
|
|
// No exceptions before then_wrapped() is installed so that ptr will be eventually populated.
|
|
|
|
return futurize_invoke(loader, key).then_wrapped([this, key, ptr = std::move(ptr)] (auto&& f) mutable {
|
|
entry& e = ptr.get_entry();
|
|
try {
|
|
partition_index_page&& page = f.get();
|
|
e.promise()->set_value();
|
|
e.set_page(std::move(page));
|
|
_stats.used_bytes += e.size_in_allocator();
|
|
++_stats.populations;
|
|
return ptr;
|
|
} catch (...) {
|
|
e.promise()->set_exception(std::current_exception());
|
|
ptr = {};
|
|
with_allocator(_region.allocator(), [&] {
|
|
_cache.erase(key);
|
|
});
|
|
throw;
|
|
}
|
|
});
|
|
}
|
|
|
|
void on_evicted(entry& p) {
|
|
_stats.used_bytes -= p.size_in_allocator();
|
|
++_stats.evictions;
|
|
}
|
|
|
|
// Evicts all unreferenced entries.
|
|
future<> evict_gently() {
|
|
auto i = _cache.begin();
|
|
std::optional<partition_index_page> partial_page;
|
|
auto clear_on_exception = defer([&] {
|
|
with_allocator(_region.allocator(), [&] {
|
|
partial_page.reset();
|
|
});
|
|
});
|
|
while (partial_page || i != _cache.end()) {
|
|
if (partial_page) {
|
|
auto preempted = with_allocator(_region.allocator(), [&] {
|
|
while (partial_page->clear_gently() != stop_iteration::yes) {
|
|
return true;
|
|
}
|
|
partial_page.reset();
|
|
return need_preempt();
|
|
});
|
|
if (preempted) {
|
|
auto key = (i != _cache.end()) ? std::optional(i->key()) : std::nullopt;
|
|
co_await coroutine::maybe_yield();
|
|
i = key ? _cache.lower_bound(*key) : _cache.end();
|
|
}
|
|
} else {
|
|
with_allocator(_region.allocator(), [&] {
|
|
if (i->is_referenced()) {
|
|
++i;
|
|
} else {
|
|
_lru.remove(*i);
|
|
on_evicted(*i);
|
|
if (i->ready()) {
|
|
partial_page = std::move(i->page());
|
|
}
|
|
i = i.erase(key_less_comparator());
|
|
}
|
|
});
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
inline
|
|
void partition_index_cache::entry::on_evicted() noexcept {
|
|
_parent->on_evicted(*this);
|
|
cache_type::iterator it(this);
|
|
it.erase(key_less_comparator());
|
|
}
|
|
|
|
}
|