Problem fixed on master since5ed559c. So branch-4.5 and up aren't affected. Index reader fails to close input streams of promoted index readers when advancing to next summary entry, so Scylla can abort as a result of a stream being destroyed while there were reads in progress. This problem was seen when row cache issued a fast forward, so index reader was asked to advance to next summary entry while the previous one still had reads in progress. By closing the list of index readers when there's only one owner holding it, the problem is safely fixed, because it cannot happen that an index_bound like _lower_bound or _upper_bound will be left with a list that's already closed. Fixes #9049. test: mode(dev, debug). No observable perf regression: BEFORE: read skip time (s) iterations frags frag/s mad f/s max f/s min f/s avg aio aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu -> 1 0 8.168640 4 100000 12242 108 12262 11982 50032.2 50049 6403116 20707 0 0 8 8 0 0 0 83.3% -> 1 1 22.257916 4 50000 2246 3 2249 2238 150025.0 150025 6454272 100001 0 49999 100000 149999 0 0 0 54.7% -> 1 8 9.384961 4 11112 1184 5 1184 1178 77781.2 77781 1439328 66618 11111 1 33334 44444 0 0 0 44.0% -> 1 16 4.976144 4 5883 1182 6 1184 1173 41180.0 41180 762053 35264 5882 0 17648 23530 0 0 0 44.1% -> 1 32 2.582744 4 3031 1174 4 1175 1167 21216.0 21216 392619 18176 3031 0 9092 12122 0 0 0 43.8% -> 1 64 1.308410 4 1539 1176 2 1178 1173 10772.0 10772 199353 9233 1539 0 4616 6154 0 0 0 44.0% -> 1 256 0.331037 4 390 1178 12 1190 1165 2729.0 2729 50519 2338 390 0 1169 1558 0 0 0 44.0% -> 1 1024 0.085108 4 98 1151 7 1155 1141 685.0 685 12694 587 98 0 293 390 0 0 0 42.9% -> 1 4096 0.024393 6 25 1025 5 1029 1020 174.0 174 3238 149 25 0 74 98 0 0 0 37.4% -> 64 1 8.765446 4 98462 11233 16 11236 11182 54642.0 54648 6405470 23632 1 1538 4615 4615 0 0 0 79.3% -> 64 8 8.456430 4 88896 10512 48 10582 10464 55578.0 55578 6405971 24031 4166 0 5553 5553 0 0 0 77.3% -> 64 16 7.798197 4 80000 10259 108 10299 10077 51248.0 51248 5922500 22160 4996 0 4998 4998 0 0 0 74.8% -> 64 32 6.605148 4 66688 10096 64 10168 10033 42715.0 42715 4936359 18796 4164 0 4165 4165 0 0 0 75.5% -> 64 64 4.933287 4 50016 10138 28 10189 10111 32039.0 32039 3702428 14106 3124 0 3125 3125 0 0 0 75.3% -> 64 256 1.971701 4 20032 10160 57 10347 10103 12831.0 12831 1482993 5731 1252 0 1250 1250 0 0 0 74.1% -> 64 1024 0.587026 4 5888 10030 84 10277 9946 3770.0 3770 435895 1635 368 0 366 366 0 0 0 74.6% -> 64 4096 0.157401 4 1600 10165 69 10202 9698 1023.0 1023 118449 455 100 0 98 98 0 0 0 73.9% AFTER: read skip time (s) iterations frags frag/s mad f/s max f/s min f/s avg aio aio (KiB) blocked dropped idx hit idx miss idx blk c hit c miss c blk cpu -> 1 0 8.191639 4 100000 12208 46 12279 12161 50031.2 50025 6403108 20243 0 0 0 0 0 0 0 87.0% -> 1 1 22.933121 4 50000 2180 36 2198 2115 150025.0 150025 6454272 100001 0 49999 100000 149999 0 0 0 54.9% -> 1 8 9.471735 4 11112 1173 5 1178 1168 77781.2 77781 1439328 66663 11111 0 33334 44445 0 0 0 44.6% -> 1 16 5.001569 4 5883 1176 2 1176 1170 41180.0 41180 762053 35296 5882 1 17648 23529 0 0 0 44.6% -> 1 32 2.587069 4 3031 1172 1 1173 1164 21216.0 21216 392619 18185 3031 1 9092 12121 0 0 0 44.8% -> 1 64 1.310747 4 1539 1174 3 1177 1171 10772.0 10772 199353 9233 1539 0 4616 6154 0 0 0 44.9% -> 1 256 0.335490 4 390 1162 2 1167 1161 2729.0 2729 50519 2338 390 0 1169 1558 0 0 0 45.7% -> 1 1024 0.081944 4 98 1196 21 1210 1162 685.0 685 12694 585 98 0 293 390 0 0 0 46.2% -> 1 4096 0.022266 6 25 1123 3 1125 1105 174.0 174 3238 149 24 0 74 98 0 0 0 41.9% -> 64 1 8.731741 4 98462 11276 45 11417 11231 54642.0 54640 6405470 23686 0 1538 4615 4615 0 0 0 80.2% -> 64 8 8.396247 4 88896 10588 19 10596 10560 55578.0 55578 6405971 24275 4166 0 5553 5553 0 0 0 77.6% -> 64 16 7.700995 4 80000 10388 88 10405 10221 51248.0 51248 5922500 22100 5000 0 4998 4998 0 0 0 76.4% -> 64 32 6.517276 4 66688 10232 31 10342 10201 42715.0 42715 4936359 19013 4164 0 4165 4165 0 0 0 75.3% -> 64 64 4.898669 4 50016 10210 60 10291 10150 32039.0 32039 3702428 14110 3124 0 3125 3125 0 0 0 74.4% -> 64 256 1.969972 4 20032 10169 22 10173 10091 12831.0 12831 1482993 5660 1252 0 1250 1250 0 0 0 74.3% -> 64 1024 0.575180 4 5888 10237 84 10316 10028 3770.0 3770 435895 1656 368 0 366 366 0 0 0 74.6% -> 64 4096 0.158503 4 1600 10094 81 10195 10014 1023.0 1023 118449 460 100 0 98 98 0 0 0 73.5% Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Message-Id: <20210722180302.64675-1-raphaelsc@scylladb.com> (cherry picked from commit9dce1e4b2b)
339 lines
12 KiB
C++
339 lines
12 KiB
C++
/*
|
|
* Copyright (C) 2017 ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* This file is part of Scylla.
|
|
*
|
|
* Scylla is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* Scylla is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <vector>
|
|
#include <memory>
|
|
#include <seastar/core/shared_future.hh>
|
|
#include <seastar/core/shared_ptr.hh>
|
|
#include <seastar/core/future.hh>
|
|
#include <seastar/core/bitops.hh>
|
|
#include <boost/intrusive/unordered_set.hpp>
|
|
#include <boost/iterator/transform_iterator.hpp>
|
|
#include <boost/lambda/bind.hpp>
|
|
#include "seastarx.hh"
|
|
|
|
namespace bi = boost::intrusive;
|
|
|
|
namespace utils {
|
|
|
|
struct do_nothing_loading_shared_values_stats {
|
|
static void inc_hits() noexcept {} // Increase the number of times entry was found ready
|
|
static void inc_misses() noexcept {} // Increase the number of times entry was not found
|
|
static void inc_blocks() noexcept {} // Increase the number of times entry was not ready (>= misses)
|
|
static void inc_evictions() noexcept {} // Increase the number of times entry was evicted
|
|
};
|
|
|
|
// Entries stay around as long as there is any live external reference (entry_ptr) to them.
|
|
// Supports asynchronous insertion, ensures that only one entry will be loaded.
|
|
// InitialBucketsCount is required to be greater than zero. Otherwise a constructor will throw an
|
|
// std::invalid_argument exception.
|
|
template<typename Key,
|
|
typename Tp,
|
|
typename Hash = std::hash<Key>,
|
|
typename EqualPred = std::equal_to<Key>,
|
|
typename Stats = do_nothing_loading_shared_values_stats,
|
|
size_t InitialBucketsCount = 16>
|
|
requires requires () {
|
|
Stats::inc_hits();
|
|
Stats::inc_misses();
|
|
Stats::inc_blocks();
|
|
Stats::inc_evictions();
|
|
}
|
|
class loading_shared_values {
|
|
public:
|
|
using key_type = Key;
|
|
using value_type = Tp;
|
|
static constexpr size_t initial_buckets_count = InitialBucketsCount;
|
|
|
|
private:
|
|
class entry : public bi::unordered_set_base_hook<bi::store_hash<true>>, public enable_lw_shared_from_this<entry> {
|
|
private:
|
|
loading_shared_values& _parent;
|
|
key_type _key;
|
|
std::optional<value_type> _val;
|
|
shared_promise<> _loaded;
|
|
|
|
public:
|
|
const key_type& key() const noexcept {
|
|
return _key;
|
|
}
|
|
|
|
const value_type& value() const noexcept {
|
|
return *_val;
|
|
}
|
|
|
|
value_type& value() noexcept {
|
|
return *_val;
|
|
}
|
|
|
|
/// \brief "Release" the object from the contained value.
|
|
/// After this call the state of the value kept inside this object is undefined and it may no longer be used.
|
|
///
|
|
/// \return The r-value reference to the value kept inside this object.
|
|
value_type&& release() {
|
|
return *std::move(_val);
|
|
}
|
|
|
|
void set_value(value_type new_val) {
|
|
_val.emplace(std::move(new_val));
|
|
}
|
|
|
|
shared_promise<>& loaded() {
|
|
return _loaded;
|
|
}
|
|
|
|
bool ready() const noexcept {
|
|
return bool(_val);
|
|
}
|
|
|
|
entry(loading_shared_values& parent, key_type k)
|
|
: _parent(parent), _key(std::move(k)) {}
|
|
|
|
~entry() {
|
|
_parent._set.erase(_parent._set.iterator_to(*this));
|
|
Stats::inc_evictions();
|
|
}
|
|
|
|
friend bool operator==(const entry& a, const entry& b){
|
|
return EqualPred()(a.key(), b.key());
|
|
}
|
|
|
|
friend std::size_t hash_value(const entry& v) {
|
|
return Hash()(v.key());
|
|
}
|
|
};
|
|
|
|
template<typename KeyType, typename KeyEqual>
|
|
struct key_eq {
|
|
bool operator()(const KeyType& k, const entry& c) const {
|
|
return KeyEqual()(k, c.key());
|
|
}
|
|
|
|
bool operator()(const entry& c, const KeyType& k) const {
|
|
return KeyEqual()(c.key(), k);
|
|
}
|
|
};
|
|
|
|
using set_type = bi::unordered_set<entry, bi::power_2_buckets<true>, bi::compare_hash<true>>;
|
|
using bi_set_bucket_traits = typename set_type::bucket_traits;
|
|
using set_iterator = typename set_type::iterator;
|
|
struct value_extractor_fn {
|
|
value_type& operator()(entry& e) const {
|
|
return e.value();
|
|
}
|
|
};
|
|
enum class shrinking_is_allowed { no, yes };
|
|
|
|
public:
|
|
using iterator = boost::transform_iterator<value_extractor_fn, set_iterator>;
|
|
|
|
public:
|
|
// Pointer to entry value
|
|
class entry_ptr {
|
|
lw_shared_ptr<entry> _e;
|
|
public:
|
|
using element_type = value_type;
|
|
entry_ptr() = default;
|
|
explicit entry_ptr(lw_shared_ptr<entry> e) : _e(std::move(e)) {}
|
|
entry_ptr& operator=(std::nullptr_t) noexcept {
|
|
_e = nullptr;
|
|
return *this;
|
|
}
|
|
explicit operator bool() const noexcept { return bool(_e); }
|
|
element_type& operator*() const noexcept { return _e->value(); }
|
|
element_type* operator->() const noexcept { return &_e->value(); }
|
|
|
|
/// \brief Get the wrapped value. Avoid the copy if this is the last reference to this value.
|
|
/// If this is the last reference then the wrapped value is going to be std::move()ed. Otherwise it's going to
|
|
/// be copied.
|
|
/// \return The wrapped value.
|
|
element_type release() {
|
|
auto res = _e.owned() ? _e->release() : _e->value();
|
|
_e = nullptr;
|
|
return res;
|
|
}
|
|
|
|
long use_count() const noexcept {
|
|
return _e ? _e.use_count() : 0;
|
|
}
|
|
|
|
friend class loading_shared_values;
|
|
};
|
|
|
|
private:
|
|
std::vector<typename set_type::bucket_type> _buckets;
|
|
set_type _set;
|
|
value_extractor_fn _value_extractor_fn;
|
|
|
|
public:
|
|
static const key_type& to_key(const entry_ptr& e_ptr) noexcept {
|
|
return e_ptr._e->key();
|
|
}
|
|
|
|
/// \throw std::invalid_argument if InitialBucketsCount is zero
|
|
loading_shared_values()
|
|
: _buckets(InitialBucketsCount)
|
|
, _set(bi_set_bucket_traits(_buckets.data(), _buckets.size()))
|
|
{
|
|
static_assert(noexcept(Stats::inc_evictions()), "Stats::inc_evictions must be non-throwing");
|
|
static_assert(noexcept(Stats::inc_hits()), "Stats::inc_hits must be non-throwing");
|
|
static_assert(noexcept(Stats::inc_misses()), "Stats::inc_misses must be non-throwing");
|
|
static_assert(noexcept(Stats::inc_blocks()), "Stats::inc_blocks must be non-throwing");
|
|
|
|
static_assert(InitialBucketsCount && ((InitialBucketsCount & (InitialBucketsCount - 1)) == 0), "Initial buckets count should be a power of two");
|
|
}
|
|
loading_shared_values(loading_shared_values&&) = default;
|
|
loading_shared_values(const loading_shared_values&) = delete;
|
|
~loading_shared_values() {
|
|
assert(!_set.size());
|
|
}
|
|
|
|
/// \brief
|
|
/// Returns a future which resolves with a shared pointer to the entry for the given key.
|
|
/// Always returns a valid pointer if succeeds.
|
|
///
|
|
/// If entry is missing, the loader is invoked. If entry is already loading, this invocation
|
|
/// will wait for prior loading to complete and use its result when it's done.
|
|
///
|
|
/// The loader object does not survive deferring, so the caller must deal with its liveness.
|
|
template<typename Loader>
|
|
future<entry_ptr> get_or_load(const key_type& key, Loader&& loader) noexcept {
|
|
static_assert(std::is_same<future<value_type>, typename futurize<std::result_of_t<Loader(const key_type&)>>::type>::value, "Bad Loader signature");
|
|
try {
|
|
auto i = _set.find(key, Hash(), key_eq<key_type, EqualPred>());
|
|
lw_shared_ptr<entry> e;
|
|
future<> f = make_ready_future<>();
|
|
if (i != _set.end()) {
|
|
e = i->shared_from_this();
|
|
// take a short cut if the value is ready
|
|
if (e->ready()) {
|
|
Stats::inc_hits();
|
|
return make_ready_future<entry_ptr>(entry_ptr(std::move(e)));
|
|
}
|
|
f = e->loaded().get_shared_future();
|
|
} else {
|
|
Stats::inc_misses();
|
|
e = make_lw_shared<entry>(*this, key);
|
|
rehash_before_insert();
|
|
_set.insert(*e);
|
|
// get_shared_future() may throw, so make sure to call it before invoking the loader(key)
|
|
f = e->loaded().get_shared_future();
|
|
// Future indirectly forwarded to `e`.
|
|
(void)futurize_invoke([&] { return loader(key); }).then_wrapped([e](future<value_type>&& val_fut) mutable {
|
|
if (val_fut.failed()) {
|
|
e->loaded().set_exception(val_fut.get_exception());
|
|
} else {
|
|
e->set_value(val_fut.get0());
|
|
e->loaded().set_value();
|
|
}
|
|
});
|
|
}
|
|
if (!f.available()) {
|
|
Stats::inc_blocks();
|
|
return f.then([e]() mutable {
|
|
return entry_ptr(std::move(e));
|
|
});
|
|
} else if (f.failed()) {
|
|
return make_exception_future<entry_ptr>(std::move(f).get_exception());
|
|
} else {
|
|
Stats::inc_hits();
|
|
return make_ready_future<entry_ptr>(entry_ptr(std::move(e)));
|
|
}
|
|
} catch (...) {
|
|
return make_exception_future<entry_ptr>(std::current_exception());
|
|
}
|
|
}
|
|
|
|
/// \brief Try to rehash the container so that the load factor is between 0.25 and 0.75.
|
|
/// \throw May throw if allocation of a new buckets array throws.
|
|
void rehash() {
|
|
rehash<shrinking_is_allowed::yes>(_set.size());
|
|
}
|
|
|
|
size_t buckets_count() const {
|
|
return _buckets.size();
|
|
}
|
|
|
|
size_t size() const {
|
|
return _set.size();
|
|
}
|
|
|
|
iterator end() {
|
|
return boost::make_transform_iterator(_set.end(), _value_extractor_fn);
|
|
}
|
|
|
|
iterator begin() {
|
|
return boost::make_transform_iterator(_set.begin(), _value_extractor_fn);
|
|
}
|
|
|
|
template<typename KeyType, typename KeyHasher, typename KeyEqual>
|
|
iterator find(const KeyType& key, KeyHasher key_hasher_func, KeyEqual key_equal_func) noexcept {
|
|
set_iterator it = _set.find(key, std::move(key_hasher_func), key_eq<KeyType, KeyEqual>());
|
|
if (it == _set.end() || !it->ready()) {
|
|
return end();
|
|
}
|
|
return boost::make_transform_iterator(it, _value_extractor_fn);
|
|
};
|
|
|
|
// keep the default non-templated overloads to ease on the compiler for specifications
|
|
// that do not require the templated find().
|
|
iterator find(const key_type& key) noexcept {
|
|
return find(key, Hash(), EqualPred());
|
|
}
|
|
|
|
private:
|
|
void rehash_before_insert() noexcept {
|
|
try {
|
|
rehash<shrinking_is_allowed::no>(_set.size() + 1);
|
|
} catch (...) {
|
|
// if rehashing fails - continue with the current buckets array
|
|
}
|
|
}
|
|
|
|
template <shrinking_is_allowed ShrinkingIsAllowed>
|
|
void rehash(size_t new_size) {
|
|
size_t new_buckets_count = 0;
|
|
|
|
// Try to keep the load factor between 0.25 (when shrinking is allowed) and 0.75.
|
|
if (ShrinkingIsAllowed == shrinking_is_allowed::yes && new_size < buckets_count() / 4) {
|
|
if (!new_size) {
|
|
new_buckets_count = 1;
|
|
} else {
|
|
new_buckets_count = size_t(1) << log2floor(new_size * 4);
|
|
}
|
|
} else if (new_size > 3 * buckets_count() / 4) {
|
|
new_buckets_count = buckets_count() * 2;
|
|
}
|
|
|
|
if (new_buckets_count < InitialBucketsCount) {
|
|
return;
|
|
}
|
|
|
|
std::vector<typename set_type::bucket_type> new_buckets(new_buckets_count);
|
|
_set.rehash(bi_set_bucket_traits(new_buckets.data(), new_buckets.size()));
|
|
_buckets = std::move(new_buckets);
|
|
}
|
|
};
|
|
|
|
}
|