Files
scylladb/utils/loading_shared_values.hh
Raphael S. Carvalho cabb7fbd3b sstables: Close promoted index readers when advancing to next summary index
Problem fixed on master since 5ed559c. So branch-4.5 and up aren't affected.

Index reader fails to close input streams of promoted index readers when advancing
to next summary entry, so Scylla can abort as a result of a stream being destroyed
while there were reads in progress. This problem was seen when row cache issued
a fast forward, so index reader was asked to advance to next summary entry while
the previous one still had reads in progress.
By closing the list of index readers when there's only one owner holding it,
the problem is safely fixed, because it cannot happen that an index_bound like
_lower_bound or _upper_bound will be left with a list that's already closed.

Fixes #9049.

test: mode(dev, debug).

No observable perf regression:

BEFORE:

   read    skip      time (s)   iterations     frags     frag/s    mad f/s    max f/s    min f/s    avg aio    aio      (KiB) blocked dropped  idx hit idx miss  idx blk    c hit   c miss    c blk    cpu
-> 1       0         8.168640            4    100000      12242        108      12262      11982    50032.2  50049    6403116   20707       0        0        8        8        0        0        0  83.3%
-> 1       1        22.257916            4     50000       2246          3       2249       2238   150025.0 150025    6454272  100001       0    49999   100000   149999        0        0        0  54.7%
-> 1       8         9.384961            4     11112       1184          5       1184       1178    77781.2  77781    1439328   66618   11111        1    33334    44444        0        0        0  44.0%
-> 1       16        4.976144            4      5883       1182          6       1184       1173    41180.0  41180     762053   35264    5882        0    17648    23530        0        0        0  44.1%
-> 1       32        2.582744            4      3031       1174          4       1175       1167    21216.0  21216     392619   18176    3031        0     9092    12122        0        0        0  43.8%
-> 1       64        1.308410            4      1539       1176          2       1178       1173    10772.0  10772     199353    9233    1539        0     4616     6154        0        0        0  44.0%
-> 1       256       0.331037            4       390       1178         12       1190       1165     2729.0   2729      50519    2338     390        0     1169     1558        0        0        0  44.0%
-> 1       1024      0.085108            4        98       1151          7       1155       1141      685.0    685      12694     587      98        0      293      390        0        0        0  42.9%
-> 1       4096      0.024393            6        25       1025          5       1029       1020      174.0    174       3238     149      25        0       74       98        0        0        0  37.4%
-> 64      1         8.765446            4     98462      11233         16      11236      11182    54642.0  54648    6405470   23632       1     1538     4615     4615        0        0        0  79.3%
-> 64      8         8.456430            4     88896      10512         48      10582      10464    55578.0  55578    6405971   24031    4166        0     5553     5553        0        0        0  77.3%
-> 64      16        7.798197            4     80000      10259        108      10299      10077    51248.0  51248    5922500   22160    4996        0     4998     4998        0        0        0  74.8%
-> 64      32        6.605148            4     66688      10096         64      10168      10033    42715.0  42715    4936359   18796    4164        0     4165     4165        0        0        0  75.5%
-> 64      64        4.933287            4     50016      10138         28      10189      10111    32039.0  32039    3702428   14106    3124        0     3125     3125        0        0        0  75.3%
-> 64      256       1.971701            4     20032      10160         57      10347      10103    12831.0  12831    1482993    5731    1252        0     1250     1250        0        0        0  74.1%
-> 64      1024      0.587026            4      5888      10030         84      10277       9946     3770.0   3770     435895    1635     368        0      366      366        0        0        0  74.6%
-> 64      4096      0.157401            4      1600      10165         69      10202       9698     1023.0   1023     118449     455     100        0       98       98        0        0        0  73.9%

AFTER:

   read    skip      time (s)   iterations     frags     frag/s    mad f/s    max f/s    min f/s    avg aio    aio      (KiB) blocked dropped  idx hit idx miss  idx blk    c hit   c miss    c blk    cpu
-> 1       0         8.191639            4    100000      12208         46      12279      12161    50031.2  50025    6403108   20243       0        0        0        0        0        0        0  87.0%
-> 1       1        22.933121            4     50000       2180         36       2198       2115   150025.0 150025    6454272  100001       0    49999   100000   149999        0        0        0  54.9%
-> 1       8         9.471735            4     11112       1173          5       1178       1168    77781.2  77781    1439328   66663   11111        0    33334    44445        0        0        0  44.6%
-> 1       16        5.001569            4      5883       1176          2       1176       1170    41180.0  41180     762053   35296    5882        1    17648    23529        0        0        0  44.6%
-> 1       32        2.587069            4      3031       1172          1       1173       1164    21216.0  21216     392619   18185    3031        1     9092    12121        0        0        0  44.8%
-> 1       64        1.310747            4      1539       1174          3       1177       1171    10772.0  10772     199353    9233    1539        0     4616     6154        0        0        0  44.9%
-> 1       256       0.335490            4       390       1162          2       1167       1161     2729.0   2729      50519    2338     390        0     1169     1558        0        0        0  45.7%
-> 1       1024      0.081944            4        98       1196         21       1210       1162      685.0    685      12694     585      98        0      293      390        0        0        0  46.2%
-> 1       4096      0.022266            6        25       1123          3       1125       1105      174.0    174       3238     149      24        0       74       98        0        0        0  41.9%
-> 64      1         8.731741            4     98462      11276         45      11417      11231    54642.0  54640    6405470   23686       0     1538     4615     4615        0        0        0  80.2%
-> 64      8         8.396247            4     88896      10588         19      10596      10560    55578.0  55578    6405971   24275    4166        0     5553     5553        0        0        0  77.6%
-> 64      16        7.700995            4     80000      10388         88      10405      10221    51248.0  51248    5922500   22100    5000        0     4998     4998        0        0        0  76.4%
-> 64      32        6.517276            4     66688      10232         31      10342      10201    42715.0  42715    4936359   19013    4164        0     4165     4165        0        0        0  75.3%
-> 64      64        4.898669            4     50016      10210         60      10291      10150    32039.0  32039    3702428   14110    3124        0     3125     3125        0        0        0  74.4%
-> 64      256       1.969972            4     20032      10169         22      10173      10091    12831.0  12831    1482993    5660    1252        0     1250     1250        0        0        0  74.3%
-> 64      1024      0.575180            4      5888      10237         84      10316      10028     3770.0   3770     435895    1656     368        0      366      366        0        0        0  74.6%
-> 64      4096      0.158503            4      1600      10094         81      10195      10014     1023.0   1023     118449     460     100        0       98       98        0        0        0  73.5%

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
Message-Id: <20210722180302.64675-1-raphaelsc@scylladb.com>
(cherry picked from commit 9dce1e4b2b)
2021-07-25 17:24:09 +03:00

339 lines
12 KiB
C++

/*
* Copyright (C) 2017 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <vector>
#include <memory>
#include <seastar/core/shared_future.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/future.hh>
#include <seastar/core/bitops.hh>
#include <boost/intrusive/unordered_set.hpp>
#include <boost/iterator/transform_iterator.hpp>
#include <boost/lambda/bind.hpp>
#include "seastarx.hh"
namespace bi = boost::intrusive;
namespace utils {
struct do_nothing_loading_shared_values_stats {
static void inc_hits() noexcept {} // Increase the number of times entry was found ready
static void inc_misses() noexcept {} // Increase the number of times entry was not found
static void inc_blocks() noexcept {} // Increase the number of times entry was not ready (>= misses)
static void inc_evictions() noexcept {} // Increase the number of times entry was evicted
};
// Entries stay around as long as there is any live external reference (entry_ptr) to them.
// Supports asynchronous insertion, ensures that only one entry will be loaded.
// InitialBucketsCount is required to be greater than zero. Otherwise a constructor will throw an
// std::invalid_argument exception.
template<typename Key,
typename Tp,
typename Hash = std::hash<Key>,
typename EqualPred = std::equal_to<Key>,
typename Stats = do_nothing_loading_shared_values_stats,
size_t InitialBucketsCount = 16>
requires requires () {
Stats::inc_hits();
Stats::inc_misses();
Stats::inc_blocks();
Stats::inc_evictions();
}
class loading_shared_values {
public:
using key_type = Key;
using value_type = Tp;
static constexpr size_t initial_buckets_count = InitialBucketsCount;
private:
class entry : public bi::unordered_set_base_hook<bi::store_hash<true>>, public enable_lw_shared_from_this<entry> {
private:
loading_shared_values& _parent;
key_type _key;
std::optional<value_type> _val;
shared_promise<> _loaded;
public:
const key_type& key() const noexcept {
return _key;
}
const value_type& value() const noexcept {
return *_val;
}
value_type& value() noexcept {
return *_val;
}
/// \brief "Release" the object from the contained value.
/// After this call the state of the value kept inside this object is undefined and it may no longer be used.
///
/// \return The r-value reference to the value kept inside this object.
value_type&& release() {
return *std::move(_val);
}
void set_value(value_type new_val) {
_val.emplace(std::move(new_val));
}
shared_promise<>& loaded() {
return _loaded;
}
bool ready() const noexcept {
return bool(_val);
}
entry(loading_shared_values& parent, key_type k)
: _parent(parent), _key(std::move(k)) {}
~entry() {
_parent._set.erase(_parent._set.iterator_to(*this));
Stats::inc_evictions();
}
friend bool operator==(const entry& a, const entry& b){
return EqualPred()(a.key(), b.key());
}
friend std::size_t hash_value(const entry& v) {
return Hash()(v.key());
}
};
template<typename KeyType, typename KeyEqual>
struct key_eq {
bool operator()(const KeyType& k, const entry& c) const {
return KeyEqual()(k, c.key());
}
bool operator()(const entry& c, const KeyType& k) const {
return KeyEqual()(c.key(), k);
}
};
using set_type = bi::unordered_set<entry, bi::power_2_buckets<true>, bi::compare_hash<true>>;
using bi_set_bucket_traits = typename set_type::bucket_traits;
using set_iterator = typename set_type::iterator;
struct value_extractor_fn {
value_type& operator()(entry& e) const {
return e.value();
}
};
enum class shrinking_is_allowed { no, yes };
public:
using iterator = boost::transform_iterator<value_extractor_fn, set_iterator>;
public:
// Pointer to entry value
class entry_ptr {
lw_shared_ptr<entry> _e;
public:
using element_type = value_type;
entry_ptr() = default;
explicit entry_ptr(lw_shared_ptr<entry> e) : _e(std::move(e)) {}
entry_ptr& operator=(std::nullptr_t) noexcept {
_e = nullptr;
return *this;
}
explicit operator bool() const noexcept { return bool(_e); }
element_type& operator*() const noexcept { return _e->value(); }
element_type* operator->() const noexcept { return &_e->value(); }
/// \brief Get the wrapped value. Avoid the copy if this is the last reference to this value.
/// If this is the last reference then the wrapped value is going to be std::move()ed. Otherwise it's going to
/// be copied.
/// \return The wrapped value.
element_type release() {
auto res = _e.owned() ? _e->release() : _e->value();
_e = nullptr;
return res;
}
long use_count() const noexcept {
return _e ? _e.use_count() : 0;
}
friend class loading_shared_values;
};
private:
std::vector<typename set_type::bucket_type> _buckets;
set_type _set;
value_extractor_fn _value_extractor_fn;
public:
static const key_type& to_key(const entry_ptr& e_ptr) noexcept {
return e_ptr._e->key();
}
/// \throw std::invalid_argument if InitialBucketsCount is zero
loading_shared_values()
: _buckets(InitialBucketsCount)
, _set(bi_set_bucket_traits(_buckets.data(), _buckets.size()))
{
static_assert(noexcept(Stats::inc_evictions()), "Stats::inc_evictions must be non-throwing");
static_assert(noexcept(Stats::inc_hits()), "Stats::inc_hits must be non-throwing");
static_assert(noexcept(Stats::inc_misses()), "Stats::inc_misses must be non-throwing");
static_assert(noexcept(Stats::inc_blocks()), "Stats::inc_blocks must be non-throwing");
static_assert(InitialBucketsCount && ((InitialBucketsCount & (InitialBucketsCount - 1)) == 0), "Initial buckets count should be a power of two");
}
loading_shared_values(loading_shared_values&&) = default;
loading_shared_values(const loading_shared_values&) = delete;
~loading_shared_values() {
assert(!_set.size());
}
/// \brief
/// Returns a future which resolves with a shared pointer to the entry for the given key.
/// Always returns a valid pointer if succeeds.
///
/// If entry is missing, the loader is invoked. If entry is already loading, this invocation
/// will wait for prior loading to complete and use its result when it's done.
///
/// The loader object does not survive deferring, so the caller must deal with its liveness.
template<typename Loader>
future<entry_ptr> get_or_load(const key_type& key, Loader&& loader) noexcept {
static_assert(std::is_same<future<value_type>, typename futurize<std::result_of_t<Loader(const key_type&)>>::type>::value, "Bad Loader signature");
try {
auto i = _set.find(key, Hash(), key_eq<key_type, EqualPred>());
lw_shared_ptr<entry> e;
future<> f = make_ready_future<>();
if (i != _set.end()) {
e = i->shared_from_this();
// take a short cut if the value is ready
if (e->ready()) {
Stats::inc_hits();
return make_ready_future<entry_ptr>(entry_ptr(std::move(e)));
}
f = e->loaded().get_shared_future();
} else {
Stats::inc_misses();
e = make_lw_shared<entry>(*this, key);
rehash_before_insert();
_set.insert(*e);
// get_shared_future() may throw, so make sure to call it before invoking the loader(key)
f = e->loaded().get_shared_future();
// Future indirectly forwarded to `e`.
(void)futurize_invoke([&] { return loader(key); }).then_wrapped([e](future<value_type>&& val_fut) mutable {
if (val_fut.failed()) {
e->loaded().set_exception(val_fut.get_exception());
} else {
e->set_value(val_fut.get0());
e->loaded().set_value();
}
});
}
if (!f.available()) {
Stats::inc_blocks();
return f.then([e]() mutable {
return entry_ptr(std::move(e));
});
} else if (f.failed()) {
return make_exception_future<entry_ptr>(std::move(f).get_exception());
} else {
Stats::inc_hits();
return make_ready_future<entry_ptr>(entry_ptr(std::move(e)));
}
} catch (...) {
return make_exception_future<entry_ptr>(std::current_exception());
}
}
/// \brief Try to rehash the container so that the load factor is between 0.25 and 0.75.
/// \throw May throw if allocation of a new buckets array throws.
void rehash() {
rehash<shrinking_is_allowed::yes>(_set.size());
}
size_t buckets_count() const {
return _buckets.size();
}
size_t size() const {
return _set.size();
}
iterator end() {
return boost::make_transform_iterator(_set.end(), _value_extractor_fn);
}
iterator begin() {
return boost::make_transform_iterator(_set.begin(), _value_extractor_fn);
}
template<typename KeyType, typename KeyHasher, typename KeyEqual>
iterator find(const KeyType& key, KeyHasher key_hasher_func, KeyEqual key_equal_func) noexcept {
set_iterator it = _set.find(key, std::move(key_hasher_func), key_eq<KeyType, KeyEqual>());
if (it == _set.end() || !it->ready()) {
return end();
}
return boost::make_transform_iterator(it, _value_extractor_fn);
};
// keep the default non-templated overloads to ease on the compiler for specifications
// that do not require the templated find().
iterator find(const key_type& key) noexcept {
return find(key, Hash(), EqualPred());
}
private:
void rehash_before_insert() noexcept {
try {
rehash<shrinking_is_allowed::no>(_set.size() + 1);
} catch (...) {
// if rehashing fails - continue with the current buckets array
}
}
template <shrinking_is_allowed ShrinkingIsAllowed>
void rehash(size_t new_size) {
size_t new_buckets_count = 0;
// Try to keep the load factor between 0.25 (when shrinking is allowed) and 0.75.
if (ShrinkingIsAllowed == shrinking_is_allowed::yes && new_size < buckets_count() / 4) {
if (!new_size) {
new_buckets_count = 1;
} else {
new_buckets_count = size_t(1) << log2floor(new_size * 4);
}
} else if (new_size > 3 * buckets_count() / 4) {
new_buckets_count = buckets_count() * 2;
}
if (new_buckets_count < InitialBucketsCount) {
return;
}
std::vector<typename set_type::bucket_type> new_buckets(new_buckets_count);
_set.rehash(bi_set_bucket_traits(new_buckets.data(), new_buckets.size()));
_buckets = std::move(new_buckets);
}
};
}