mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-26 19:35:12 +00:00
Compare commits
86 Commits
scylla-2.0
...
next-2.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f19fbc3058 | ||
|
|
8eddb28954 | ||
|
|
5aaa8031a2 | ||
|
|
3d50e7077a | ||
|
|
4063e92f57 | ||
|
|
b6de30bb87 | ||
|
|
c23e3a1eda | ||
|
|
2732b6cf1d | ||
|
|
49722e74da | ||
|
|
ba7623ac55 | ||
|
|
9db2ff36f2 | ||
|
|
378029b8da | ||
|
|
2b7644dc36 | ||
|
|
4bd931ba59 | ||
|
|
78eebe74c7 | ||
|
|
30e21afb13 | ||
|
|
e8616b10e5 | ||
|
|
0cb842dde1 | ||
|
|
7945f5edda | ||
|
|
9c2a328000 | ||
|
|
98498c679b | ||
|
|
b147b5854b | ||
|
|
226095f4db | ||
|
|
3dd1f68590 | ||
|
|
e08e4c75d7 | ||
|
|
8bcb4e7439 | ||
|
|
97369adb1c | ||
|
|
c89ead5e55 | ||
|
|
46fd96d877 | ||
|
|
19806fc056 | ||
|
|
0b314a745f | ||
|
|
73870751d9 | ||
|
|
c8983034c0 | ||
|
|
77d14a6256 | ||
|
|
21259bcfb3 | ||
|
|
9f02b44537 | ||
|
|
9dc7a63014 | ||
|
|
5dcef25f6f | ||
|
|
f763bf7f0d | ||
|
|
9af9ca0d60 | ||
|
|
fbc30221b5 | ||
|
|
d17aa3cd1c | ||
|
|
f7e79322f1 | ||
|
|
e31331bdb2 | ||
|
|
6873e26060 | ||
|
|
24bee2c887 | ||
|
|
8bba15a709 | ||
|
|
ad68d3ecfd | ||
|
|
707ac9242e | ||
|
|
dae0563ff8 | ||
|
|
7ba50b87f1 | ||
|
|
1da277c78e | ||
|
|
36dfd4b990 | ||
|
|
c5ce2765dc | ||
|
|
25ffdf527b | ||
|
|
dbc6d9fe01 | ||
|
|
915683bddd | ||
|
|
7ca8988d0e | ||
|
|
383d7e6c91 | ||
|
|
7bef696ee5 | ||
|
|
a603111a85 | ||
|
|
d5884d3c7c | ||
|
|
e6cb685178 | ||
|
|
cd19e5885a | ||
|
|
f367031016 | ||
|
|
7ae67331ad | ||
|
|
0e6561169b | ||
|
|
5b3aa8e90d | ||
|
|
db9d502f82 | ||
|
|
cde39bffd0 | ||
|
|
0fbcc852a5 | ||
|
|
16d5f68886 | ||
|
|
91540c8181 | ||
|
|
eaa8ed929f | ||
|
|
5ba1621716 | ||
|
|
b4f515035a | ||
|
|
d55e3f6a7f | ||
|
|
07b039feab | ||
|
|
35b7353efd | ||
|
|
200e01cc31 | ||
|
|
b5c4cf2d87 | ||
|
|
f96cb361aa | ||
|
|
bd59d7c968 | ||
|
|
9d923a61e1 | ||
|
|
0b23bcbe29 | ||
|
|
b1899f000a |
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
VERSION=2.0.1
|
||||
VERSION=2.0.4
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -114,7 +114,7 @@ struct hash<auth::authenticated_user> {
|
||||
|
||||
class auth::auth::permissions_cache {
|
||||
public:
|
||||
typedef utils::loading_cache<std::pair<authenticated_user, data_resource>, permission_set, utils::tuple_hash> cache_type;
|
||||
typedef utils::loading_cache<std::pair<authenticated_user, data_resource>, permission_set, utils::loading_cache_reload_enabled::yes, utils::simple_entry_size<permission_set>, utils::tuple_hash> cache_type;
|
||||
typedef typename cache_type::key_type key_type;
|
||||
|
||||
permissions_cache()
|
||||
|
||||
@@ -70,7 +70,7 @@ public:
|
||||
{
|
||||
if (!with_static_row) {
|
||||
if (_current == _end) {
|
||||
_current_start = _current_end = position_in_partition_view::after_all_clustered_rows();
|
||||
_current_start = position_in_partition_view::before_all_clustered_rows();
|
||||
} else {
|
||||
_current_start = position_in_partition_view::for_range_start(*_current);
|
||||
_current_end = position_in_partition_view::for_range_end(*_current);
|
||||
|
||||
@@ -241,7 +241,7 @@ public:
|
||||
using component_view = std::pair<bytes_view, eoc>;
|
||||
private:
|
||||
template<typename Value, typename = std::enable_if_t<!std::is_same<const data_value, std::decay_t<Value>>::value>>
|
||||
static size_t size(Value& val) {
|
||||
static size_t size(const Value& val) {
|
||||
return val.size();
|
||||
}
|
||||
static size_t size(const data_value& val) {
|
||||
@@ -445,17 +445,16 @@ public:
|
||||
return _is_compound;
|
||||
}
|
||||
|
||||
// The following factory functions assume this composite is a compound value.
|
||||
template <typename ClusteringElement>
|
||||
static composite from_clustering_element(const schema& s, const ClusteringElement& ce) {
|
||||
return serialize_value(ce.components(s));
|
||||
return serialize_value(ce.components(s), s.is_compound());
|
||||
}
|
||||
|
||||
static composite from_exploded(const std::vector<bytes_view>& v, eoc marker = eoc::none) {
|
||||
static composite from_exploded(const std::vector<bytes_view>& v, bool is_compound, eoc marker = eoc::none) {
|
||||
if (v.size() == 0) {
|
||||
return composite(bytes(size_t(1), bytes::value_type(marker)));
|
||||
return composite(bytes(size_t(1), bytes::value_type(marker)), is_compound);
|
||||
}
|
||||
return serialize_value(v, true, marker);
|
||||
return serialize_value(v, is_compound, marker);
|
||||
}
|
||||
|
||||
static composite static_prefix(const schema& s) {
|
||||
|
||||
20
configure.py
20
configure.py
@@ -238,6 +238,7 @@ scylla_tests = [
|
||||
'tests/view_schema_test',
|
||||
'tests/counter_test',
|
||||
'tests/cell_locker_test',
|
||||
'tests/loading_cache_test',
|
||||
]
|
||||
|
||||
apps = [
|
||||
@@ -730,6 +731,9 @@ if not try_compile(compiler=args.cxx, source='''\
|
||||
print('Installed boost version too old. Please update {}.'.format(pkgname("boost-devel")))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
has_sanitize_address_use_after_scope = try_compile(compiler=args.cxx, flags=['-fsanitize-address-use-after-scope'], source='int f() {}')
|
||||
|
||||
defines = ' '.join(['-D' + d for d in defines])
|
||||
|
||||
globals().update(vars(args))
|
||||
@@ -863,7 +867,7 @@ with open(buildfile, 'w') as f:
|
||||
f.write(textwrap.dedent('''\
|
||||
cxxflags_{mode} = -I. -I $builddir/{mode}/gen -I seastar -I seastar/build/{mode}/gen
|
||||
rule cxx.{mode}
|
||||
command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} -c -o $out $in
|
||||
command = $cxx -MD -MT $out -MF $out.d {seastar_cflags} $cxxflags $cxxflags_{mode} $obj_cxxflags -c -o $out $in
|
||||
description = CXX $out
|
||||
depfile = $out.d
|
||||
rule link.{mode}
|
||||
@@ -881,7 +885,16 @@ with open(buildfile, 'w') as f:
|
||||
command = thrift -gen cpp:cob_style -out $builddir/{mode}/gen $in
|
||||
description = THRIFT $in
|
||||
rule antlr3.{mode}
|
||||
command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in && antlr3 $builddir/{mode}/gen/$in && sed -i 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' build/{mode}/gen/${{stem}}Parser.cpp
|
||||
# We replace many local `ExceptionBaseType* ex` variables with a single function-scope one.
|
||||
# Because we add such a variable to every function, and because `ExceptionBaseType` is not a global
|
||||
# name, we also add a global typedef to avoid compilation errors.
|
||||
command = sed -e '/^#if 0/,/^#endif/d' $in > $builddir/{mode}/gen/$in $
|
||||
&& antlr3 $builddir/{mode}/gen/$in $
|
||||
&& sed -i -e 's/^\\( *\)\\(ImplTraits::CommonTokenType\\* [a-zA-Z0-9_]* = NULL;\\)$$/\\1const \\2/' $
|
||||
-e '1i using ExceptionBaseType = int;' $
|
||||
-e 's/^{{/{{ ExceptionBaseType\* ex = nullptr;/; $
|
||||
s/ExceptionBaseType\* ex = new/ex = new/' $
|
||||
build/{mode}/gen/${{stem}}Parser.cpp
|
||||
description = ANTLR3 $in
|
||||
''').format(mode = mode, **modeval))
|
||||
f.write('build {mode}: phony {artifacts}\n'.format(mode = mode,
|
||||
@@ -998,6 +1011,9 @@ with open(buildfile, 'w') as f:
|
||||
for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
|
||||
obj = cc.replace('.cpp', '.o')
|
||||
f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
|
||||
if cc.endswith('Parser.cpp') and has_sanitize_address_use_after_scope:
|
||||
# Parsers end up using huge amounts of stack space and overflowing their stack
|
||||
f.write(' obj_cxxflags = -fno-sanitize-address-use-after-scope\n')
|
||||
f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune seastar/build/{mode}/gen/http/request_parser.hh seastar/build/{mode}/gen/http/http_response_parser.hh: ninja {seastar_deps}\n'
|
||||
.format(**locals()))
|
||||
f.write(' pool = seastar_pool\n')
|
||||
|
||||
171
cql3/prepared_statements_cache.hh
Normal file
171
cql3/prepared_statements_cache.hh
Normal file
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (C) 2017 ScyllaDB
|
||||
*
|
||||
* Modified by ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "utils/loading_cache.hh"
|
||||
#include "cql3/statements/prepared_statement.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
using prepared_cache_entry = std::unique_ptr<statements::prepared_statement>;
|
||||
|
||||
struct prepared_cache_entry_size {
|
||||
size_t operator()(const prepared_cache_entry& val) {
|
||||
// TODO: improve the size approximation
|
||||
return 10000;
|
||||
}
|
||||
};
|
||||
|
||||
typedef bytes cql_prepared_id_type;
|
||||
typedef int32_t thrift_prepared_id_type;
|
||||
|
||||
/// \brief The key of the prepared statements cache
|
||||
///
|
||||
/// We are going to store the CQL and Thrift prepared statements in the same cache therefore we need generate the key
|
||||
/// that is going to be unique in both cases. Thrift use int32_t as a prepared statement ID, CQL - MD5 digest.
|
||||
///
|
||||
/// We are going to use an std::pair<CQL_PREP_ID_TYPE, int64_t> as a key. For CQL statements we will use {CQL_PREP_ID, std::numeric_limits<int64_t>::max()} as a key
|
||||
/// and for Thrift - {CQL_PREP_ID_TYPE(0), THRIFT_PREP_ID}. This way CQL and Thrift keys' values will never collide.
|
||||
class prepared_cache_key_type {
|
||||
public:
|
||||
using cache_key_type = std::pair<cql_prepared_id_type, int64_t>;
|
||||
|
||||
private:
|
||||
cache_key_type _key;
|
||||
|
||||
public:
|
||||
prepared_cache_key_type() = default;
|
||||
explicit prepared_cache_key_type(cql_prepared_id_type cql_id) : _key(std::move(cql_id), std::numeric_limits<int64_t>::max()) {}
|
||||
explicit prepared_cache_key_type(thrift_prepared_id_type thrift_id) : _key(cql_prepared_id_type(), thrift_id) {}
|
||||
|
||||
cache_key_type& key() { return _key; }
|
||||
const cache_key_type& key() const { return _key; }
|
||||
|
||||
static const cql_prepared_id_type& cql_id(const prepared_cache_key_type& key) {
|
||||
return key.key().first;
|
||||
}
|
||||
static thrift_prepared_id_type thrift_id(const prepared_cache_key_type& key) {
|
||||
return key.key().second;
|
||||
}
|
||||
};
|
||||
|
||||
class prepared_statements_cache {
|
||||
public:
|
||||
struct stats {
|
||||
uint64_t prepared_cache_evictions = 0;
|
||||
};
|
||||
|
||||
static stats& shard_stats() {
|
||||
static thread_local stats _stats;
|
||||
return _stats;
|
||||
}
|
||||
|
||||
struct prepared_cache_stats_updater {
|
||||
static void inc_hits() noexcept {}
|
||||
static void inc_misses() noexcept {}
|
||||
static void inc_blocks() noexcept {}
|
||||
static void inc_evictions() noexcept {
|
||||
++shard_stats().prepared_cache_evictions;
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
using cache_key_type = typename prepared_cache_key_type::cache_key_type;
|
||||
using cache_type = utils::loading_cache<cache_key_type, prepared_cache_entry, utils::loading_cache_reload_enabled::no, prepared_cache_entry_size, utils::tuple_hash, std::equal_to<cache_key_type>, prepared_cache_stats_updater>;
|
||||
using cache_value_ptr = typename cache_type::value_ptr;
|
||||
using cache_iterator = typename cache_type::iterator;
|
||||
using checked_weak_ptr = typename statements::prepared_statement::checked_weak_ptr;
|
||||
struct value_extractor_fn {
|
||||
checked_weak_ptr operator()(prepared_cache_entry& e) const {
|
||||
return e->checked_weak_from_this();
|
||||
}
|
||||
};
|
||||
|
||||
static const std::chrono::minutes entry_expiry;
|
||||
|
||||
public:
|
||||
using key_type = prepared_cache_key_type;
|
||||
using value_type = checked_weak_ptr;
|
||||
using statement_is_too_big = typename cache_type::entry_is_too_big;
|
||||
/// \note both iterator::reference and iterator::value_type are checked_weak_ptr
|
||||
using iterator = boost::transform_iterator<value_extractor_fn, cache_iterator>;
|
||||
|
||||
private:
|
||||
cache_type _cache;
|
||||
value_extractor_fn _value_extractor_fn;
|
||||
|
||||
public:
|
||||
prepared_statements_cache(logging::logger& logger)
|
||||
: _cache(memory::stats().total_memory() / 256, entry_expiry, logger)
|
||||
{}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<value_type> get(const key_type& key, LoadFunc&& load) {
|
||||
return _cache.get_ptr(key.key(), [load = std::forward<LoadFunc>(load)] (const cache_key_type&) { return load(); }).then([] (cache_value_ptr v_ptr) {
|
||||
return make_ready_future<value_type>((*v_ptr)->checked_weak_from_this());
|
||||
});
|
||||
}
|
||||
|
||||
iterator find(const key_type& key) {
|
||||
return boost::make_transform_iterator(_cache.find(key.key()), _value_extractor_fn);
|
||||
}
|
||||
|
||||
iterator end() {
|
||||
return boost::make_transform_iterator(_cache.end(), _value_extractor_fn);
|
||||
}
|
||||
|
||||
iterator begin() {
|
||||
return boost::make_transform_iterator(_cache.begin(), _value_extractor_fn);
|
||||
}
|
||||
|
||||
template <typename Pred>
|
||||
void remove_if(Pred&& pred) {
|
||||
static_assert(std::is_same<bool, std::result_of_t<Pred(::shared_ptr<cql_statement>)>>::value, "Bad Pred signature");
|
||||
|
||||
_cache.remove_if([&pred] (const prepared_cache_entry& e) {
|
||||
return pred(e->statement);
|
||||
});
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return _cache.size();
|
||||
}
|
||||
|
||||
size_t memory_footprint() const {
|
||||
return _cache.memory_footprint();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
namespace std { // for prepared_statements_cache log printouts
|
||||
inline std::ostream& operator<<(std::ostream& os, const typename cql3::prepared_cache_key_type::cache_key_type& p) {
|
||||
os << "{cql_id: " << p.first << ", thrift_id: " << p.second << "}";
|
||||
return os;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const cql3::prepared_cache_key_type& p) {
|
||||
os << p.key();
|
||||
return os;
|
||||
}
|
||||
}
|
||||
@@ -57,11 +57,14 @@ using namespace statements;
|
||||
using namespace cql_transport::messages;
|
||||
|
||||
logging::logger log("query_processor");
|
||||
logging::logger prep_cache_log("prepared_statements_cache");
|
||||
|
||||
distributed<query_processor> _the_query_processor;
|
||||
|
||||
const sstring query_processor::CQL_VERSION = "3.3.1";
|
||||
|
||||
const std::chrono::minutes prepared_statements_cache::entry_expiry = std::chrono::minutes(60);
|
||||
|
||||
class query_processor::internal_state {
|
||||
service::query_state _qs;
|
||||
public:
|
||||
@@ -95,6 +98,7 @@ query_processor::query_processor(distributed<service::storage_proxy>& proxy,
|
||||
, _proxy(proxy)
|
||||
, _db(db)
|
||||
, _internal_state(new internal_state())
|
||||
, _prepared_cache(prep_cache_log)
|
||||
{
|
||||
namespace sm = seastar::metrics;
|
||||
|
||||
@@ -130,6 +134,15 @@ query_processor::query_processor(distributed<service::storage_proxy>& proxy,
|
||||
|
||||
sm::make_derive("batches_unlogged_from_logged", _cql_stats.batches_unlogged_from_logged,
|
||||
sm::description("Counts a total number of LOGGED batches that were executed as UNLOGGED batches.")),
|
||||
|
||||
sm::make_derive("prepared_cache_evictions", [] { return prepared_statements_cache::shard_stats().prepared_cache_evictions; },
|
||||
sm::description("Counts a number of prepared statements cache entries evictions.")),
|
||||
|
||||
sm::make_gauge("prepared_cache_size", [this] { return _prepared_cache.size(); },
|
||||
sm::description("A number of entries in the prepared statements cache.")),
|
||||
|
||||
sm::make_gauge("prepared_cache_memory_footprint", [this] { return _prepared_cache.memory_footprint(); },
|
||||
sm::description("Size (in bytes) of the prepared statements cache.")),
|
||||
});
|
||||
|
||||
service::get_local_migration_manager().register_listener(_migration_subscriber.get());
|
||||
@@ -197,31 +210,21 @@ query_processor::process_statement(::shared_ptr<cql_statement> statement,
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(const std::experimental::string_view& query_string, service::query_state& query_state)
|
||||
query_processor::prepare(sstring query_string, service::query_state& query_state)
|
||||
{
|
||||
auto& client_state = query_state.get_client_state();
|
||||
return prepare(query_string, client_state, client_state.is_thrift());
|
||||
return prepare(std::move(query_string), client_state, client_state.is_thrift());
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::prepare(const std::experimental::string_view& query_string,
|
||||
const service::client_state& client_state,
|
||||
bool for_thrift)
|
||||
query_processor::prepare(sstring query_string, const service::client_state& client_state, bool for_thrift)
|
||||
{
|
||||
auto existing = get_stored_prepared_statement(query_string, client_state.get_raw_keyspace(), for_thrift);
|
||||
if (existing) {
|
||||
return make_ready_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(existing);
|
||||
using namespace cql_transport::messages;
|
||||
if (for_thrift) {
|
||||
return prepare_one<result_message::prepared::thrift>(std::move(query_string), client_state, compute_thrift_id, prepared_cache_key_type::thrift_id);
|
||||
} else {
|
||||
return prepare_one<result_message::prepared::cql>(std::move(query_string), client_state, compute_id, prepared_cache_key_type::cql_id);
|
||||
}
|
||||
|
||||
return futurize<::shared_ptr<cql_transport::messages::result_message::prepared>>::apply([this, &query_string, &client_state, for_thrift] {
|
||||
auto prepared = get_statement(query_string, client_state);
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
|
||||
throw exceptions::invalid_request_exception(sprint("Too many markers(?). %d markers exceed the allowed maximum of %d", bound_terms, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
assert(bound_terms == prepared->bound_names.size());
|
||||
return store_prepared_statement(query_string, client_state.get_raw_keyspace(), std::move(prepared), for_thrift);
|
||||
});
|
||||
}
|
||||
|
||||
::shared_ptr<cql_transport::messages::result_message::prepared>
|
||||
@@ -229,50 +232,11 @@ query_processor::get_stored_prepared_statement(const std::experimental::string_v
|
||||
const sstring& keyspace,
|
||||
bool for_thrift)
|
||||
{
|
||||
using namespace cql_transport::messages;
|
||||
if (for_thrift) {
|
||||
auto statement_id = compute_thrift_id(query_string, keyspace);
|
||||
auto it = _thrift_prepared_statements.find(statement_id);
|
||||
if (it == _thrift_prepared_statements.end()) {
|
||||
return ::shared_ptr<result_message::prepared>();
|
||||
}
|
||||
return ::make_shared<result_message::prepared::thrift>(statement_id, it->second->checked_weak_from_this());
|
||||
return get_stored_prepared_statement_one<result_message::prepared::thrift>(query_string, keyspace, compute_thrift_id, prepared_cache_key_type::thrift_id);
|
||||
} else {
|
||||
auto statement_id = compute_id(query_string, keyspace);
|
||||
auto it = _prepared_statements.find(statement_id);
|
||||
if (it == _prepared_statements.end()) {
|
||||
return ::shared_ptr<result_message::prepared>();
|
||||
}
|
||||
return ::make_shared<result_message::prepared::cql>(statement_id, it->second->checked_weak_from_this());
|
||||
}
|
||||
}
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
query_processor::store_prepared_statement(const std::experimental::string_view& query_string,
|
||||
const sstring& keyspace,
|
||||
std::unique_ptr<statements::prepared_statement> prepared,
|
||||
bool for_thrift)
|
||||
{
|
||||
#if 0
|
||||
// Concatenate the current keyspace so we don't mix prepared statements between keyspace (#5352).
|
||||
// (if the keyspace is null, queryString has to have a fully-qualified keyspace so it's fine.
|
||||
long statementSize = measure(prepared.statement);
|
||||
// don't execute the statement if it's bigger than the allowed threshold
|
||||
if (statementSize > MAX_CACHE_PREPARED_MEMORY)
|
||||
throw new InvalidRequestException(String.format("Prepared statement of size %d bytes is larger than allowed maximum of %d bytes.",
|
||||
statementSize,
|
||||
MAX_CACHE_PREPARED_MEMORY));
|
||||
#endif
|
||||
prepared->raw_cql_statement = query_string.data();
|
||||
if (for_thrift) {
|
||||
auto statement_id = compute_thrift_id(query_string, keyspace);
|
||||
auto msg = ::make_shared<result_message::prepared::thrift>(statement_id, prepared->checked_weak_from_this());
|
||||
_thrift_prepared_statements.emplace(statement_id, std::move(prepared));
|
||||
return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
|
||||
} else {
|
||||
auto statement_id = compute_id(query_string, keyspace);
|
||||
auto msg = ::make_shared<result_message::prepared::cql>(statement_id, prepared->checked_weak_from_this());
|
||||
_prepared_statements.emplace(statement_id, std::move(prepared));
|
||||
return make_ready_future<::shared_ptr<result_message::prepared>>(std::move(msg));
|
||||
return get_stored_prepared_statement_one<result_message::prepared::cql>(query_string, keyspace, compute_id, prepared_cache_key_type::cql_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -289,19 +253,19 @@ static sstring hash_target(const std::experimental::string_view& query_string, c
|
||||
return keyspace + query_string.to_string();
|
||||
}
|
||||
|
||||
bytes query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
|
||||
prepared_cache_key_type query_processor::compute_id(const std::experimental::string_view& query_string, const sstring& keyspace)
|
||||
{
|
||||
return md5_calculate(hash_target(query_string, keyspace));
|
||||
return prepared_cache_key_type(md5_calculate(hash_target(query_string, keyspace)));
|
||||
}
|
||||
|
||||
int32_t query_processor::compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace)
|
||||
prepared_cache_key_type query_processor::compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace)
|
||||
{
|
||||
auto target = hash_target(query_string, keyspace);
|
||||
uint32_t h = 0;
|
||||
for (auto&& c : hash_target(query_string, keyspace)) {
|
||||
h = 31*h + c;
|
||||
}
|
||||
return static_cast<int32_t>(h);
|
||||
return prepared_cache_key_type(static_cast<int32_t>(h));
|
||||
}
|
||||
|
||||
std::unique_ptr<prepared_statement>
|
||||
@@ -527,7 +491,7 @@ void query_processor::migration_subscriber::on_drop_view(const sstring& ks_name,
|
||||
|
||||
void query_processor::migration_subscriber::remove_invalid_prepared_statements(sstring ks_name, std::experimental::optional<sstring> cf_name)
|
||||
{
|
||||
_qp->invalidate_prepared_statements([&] (::shared_ptr<cql_statement> stmt) {
|
||||
_qp->_prepared_cache.remove_if([&] (::shared_ptr<cql_statement> stmt) {
|
||||
return this->should_invalidate(ks_name, cf_name, stmt);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -57,6 +57,7 @@
|
||||
#include "statements/prepared_statement.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
#include "untyped_result_set.hh"
|
||||
#include "prepared_statements_cache.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
@@ -64,9 +65,32 @@ namespace statements {
|
||||
class batch_statement;
|
||||
}
|
||||
|
||||
class prepared_statement_is_too_big : public std::exception {
|
||||
public:
|
||||
static constexpr int max_query_prefix = 100;
|
||||
|
||||
private:
|
||||
sstring _msg;
|
||||
|
||||
public:
|
||||
prepared_statement_is_too_big(const sstring& query_string)
|
||||
: _msg(seastar::format("Prepared statement is too big: {}", query_string.substr(0, max_query_prefix)))
|
||||
{
|
||||
// mark that we clipped the query string
|
||||
if (query_string.size() > max_query_prefix) {
|
||||
_msg += "...";
|
||||
}
|
||||
}
|
||||
|
||||
virtual const char* what() const noexcept override {
|
||||
return _msg.c_str();
|
||||
}
|
||||
};
|
||||
|
||||
class query_processor {
|
||||
public:
|
||||
class migration_subscriber;
|
||||
|
||||
private:
|
||||
std::unique_ptr<migration_subscriber> _migration_subscriber;
|
||||
distributed<service::storage_proxy>& _proxy;
|
||||
@@ -127,9 +151,7 @@ private:
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
std::unordered_map<bytes, std::unique_ptr<statements::prepared_statement>> _prepared_statements;
|
||||
std::unordered_map<int32_t, std::unique_ptr<statements::prepared_statement>> _thrift_prepared_statements;
|
||||
prepared_statements_cache _prepared_cache;
|
||||
std::unordered_map<sstring, std::unique_ptr<statements::prepared_statement>> _internal_statements;
|
||||
#if 0
|
||||
|
||||
@@ -221,21 +243,14 @@ private:
|
||||
}
|
||||
#endif
|
||||
public:
|
||||
statements::prepared_statement::checked_weak_ptr get_prepared(const bytes& id) {
|
||||
auto it = _prepared_statements.find(id);
|
||||
if (it == _prepared_statements.end()) {
|
||||
statements::prepared_statement::checked_weak_ptr get_prepared(const prepared_cache_key_type& key) {
|
||||
auto it = _prepared_cache.find(key);
|
||||
if (it == _prepared_cache.end()) {
|
||||
return statements::prepared_statement::checked_weak_ptr();
|
||||
}
|
||||
return it->second->checked_weak_from_this();
|
||||
return *it;
|
||||
}
|
||||
|
||||
statements::prepared_statement::checked_weak_ptr get_prepared_for_thrift(int32_t id) {
|
||||
auto it = _thrift_prepared_statements.find(id);
|
||||
if (it == _thrift_prepared_statements.end()) {
|
||||
return statements::prepared_statement::checked_weak_ptr();
|
||||
}
|
||||
return it->second->checked_weak_from_this();
|
||||
}
|
||||
#if 0
|
||||
public static void validateKey(ByteBuffer key) throws InvalidRequestException
|
||||
{
|
||||
@@ -435,42 +450,61 @@ public:
|
||||
#endif
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
prepare(const std::experimental::string_view& query_string, service::query_state& query_state);
|
||||
prepare(sstring query_string, service::query_state& query_state);
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
prepare(const std::experimental::string_view& query_string, const service::client_state& client_state, bool for_thrift);
|
||||
prepare(sstring query_string, const service::client_state& client_state, bool for_thrift);
|
||||
|
||||
static bytes compute_id(const std::experimental::string_view& query_string, const sstring& keyspace);
|
||||
static int32_t compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace);
|
||||
static prepared_cache_key_type compute_id(const std::experimental::string_view& query_string, const sstring& keyspace);
|
||||
static prepared_cache_key_type compute_thrift_id(const std::experimental::string_view& query_string, const sstring& keyspace);
|
||||
|
||||
private:
|
||||
///
|
||||
/// \tparam ResultMsgType type of the returned result message (CQL or Thrift)
|
||||
/// \tparam PreparedKeyGenerator a function that generates the prepared statement cache key for given query and keyspace
|
||||
/// \tparam IdGetter a function that returns the corresponding prepared statement ID (CQL or Thrift) for a given prepared statement cache key
|
||||
/// \param query_string
|
||||
/// \param client_state
|
||||
/// \param id_gen prepared ID generator, called before the first deferring
|
||||
/// \param id_getter prepared ID getter, passed to deferred context by reference. The caller must ensure its liveness.
|
||||
/// \return
|
||||
template <typename ResultMsgType, typename PreparedKeyGenerator, typename IdGetter>
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
prepare_one(sstring query_string, const service::client_state& client_state, PreparedKeyGenerator&& id_gen, IdGetter&& id_getter) {
|
||||
return do_with(id_gen(query_string, client_state.get_raw_keyspace()), std::move(query_string), [this, &client_state, &id_getter] (const prepared_cache_key_type& key, const sstring& query_string) {
|
||||
return _prepared_cache.get(key, [this, &query_string, &client_state] {
|
||||
auto prepared = get_statement(query_string, client_state);
|
||||
auto bound_terms = prepared->statement->get_bound_terms();
|
||||
if (bound_terms > std::numeric_limits<uint16_t>::max()) {
|
||||
throw exceptions::invalid_request_exception(sprint("Too many markers(?). %d markers exceed the allowed maximum of %d", bound_terms, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
assert(bound_terms == prepared->bound_names.size());
|
||||
prepared->raw_cql_statement = query_string;
|
||||
return make_ready_future<std::unique_ptr<statements::prepared_statement>>(std::move(prepared));
|
||||
}).then([&key, &id_getter] (auto prep_ptr) {
|
||||
return make_ready_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(::make_shared<ResultMsgType>(id_getter(key), std::move(prep_ptr)));
|
||||
}).handle_exception_type([&query_string] (typename prepared_statements_cache::statement_is_too_big&) {
|
||||
return make_exception_future<::shared_ptr<cql_transport::messages::result_message::prepared>>(prepared_statement_is_too_big(query_string));
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
template <typename ResultMsgType, typename KeyGenerator, typename IdGetter>
|
||||
::shared_ptr<cql_transport::messages::result_message::prepared>
|
||||
get_stored_prepared_statement_one(const std::experimental::string_view& query_string, const sstring& keyspace, KeyGenerator&& key_gen, IdGetter&& id_getter)
|
||||
{
|
||||
auto cache_key = key_gen(query_string, keyspace);
|
||||
auto it = _prepared_cache.find(cache_key);
|
||||
if (it == _prepared_cache.end()) {
|
||||
return ::shared_ptr<cql_transport::messages::result_message::prepared>();
|
||||
}
|
||||
|
||||
return ::make_shared<ResultMsgType>(id_getter(cache_key), *it);
|
||||
}
|
||||
|
||||
::shared_ptr<cql_transport::messages::result_message::prepared>
|
||||
get_stored_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, bool for_thrift);
|
||||
|
||||
future<::shared_ptr<cql_transport::messages::result_message::prepared>>
|
||||
store_prepared_statement(const std::experimental::string_view& query_string, const sstring& keyspace, std::unique_ptr<statements::prepared_statement> prepared, bool for_thrift);
|
||||
|
||||
// Erases the statements for which filter returns true.
|
||||
template <typename Pred>
|
||||
void invalidate_prepared_statements(Pred filter) {
|
||||
static_assert(std::is_same<bool, std::result_of_t<Pred(::shared_ptr<cql_statement>)>>::value,
|
||||
"bad Pred signature");
|
||||
for (auto it = _prepared_statements.begin(); it != _prepared_statements.end(); ) {
|
||||
if (filter(it->second->statement)) {
|
||||
it = _prepared_statements.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
for (auto it = _thrift_prepared_statements.begin(); it != _thrift_prepared_statements.end(); ) {
|
||||
if (filter(it->second->statement)) {
|
||||
it = _thrift_prepared_statements.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
public ResultMessage processPrepared(CQLStatement statement, QueryState queryState, QueryOptions options)
|
||||
throws RequestExecutionException, RequestValidationException
|
||||
|
||||
@@ -101,6 +101,10 @@ public:
|
||||
return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->has_bound(b); });
|
||||
}
|
||||
|
||||
virtual bool is_inclusive(statements::bound b) const override {
|
||||
return boost::algorithm::all_of(_restrictions->restrictions(), [b] (auto&& r) { return r.second->is_inclusive(b); });
|
||||
}
|
||||
|
||||
virtual bool uses_function(const sstring& ks_name, const sstring& function_name) const override {
|
||||
return _restrictions->uses_function(ks_name, function_name);
|
||||
}
|
||||
|
||||
@@ -75,7 +75,7 @@ cql3::statements::create_user_statement::execute(distributed<service::storage_pr
|
||||
throw exceptions::invalid_request_exception(sprint("User %s already exists", _username));
|
||||
}
|
||||
if (exists && _if_not_exists) {
|
||||
make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
|
||||
return make_ready_future<::shared_ptr<cql_transport::messages::result_message>>();
|
||||
}
|
||||
return auth::authenticator::get().create(_username, _opts->options()).then([this] {
|
||||
return auth::auth::insert_user(_username, _superuser).then([] {
|
||||
|
||||
@@ -106,6 +106,9 @@ delete_statement::prepare_internal(database& db, schema_ptr schema, shared_ptr<v
|
||||
|| !stmt->restrictions()->get_clustering_columns_restrictions()->has_bound(bound::END)) {
|
||||
throw exceptions::invalid_request_exception("A range deletion operation needs to specify both bounds");
|
||||
}
|
||||
if (!schema->is_compound() && stmt->restrictions()->get_clustering_columns_restrictions()->is_slice()) {
|
||||
throw exceptions::invalid_request_exception("Range deletions on \"compact storage\" schemas are not supported");
|
||||
}
|
||||
return stmt;
|
||||
}
|
||||
|
||||
|
||||
@@ -886,7 +886,8 @@ column_family::seal_active_streaming_memtable_immediate() {
|
||||
return old->clear_gently();
|
||||
}
|
||||
});
|
||||
}).handle_exception([old] (auto ep) {
|
||||
}).handle_exception([old, newtab] (auto ep) {
|
||||
newtab->mark_for_deletion();
|
||||
dblog.error("failed to write streamed sstable: {}", ep);
|
||||
return make_exception_future<>(ep);
|
||||
});
|
||||
@@ -924,7 +925,8 @@ future<> column_family::seal_active_streaming_memtable_big(streaming_memtable_bi
|
||||
auto&& priority = service::get_local_streaming_write_priority();
|
||||
return write_memtable_to_sstable(*old, newtab, incremental_backups_enabled(), priority, true, _config.background_writer_scheduling_group).then([this, newtab, old, &smb] {
|
||||
smb.sstables.emplace_back(newtab);
|
||||
}).handle_exception([] (auto ep) {
|
||||
}).handle_exception([newtab] (auto ep) {
|
||||
newtab->mark_for_deletion();
|
||||
dblog.error("failed to write streamed sstable: {}", ep);
|
||||
return make_exception_future<>(ep);
|
||||
});
|
||||
|
||||
@@ -64,8 +64,11 @@
|
||||
#include "db/config.hh"
|
||||
#include "md5_hasher.hh"
|
||||
|
||||
#include <seastar/util/noncopyable_function.hh>
|
||||
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
#include <boost/range/algorithm/copy.hpp>
|
||||
#include <boost/range/algorithm/transform.hpp>
|
||||
#include <boost/range/adaptor/map.hpp>
|
||||
#include <boost/range/join.hpp>
|
||||
|
||||
@@ -126,7 +129,11 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
||||
std::map<qualified_name, schema_mutations>&& views_before,
|
||||
std::map<qualified_name, schema_mutations>&& views_after);
|
||||
|
||||
static void merge_types(distributed<service::storage_proxy>& proxy,
|
||||
struct user_types_to_drop final {
|
||||
seastar::noncopyable_function<void()> drop;
|
||||
};
|
||||
|
||||
static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy,
|
||||
schema_result&& before,
|
||||
schema_result&& after);
|
||||
|
||||
@@ -832,7 +839,7 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
|
||||
#endif
|
||||
|
||||
std::set<sstring> keyspaces_to_drop = merge_keyspaces(proxy, std::move(old_keyspaces), std::move(new_keyspaces)).get0();
|
||||
merge_types(proxy, std::move(old_types), std::move(new_types));
|
||||
auto types_to_drop = merge_types(proxy, std::move(old_types), std::move(new_types));
|
||||
merge_tables_and_views(proxy,
|
||||
std::move(old_column_families), std::move(new_column_families),
|
||||
std::move(old_views), std::move(new_views));
|
||||
@@ -840,6 +847,8 @@ static future<> do_merge_schema(distributed<service::storage_proxy>& proxy, std:
|
||||
mergeFunctions(oldFunctions, newFunctions);
|
||||
mergeAggregates(oldAggregates, newAggregates);
|
||||
#endif
|
||||
types_to_drop.drop();
|
||||
|
||||
proxy.local().get_db().invoke_on_all([keyspaces_to_drop = std::move(keyspaces_to_drop)] (database& db) {
|
||||
// it is safe to drop a keyspace only when all nested ColumnFamilies where deleted
|
||||
return do_for_each(keyspaces_to_drop, [&db] (auto keyspace_to_drop) {
|
||||
@@ -996,30 +1005,37 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
||||
}).get();
|
||||
}
|
||||
|
||||
static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<user_type>& to)
|
||||
struct naked_user_type {
|
||||
const sstring keyspace;
|
||||
const sstring qualified_name;
|
||||
};
|
||||
|
||||
static inline void collect_types(std::set<sstring>& keys, schema_result& result, std::vector<naked_user_type>& to)
|
||||
{
|
||||
for (auto&& key : keys) {
|
||||
auto&& value = result[key];
|
||||
auto types = create_types_from_schema_partition(schema_result_value_type{key, std::move(value)});
|
||||
std::move(types.begin(), types.end(), std::back_inserter(to));
|
||||
boost::transform(types, std::back_inserter(to), [] (user_type type) {
|
||||
return naked_user_type{std::move(type->_keyspace), std::move(type->name())};
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// see the comments for merge_keyspaces()
|
||||
static void merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
|
||||
// see the comments for merge_keyspaces()
|
||||
static user_types_to_drop merge_types(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after)
|
||||
{
|
||||
std::vector<user_type> created, altered, dropped;
|
||||
std::vector<naked_user_type> created, altered, dropped;
|
||||
|
||||
auto diff = difference(before, after, indirect_equal_to<lw_shared_ptr<query::result_set>>());
|
||||
|
||||
collect_types(diff.entries_only_on_left, before, dropped); // Keyspaces with no more types
|
||||
collect_types(diff.entries_only_on_right, after, created); // New keyspaces with types
|
||||
|
||||
for (auto&& key : diff.entries_differing) {
|
||||
for (auto&& keyspace : diff.entries_differing) {
|
||||
// The user types of this keyspace differ, so diff the current types with the updated ones
|
||||
auto current_types = proxy.local().get_db().local().find_keyspace(key).metadata()->user_types()->get_all_types();
|
||||
auto current_types = proxy.local().get_db().local().find_keyspace(keyspace).metadata()->user_types()->get_all_types();
|
||||
decltype(current_types) updated_types;
|
||||
auto ts = create_types_from_schema_partition(schema_result_value_type{key, std::move(after[key])});
|
||||
auto ts = create_types_from_schema_partition(schema_result_value_type{keyspace, std::move(after[keyspace])});
|
||||
updated_types.reserve(ts.size());
|
||||
for (auto&& type : ts) {
|
||||
updated_types[type->_name] = std::move(type);
|
||||
@@ -1027,36 +1043,46 @@ static void merge_types(distributed<service::storage_proxy>& proxy, schema_resul
|
||||
|
||||
auto delta = difference(current_types, updated_types, indirect_equal_to<user_type>());
|
||||
|
||||
for (auto&& key : delta.entries_only_on_left) {
|
||||
dropped.emplace_back(current_types[key]);
|
||||
for (auto&& type_name : delta.entries_only_on_left) {
|
||||
dropped.emplace_back(naked_user_type{keyspace, current_types[type_name]->name()});
|
||||
}
|
||||
for (auto&& key : delta.entries_only_on_right) {
|
||||
created.emplace_back(std::move(updated_types[key]));
|
||||
for (auto&& type_name : delta.entries_only_on_right) {
|
||||
created.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
|
||||
}
|
||||
for (auto&& key : delta.entries_differing) {
|
||||
altered.emplace_back(std::move(updated_types[key]));
|
||||
for (auto&& type_name : delta.entries_differing) {
|
||||
altered.emplace_back(naked_user_type{keyspace, updated_types[type_name]->name()});
|
||||
}
|
||||
}
|
||||
|
||||
proxy.local().get_db().invoke_on_all([&created, &dropped, &altered] (database& db) {
|
||||
// Create and update user types before any tables/views are created that potentially
|
||||
// use those types. Similarly, defer dropping until after tables/views that may use
|
||||
// some of these user types are dropped.
|
||||
|
||||
proxy.local().get_db().invoke_on_all([&created, &altered] (database& db) {
|
||||
return seastar::async([&] {
|
||||
for (auto&& type : created) {
|
||||
auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
|
||||
auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
|
||||
db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
|
||||
service::get_local_migration_manager().notify_create_user_type(user_type).get();
|
||||
}
|
||||
for (auto&& type : dropped) {
|
||||
auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
|
||||
db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
|
||||
service::get_local_migration_manager().notify_drop_user_type(user_type).get();
|
||||
}
|
||||
for (auto&& type : altered) {
|
||||
auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type->name()));
|
||||
auto user_type = dynamic_pointer_cast<const user_type_impl>(parse_type(type.qualified_name));
|
||||
db.find_keyspace(user_type->_keyspace).add_user_type(user_type);
|
||||
service::get_local_migration_manager().notify_update_user_type(user_type).get();
|
||||
}
|
||||
});
|
||||
}).get();
|
||||
|
||||
return user_types_to_drop{[&proxy, dropped = std::move(dropped)] {
|
||||
proxy.local().get_db().invoke_on_all([dropped = std::move(dropped)](database& db) {
|
||||
return do_for_each(dropped, [&db](auto& user_type_to_drop) {
|
||||
auto user_type = dynamic_pointer_cast<const user_type_impl>(
|
||||
parse_type(std::move(user_type_to_drop.qualified_name)));
|
||||
db.find_keyspace(user_type->_keyspace).remove_user_type(user_type);
|
||||
return service::get_local_migration_manager().notify_drop_user_type(user_type);
|
||||
});
|
||||
}).get();
|
||||
}};
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
@@ -194,13 +194,13 @@ public:
|
||||
: _view(std::move(view))
|
||||
, _view_info(*_view->view_info())
|
||||
, _base(std::move(base))
|
||||
, _updates(8, partition_key::hashing(*_base), partition_key::equality(*_base)) {
|
||||
, _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
|
||||
}
|
||||
|
||||
void move_to(std::vector<mutation>& mutations) && {
|
||||
auto& partitioner = dht::global_partitioner();
|
||||
std::transform(_updates.begin(), _updates.end(), std::back_inserter(mutations), [&, this] (auto&& m) {
|
||||
return mutation(_view, partitioner.decorate_key(*_base, std::move(m.first)), std::move(m.second));
|
||||
return mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -59,14 +59,11 @@ future<> boot_strapper::bootstrap() {
|
||||
streamer->add_ranges(keyspace_name, ranges);
|
||||
}
|
||||
|
||||
return streamer->fetch_async().then_wrapped([streamer] (auto&& f) {
|
||||
try {
|
||||
auto state = f.get0();
|
||||
} catch (...) {
|
||||
throw std::runtime_error(sprint("Error during boostrap: %s", std::current_exception()));
|
||||
}
|
||||
return streamer->stream_async().then([streamer] () {
|
||||
service::get_local_storage_service().finish_bootstrapping();
|
||||
return make_ready_future<>();
|
||||
}).handle_exception([streamer] (std::exception_ptr eptr) {
|
||||
blogger.warn("Eror during bootstrap: {}", eptr);
|
||||
return make_exception_future<>(std::move(eptr));
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -210,7 +210,36 @@ bool range_streamer::use_strict_sources_for_ranges(const sstring& keyspace_name)
|
||||
&& _metadata.get_all_endpoints().size() != strat.get_replication_factor();
|
||||
}
|
||||
|
||||
void range_streamer::add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families) {
|
||||
if (_nr_rx_added) {
|
||||
throw std::runtime_error("Mixed sending and receiving is not supported");
|
||||
}
|
||||
_nr_tx_added++;
|
||||
_to_stream.emplace(keyspace_name, std::move(ranges_per_endpoint));
|
||||
auto inserted = _column_families.emplace(keyspace_name, std::move(column_families)).second;
|
||||
if (!inserted) {
|
||||
throw std::runtime_error("Can not add column_families for the same keyspace more than once");
|
||||
}
|
||||
}
|
||||
|
||||
void range_streamer::add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families) {
|
||||
if (_nr_tx_added) {
|
||||
throw std::runtime_error("Mixed sending and receiving is not supported");
|
||||
}
|
||||
_nr_rx_added++;
|
||||
_to_stream.emplace(keyspace_name, std::move(ranges_per_endpoint));
|
||||
auto inserted = _column_families.emplace(keyspace_name, std::move(column_families)).second;
|
||||
if (!inserted) {
|
||||
throw std::runtime_error("Can not add column_families for the same keyspace more than once");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: This is the legacy range_streamer interface, it is add_rx_ranges which adds rx ranges.
|
||||
void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges) {
|
||||
if (_nr_tx_added) {
|
||||
throw std::runtime_error("Mixed sending and receiving is not supported");
|
||||
}
|
||||
_nr_rx_added++;
|
||||
auto ranges_for_keyspace = use_strict_sources_for_ranges(keyspace_name)
|
||||
? get_all_ranges_with_strict_sources_for(keyspace_name, ranges)
|
||||
: get_all_ranges_with_sources_for(keyspace_name, ranges);
|
||||
@@ -231,26 +260,114 @@ void range_streamer::add_ranges(const sstring& keyspace_name, dht::token_range_v
|
||||
logger.debug("{} : range {} from source {} for keyspace {}", _description, x.second, x.first, keyspace_name);
|
||||
}
|
||||
}
|
||||
_to_fetch.emplace(keyspace_name, std::move(range_fetch_map));
|
||||
_to_stream.emplace(keyspace_name, std::move(range_fetch_map));
|
||||
}
|
||||
|
||||
future<streaming::stream_state> range_streamer::fetch_async() {
|
||||
for (auto& fetch : _to_fetch) {
|
||||
const auto& keyspace = fetch.first;
|
||||
for (auto& x : fetch.second) {
|
||||
auto& source = x.first;
|
||||
auto& ranges = x.second;
|
||||
/* Send messages to respective folks to stream data over to me */
|
||||
if (logger.is_enabled(logging::log_level::debug)) {
|
||||
logger.debug("{}ing from {} ranges {}", _description, source, ranges);
|
||||
future<> range_streamer::stream_async() {
|
||||
return seastar::async([this] {
|
||||
int sleep_time = 60;
|
||||
for (;;) {
|
||||
try {
|
||||
do_stream_async().get();
|
||||
break;
|
||||
} catch (...) {
|
||||
logger.warn("{} failed to stream. Will retry in {} seconds ...", _description, sleep_time);
|
||||
sleep_abortable(std::chrono::seconds(sleep_time)).get();
|
||||
sleep_time *= 1.5;
|
||||
if (++_nr_retried >= _nr_max_retry) {
|
||||
throw;
|
||||
}
|
||||
}
|
||||
_stream_plan.request_ranges(source, keyspace, ranges);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
future<> range_streamer::do_stream_async() {
|
||||
auto nr_ranges_remaining = nr_ranges_to_stream();
|
||||
logger.info("{} starts, nr_ranges_remaining={}", _description, nr_ranges_remaining);
|
||||
auto start = lowres_clock::now();
|
||||
return do_for_each(_to_stream, [this, start, description = _description] (auto& stream) {
|
||||
const auto& keyspace = stream.first;
|
||||
auto& ip_range_vec = stream.second;
|
||||
// Fetch from or send to peer node in parallel
|
||||
return parallel_for_each(ip_range_vec, [this, description, keyspace] (auto& ip_range) {
|
||||
auto& source = ip_range.first;
|
||||
auto& range_vec = ip_range.second;
|
||||
return seastar::async([this, description, keyspace, source, &range_vec] () mutable {
|
||||
// TODO: It is better to use fiber instead of thread here because
|
||||
// creating a thread per peer can be some memory in a large cluster.
|
||||
auto start_time = lowres_clock::now();
|
||||
unsigned sp_index = 0;
|
||||
unsigned nr_ranges_streamed = 0;
|
||||
size_t nr_ranges_total = range_vec.size();
|
||||
size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
|
||||
dht::token_range_vector ranges_to_stream;
|
||||
auto do_streaming = [&] {
|
||||
auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
|
||||
logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
|
||||
description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
|
||||
if (_nr_rx_added) {
|
||||
sp.request_ranges(source, keyspace, ranges_to_stream, _column_families[keyspace]);
|
||||
} else if (_nr_tx_added) {
|
||||
sp.transfer_ranges(source, keyspace, ranges_to_stream, _column_families[keyspace]);
|
||||
}
|
||||
sp.execute().discard_result().get();
|
||||
ranges_to_stream.clear();
|
||||
};
|
||||
try {
|
||||
for (auto it = range_vec.begin(); it < range_vec.end();) {
|
||||
ranges_to_stream.push_back(*it);
|
||||
it = range_vec.erase(it);
|
||||
nr_ranges_streamed++;
|
||||
if (ranges_to_stream.size() < nr_ranges_per_stream_plan) {
|
||||
continue;
|
||||
} else {
|
||||
do_streaming();
|
||||
}
|
||||
}
|
||||
if (ranges_to_stream.size() > 0) {
|
||||
do_streaming();
|
||||
}
|
||||
} catch (...) {
|
||||
for (auto& range : ranges_to_stream) {
|
||||
range_vec.push_back(range);
|
||||
}
|
||||
auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
|
||||
logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
|
||||
throw;
|
||||
}
|
||||
auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
|
||||
logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
|
||||
});
|
||||
|
||||
});
|
||||
}).finally([this, start] {
|
||||
auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start).count();
|
||||
auto nr_ranges_remaining = nr_ranges_to_stream();
|
||||
if (nr_ranges_remaining) {
|
||||
logger.warn("{} failed, took {} seconds, nr_ranges_remaining={}", _description, t, nr_ranges_remaining);
|
||||
} else {
|
||||
logger.info("{} succeeded, took {} seconds, nr_ranges_remaining={}", _description, t, nr_ranges_remaining);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
size_t range_streamer::nr_ranges_to_stream() {
|
||||
size_t nr_ranges_remaining = 0;
|
||||
for (auto& fetch : _to_stream) {
|
||||
const auto& keyspace = fetch.first;
|
||||
auto& ip_range_vec = fetch.second;
|
||||
for (auto& ip_range : ip_range_vec) {
|
||||
auto& source = ip_range.first;
|
||||
auto& range_vec = ip_range.second;
|
||||
nr_ranges_remaining += range_vec.size();
|
||||
logger.debug("Remaining: keyspace={}, source={}, ranges={}", keyspace, source, range_vec);
|
||||
}
|
||||
}
|
||||
|
||||
return _stream_plan.execute();
|
||||
return nr_ranges_remaining;
|
||||
}
|
||||
|
||||
|
||||
std::unordered_multimap<inet_address, dht::token_range>
|
||||
range_streamer::get_work_map(const std::unordered_multimap<dht::token_range, inet_address>& ranges_with_source_target,
|
||||
const sstring& keyspace) {
|
||||
|
||||
@@ -119,6 +119,8 @@ public:
|
||||
}
|
||||
|
||||
void add_ranges(const sstring& keyspace_name, dht::token_range_vector ranges);
|
||||
void add_tx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families = {});
|
||||
void add_rx_ranges(const sstring& keyspace_name, std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint, std::vector<sstring> column_families = {});
|
||||
private:
|
||||
bool use_strict_sources_for_ranges(const sstring& keyspace_name);
|
||||
/**
|
||||
@@ -159,16 +161,25 @@ public:
|
||||
}
|
||||
#endif
|
||||
public:
|
||||
future<streaming::stream_state> fetch_async();
|
||||
future<> stream_async();
|
||||
future<> do_stream_async();
|
||||
size_t nr_ranges_to_stream();
|
||||
private:
|
||||
distributed<database>& _db;
|
||||
token_metadata& _metadata;
|
||||
std::unordered_set<token> _tokens;
|
||||
inet_address _address;
|
||||
sstring _description;
|
||||
std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_fetch;
|
||||
std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
|
||||
std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
|
||||
stream_plan _stream_plan;
|
||||
std::unordered_map<sstring, std::vector<sstring>> _column_families;
|
||||
// Retry the stream plan _nr_max_retry times
|
||||
unsigned _nr_retried = 0;
|
||||
unsigned _nr_max_retry = 5;
|
||||
// Number of tx and rx ranges added
|
||||
unsigned _nr_tx_added = 0;
|
||||
unsigned _nr_rx_added = 0;
|
||||
};
|
||||
|
||||
} // dht
|
||||
|
||||
2
dist/ami/files/scylla-ami
vendored
2
dist/ami/files/scylla-ami
vendored
Submodule dist/ami/files/scylla-ami updated: be90a3fb9f...fa2461de01
1
dist/common/modprobe.d/scylla-raid0.conf
vendored
1
dist/common/modprobe.d/scylla-raid0.conf
vendored
@@ -1 +0,0 @@
|
||||
options raid0 devices_discard_performance=Y
|
||||
@@ -6,7 +6,7 @@ After=network.target
|
||||
Type=simple
|
||||
User=scylla
|
||||
Group=scylla
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid --repo-files '/etc/yum.repos.d/scylla*.repo' -q -c /etc/scylla.d/housekeeping.cfg version --mode d
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode d
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -6,7 +6,7 @@ After=network.target
|
||||
Type=simple
|
||||
User=scylla
|
||||
Group=scylla
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q --repo-files '/etc/yum.repos.d/scylla*.repo' -c /etc/scylla.d/housekeeping.cfg version --mode r
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files @@REPOFILES@@ version --mode r
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
6
dist/debian/build_deb.sh
vendored
6
dist/debian/build_deb.sh
vendored
@@ -196,8 +196,10 @@ else
|
||||
fi
|
||||
cp dist/common/systemd/scylla-server.service.in debian/scylla-server.service
|
||||
sed -i -e "s#@@SYSCONFDIR@@#/etc/default#g" debian/scylla-server.service
|
||||
cp dist/common/systemd/scylla-housekeeping-daily.service debian/scylla-server.scylla-housekeeping-daily.service
|
||||
cp dist/common/systemd/scylla-housekeeping-restart.service debian/scylla-server.scylla-housekeeping-restart.service
|
||||
cp dist/common/systemd/scylla-housekeeping-daily.service.in debian/scylla-server.scylla-housekeeping-daily.service
|
||||
sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-daily.service
|
||||
cp dist/common/systemd/scylla-housekeeping-restart.service.in debian/scylla-server.scylla-housekeeping-restart.service
|
||||
sed -i -e "s#@@REPOFILES@@#'/etc/apt/sources.list.d/scylla*.list'#g" debian/scylla-server.scylla-housekeeping-restart.service
|
||||
cp dist/common/systemd/node-exporter.service debian/scylla-server.node-exporter.service
|
||||
|
||||
if [ $REBUILD -eq 1 ]; then
|
||||
|
||||
2
dist/debian/control.in
vendored
2
dist/debian/control.in
vendored
@@ -40,7 +40,7 @@ Description: Scylla kernel tuning configuration
|
||||
Package: scylla
|
||||
Section: metapackages
|
||||
Architecture: any
|
||||
Depends: scylla-server, scylla-jmx, scylla-tools, scylla-kernel-conf
|
||||
Depends: scylla-server, scylla-jmx, scylla-tools, scylla-tools-core, scylla-kernel-conf
|
||||
Description: Scylla database metapackage
|
||||
Scylla is a highly scalable, eventually consistent, distributed,
|
||||
partitioned row DB.
|
||||
|
||||
6
dist/redhat/build_rpm.sh
vendored
6
dist/redhat/build_rpm.sh
vendored
@@ -104,9 +104,9 @@ fi
|
||||
|
||||
|
||||
if [ $JOBS -gt 0 ]; then
|
||||
SRPM_OPTS="$SRPM_OPTS --define='_smp_mflags -j$JOBS'"
|
||||
RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
|
||||
fi
|
||||
sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS
|
||||
sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/scylla-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
|
||||
if [ "$TARGET" = "epel-7-x86_64" ] && [ $REBUILD = 1 ]; then
|
||||
./dist/redhat/centos_dep/build_dependency.sh
|
||||
sudo mock --init --root=$TARGET
|
||||
@@ -116,4 +116,4 @@ elif [ "$TARGET" = "epel-7-x86_64" ] && [ $REBUILD = 0 ]; then
|
||||
TARGET=scylla-$TARGET
|
||||
RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
|
||||
fi
|
||||
sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS build/srpms/scylla-$VERSION*.src.rpm
|
||||
sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/scylla-$VERSION*.src.rpm
|
||||
|
||||
14
dist/redhat/centos_dep/binutils.diff
vendored
14
dist/redhat/centos_dep/binutils.diff
vendored
@@ -33,8 +33,8 @@
|
||||
Requires(post): coreutils
|
||||
-Requires(post): %{_sbindir}/alternatives
|
||||
-Requires(preun): %{_sbindir}/alternatives
|
||||
+Requires(post): /sbin/alternatives
|
||||
+Requires(preun): /sbin/alternatives
|
||||
+Requires(post): /usr/sbin/alternatives
|
||||
+Requires(preun): /usr/sbin/alternatives
|
||||
%endif
|
||||
|
||||
# On ARM EABI systems, we do want -gnueabi to be part of the
|
||||
@@ -58,13 +58,13 @@
|
||||
%if "%{build_gold}" == "both"
|
||||
%__rm -f %{_bindir}/%{?cross}ld
|
||||
-%{_sbindir}/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
|
||||
+/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
|
||||
+/usr/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
|
||||
%{_bindir}/%{?cross}ld.bfd %{ld_bfd_priority}
|
||||
-%{_sbindir}/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
|
||||
+/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
|
||||
+/usr/sbin/alternatives --install %{_bindir}/%{?cross}ld %{?cross}ld \
|
||||
%{_bindir}/%{?cross}ld.gold %{ld_gold_priority}
|
||||
-%{_sbindir}/alternatives --auto %{?cross}ld
|
||||
+/sbin/alternatives --auto %{?cross}ld
|
||||
+/usr/sbin/alternatives --auto %{?cross}ld
|
||||
%endif
|
||||
%if %{isnative}
|
||||
/sbin/ldconfig
|
||||
@@ -74,8 +74,8 @@
|
||||
if [ $1 = 0 ]; then
|
||||
- %{_sbindir}/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.bfd
|
||||
- %{_sbindir}/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.gold
|
||||
+ /sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.bfd
|
||||
+ /sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.gold
|
||||
+ /usr/sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.bfd
|
||||
+ /usr/sbin/alternatives --remove %{?cross}ld %{_bindir}/%{?cross}ld.gold
|
||||
fi
|
||||
%endif
|
||||
%if %{isnative}
|
||||
|
||||
23
dist/redhat/scylla.spec.in
vendored
23
dist/redhat/scylla.spec.in
vendored
@@ -7,14 +7,14 @@ Group: Applications/Databases
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Source0: %{name}-@@VERSION@@-@@RELEASE@@.tar
|
||||
Requires: scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-kernel-conf = @@VERSION@@
|
||||
Requires: scylla-server = @@VERSION@@ scylla-jmx = @@VERSION@@ scylla-tools = @@VERSION@@ scylla-tools-core = @@VERSION@@ scylla-kernel-conf = @@VERSION@@
|
||||
Obsoletes: scylla-server < 1.1
|
||||
|
||||
%description
|
||||
Scylla is a highly scalable, eventually consistent, distributed,
|
||||
partitioned row DB.
|
||||
This package installs all required packages for ScyllaDB, including
|
||||
scylla-server, scylla-jmx, scylla-tools.
|
||||
scylla-server, scylla-jmx, scylla-tools, scylla-tools-core.
|
||||
|
||||
# this is needed to prevent python compilation error on CentOS (#2235)
|
||||
%if 0%{?rhel}
|
||||
@@ -78,6 +78,10 @@ python3.4 ./configure.py --enable-dpdk --mode=release --static-stdc++ --static-b
|
||||
ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
|
||||
cp dist/common/systemd/scylla-server.service.in build/scylla-server.service
|
||||
sed -i -e "s#@@SYSCONFDIR@@#/etc/sysconfig#g" build/scylla-server.service
|
||||
cp dist/common/systemd/scylla-housekeeping-restart.service.in build/scylla-housekeeping-restart.service
|
||||
sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-restart.service
|
||||
cp dist/common/systemd/scylla-housekeeping-daily.service.in build/scylla-housekeeping-daily.service
|
||||
sed -i -e "s#@@REPOFILES@@#'/etc/yum.repos.d/scylla*.repo'#g" build/scylla-housekeeping-daily.service
|
||||
|
||||
%install
|
||||
rm -rf $RPM_BUILD_ROOT
|
||||
@@ -88,9 +92,6 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/security/limits.d/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
|
||||
%if 0%{?rhel}
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
|
||||
%endif
|
||||
mkdir -p $RPM_BUILD_ROOT%{_sysctldir}/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_docdir}/scylla/
|
||||
mkdir -p $RPM_BUILD_ROOT%{_unitdir}
|
||||
@@ -101,9 +102,6 @@ install -m644 dist/common/limits.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/sec
|
||||
install -m644 dist/common/collectd.d/scylla.conf $RPM_BUILD_ROOT%{_sysconfdir}/collectd.d/
|
||||
install -m644 dist/common/scylla.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/scylla.d/
|
||||
install -m644 dist/common/sysctl.d/*.conf $RPM_BUILD_ROOT%{_sysctldir}/
|
||||
%if 0%{?rhel}
|
||||
install -m644 dist/common/modprobe.d/*.conf $RPM_BUILD_ROOT%{_sysconfdir}/modprobe.d/
|
||||
%endif
|
||||
install -d -m755 $RPM_BUILD_ROOT%{_sysconfdir}/scylla
|
||||
install -m644 conf/scylla.yaml $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
|
||||
install -m644 conf/cassandra-rackdc.properties $RPM_BUILD_ROOT%{_sysconfdir}/scylla/
|
||||
@@ -267,18 +265,9 @@ if Scylla is the main application on your server and you wish to optimize its la
|
||||
# We cannot use the sysctl_apply rpm macro because it is not present in 7.0
|
||||
# following is a "manual" expansion
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
|
||||
# Write modprobe.d params when module already loaded
|
||||
%if 0%{?rhel}
|
||||
if [ -e /sys/module/raid0/parameters/devices_discard_performance ]; then
|
||||
echo Y > /sys/module/raid0/parameters/devices_discard_performance
|
||||
fi
|
||||
%endif
|
||||
|
||||
%files kernel-conf
|
||||
%defattr(-,root,root)
|
||||
%if 0%{?rhel}
|
||||
%config(noreplace) %{_sysconfdir}/modprobe.d/*.conf
|
||||
%endif
|
||||
%{_sysctldir}/*.conf
|
||||
|
||||
%changelog
|
||||
|
||||
@@ -461,7 +461,8 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
|
||||
int local_generation = local_ep_state_ptr.get_heart_beat_state().get_generation();
|
||||
int remote_generation = remote_state.get_heart_beat_state().get_generation();
|
||||
logger.trace("{} local generation {}, remote generation {}", ep, local_generation, remote_generation);
|
||||
if (local_generation != 0 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
|
||||
// A node was removed with nodetool removenode can have a generation of 2
|
||||
if (local_generation > 2 && remote_generation > local_generation + MAX_GENERATION_DIFFERENCE) {
|
||||
// assume some peer has corrupted memory and is broadcasting an unbelievable generation about another peer (or itself)
|
||||
logger.warn("received an invalid gossip generation for peer {}; local generation = {}, received generation = {}",
|
||||
ep, local_generation, remote_generation);
|
||||
@@ -832,6 +833,7 @@ int gossiper::get_max_endpoint_state_version(endpoint_state state) {
|
||||
|
||||
// Runs inside seastar::async context
|
||||
void gossiper::evict_from_membership(inet_address endpoint) {
|
||||
auto permit = lock_endpoint(endpoint).get0();
|
||||
_unreachable_endpoints.erase(endpoint);
|
||||
container().invoke_on_all([endpoint] (auto& g) {
|
||||
g.endpoint_state_map.erase(endpoint);
|
||||
@@ -982,7 +984,7 @@ future<> gossiper::assassinate_endpoint(sstring address) {
|
||||
logger.warn("Assassinating {} via gossip", endpoint);
|
||||
if (es) {
|
||||
auto& ss = service::get_local_storage_service();
|
||||
auto tokens = ss.get_token_metadata().get_tokens(endpoint);
|
||||
tokens = ss.get_token_metadata().get_tokens(endpoint);
|
||||
if (tokens.empty()) {
|
||||
logger.warn("Unable to calculate tokens for {}. Will use a random one", address);
|
||||
throw std::runtime_error(sprint("Unable to calculate tokens for %s", endpoint));
|
||||
|
||||
@@ -100,7 +100,6 @@ future<> ec2_multi_region_snitch::gossiper_starting() {
|
||||
// Note: currently gossiper "main" instance always runs on CPU0 therefore
|
||||
// this function will be executed on CPU0 only.
|
||||
//
|
||||
ec2_snitch::gossiper_starting();
|
||||
|
||||
using namespace gms;
|
||||
auto& g = get_local_gossiper();
|
||||
|
||||
@@ -110,7 +110,11 @@ void token_metadata::update_normal_tokens(std::unordered_map<inet_address, std::
|
||||
inet_address endpoint = i.first;
|
||||
std::unordered_set<token>& tokens = i.second;
|
||||
|
||||
assert(!tokens.empty());
|
||||
if (tokens.empty()) {
|
||||
auto msg = sprint("tokens is empty in update_normal_tokens");
|
||||
tlogger.error("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
|
||||
for(auto it = _token_to_endpoint_map.begin(), ite = _token_to_endpoint_map.end(); it != ite;) {
|
||||
if(it->second == endpoint) {
|
||||
@@ -141,7 +145,11 @@ void token_metadata::update_normal_tokens(std::unordered_map<inet_address, std::
|
||||
}
|
||||
|
||||
size_t token_metadata::first_token_index(const token& start) const {
|
||||
assert(_sorted_tokens.size() > 0);
|
||||
if (_sorted_tokens.empty()) {
|
||||
auto msg = sprint("sorted_tokens is empty in first_token_index!");
|
||||
tlogger.error("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
auto it = std::lower_bound(_sorted_tokens.begin(), _sorted_tokens.end(), start);
|
||||
if (it == _sorted_tokens.end()) {
|
||||
return 0;
|
||||
@@ -292,7 +300,11 @@ void token_metadata::add_bootstrap_tokens(std::unordered_set<token> tokens, inet
|
||||
}
|
||||
|
||||
void token_metadata::remove_bootstrap_tokens(std::unordered_set<token> tokens) {
|
||||
assert(!tokens.empty());
|
||||
if (tokens.empty()) {
|
||||
auto msg = sprint("tokens is empty in remove_bootstrap_tokens!");
|
||||
tlogger.error("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
for (auto t : tokens) {
|
||||
_bootstrap_tokens.erase(t);
|
||||
}
|
||||
@@ -320,7 +332,11 @@ void token_metadata::remove_from_moving(inet_address endpoint) {
|
||||
token token_metadata::get_predecessor(token t) {
|
||||
auto& tokens = sorted_tokens();
|
||||
auto it = std::lower_bound(tokens.begin(), tokens.end(), t);
|
||||
assert(it != tokens.end() && *it == t);
|
||||
if (it == tokens.end() || *it != t) {
|
||||
auto msg = sprint("token error in get_predecessor!");
|
||||
tlogger.error("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
}
|
||||
if (it == tokens.begin()) {
|
||||
// If the token is the first element, its preprocessor is the last element
|
||||
return tokens.back();
|
||||
|
||||
@@ -514,7 +514,6 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
}();
|
||||
|
||||
auto remote_addr = ipv4_addr(get_preferred_ip(id.addr).raw_addr(), must_encrypt ? _ssl_port : _port);
|
||||
auto local_addr = ipv4_addr{_listen_address.raw_addr(), 0};
|
||||
|
||||
rpc::client_options opts;
|
||||
// send keepalive messages each minute if connection is idle, drop connection after 10 failures
|
||||
@@ -526,9 +525,9 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
|
||||
auto client = must_encrypt ?
|
||||
::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
|
||||
remote_addr, local_addr, _credentials) :
|
||||
remote_addr, ipv4_addr(), _credentials) :
|
||||
::make_shared<rpc_protocol_client_wrapper>(*_rpc, std::move(opts),
|
||||
remote_addr, local_addr);
|
||||
remote_addr);
|
||||
|
||||
it = _clients[idx].emplace(id, shard_info(std::move(client))).first;
|
||||
uint32_t src_cpu_id = engine().cpu_id();
|
||||
@@ -640,59 +639,6 @@ auto send_message_timeout(messaging_service* ms, messaging_verb verb, msg_addr i
|
||||
});
|
||||
}
|
||||
|
||||
template <typename MsgIn, typename... MsgOut>
|
||||
auto send_message_timeout_and_retry(messaging_service* ms, messaging_verb verb, msg_addr id,
|
||||
std::chrono::seconds timeout, int nr_retry, std::chrono::seconds wait, MsgOut... msg) {
|
||||
using MsgInTuple = typename futurize_t<MsgIn>::value_type;
|
||||
return do_with(int(nr_retry), std::move(msg)..., [ms, verb, id, timeout, wait, nr_retry] (auto& retry, const auto&... messages) {
|
||||
return repeat_until_value([ms, verb, id, timeout, wait, nr_retry, &retry, &messages...] {
|
||||
return send_message_timeout<MsgIn>(ms, verb, id, timeout, messages...).then_wrapped(
|
||||
[ms, verb, id, timeout, wait, nr_retry, &retry] (auto&& f) mutable {
|
||||
auto vb = int(verb);
|
||||
try {
|
||||
MsgInTuple ret = f.get();
|
||||
if (retry != nr_retry) {
|
||||
mlogger.info("Retry verb={} to {}, retry={}: OK", vb, id, retry);
|
||||
}
|
||||
return make_ready_future<stdx::optional<MsgInTuple>>(std::move(ret));
|
||||
} catch (rpc::timeout_error) {
|
||||
mlogger.info("Retry verb={} to {}, retry={}: timeout in {} seconds", vb, id, retry, timeout.count());
|
||||
throw;
|
||||
} catch (rpc::closed_error) {
|
||||
mlogger.info("Retry verb={} to {}, retry={}: {}", vb, id, retry, std::current_exception());
|
||||
// Stop retrying if retry reaches 0 or message service is shutdown
|
||||
// or the remote node is removed from gossip (on_remove())
|
||||
retry--;
|
||||
if (retry == 0) {
|
||||
mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: retry == 0", vb, id, retry);
|
||||
throw;
|
||||
}
|
||||
if (ms->is_stopping()) {
|
||||
mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: messaging_service is stopped",
|
||||
vb, id, retry);
|
||||
throw;
|
||||
}
|
||||
if (!gms::get_local_gossiper().is_known_endpoint(id.addr)) {
|
||||
mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: node is removed from the cluster",
|
||||
vb, id, retry);
|
||||
throw;
|
||||
}
|
||||
return sleep_abortable(wait).then([] {
|
||||
return make_ready_future<stdx::optional<MsgInTuple>>(stdx::nullopt);
|
||||
}).handle_exception([vb, id, retry] (std::exception_ptr ep) {
|
||||
mlogger.debug("Retry verb={} to {}, retry={}: stop retrying: {}", vb, id, retry, ep);
|
||||
return make_exception_future<stdx::optional<MsgInTuple>>(ep);
|
||||
});
|
||||
} catch (...) {
|
||||
throw;
|
||||
}
|
||||
});
|
||||
}).then([ms = ms->shared_from_this()] (MsgInTuple result) {
|
||||
return futurize<MsgIn>::from_tuple(std::move(result));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Send one way message for verb
|
||||
template <typename... MsgOut>
|
||||
auto send_message_oneway(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOut&&... msg) {
|
||||
@@ -707,13 +653,6 @@ auto send_message_oneway_timeout(messaging_service* ms, Timeout timeout, messagi
|
||||
|
||||
// Wrappers for verbs
|
||||
|
||||
// Retransmission parameters for streaming verbs.
|
||||
// A stream plan gives up retrying in 10*30 + 10*60 seconds (15 minutes) at
|
||||
// most, 10*30 seconds (5 minutes) at least.
|
||||
static constexpr int streaming_nr_retry = 10;
|
||||
static constexpr std::chrono::seconds streaming_timeout{10*60};
|
||||
static constexpr std::chrono::seconds streaming_wait_before_retry{30};
|
||||
|
||||
// PREPARE_MESSAGE
|
||||
void messaging_service::register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description)>&& func) {
|
||||
@@ -721,8 +660,7 @@ void messaging_service::register_prepare_message(std::function<future<streaming:
|
||||
}
|
||||
future<streaming::prepare_message> messaging_service::send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
|
||||
sstring description) {
|
||||
return send_message_timeout_and_retry<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
|
||||
streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
|
||||
return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
|
||||
std::move(msg), plan_id, std::move(description));
|
||||
}
|
||||
|
||||
@@ -731,8 +669,7 @@ void messaging_service::register_prepare_done_message(std::function<future<> (co
|
||||
register_handler(this, messaging_verb::PREPARE_DONE_MESSAGE, std::move(func));
|
||||
}
|
||||
future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id) {
|
||||
return send_message_timeout_and_retry<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
|
||||
streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
|
||||
return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
|
||||
plan_id, dst_cpu_id);
|
||||
}
|
||||
|
||||
@@ -741,8 +678,7 @@ void messaging_service::register_stream_mutation(std::function<future<> (const r
|
||||
register_handler(this, messaging_verb::STREAM_MUTATION, std::move(func));
|
||||
}
|
||||
future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented) {
|
||||
return send_message_timeout_and_retry<void>(this, messaging_verb::STREAM_MUTATION, id,
|
||||
streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
|
||||
return send_message<void>(this, messaging_verb::STREAM_MUTATION, id,
|
||||
plan_id, std::move(fm), dst_cpu_id, fragmented);
|
||||
}
|
||||
|
||||
@@ -757,19 +693,17 @@ void messaging_service::register_stream_mutation_done(std::function<future<> (co
|
||||
});
|
||||
}
|
||||
future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id) {
|
||||
return send_message_timeout_and_retry<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
|
||||
streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
|
||||
return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
|
||||
plan_id, std::move(ranges), cf_id, dst_cpu_id);
|
||||
}
|
||||
|
||||
// COMPLETE_MESSAGE
|
||||
void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
|
||||
void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
|
||||
register_handler(this, messaging_verb::COMPLETE_MESSAGE, std::move(func));
|
||||
}
|
||||
future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id) {
|
||||
return send_message_timeout_and_retry<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
|
||||
streaming_timeout, streaming_nr_retry, streaming_wait_before_retry,
|
||||
plan_id, dst_cpu_id);
|
||||
future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed) {
|
||||
return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
|
||||
plan_id, dst_cpu_id, failed);
|
||||
}
|
||||
|
||||
void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
|
||||
|
||||
@@ -249,8 +249,8 @@ public:
|
||||
void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
|
||||
|
||||
void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
|
||||
void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
|
||||
future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);
|
||||
|
||||
// Wrapper for REPAIR_CHECKSUM_RANGE verb
|
||||
void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
|
||||
|
||||
@@ -545,13 +545,19 @@ lw_shared_ptr<partition_snapshot> partition_entry::read(schema_ptr entry_schema,
|
||||
std::vector<range_tombstone>
|
||||
partition_snapshot::range_tombstones(const schema& s, position_in_partition_view start, position_in_partition_view end)
|
||||
{
|
||||
partition_version* v = &*version();
|
||||
if (!v->next()) {
|
||||
return boost::copy_range<std::vector<range_tombstone>>(
|
||||
v->partition().row_tombstones().slice(s, start, end));
|
||||
}
|
||||
range_tombstone_list list(s);
|
||||
for (auto&& v : versions()) {
|
||||
for (auto&& rt : v.partition().row_tombstones().slice(s, start, end)) {
|
||||
while (v) {
|
||||
for (auto&& rt : v->partition().row_tombstones().slice(s, start, end)) {
|
||||
list.apply(s, rt);
|
||||
}
|
||||
v = v->next();
|
||||
}
|
||||
return boost::copy_range<std::vector<range_tombstone>>(list);
|
||||
return boost::copy_range<std::vector<range_tombstone>>(list.slice(s, start, end));
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, partition_entry& e) {
|
||||
|
||||
@@ -124,6 +124,7 @@ void range_tombstone_list::insert_from(const schema& s,
|
||||
if (less(end_bound, it->end_bound())) {
|
||||
end = it->end;
|
||||
end_kind = it->end_kind;
|
||||
end_bound = bound_view(end, end_kind);
|
||||
}
|
||||
it = rev.erase(it);
|
||||
} else if (c > 0) {
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 04896556c6...da2e1afaa8
@@ -87,6 +87,7 @@ static const sstring COUNTERS_FEATURE = "COUNTERS";
|
||||
static const sstring INDEXES_FEATURE = "INDEXES";
|
||||
static const sstring CORRECT_COUNTER_ORDER_FEATURE = "CORRECT_COUNTER_ORDER";
|
||||
static const sstring SCHEMA_TABLES_V3 = "SCHEMA_TABLES_V3";
|
||||
static const sstring CORRECT_NON_COMPOUND_RANGE_TOMBSTONES = "CORRECT_NON_COMPOUND_RANGE_TOMBSTONES";
|
||||
|
||||
distributed<storage_service> _the_storage_service;
|
||||
|
||||
@@ -129,7 +130,8 @@ sstring storage_service::get_config_supported_features() {
|
||||
LARGE_PARTITIONS_FEATURE,
|
||||
COUNTERS_FEATURE,
|
||||
CORRECT_COUNTER_ORDER_FEATURE,
|
||||
SCHEMA_TABLES_V3
|
||||
SCHEMA_TABLES_V3,
|
||||
CORRECT_NON_COMPOUND_RANGE_TOMBSTONES,
|
||||
};
|
||||
if (service::get_local_storage_service()._db.local().get_config().experimental()) {
|
||||
features.push_back(MATERIALIZED_VIEWS_FEATURE);
|
||||
@@ -339,6 +341,7 @@ void storage_service::register_features() {
|
||||
_counters_feature = gms::feature(COUNTERS_FEATURE);
|
||||
_correct_counter_order_feature = gms::feature(CORRECT_COUNTER_ORDER_FEATURE);
|
||||
_schema_tables_v3 = gms::feature(SCHEMA_TABLES_V3);
|
||||
_correct_non_compound_range_tombstones = gms::feature(CORRECT_NON_COMPOUND_RANGE_TOMBSTONES);
|
||||
|
||||
if (_db.local().get_config().experimental()) {
|
||||
_materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
|
||||
@@ -926,7 +929,17 @@ void storage_service::handle_state_removing(inet_address endpoint, std::vector<s
|
||||
slogger.warn("{}", err);
|
||||
throw std::runtime_error(err);
|
||||
}
|
||||
restore_replica_count(endpoint, ep.value()).get();
|
||||
// Kick off streaming commands. No need to wait for
|
||||
// restore_replica_count to complete which can take a long time,
|
||||
// since when it completes, this node will send notification to
|
||||
// tell the removal_coordinator with IP address notify_endpoint
|
||||
// that the restore process is finished on this node. This node
|
||||
// will be removed from _replicating_nodes on the
|
||||
// removal_coordinator.
|
||||
auto notify_endpoint = ep.value();
|
||||
restore_replica_count(endpoint, notify_endpoint).handle_exception([endpoint, notify_endpoint] (auto ep) {
|
||||
slogger.info("Failed to restore_replica_count for node {}, notify_endpoint={} : {}", endpoint, notify_endpoint, ep);
|
||||
});
|
||||
}
|
||||
} else { // now that the gossiper has told us about this nonexistent member, notify the gossiper to remove it
|
||||
if (sstring(gms::versioned_value::REMOVED_TOKEN) == pieces[0]) {
|
||||
@@ -978,6 +991,7 @@ void storage_service::on_change(inet_address endpoint, application_state state,
|
||||
boost::split(pieces, value.value, boost::is_any_of(sstring(versioned_value::DELIMITER_STR)));
|
||||
if (pieces.empty()) {
|
||||
slogger.warn("Fail to split status in on_change: endpoint={}, app_state={}, value={}", endpoint, state, value);
|
||||
return;
|
||||
}
|
||||
sstring move_name = pieces[0];
|
||||
if (move_name == sstring(versioned_value::STATUS_BOOTSTRAPPING)) {
|
||||
@@ -1026,8 +1040,8 @@ void storage_service::on_remove(gms::inet_address endpoint) {
|
||||
|
||||
void storage_service::on_dead(gms::inet_address endpoint, gms::endpoint_state state) {
|
||||
slogger.debug("endpoint={} on_dead", endpoint);
|
||||
netw::get_local_messaging_service().remove_rpc_client(netw::msg_addr{endpoint, 0});
|
||||
get_storage_service().invoke_on_all([endpoint] (auto&& ss) {
|
||||
netw::get_local_messaging_service().remove_rpc_client(netw::msg_addr{endpoint, 0});
|
||||
for (auto&& subscriber : ss._lifecycle_subscribers) {
|
||||
try {
|
||||
subscriber->on_down(endpoint);
|
||||
@@ -2345,15 +2359,12 @@ future<> storage_service::rebuild(sstring source_dc) {
|
||||
for (const auto& keyspace_name : ss._db.local().get_non_system_keyspaces()) {
|
||||
streamer->add_ranges(keyspace_name, ss.get_local_ranges(keyspace_name));
|
||||
}
|
||||
return streamer->fetch_async().then_wrapped([streamer] (auto&& f) {
|
||||
try {
|
||||
auto state = f.get0();
|
||||
} catch (...) {
|
||||
// This is used exclusively through JMX, so log the full trace but only throw a simple RTE
|
||||
slogger.error("Error while rebuilding node: {}", std::current_exception());
|
||||
throw std::runtime_error(sprint("Error while rebuilding node: %s", std::current_exception()));
|
||||
}
|
||||
return make_ready_future<>();
|
||||
return streamer->stream_async().then([streamer] {
|
||||
slogger.info("Streaming for rebuild successful");
|
||||
}).handle_exception([] (auto ep) {
|
||||
// This is used exclusively through JMX, so log the full trace but only throw a simple RTE
|
||||
slogger.warn("Error while rebuilding node: {}", std::current_exception());
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -2480,10 +2491,8 @@ void storage_service::unbootstrap() {
|
||||
}
|
||||
|
||||
future<> storage_service::restore_replica_count(inet_address endpoint, inet_address notify_endpoint) {
|
||||
std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> ranges_to_fetch;
|
||||
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Restore_replica_count");
|
||||
auto my_address = get_broadcast_address();
|
||||
|
||||
auto non_system_keyspaces = _db.local().get_non_system_keyspaces();
|
||||
for (const auto& keyspace_name : non_system_keyspaces) {
|
||||
std::unordered_multimap<dht::token_range, inet_address> changed_ranges = get_changed_ranges_for_leaving(keyspace_name, endpoint);
|
||||
@@ -2494,26 +2503,15 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
|
||||
}
|
||||
}
|
||||
std::unordered_multimap<inet_address, dht::token_range> source_ranges = get_new_source_ranges(keyspace_name, my_new_ranges);
|
||||
std::unordered_map<inet_address, dht::token_range_vector> tmp;
|
||||
std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint;
|
||||
for (auto& x : source_ranges) {
|
||||
tmp[x.first].emplace_back(x.second);
|
||||
ranges_per_endpoint[x.first].emplace_back(x.second);
|
||||
}
|
||||
ranges_to_fetch.emplace(keyspace_name, std::move(tmp));
|
||||
streamer->add_rx_ranges(keyspace_name, std::move(ranges_per_endpoint));
|
||||
}
|
||||
auto sp = make_lw_shared<streaming::stream_plan>("Restore replica count");
|
||||
for (auto& x: ranges_to_fetch) {
|
||||
const sstring& keyspace_name = x.first;
|
||||
std::unordered_map<inet_address, dht::token_range_vector>& maps = x.second;
|
||||
for (auto& m : maps) {
|
||||
auto source = m.first;
|
||||
auto ranges = m.second;
|
||||
slogger.debug("Requesting from {} ranges {}", source, ranges);
|
||||
sp->request_ranges(source, keyspace_name, ranges);
|
||||
}
|
||||
}
|
||||
return sp->execute().then_wrapped([this, sp, notify_endpoint] (auto&& f) {
|
||||
return streamer->stream_async().then_wrapped([this, streamer, notify_endpoint] (auto&& f) {
|
||||
try {
|
||||
auto state = f.get0();
|
||||
f.get();
|
||||
return this->send_replication_notification(notify_endpoint);
|
||||
} catch (...) {
|
||||
slogger.warn("Streaming to restore replica count failed: {}", std::current_exception());
|
||||
@@ -2605,8 +2603,7 @@ void storage_service::leave_ring() {
|
||||
|
||||
future<>
|
||||
storage_service::stream_ranges(std::unordered_map<sstring, std::unordered_multimap<dht::token_range, inet_address>> ranges_to_stream_by_keyspace) {
|
||||
// First, we build a list of ranges to stream to each host, per table
|
||||
std::unordered_map<sstring, std::unordered_map<inet_address, dht::token_range_vector>> sessions_to_stream_by_keyspace;
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Unbootstrap");
|
||||
for (auto& entry : ranges_to_stream_by_keyspace) {
|
||||
const auto& keyspace = entry.first;
|
||||
auto& ranges_with_endpoints = entry.second;
|
||||
@@ -2621,26 +2618,13 @@ storage_service::stream_ranges(std::unordered_map<sstring, std::unordered_multim
|
||||
inet_address endpoint = end_point_entry.second;
|
||||
ranges_per_endpoint[endpoint].emplace_back(r);
|
||||
}
|
||||
sessions_to_stream_by_keyspace.emplace(keyspace, std::move(ranges_per_endpoint));
|
||||
streamer->add_tx_ranges(keyspace, std::move(ranges_per_endpoint));
|
||||
}
|
||||
auto sp = make_lw_shared<streaming::stream_plan>("Unbootstrap");
|
||||
for (auto& entry : sessions_to_stream_by_keyspace) {
|
||||
const auto& keyspace_name = entry.first;
|
||||
// TODO: we can move to avoid copy of std::vector
|
||||
auto& ranges_per_endpoint = entry.second;
|
||||
|
||||
for (auto& ranges_entry : ranges_per_endpoint) {
|
||||
auto& ranges = ranges_entry.second;
|
||||
auto new_endpoint = ranges_entry.first;
|
||||
// TODO each call to transferRanges re-flushes, this is potentially a lot of waste
|
||||
sp->transfer_ranges(new_endpoint, keyspace_name, ranges);
|
||||
}
|
||||
}
|
||||
return sp->execute().discard_result().then([sp] {
|
||||
return streamer->stream_async().then([streamer] {
|
||||
slogger.info("stream_ranges successful");
|
||||
}).handle_exception([] (auto ep) {
|
||||
slogger.info("stream_ranges failed: {}", ep);
|
||||
return make_exception_future(std::runtime_error("stream_ranges failed"));
|
||||
slogger.warn("stream_ranges failed: {}", ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2674,16 +2658,18 @@ future<> storage_service::stream_hints() {
|
||||
// stream all hints -- range list will be a singleton of "the entire ring"
|
||||
dht::token_range_vector ranges = {dht::token_range::make_open_ended_both_sides()};
|
||||
slogger.debug("stream_hints: ranges={}", ranges);
|
||||
std::unordered_map<inet_address, dht::token_range_vector> ranges_per_endpoint;
|
||||
ranges_per_endpoint[hints_destination_host] = std::move(ranges);
|
||||
|
||||
auto sp = make_lw_shared<streaming::stream_plan>("Hints");
|
||||
std::vector<sstring> column_families = { db::system_keyspace::HINTS };
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Hints");
|
||||
auto keyspace = db::system_keyspace::NAME;
|
||||
sp->transfer_ranges(hints_destination_host, keyspace, ranges, column_families);
|
||||
return sp->execute().discard_result().then([sp] {
|
||||
std::vector<sstring> column_families = { db::system_keyspace::HINTS };
|
||||
streamer->add_tx_ranges(keyspace, std::move(ranges_per_endpoint), column_families);
|
||||
return streamer->stream_async().then([streamer] {
|
||||
slogger.info("stream_hints successful");
|
||||
}).handle_exception([] (auto ep) {
|
||||
slogger.info("stream_hints failed: {}", ep);
|
||||
return make_exception_future(std::runtime_error("stream_hints failed"));
|
||||
slogger.warn("stream_hints failed: {}", ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -266,6 +266,7 @@ private:
|
||||
gms::feature _indexes_feature;
|
||||
gms::feature _correct_counter_order_feature;
|
||||
gms::feature _schema_tables_v3;
|
||||
gms::feature _correct_non_compound_range_tombstones;
|
||||
public:
|
||||
void enable_all_features() {
|
||||
_range_tombstones_feature.enable();
|
||||
@@ -275,6 +276,7 @@ public:
|
||||
_indexes_feature.enable();
|
||||
_correct_counter_order_feature.enable();
|
||||
_schema_tables_v3.enable();
|
||||
_correct_non_compound_range_tombstones.enable();
|
||||
}
|
||||
|
||||
void finish_bootstrapping() {
|
||||
@@ -2236,6 +2238,10 @@ public:
|
||||
const gms::feature& cluster_supports_schema_tables_v3() const {
|
||||
return _schema_tables_v3;
|
||||
}
|
||||
|
||||
bool cluster_supports_reading_correctly_serialized_range_tombstones() const {
|
||||
return bool(_correct_non_compound_range_tombstones);
|
||||
}
|
||||
};
|
||||
|
||||
inline future<> init_storage_service(distributed<database>& db) {
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "consumer.hh"
|
||||
#include "downsampling.hh"
|
||||
#include "sstables/shared_index_lists.hh"
|
||||
#include <seastar/util/bool_class.hh>
|
||||
|
||||
namespace sstables {
|
||||
|
||||
@@ -47,12 +48,16 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// See #2993
|
||||
class trust_promoted_index_tag;
|
||||
using trust_promoted_index = bool_class<trust_promoted_index_tag>;
|
||||
|
||||
// IndexConsumer is a concept that implements:
|
||||
//
|
||||
// bool should_continue();
|
||||
// void consume_entry(index_entry&& ie, uintt64_t offset);
|
||||
template <class IndexConsumer>
|
||||
class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
|
||||
class index_consume_entry_context : public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
|
||||
using proceed = data_consumer::proceed;
|
||||
using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
|
||||
private:
|
||||
@@ -72,6 +77,8 @@ private:
|
||||
temporary_buffer<char> _key;
|
||||
temporary_buffer<char> _promoted;
|
||||
|
||||
trust_promoted_index _trust_pi;
|
||||
|
||||
public:
|
||||
void verify_end_state() {
|
||||
}
|
||||
@@ -117,6 +124,9 @@ public:
|
||||
}
|
||||
case state::CONSUME_ENTRY: {
|
||||
auto len = (_key.size() + _promoted.size() + 14);
|
||||
if (_trust_pi == trust_promoted_index::no) {
|
||||
_promoted = temporary_buffer<char>();
|
||||
}
|
||||
_consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)), _entry_offset);
|
||||
_entry_offset += len;
|
||||
_state = state::START;
|
||||
@@ -128,10 +138,10 @@ public:
|
||||
return proceed::yes;
|
||||
}
|
||||
|
||||
index_consume_entry_context(IndexConsumer& consumer,
|
||||
index_consume_entry_context(IndexConsumer& consumer, trust_promoted_index trust_pi,
|
||||
input_stream<char>&& input, uint64_t start, uint64_t maxlen)
|
||||
: continuous_data_consumer(std::move(input), start, maxlen)
|
||||
, _consumer(consumer), _entry_offset(start)
|
||||
, _consumer(consumer), _entry_offset(start), _trust_pi(trust_pi)
|
||||
{}
|
||||
|
||||
void reset(uint64_t offset) {
|
||||
@@ -196,7 +206,9 @@ class index_reader {
|
||||
|
||||
reader(shared_sstable sst, const io_priority_class& pc, uint64_t begin, uint64_t end, uint64_t quantity)
|
||||
: _consumer(quantity)
|
||||
, _context(_consumer, create_file_input_stream(sst, pc, begin, end), begin, end - begin)
|
||||
, _context(_consumer,
|
||||
trust_promoted_index(sst->has_correct_promoted_index_entries()),
|
||||
create_file_input_stream(sst, pc, begin, end), begin, end - begin)
|
||||
{ }
|
||||
};
|
||||
|
||||
@@ -514,6 +526,7 @@ public:
|
||||
}
|
||||
if (_current_index_idx + 1 < _current_list->size()) {
|
||||
++_current_index_idx;
|
||||
_current_pi_idx = 0;
|
||||
_data_file_position = (*_current_list)[_current_index_idx].position();
|
||||
_element = indexable_element::partition;
|
||||
return make_ready_future<>();
|
||||
|
||||
@@ -97,6 +97,9 @@ private:
|
||||
// _range_tombstones holds only tombstones which are relevant for current ranges.
|
||||
range_tombstone_stream _range_tombstones;
|
||||
bool _first_row_encountered = false;
|
||||
|
||||
// See #2986
|
||||
bool _treat_non_compound_rt_as_compound;
|
||||
public:
|
||||
void set_streamed_mutation(sstable_streamed_mutation* sm) {
|
||||
_sm = sm;
|
||||
@@ -306,18 +309,21 @@ public:
|
||||
mp_row_consumer(const schema_ptr schema,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
streamed_mutation::forwarding fwd)
|
||||
streamed_mutation::forwarding fwd,
|
||||
const shared_sstable& sst)
|
||||
: _schema(schema)
|
||||
, _pc(pc)
|
||||
, _slice(slice)
|
||||
, _fwd(fwd)
|
||||
, _range_tombstones(*_schema)
|
||||
, _treat_non_compound_rt_as_compound(!sst->has_correct_non_compound_range_tombstones())
|
||||
{ }
|
||||
|
||||
mp_row_consumer(const schema_ptr schema,
|
||||
const io_priority_class& pc,
|
||||
streamed_mutation::forwarding fwd)
|
||||
: mp_row_consumer(schema, query::full_slice, pc, fwd) { }
|
||||
streamed_mutation::forwarding fwd,
|
||||
const shared_sstable& sst)
|
||||
: mp_row_consumer(schema, query::full_slice, pc, fwd, sst) { }
|
||||
|
||||
virtual proceed consume_row_start(sstables::key_view key, sstables::deletion_time deltime) override {
|
||||
if (!_is_mutation_end) {
|
||||
@@ -621,7 +627,8 @@ public:
|
||||
return proceed::yes;
|
||||
}
|
||||
|
||||
auto start = composite_view(column::fix_static_name(*_schema, start_col)).explode();
|
||||
auto compound = _schema->is_compound() || _treat_non_compound_rt_as_compound;
|
||||
auto start = composite_view(column::fix_static_name(*_schema, start_col), compound).explode();
|
||||
|
||||
// Note how this is slightly different from the check in is_collection. Collection tombstones
|
||||
// do not have extra data.
|
||||
@@ -630,9 +637,9 @@ public:
|
||||
// won't have a full clustering prefix (otherwise it isn't a range)
|
||||
if (start.size() <= _schema->clustering_key_size()) {
|
||||
auto start_ck = clustering_key_prefix::from_exploded_view(start);
|
||||
auto start_kind = start_marker_to_bound_kind(start_col);
|
||||
auto end = clustering_key_prefix::from_exploded_view(composite_view(column::fix_static_name(*_schema, end_col)).explode());
|
||||
auto end_kind = end_marker_to_bound_kind(end_col);
|
||||
auto start_kind = compound ? start_marker_to_bound_kind(start_col) : bound_kind::incl_start;
|
||||
auto end = clustering_key_prefix::from_exploded_view(composite_view(column::fix_static_name(*_schema, end_col), compound).explode());
|
||||
auto end_kind = compound ? end_marker_to_bound_kind(end_col) : bound_kind::incl_end;
|
||||
if (range_tombstone::is_single_clustering_row_tombstone(*_schema, start_ck, start_kind, end, end_kind)) {
|
||||
auto ret = flush_if_needed(std::move(start_ck));
|
||||
if (!_skip_in_progress) {
|
||||
@@ -1050,7 +1057,7 @@ public:
|
||||
const io_priority_class &pc,
|
||||
streamed_mutation::forwarding fwd)
|
||||
: _get_data_source([this, sst = std::move(sst), s = std::move(schema), toread, last_end, &pc, fwd] {
|
||||
auto consumer = mp_row_consumer(s, query::full_slice, pc, fwd);
|
||||
auto consumer = mp_row_consumer(s, query::full_slice, pc, fwd, sst);
|
||||
auto ds = make_lw_shared<sstable_data_source>(std::move(s), std::move(sst), std::move(consumer), std::move(toread), last_end);
|
||||
return make_ready_future<lw_shared_ptr<sstable_data_source>>(std::move(ds));
|
||||
}) { }
|
||||
@@ -1058,7 +1065,7 @@ public:
|
||||
const io_priority_class &pc,
|
||||
streamed_mutation::forwarding fwd)
|
||||
: _get_data_source([this, sst = std::move(sst), s = std::move(schema), &pc, fwd] {
|
||||
auto consumer = mp_row_consumer(s, query::full_slice, pc, fwd);
|
||||
auto consumer = mp_row_consumer(s, query::full_slice, pc, fwd, sst);
|
||||
auto ds = make_lw_shared<sstable_data_source>(std::move(s), std::move(sst), std::move(consumer));
|
||||
return make_ready_future<lw_shared_ptr<sstable_data_source>>(std::move(ds));
|
||||
}) { }
|
||||
@@ -1076,7 +1083,7 @@ public:
|
||||
return f.then([this, lh_index = std::move(lh_index), rh_index = std::move(rh_index), sst = std::move(sst), s = std::move(s), &pc, &slice, fwd, fwd_mr] () mutable {
|
||||
sstable::disk_read_range drr{lh_index->data_file_position(),
|
||||
rh_index->data_file_position()};
|
||||
auto consumer = mp_row_consumer(s, slice, pc, fwd);
|
||||
auto consumer = mp_row_consumer(s, slice, pc, fwd, sst);
|
||||
auto ds = make_lw_shared<sstable_data_source>(std::move(s), std::move(sst), std::move(consumer), drr, (fwd_mr ? sst->data_size() : drr.end), std::move(lh_index), std::move(rh_index));
|
||||
ds->_index_in_current_partition = true;
|
||||
ds->_will_likely_slice = sstable_data_source::will_likely_slice(slice);
|
||||
@@ -1271,7 +1278,7 @@ sstables::sstable::read_row(schema_ptr schema,
|
||||
auto rh_index = std::make_unique<index_reader>(*lh_index);
|
||||
auto f = advance_to_upper_bound(*rh_index, *_schema, slice, key);
|
||||
return f.then([this, &slice, &pc, fwd, lh_index = std::move(lh_index), rh_index = std::move(rh_index), s = std::move(s)] () mutable {
|
||||
auto consumer = mp_row_consumer(s, slice, pc, fwd);
|
||||
auto consumer = mp_row_consumer(s, slice, pc, fwd, shared_from_this());
|
||||
auto ds = make_lw_shared<sstable_data_source>(sstable_data_source::single_partition_tag(), std::move(s),
|
||||
shared_from_this(), std::move(consumer), std::move(lh_index), std::move(rh_index));
|
||||
ds->_will_likely_slice = sstable_data_source::will_likely_slice(slice);
|
||||
|
||||
@@ -21,10 +21,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
#include "utils/loading_shared_values.hh"
|
||||
|
||||
namespace sstables {
|
||||
|
||||
@@ -36,50 +35,26 @@ using index_list = std::vector<index_entry>;
|
||||
class shared_index_lists {
|
||||
public:
|
||||
using key_type = uint64_t;
|
||||
struct stats {
|
||||
static thread_local struct stats {
|
||||
uint64_t hits = 0; // Number of times entry was found ready
|
||||
uint64_t misses = 0; // Number of times entry was not found
|
||||
uint64_t blocks = 0; // Number of times entry was not ready (>= misses)
|
||||
};
|
||||
private:
|
||||
class entry : public enable_lw_shared_from_this<entry> {
|
||||
public:
|
||||
key_type key;
|
||||
index_list list;
|
||||
shared_promise<> loaded;
|
||||
shared_index_lists& parent;
|
||||
} _shard_stats;
|
||||
|
||||
entry(shared_index_lists& parent, key_type key)
|
||||
: key(key), parent(parent)
|
||||
{ }
|
||||
~entry() {
|
||||
parent._lists.erase(key);
|
||||
}
|
||||
bool operator==(const entry& e) const { return key == e.key; }
|
||||
bool operator!=(const entry& e) const { return key != e.key; }
|
||||
struct stats_updater {
|
||||
static void inc_hits() noexcept { ++_shard_stats.hits; }
|
||||
static void inc_misses() noexcept { ++_shard_stats.misses; }
|
||||
static void inc_blocks() noexcept { ++_shard_stats.blocks; }
|
||||
static void inc_evictions() noexcept {}
|
||||
};
|
||||
std::unordered_map<key_type, entry*> _lists;
|
||||
static thread_local stats _shard_stats;
|
||||
public:
|
||||
|
||||
using loading_shared_lists_type = utils::loading_shared_values<key_type, index_list, std::hash<key_type>, std::equal_to<key_type>, stats_updater>;
|
||||
// Pointer to index_list
|
||||
class list_ptr {
|
||||
lw_shared_ptr<entry> _e;
|
||||
public:
|
||||
using element_type = index_list;
|
||||
list_ptr() = default;
|
||||
explicit list_ptr(lw_shared_ptr<entry> e) : _e(std::move(e)) {}
|
||||
explicit operator bool() const { return static_cast<bool>(_e); }
|
||||
index_list& operator*() { return _e->list; }
|
||||
const index_list& operator*() const { return _e->list; }
|
||||
index_list* operator->() { return &_e->list; }
|
||||
const index_list* operator->() const { return &_e->list; }
|
||||
using list_ptr = loading_shared_lists_type::entry_ptr;
|
||||
private:
|
||||
|
||||
index_list release() {
|
||||
auto res = _e.owned() ? index_list(std::move(_e->list)) : index_list(_e->list);
|
||||
_e = {};
|
||||
return std::move(res);
|
||||
}
|
||||
};
|
||||
loading_shared_lists_type _lists;
|
||||
public:
|
||||
|
||||
shared_index_lists() = default;
|
||||
shared_index_lists(shared_index_lists&&) = delete;
|
||||
@@ -93,41 +68,8 @@ public:
|
||||
//
|
||||
// The loader object does not survive deferring, so the caller must deal with its liveness.
|
||||
template<typename Loader>
|
||||
future<list_ptr> get_or_load(key_type key, Loader&& loader) {
|
||||
auto i = _lists.find(key);
|
||||
lw_shared_ptr<entry> e;
|
||||
auto f = [&] {
|
||||
if (i != _lists.end()) {
|
||||
e = i->second->shared_from_this();
|
||||
return e->loaded.get_shared_future();
|
||||
} else {
|
||||
++_shard_stats.misses;
|
||||
e = make_lw_shared<entry>(*this, key);
|
||||
auto f = e->loaded.get_shared_future();
|
||||
auto res = _lists.emplace(key, e.get());
|
||||
assert(res.second);
|
||||
futurize_apply(loader, key).then_wrapped([e](future<index_list>&& f) mutable {
|
||||
if (f.failed()) {
|
||||
e->loaded.set_exception(f.get_exception());
|
||||
} else {
|
||||
e->list = f.get0();
|
||||
e->loaded.set_value();
|
||||
}
|
||||
});
|
||||
return f;
|
||||
}
|
||||
}();
|
||||
if (!f.available()) {
|
||||
++_shard_stats.blocks;
|
||||
return f.then([e]() mutable {
|
||||
return list_ptr(std::move(e));
|
||||
});
|
||||
} else if (f.failed()) {
|
||||
return make_exception_future<list_ptr>(std::move(f).get_exception());
|
||||
} else {
|
||||
++_shard_stats.hits;
|
||||
return make_ready_future<list_ptr>(list_ptr(std::move(e)));
|
||||
}
|
||||
future<list_ptr> get_or_load(const key_type& key, Loader&& loader) {
|
||||
return _lists.get_or_load(key, std::forward<Loader>(loader));
|
||||
}
|
||||
|
||||
static const stats& shard_stats() { return _shard_stats; }
|
||||
|
||||
@@ -156,6 +156,12 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
shared_sstable
|
||||
make_sstable(schema_ptr schema, sstring dir, int64_t generation, sstable::version_types v, sstable::format_types f, gc_clock::time_point now,
|
||||
io_error_handler_gen error_handler_gen, size_t buffer_size) {
|
||||
return make_lw_shared<sstable>(std::move(schema), std::move(dir), generation, v, f, now, std::move(error_handler_gen), buffer_size);
|
||||
}
|
||||
|
||||
std::unordered_map<sstable::version_types, sstring, enum_hash<sstable::version_types>> sstable::_version_string = {
|
||||
{ sstable::version_types::ka , "ka" },
|
||||
{ sstable::version_types::la , "la" }
|
||||
@@ -1279,6 +1285,110 @@ static composite::eoc bound_kind_to_end_marker(bound_kind end_kind) {
|
||||
: composite::eoc::end;
|
||||
}
|
||||
|
||||
class bytes_writer_for_column_name {
|
||||
bytes _buf;
|
||||
bytes::iterator _pos;
|
||||
public:
|
||||
void prepare(size_t size) {
|
||||
_buf = bytes(bytes::initialized_later(), size);
|
||||
_pos = _buf.begin();
|
||||
}
|
||||
|
||||
template<typename... Args>
|
||||
void write(Args&&... args) {
|
||||
auto write_one = [this] (bytes_view data) {
|
||||
_pos = std::copy(data.begin(), data.end(), _pos);
|
||||
};
|
||||
auto ignore = { (write_one(bytes_view(args)), 0)... };
|
||||
(void)ignore;
|
||||
}
|
||||
|
||||
bytes&& release() && {
|
||||
return std::move(_buf);
|
||||
}
|
||||
};
|
||||
|
||||
class file_writer_for_column_name {
|
||||
file_writer& _fw;
|
||||
public:
|
||||
file_writer_for_column_name(file_writer& fw) : _fw(fw) { }
|
||||
|
||||
void prepare(uint16_t size) {
|
||||
sstables::write(_fw, size);
|
||||
}
|
||||
|
||||
template<typename... Args>
|
||||
void write(Args&&... args) {
|
||||
sstables::write(_fw, std::forward<Args>(args)...);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Writer>
|
||||
static void write_compound_non_dense_column_name(Writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none) {
|
||||
// was defined in the schema, for example.
|
||||
auto c = composite::from_exploded(column_names, true, marker);
|
||||
auto ck_bview = bytes_view(clustering_key);
|
||||
|
||||
// The marker is not a component, so if the last component is empty (IOW,
|
||||
// only serializes to the marker), then we just replace the key's last byte
|
||||
// with the marker. If the component however it is not empty, then the
|
||||
// marker should be in the end of it, and we just join them together as we
|
||||
// do for any normal component
|
||||
if (c.size() == 1) {
|
||||
ck_bview.remove_suffix(1);
|
||||
}
|
||||
size_t sz = ck_bview.size() + c.size();
|
||||
if (sz > std::numeric_limits<uint16_t>::max()) {
|
||||
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
out.prepare(uint16_t(sz));
|
||||
out.write(ck_bview, c);
|
||||
}
|
||||
|
||||
static void write_compound_non_dense_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none) {
|
||||
auto w = file_writer_for_column_name(out);
|
||||
write_compound_non_dense_column_name(w, clustering_key, column_names, marker);
|
||||
}
|
||||
|
||||
template<typename Writer>
|
||||
static void write_column_name(Writer& out, bytes_view column_names) {
|
||||
size_t sz = column_names.size();
|
||||
if (sz > std::numeric_limits<uint16_t>::max()) {
|
||||
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
out.prepare(uint16_t(sz));
|
||||
out.write(column_names);
|
||||
}
|
||||
|
||||
static void write_column_name(file_writer& out, bytes_view column_names) {
|
||||
auto w = file_writer_for_column_name(out);
|
||||
write_column_name(w, column_names);
|
||||
}
|
||||
|
||||
template<typename Writer>
|
||||
static void write_column_name(Writer& out, const schema& s, const composite& clustering_element, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none) {
|
||||
if (s.is_dense()) {
|
||||
write_column_name(out, bytes_view(clustering_element));
|
||||
} else if (s.is_compound()) {
|
||||
write_compound_non_dense_column_name(out, clustering_element, column_names, marker);
|
||||
} else {
|
||||
write_column_name(out, column_names[0]);
|
||||
}
|
||||
}
|
||||
|
||||
void sstable::write_range_tombstone_bound(file_writer& out,
|
||||
const schema& s,
|
||||
const composite& clustering_element,
|
||||
const std::vector<bytes_view>& column_names,
|
||||
composite::eoc marker) {
|
||||
if (!_correctly_serialize_non_compound_range_tombstones && !clustering_element.is_compound()) {
|
||||
auto vals = clustering_element.values();
|
||||
write_compound_non_dense_column_name(out, composite::serialize_value(vals, true), column_names, marker);
|
||||
} else {
|
||||
write_column_name(out, s, clustering_element, column_names, marker);
|
||||
}
|
||||
}
|
||||
|
||||
static void output_promoted_index_entry(bytes_ostream& promoted_index,
|
||||
const bytes& first_col,
|
||||
const bytes& last_col,
|
||||
@@ -1297,29 +1407,6 @@ static void output_promoted_index_entry(bytes_ostream& promoted_index,
|
||||
promoted_index.write(q, 8);
|
||||
}
|
||||
|
||||
// FIXME: use this in write_column_name() instead of repeating the code
|
||||
static bytes serialize_colname(const composite& clustering_key,
|
||||
const std::vector<bytes_view>& column_names, composite::eoc marker) {
|
||||
auto c = composite::from_exploded(column_names, marker);
|
||||
auto ck_bview = bytes_view(clustering_key);
|
||||
// The marker is not a component, so if the last component is empty (IOW,
|
||||
// only serializes to the marker), then we just replace the key's last byte
|
||||
// with the marker. If the component however it is not empty, then the
|
||||
// marker should be in the end of it, and we just join them together as we
|
||||
// do for any normal component
|
||||
if (c.size() == 1) {
|
||||
ck_bview.remove_suffix(1);
|
||||
}
|
||||
size_t sz = ck_bview.size() + c.size();
|
||||
if (sz > std::numeric_limits<uint16_t>::max()) {
|
||||
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
bytes colname(bytes::initialized_later(), sz);
|
||||
std::copy(ck_bview.begin(), ck_bview.end(), colname.begin());
|
||||
std::copy(c.get_bytes().begin(), c.get_bytes().end(), colname.begin() + ck_bview.size());
|
||||
return colname;
|
||||
}
|
||||
|
||||
// Call maybe_flush_pi_block() before writing the given sstable atom to the
|
||||
// output. This may start a new promoted-index block depending on how much
|
||||
// data we've already written since the start of the current block. Starting
|
||||
@@ -1337,7 +1424,18 @@ void sstable::maybe_flush_pi_block(file_writer& out,
|
||||
const composite& clustering_key,
|
||||
const std::vector<bytes_view>& column_names,
|
||||
composite::eoc marker) {
|
||||
bytes colname = serialize_colname(clustering_key, column_names, marker);
|
||||
if (!_schema->clustering_key_size()) {
|
||||
return;
|
||||
}
|
||||
bytes_writer_for_column_name w;
|
||||
write_column_name(w, *_schema, clustering_key, column_names, marker);
|
||||
maybe_flush_pi_block(out, clustering_key, std::move(w).release());
|
||||
}
|
||||
|
||||
// Overload can only be called if the schema has clustering keys.
|
||||
void sstable::maybe_flush_pi_block(file_writer& out,
|
||||
const composite& clustering_key,
|
||||
bytes colname) {
|
||||
if (_pi_write.block_first_colname.empty()) {
|
||||
// This is the first column in the partition, or first column since we
|
||||
// closed a promoted-index block. Remember its name and position -
|
||||
@@ -1362,17 +1460,15 @@ void sstable::maybe_flush_pi_block(file_writer& out,
|
||||
// block includes them), but we set block_next_start_offset after - so
|
||||
// even if we wrote a lot of open tombstones, we still get a full
|
||||
// block size of new data.
|
||||
if (!clustering_key.empty()) {
|
||||
auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
|
||||
clustering_key_prefix::from_range(clustering_key.values()));
|
||||
for (const auto& rt : rts) {
|
||||
auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
|
||||
auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
|
||||
write_range_tombstone(out,
|
||||
start, bound_kind_to_start_marker(rt.start_kind),
|
||||
end, bound_kind_to_end_marker(rt.end_kind),
|
||||
{}, rt.tomb);
|
||||
}
|
||||
auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
|
||||
clustering_key_prefix::from_range(clustering_key.values()));
|
||||
for (const auto& rt : rts) {
|
||||
auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
|
||||
auto end = composite::from_clustering_element(*_pi_write.schemap, rt.end);
|
||||
write_range_tombstone(out,
|
||||
start, bound_kind_to_start_marker(rt.start_kind),
|
||||
end, bound_kind_to_end_marker(rt.end_kind),
|
||||
{}, rt.tomb);
|
||||
}
|
||||
_pi_write.block_next_start_offset = out.offset() + _pi_write.desired_block_size;
|
||||
_pi_write.block_first_colname = colname;
|
||||
@@ -1384,37 +1480,6 @@ void sstable::maybe_flush_pi_block(file_writer& out,
|
||||
}
|
||||
}
|
||||
|
||||
void sstable::write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker) {
|
||||
// was defined in the schema, for example.
|
||||
auto c = composite::from_exploded(column_names, marker);
|
||||
auto ck_bview = bytes_view(clustering_key);
|
||||
|
||||
// The marker is not a component, so if the last component is empty (IOW,
|
||||
// only serializes to the marker), then we just replace the key's last byte
|
||||
// with the marker. If the component however it is not empty, then the
|
||||
// marker should be in the end of it, and we just join them together as we
|
||||
// do for any normal component
|
||||
if (c.size() == 1) {
|
||||
ck_bview.remove_suffix(1);
|
||||
}
|
||||
size_t sz = ck_bview.size() + c.size();
|
||||
if (sz > std::numeric_limits<uint16_t>::max()) {
|
||||
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
uint16_t sz16 = sz;
|
||||
write(out, sz16, ck_bview, c);
|
||||
}
|
||||
|
||||
void sstable::write_column_name(file_writer& out, bytes_view column_names) {
|
||||
size_t sz = column_names.size();
|
||||
if (sz > std::numeric_limits<uint16_t>::max()) {
|
||||
throw std::runtime_error(sprint("Column name too large (%d > %d)", sz, std::numeric_limits<uint16_t>::max()));
|
||||
}
|
||||
uint16_t sz16 = sz;
|
||||
write(out, sz16, column_names);
|
||||
}
|
||||
|
||||
|
||||
static inline void update_cell_stats(column_stats& c_stats, uint64_t timestamp) {
|
||||
c_stats.update_min_timestamp(timestamp);
|
||||
c_stats.update_max_timestamp(timestamp);
|
||||
@@ -1496,13 +1561,12 @@ void sstable::write_cell(file_writer& out, atomic_cell_view cell, const column_d
|
||||
}
|
||||
}
|
||||
|
||||
void sstable::write_row_marker(file_writer& out, const row_marker& marker, const composite& clustering_key) {
|
||||
if (marker.is_missing()) {
|
||||
void sstable::maybe_write_row_marker(file_writer& out, const schema& schema, const row_marker& marker, const composite& clustering_key) {
|
||||
if (!schema.is_compound() || schema.is_dense() || marker.is_missing()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Write row mark cell to the beginning of clustered row.
|
||||
write_column_name(out, clustering_key, { bytes_view() });
|
||||
index_and_write_column_name(out, clustering_key, { bytes_view() });
|
||||
uint64_t timestamp = marker.timestamp();
|
||||
uint32_t value_length = 0;
|
||||
|
||||
@@ -1538,21 +1602,25 @@ void sstable::write_deletion_time(file_writer& out, const tombstone t) {
|
||||
write(out, deletion_time, timestamp);
|
||||
}
|
||||
|
||||
void sstable::write_row_tombstone(file_writer& out, const composite& key, const row_tombstone t) {
|
||||
void sstable::index_tombstone(file_writer& out, const composite& key, range_tombstone&& rt, composite::eoc marker) {
|
||||
maybe_flush_pi_block(out, key, {}, marker);
|
||||
// Remember the range tombstone so when we need to open a new promoted
|
||||
// index block, we can figure out which ranges are still open and need
|
||||
// to be repeated in the data file. Note that apply() also drops ranges
|
||||
// already closed by rt.start, so the accumulator doesn't grow boundless.
|
||||
_pi_write.tombstone_accumulator->apply(std::move(rt));
|
||||
}
|
||||
|
||||
void sstable::maybe_write_row_tombstone(file_writer& out, const composite& key, const clustering_row& clustered_row) {
|
||||
auto t = clustered_row.tomb();
|
||||
if (!t) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto write_tombstone = [&] (tombstone t, column_mask mask) {
|
||||
write_column_name(out, key, {}, composite::eoc::start);
|
||||
write(out, mask);
|
||||
write_column_name(out, key, {}, composite::eoc::end);
|
||||
write_deletion_time(out, t);
|
||||
};
|
||||
|
||||
write_tombstone(t.regular(), column_mask::range_tombstone);
|
||||
auto rt = range_tombstone(clustered_row.key(), bound_kind::incl_start, clustered_row.key(), bound_kind::incl_end, t.tomb());
|
||||
index_tombstone(out, key, std::move(rt), composite::eoc::none);
|
||||
write_range_tombstone(out, key, composite::eoc::start, key, composite::eoc::end, {}, t.regular());
|
||||
if (t.is_shadowable()) {
|
||||
write_tombstone(t.shadowable().tomb(), column_mask::shadowable);
|
||||
write_range_tombstone(out, key, composite::eoc::start, key, composite::eoc::end, {}, t.shadowable().tomb(), column_mask::shadowable);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1562,27 +1630,26 @@ void sstable::write_range_tombstone(file_writer& out,
|
||||
const composite& end,
|
||||
composite::eoc end_marker,
|
||||
std::vector<bytes_view> suffix,
|
||||
const tombstone t) {
|
||||
if (!t) {
|
||||
return;
|
||||
const tombstone t,
|
||||
column_mask mask) {
|
||||
if (!_schema->is_compound() && (start_marker == composite::eoc::end || end_marker == composite::eoc::start)) {
|
||||
throw std::logic_error(sprint("Cannot represent marker type in range tombstone for non-compound schemas"));
|
||||
}
|
||||
|
||||
write_column_name(out, start, suffix, start_marker);
|
||||
column_mask mask = column_mask::range_tombstone;
|
||||
write_range_tombstone_bound(out, *_schema, start, suffix, start_marker);
|
||||
write(out, mask);
|
||||
write_column_name(out, end, suffix, end_marker);
|
||||
write_range_tombstone_bound(out, *_schema, end, suffix, end_marker);
|
||||
write_deletion_time(out, t);
|
||||
}
|
||||
|
||||
void sstable::write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection) {
|
||||
|
||||
auto t = static_pointer_cast<const collection_type_impl>(cdef.type);
|
||||
auto mview = t->deserialize_mutation_form(collection);
|
||||
const bytes& column_name = cdef.name();
|
||||
write_range_tombstone(out, clustering_key, clustering_key, { bytes_view(column_name) }, mview.tomb);
|
||||
if (mview.tomb) {
|
||||
write_range_tombstone(out, clustering_key, composite::eoc::start, clustering_key, composite::eoc::end, { column_name }, mview.tomb);
|
||||
}
|
||||
for (auto& cp: mview.cells) {
|
||||
maybe_flush_pi_block(out, clustering_key, { column_name, cp.first });
|
||||
write_column_name(out, clustering_key, { column_name, cp.first });
|
||||
index_and_write_column_name(out, clustering_key, { column_name, cp.first });
|
||||
write_cell(out, cp.second, cdef);
|
||||
}
|
||||
}
|
||||
@@ -1592,24 +1659,8 @@ void sstable::write_collection(file_writer& out, const composite& clustering_key
|
||||
void sstable::write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row) {
|
||||
auto clustering_key = composite::from_clustering_element(schema, clustered_row.key());
|
||||
|
||||
if (schema.is_compound() && !schema.is_dense()) {
|
||||
maybe_flush_pi_block(out, clustering_key, { bytes_view() });
|
||||
write_row_marker(out, clustered_row.marker(), clustering_key);
|
||||
}
|
||||
// Before writing cells, range tombstone must be written if the row has any (deletable_row::t).
|
||||
if (clustered_row.tomb()) {
|
||||
maybe_flush_pi_block(out, clustering_key, {});
|
||||
write_row_tombstone(out, clustering_key, clustered_row.tomb());
|
||||
// Because we currently may break a partition to promoted-index blocks
|
||||
// in the middle of a clustered row, we also need to track the current
|
||||
// row's tombstone - not just range tombstones - which may effect the
|
||||
// beginning of a new block.
|
||||
// TODO: consider starting a new block only between rows, so the
|
||||
// following code can be dropped:
|
||||
_pi_write.tombstone_accumulator->apply(range_tombstone(
|
||||
clustered_row.key(), bound_kind::incl_start,
|
||||
clustered_row.key(), bound_kind::incl_end, clustered_row.tomb().tomb()));
|
||||
}
|
||||
maybe_write_row_marker(out, schema, clustered_row.marker(), clustering_key);
|
||||
maybe_write_row_tombstone(out, clustering_key, clustered_row);
|
||||
|
||||
if (schema.clustering_key_size()) {
|
||||
column_name_helper::min_max_components(schema, _collector.min_column_names(), _collector.max_column_names(),
|
||||
@@ -1627,30 +1678,14 @@ void sstable::write_clustered_row(file_writer& out, const schema& schema, const
|
||||
}
|
||||
assert(column_definition.is_regular());
|
||||
atomic_cell_view cell = c.as_atomic_cell();
|
||||
const bytes& column_name = column_definition.name();
|
||||
|
||||
if (schema.is_compound()) {
|
||||
if (schema.is_dense()) {
|
||||
maybe_flush_pi_block(out, composite(), { bytes_view(clustering_key) });
|
||||
write_column_name(out, bytes_view(clustering_key));
|
||||
} else {
|
||||
maybe_flush_pi_block(out, clustering_key, { bytes_view(column_name) });
|
||||
write_column_name(out, clustering_key, { bytes_view(column_name) });
|
||||
}
|
||||
} else {
|
||||
if (schema.is_dense()) {
|
||||
maybe_flush_pi_block(out, composite(), { bytes_view(clustered_row.key().get_component(schema, 0)) });
|
||||
write_column_name(out, bytes_view(clustered_row.key().get_component(schema, 0)));
|
||||
} else {
|
||||
maybe_flush_pi_block(out, composite(), { bytes_view(column_name) });
|
||||
write_column_name(out, bytes_view(column_name));
|
||||
}
|
||||
}
|
||||
std::vector<bytes_view> column_name = { column_definition.name() };
|
||||
index_and_write_column_name(out, clustering_key, column_name);
|
||||
write_cell(out, cell, column_definition);
|
||||
});
|
||||
}
|
||||
|
||||
void sstable::write_static_row(file_writer& out, const schema& schema, const row& static_row) {
|
||||
assert(schema.is_compound());
|
||||
static_row.for_each_cell([&] (column_id id, const atomic_cell_or_collection& c) {
|
||||
auto&& column_definition = schema.static_column_at(id);
|
||||
if (!column_definition.is_atomic()) {
|
||||
@@ -1660,20 +1695,28 @@ void sstable::write_static_row(file_writer& out, const schema& schema, const row
|
||||
}
|
||||
assert(column_definition.is_static());
|
||||
const auto& column_name = column_definition.name();
|
||||
if (schema.is_compound()) {
|
||||
auto sp = composite::static_prefix(schema);
|
||||
maybe_flush_pi_block(out, sp, { bytes_view(column_name) });
|
||||
write_column_name(out, sp, { bytes_view(column_name) });
|
||||
} else {
|
||||
assert(!schema.is_dense());
|
||||
maybe_flush_pi_block(out, composite(), { bytes_view(column_name) });
|
||||
write_column_name(out, bytes_view(column_name));
|
||||
}
|
||||
auto sp = composite::static_prefix(schema);
|
||||
index_and_write_column_name(out, sp, { bytes_view(column_name) });
|
||||
atomic_cell_view cell = c.as_atomic_cell();
|
||||
write_cell(out, cell, column_definition);
|
||||
});
|
||||
}
|
||||
|
||||
void sstable::index_and_write_column_name(file_writer& out,
|
||||
const composite& clustering_element,
|
||||
const std::vector<bytes_view>& column_names,
|
||||
composite::eoc marker) {
|
||||
if (_schema->clustering_key_size()) {
|
||||
bytes_writer_for_column_name w;
|
||||
write_column_name(w, *_schema, clustering_element, column_names, marker);
|
||||
auto&& colname = std::move(w).release();
|
||||
maybe_flush_pi_block(out, clustering_element, colname);
|
||||
write_column_name(out, colname);
|
||||
} else {
|
||||
write_column_name(out, *_schema, clustering_element, column_names, marker);
|
||||
}
|
||||
}
|
||||
|
||||
static void write_index_header(file_writer& out, disk_string_view<uint16_t>& key, uint64_t pos) {
|
||||
write(out, key, pos);
|
||||
}
|
||||
@@ -1855,6 +1898,7 @@ components_writer::components_writer(sstable& sst, const schema& s, file_writer&
|
||||
{
|
||||
_sst._components->filter = utils::i_filter::get_filter(estimated_partitions, _schema.bloom_filter_fp_chance());
|
||||
_sst._pi_write.desired_block_size = cfg.promoted_index_block_size.value_or(get_config().column_index_size_in_kb() * 1024);
|
||||
_sst._correctly_serialize_non_compound_range_tombstones = cfg.correctly_serialize_non_compound_range_tombstones;
|
||||
|
||||
prepare_summary(_sst._components->summary, estimated_partitions, _schema.min_index_interval());
|
||||
|
||||
@@ -1929,17 +1973,13 @@ stop_iteration components_writer::consume(clustering_row&& cr) {
|
||||
|
||||
stop_iteration components_writer::consume(range_tombstone&& rt) {
|
||||
ensure_tombstone_is_written();
|
||||
// Remember the range tombstone so when we need to open a new promoted
|
||||
// index block, we can figure out which ranges are still open and need
|
||||
// to be repeated in the data file. Note that apply() also drops ranges
|
||||
// already closed by rt.start, so the accumulator doesn't grow boundless.
|
||||
_sst._pi_write.tombstone_accumulator->apply(rt);
|
||||
auto start = composite::from_clustering_element(_schema, std::move(rt.start));
|
||||
auto start = composite::from_clustering_element(_schema, rt.start);
|
||||
auto start_marker = bound_kind_to_start_marker(rt.start_kind);
|
||||
auto end = composite::from_clustering_element(_schema, std::move(rt.end));
|
||||
auto end = composite::from_clustering_element(_schema, rt.end);
|
||||
auto end_marker = bound_kind_to_end_marker(rt.end_kind);
|
||||
_sst.maybe_flush_pi_block(_out, start, {}, start_marker);
|
||||
_sst.write_range_tombstone(_out, std::move(start), start_marker, std::move(end), end_marker, {}, rt.tomb);
|
||||
auto tomb = rt.tomb;
|
||||
_sst.index_tombstone(_out, start, std::move(rt), start_marker);
|
||||
_sst.write_range_tombstone(_out, std::move(start), start_marker, std::move(end), end_marker, {}, tomb);
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
@@ -2018,12 +2058,13 @@ sstable::read_scylla_metadata(const io_priority_class& pc) {
|
||||
}
|
||||
|
||||
void
|
||||
sstable::write_scylla_metadata(const io_priority_class& pc, shard_id shard) {
|
||||
sstable::write_scylla_metadata(const io_priority_class& pc, shard_id shard, sstable_enabled_features features) {
|
||||
auto&& first_key = get_first_decorated_key();
|
||||
auto&& last_key = get_last_decorated_key();
|
||||
auto sm = create_sharding_metadata(_schema, first_key, last_key, shard);
|
||||
_components->scylla_metadata.emplace();
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::Sharding>(std::move(sm));
|
||||
_components->scylla_metadata->data.set<scylla_metadata_type::Features>(std::move(features));
|
||||
|
||||
write_simple<component_type::Scylla>(*_components->scylla_metadata, pc);
|
||||
}
|
||||
@@ -2075,6 +2116,7 @@ sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated
|
||||
, _backup(cfg.backup)
|
||||
, _leave_unsealed(cfg.leave_unsealed)
|
||||
, _shard(shard)
|
||||
, _correctly_serialize_non_compound_range_tombstones(cfg.correctly_serialize_non_compound_range_tombstones)
|
||||
{
|
||||
_sst.generate_toc(_schema.get_compressor_params().get_compressor(), _schema.bloom_filter_fp_chance());
|
||||
_sst.write_toc(_pc);
|
||||
@@ -2084,6 +2126,10 @@ sstable_writer::sstable_writer(sstable& sst, const schema& s, uint64_t estimated
|
||||
_components_writer.emplace(_sst, _schema, *_writer, estimated_partitions, cfg, _pc);
|
||||
}
|
||||
|
||||
static sstable_enabled_features all_features() {
|
||||
return sstable_enabled_features{(1 << sstable_feature::End) - 1};
|
||||
}
|
||||
|
||||
void sstable_writer::consume_end_of_stream()
|
||||
{
|
||||
_components_writer->consume_end_of_stream();
|
||||
@@ -2093,7 +2139,11 @@ void sstable_writer::consume_end_of_stream()
|
||||
_sst.write_filter(_pc);
|
||||
_sst.write_statistics(_pc);
|
||||
_sst.write_compression(_pc);
|
||||
_sst.write_scylla_metadata(_pc, _shard);
|
||||
auto features = all_features();
|
||||
if (!_correctly_serialize_non_compound_range_tombstones) {
|
||||
features.disable(sstable_feature::NonCompoundRangeTombstones);
|
||||
}
|
||||
_sst.write_scylla_metadata(_pc, _shard, std::move(features));
|
||||
|
||||
if (!_leave_unsealed) {
|
||||
_sst.seal_sstable(_backup).get();
|
||||
@@ -2169,7 +2219,8 @@ future<> sstable::generate_summary(const io_priority_class& pc) {
|
||||
options.io_priority_class = pc;
|
||||
auto stream = make_file_input_stream(index_file, 0, size, std::move(options));
|
||||
return do_with(summary_generator(_components->summary), [this, &pc, stream = std::move(stream), size] (summary_generator& s) mutable {
|
||||
auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(s, std::move(stream), 0, size);
|
||||
auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(
|
||||
s, trust_promoted_index::yes, std::move(stream), 0, size);
|
||||
return ctx->consume_input(*ctx).finally([ctx] {
|
||||
return ctx->close();
|
||||
}).then([this, ctx, &s] {
|
||||
@@ -2872,5 +2923,8 @@ mutation_source sstable::as_mutation_source() {
|
||||
});
|
||||
}
|
||||
|
||||
bool supports_correct_non_compound_range_tombstones() {
|
||||
return service::get_local_storage_service().cluster_supports_reading_correctly_serialized_range_tombstones();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -130,6 +130,8 @@ struct sstable_open_info;
|
||||
|
||||
class index_reader;
|
||||
|
||||
bool supports_correct_non_compound_range_tombstones();
|
||||
|
||||
struct sstable_writer_config {
|
||||
std::experimental::optional<size_t> promoted_index_block_size;
|
||||
uint64_t max_sstable_size = std::numeric_limits<uint64_t>::max();
|
||||
@@ -137,6 +139,7 @@ struct sstable_writer_config {
|
||||
bool leave_unsealed = false;
|
||||
stdx::optional<db::replay_position> replay_position;
|
||||
seastar::thread_scheduling_group* thread_scheduling_group = nullptr;
|
||||
bool correctly_serialize_non_compound_range_tombstones = supports_correct_non_compound_range_tombstones();
|
||||
};
|
||||
|
||||
class sstable : public enable_lw_shared_from_this<sstable> {
|
||||
@@ -479,6 +482,10 @@ private:
|
||||
lw_shared_ptr<file_input_stream_history> _single_partition_history = make_lw_shared<file_input_stream_history>();
|
||||
lw_shared_ptr<file_input_stream_history> _partition_range_history = make_lw_shared<file_input_stream_history>();
|
||||
|
||||
//FIXME: Set by sstable_writer to influence sstable writing behavior.
|
||||
// Remove when doing #3012
|
||||
bool _correctly_serialize_non_compound_range_tombstones;
|
||||
|
||||
// _pi_write is used temporarily for building the promoted
|
||||
// index (column sample) of one partition when writing a new sstable.
|
||||
struct {
|
||||
@@ -501,6 +508,10 @@ private:
|
||||
const std::vector<bytes_view>& column_names,
|
||||
composite::eoc marker = composite::eoc::none);
|
||||
|
||||
void maybe_flush_pi_block(file_writer& out,
|
||||
const composite& clustering_key,
|
||||
bytes colname);
|
||||
|
||||
schema_ptr _schema;
|
||||
sstring _dir;
|
||||
unsigned long _generation = 0;
|
||||
@@ -534,7 +545,7 @@ private:
|
||||
void write_compression(const io_priority_class& pc);
|
||||
|
||||
future<> read_scylla_metadata(const io_priority_class& pc);
|
||||
void write_scylla_metadata(const io_priority_class& pc, shard_id shard = engine().cpu_id());
|
||||
void write_scylla_metadata(const io_priority_class& pc, shard_id shard, sstable_enabled_features features);
|
||||
|
||||
future<> read_filter(const io_priority_class& pc);
|
||||
|
||||
@@ -598,20 +609,23 @@ private:
|
||||
bool filter_has_key(const schema& s, const dht::decorated_key& dk) { return filter_has_key(key::from_partition_key(s, dk._key)); }
|
||||
|
||||
// NOTE: functions used to generate sstable components.
|
||||
void write_row_marker(file_writer& out, const row_marker& marker, const composite& clustering_key);
|
||||
void maybe_write_row_marker(file_writer& out, const schema& schema, const row_marker& marker, const composite& clustering_key);
|
||||
void write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row);
|
||||
void write_static_row(file_writer& out, const schema& schema, const row& static_row);
|
||||
void write_cell(file_writer& out, atomic_cell_view cell, const column_definition& cdef);
|
||||
void write_column_name(file_writer& out, const composite& clustering_key, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none);
|
||||
void write_column_name(file_writer& out, bytes_view column_names);
|
||||
void write_range_tombstone(file_writer& out, const composite& start, composite::eoc start_marker, const composite& end, composite::eoc end_marker, std::vector<bytes_view> suffix, const tombstone t);
|
||||
void write_range_tombstone(file_writer& out, const composite& start, const composite& end, std::vector<bytes_view> suffix, const tombstone t) {
|
||||
write_range_tombstone(out, start, composite::eoc::start, end, composite::eoc::end, std::move(suffix), std::move(t));
|
||||
}
|
||||
void write_range_tombstone(file_writer& out, const composite& start, composite::eoc start_marker, const composite& end, composite::eoc end_marker,
|
||||
std::vector<bytes_view> suffix, const tombstone t, const column_mask = column_mask::range_tombstone);
|
||||
void write_range_tombstone_bound(file_writer& out, const schema& s, const composite& clustering_element, const std::vector<bytes_view>& column_names, composite::eoc marker = composite::eoc::none);
|
||||
void index_tombstone(file_writer& out, const composite& key, range_tombstone&& rt, composite::eoc marker);
|
||||
void write_collection(file_writer& out, const composite& clustering_key, const column_definition& cdef, collection_mutation_view collection);
|
||||
void write_row_tombstone(file_writer& out, const composite& key, const row_tombstone t);
|
||||
void maybe_write_row_tombstone(file_writer& out, const composite& key, const clustering_row& clustered_row);
|
||||
void write_deletion_time(file_writer& out, const tombstone t);
|
||||
|
||||
void index_and_write_column_name(file_writer& out,
|
||||
const composite& clustering,
|
||||
const std::vector<bytes_view>& column_names,
|
||||
composite::eoc marker = composite::eoc::none);
|
||||
|
||||
stdx::optional<std::pair<uint64_t, uint64_t>> get_sample_indexes_for_range(const dht::token_range& range);
|
||||
public:
|
||||
std::unique_ptr<index_reader> get_index_reader(const io_priority_class& pc);
|
||||
@@ -622,6 +636,14 @@ public:
|
||||
return has_component(component_type::Scylla);
|
||||
}
|
||||
|
||||
bool has_correct_promoted_index_entries() const {
|
||||
return _schema->is_compound() || !has_scylla_component() || _components->scylla_metadata->has_feature(sstable_feature::NonCompoundPIEntries);
|
||||
}
|
||||
|
||||
bool has_correct_non_compound_range_tombstones() const {
|
||||
return _schema->is_compound() || !has_scylla_component() || _components->scylla_metadata->has_feature(sstable_feature::NonCompoundRangeTombstones);
|
||||
}
|
||||
|
||||
bool filter_has_key(const key& key) {
|
||||
return _components->filter->is_present(bytes_view(key));
|
||||
}
|
||||
@@ -724,6 +746,11 @@ public:
|
||||
using shared_sstable = lw_shared_ptr<sstable>;
|
||||
using sstable_list = std::unordered_set<shared_sstable>;
|
||||
|
||||
shared_sstable make_sstable(schema_ptr schema, sstring dir, int64_t generation, sstable::version_types v, sstable::format_types f, gc_clock::time_point now = gc_clock::now(),
|
||||
io_error_handler_gen error_handler_gen = default_io_error_handler_gen(), size_t buffer_size = 128*1024);
|
||||
|
||||
|
||||
|
||||
struct entry_descriptor {
|
||||
sstring ks;
|
||||
sstring cf;
|
||||
@@ -819,6 +846,7 @@ class sstable_writer {
|
||||
std::unique_ptr<file_writer> _writer;
|
||||
stdx::optional<components_writer> _components_writer;
|
||||
shard_id _shard; // Specifies which shard new sstable will belong to.
|
||||
bool _correctly_serialize_non_compound_range_tombstones;
|
||||
private:
|
||||
void prepare_file_writer();
|
||||
void finish_file_writer();
|
||||
@@ -828,7 +856,8 @@ public:
|
||||
~sstable_writer();
|
||||
sstable_writer(sstable_writer&& o) : _sst(o._sst), _schema(o._schema), _pc(o._pc), _backup(o._backup),
|
||||
_leave_unsealed(o._leave_unsealed), _compression_enabled(o._compression_enabled), _writer(std::move(o._writer)),
|
||||
_components_writer(std::move(o._components_writer)), _shard(o._shard) {}
|
||||
_components_writer(std::move(o._components_writer)), _shard(o._shard),
|
||||
_correctly_serialize_non_compound_range_tombstones(o._correctly_serialize_non_compound_range_tombstones) { }
|
||||
void consume_new_partition(const dht::decorated_key& dk) { return _components_writer->consume_new_partition(dk); }
|
||||
void consume(tombstone t) { _components_writer->consume(t); }
|
||||
stop_iteration consume(static_row&& sr) { return _components_writer->consume(std::move(sr)); }
|
||||
|
||||
@@ -358,6 +358,28 @@ struct sharding_metadata {
|
||||
auto describe_type(Describer f) { return f(token_ranges); }
|
||||
};
|
||||
|
||||
// Scylla-specific list of features an sstable supports.
|
||||
enum sstable_feature : uint8_t {
|
||||
NonCompoundPIEntries = 0, // See #2993
|
||||
NonCompoundRangeTombstones = 1, // See #2986
|
||||
End = 2
|
||||
};
|
||||
|
||||
// Scylla-specific features enabled for a particular sstable.
|
||||
struct sstable_enabled_features {
|
||||
uint64_t enabled_features;
|
||||
|
||||
bool is_enabled(sstable_feature f) const {
|
||||
return enabled_features & (1 << f);
|
||||
}
|
||||
|
||||
void disable(sstable_feature f) {
|
||||
enabled_features &= ~(1<< f);
|
||||
}
|
||||
|
||||
template <typename Describer>
|
||||
auto describe_type(Describer f) { return f(enabled_features); }
|
||||
};
|
||||
|
||||
// Numbers are found on disk, so they do matter. Also, setting their sizes of
|
||||
// that of an uint32_t is a bit wasteful, but it simplifies the code a lot
|
||||
@@ -369,16 +391,22 @@ enum class metadata_type : uint32_t {
|
||||
Stats = 2,
|
||||
};
|
||||
|
||||
|
||||
enum class scylla_metadata_type : uint32_t {
|
||||
Sharding = 1,
|
||||
Features = 2,
|
||||
};
|
||||
|
||||
struct scylla_metadata {
|
||||
disk_set_of_tagged_union<scylla_metadata_type,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Sharding, sharding_metadata>
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Sharding, sharding_metadata>,
|
||||
disk_tagged_union_member<scylla_metadata_type, scylla_metadata_type::Features, sstable_enabled_features>
|
||||
> data;
|
||||
|
||||
bool has_feature(sstable_feature f) const {
|
||||
auto features = data.get<scylla_metadata_type::Features, sstable_enabled_features>();
|
||||
return features && features->is_enabled(f);
|
||||
}
|
||||
|
||||
template <typename Describer>
|
||||
auto describe_type(Describer f) { return f(data); }
|
||||
};
|
||||
|
||||
@@ -100,7 +100,7 @@ void stream_coordinator::connect_all_stream_sessions() {
|
||||
for (auto& x : _peer_sessions) {
|
||||
auto& session = x.second;
|
||||
session->start();
|
||||
sslog.info("[Stream #{}] Beginning stream session with {}", session->plan_id(), session->peer);
|
||||
sslog.debug("[Stream #{}] Beginning stream session with {}", session->plan_id(), session->peer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -291,4 +291,15 @@ void stream_manager::on_restart(inet_address endpoint, endpoint_state ep_state)
|
||||
}
|
||||
}
|
||||
|
||||
void stream_manager::on_dead(inet_address endpoint, endpoint_state ep_state) {
|
||||
if (has_peer(endpoint) && ep_state.is_shutdown()) {
|
||||
sslog.info("stream_manager: Close all stream_session with peer = {} in on_dead", endpoint);
|
||||
get_stream_manager().invoke_on_all([endpoint] (auto& sm) {
|
||||
sm.fail_sessions(endpoint);
|
||||
}).handle_exception([endpoint] (auto ep) {
|
||||
sslog.warn("stream_manager: Fail to close sessions peer = {} in on_dead", endpoint);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace streaming
|
||||
|
||||
@@ -156,7 +156,7 @@ public:
|
||||
virtual void before_change(inet_address endpoint, endpoint_state current_state, application_state new_state_key, const versioned_value& new_value) override {}
|
||||
virtual void on_change(inet_address endpoint, application_state state, const versioned_value& value) override {}
|
||||
virtual void on_alive(inet_address endpoint, endpoint_state state) override {}
|
||||
virtual void on_dead(inet_address endpoint, endpoint_state state) override {}
|
||||
virtual void on_dead(inet_address endpoint, endpoint_state state) override;
|
||||
virtual void on_remove(inet_address endpoint) override;
|
||||
virtual void on_restart(inet_address endpoint, endpoint_state ep_state) override;
|
||||
|
||||
|
||||
@@ -54,7 +54,7 @@ future<stream_state> stream_result_future::init_sending_side(UUID plan_id_, sstr
|
||||
sr->add_event_listener(listener);
|
||||
}
|
||||
|
||||
sslog.info("[Stream #{}] Executing streaming plan for {}", plan_id_, description_);
|
||||
sslog.info("[Stream #{}] Executing streaming plan for {} with peers={}, master", plan_id_, description_, coordinator_->get_peers());
|
||||
|
||||
// Initialize and start all sessions
|
||||
for (auto& session : coordinator_->get_all_stream_sessions()) {
|
||||
@@ -74,7 +74,7 @@ shared_ptr<stream_result_future> stream_result_future::init_receiving_side(UUID
|
||||
sslog.warn(err.c_str());
|
||||
throw std::runtime_error(err);
|
||||
}
|
||||
sslog.info("[Stream #{}] Creating new streaming plan for {}, with {}", plan_id, description, from);
|
||||
sslog.info("[Stream #{}] Executing streaming plan for {} with peers={}, slave", plan_id, description, from);
|
||||
bool is_receiving = true;
|
||||
sr = make_shared<stream_result_future>(plan_id, description, is_receiving);
|
||||
sm.register_receiving(sr);
|
||||
@@ -83,7 +83,7 @@ shared_ptr<stream_result_future> stream_result_future::init_receiving_side(UUID
|
||||
|
||||
void stream_result_future::handle_session_prepared(shared_ptr<stream_session> session) {
|
||||
auto si = session->make_session_info();
|
||||
sslog.info("[Stream #{}] Prepare completed with {}. Receiving {}, sending {}",
|
||||
sslog.debug("[Stream #{}] Prepare completed with {}. Receiving {}, sending {}",
|
||||
session->plan_id(),
|
||||
session->peer,
|
||||
si.get_total_files_to_receive(),
|
||||
@@ -94,7 +94,7 @@ void stream_result_future::handle_session_prepared(shared_ptr<stream_session> se
|
||||
}
|
||||
|
||||
void stream_result_future::handle_session_complete(shared_ptr<stream_session> session) {
|
||||
sslog.info("[Stream #{}] Session with {} is complete, state={}", session->plan_id(), session->peer, session->get_state());
|
||||
sslog.debug("[Stream #{}] Session with {} is complete, state={}", session->plan_id(), session->peer, session->get_state());
|
||||
auto event = session_complete_event(session);
|
||||
fire_stream_event(std::move(event));
|
||||
auto si = session->make_session_info();
|
||||
@@ -120,25 +120,25 @@ void stream_result_future::maybe_complete() {
|
||||
sm.show_streams();
|
||||
}
|
||||
auto duration = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - _start_time).count();
|
||||
sm.get_progress_on_all_shards(plan_id).then([plan_id, duration] (auto sbytes) {
|
||||
auto tx_bw = sstring("+inf");
|
||||
auto rx_bw = sstring("+inf");
|
||||
auto stats = make_lw_shared<sstring>("");
|
||||
sm.get_progress_on_all_shards(plan_id).then([plan_id, duration, stats] (auto sbytes) {
|
||||
auto tx_bw = sstring("0");
|
||||
auto rx_bw = sstring("0");
|
||||
if (std::fabs(duration) > FLT_EPSILON) {
|
||||
tx_bw = sprint("%.3f", sbytes.bytes_sent / duration / (1024 * 1024));
|
||||
rx_bw = sprint("%.3f", sbytes.bytes_received / duration / (1024 * 1024));
|
||||
tx_bw = sprint("%.2f", sbytes.bytes_sent / duration / 1024);
|
||||
rx_bw = sprint("%.2f", sbytes.bytes_received / duration / 1024);
|
||||
}
|
||||
sslog.info("[Stream #{}] bytes_sent = {}, bytes_received = {}, tx_bandwidth = {} MiB/s, rx_bandwidth = {} MiB/s",
|
||||
plan_id, sbytes.bytes_sent, sbytes.bytes_received, tx_bw, rx_bw);
|
||||
*stats = sprint("tx=%ld KiB, %s KiB/s, rx=%ld KiB, %s KiB/s", sbytes.bytes_sent / 1024, tx_bw, sbytes.bytes_received / 1024, rx_bw);
|
||||
}).handle_exception([plan_id] (auto ep) {
|
||||
sslog.warn("[Stream #{}] Fail to get progess on all shards: {}", plan_id, ep);
|
||||
}).finally([this, plan_id, &sm] {
|
||||
}).finally([this, plan_id, stats, &sm] () {
|
||||
sm.remove_stream(plan_id);
|
||||
auto final_state = get_current_state();
|
||||
if (final_state.has_failed_session()) {
|
||||
sslog.warn("[Stream #{}] Stream failed for streaming plan {}, peers={}", plan_id, description, _coordinator->get_peers());
|
||||
sslog.warn("[Stream #{}] Streaming plan for {} failed, peers={}, {}", plan_id, description, _coordinator->get_peers(), *stats);
|
||||
_done.set_exception(stream_exception(final_state, "Stream failed"));
|
||||
} else {
|
||||
sslog.info("[Stream #{}] All sessions completed for streaming plan {}, peers={}", plan_id, description, _coordinator->get_peers());
|
||||
sslog.info("[Stream #{}] Streaming plan for {} succeeded, peers={}, {}", plan_id, description, _coordinator->get_peers(), *stats);
|
||||
_done.set_value(final_state);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -176,11 +176,20 @@ void stream_session::init_messaging_service_handler() {
|
||||
});
|
||||
});
|
||||
});
|
||||
ms().register_complete_message([] (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id) {
|
||||
ms().register_complete_message([] (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed) {
|
||||
const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
// Be compatible with old version. Do nothing but return a ready future.
|
||||
sslog.debug("[Stream #{}] COMPLETE_MESSAGE from {} dst_cpu_id={}", plan_id, from, dst_cpu_id);
|
||||
return make_ready_future<>();
|
||||
if (failed && *failed) {
|
||||
return smp::submit_to(dst_cpu_id, [plan_id, from, dst_cpu_id] () {
|
||||
auto session = get_session(plan_id, from, "COMPLETE_MESSAGE");
|
||||
sslog.debug("[Stream #{}] COMPLETE_MESSAGE with error flag from {} dst_cpu_id={}", plan_id, from, dst_cpu_id);
|
||||
session->received_failed_complete_message();
|
||||
return make_ready_future<>();
|
||||
});
|
||||
} else {
|
||||
// Be compatible with old version. Do nothing but return a ready future.
|
||||
sslog.debug("[Stream #{}] COMPLETE_MESSAGE from {} dst_cpu_id={}", plan_id, from, dst_cpu_id);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -227,7 +236,9 @@ future<> stream_session::on_initialization_complete() {
|
||||
for (auto& summary : msg.summaries) {
|
||||
this->prepare_receiving(summary);
|
||||
}
|
||||
_stream_result->handle_session_prepared(this->shared_from_this());
|
||||
if (_stream_result) {
|
||||
_stream_result->handle_session_prepared(this->shared_from_this());
|
||||
}
|
||||
} catch (...) {
|
||||
sslog.warn("[Stream #{}] Fail to send PREPARE_MESSAGE to {}, {}", this->plan_id(), id, std::current_exception());
|
||||
throw;
|
||||
@@ -248,9 +259,19 @@ future<> stream_session::on_initialization_complete() {
|
||||
});
|
||||
}
|
||||
|
||||
void stream_session::received_failed_complete_message() {
|
||||
sslog.info("[Stream #{}] Received failed complete message, peer={}", plan_id(), peer);
|
||||
_received_failed_complete_message = true;
|
||||
close_session(stream_session_state::FAILED);
|
||||
}
|
||||
|
||||
void stream_session::abort() {
|
||||
sslog.info("[Stream #{}] Aborted stream session={}, peer={}, is_initialized={}", plan_id(), this, peer, is_initialized());
|
||||
close_session(stream_session_state::FAILED);
|
||||
}
|
||||
|
||||
void stream_session::on_error() {
|
||||
sslog.warn("[Stream #{}] Streaming error occurred", plan_id());
|
||||
// fail session
|
||||
sslog.warn("[Stream #{}] Streaming error occurred, peer={}", plan_id(), peer);
|
||||
close_session(stream_session_state::FAILED);
|
||||
}
|
||||
|
||||
@@ -300,7 +321,9 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
|
||||
}
|
||||
}
|
||||
prepare.dst_cpu_id = engine().cpu_id();;
|
||||
_stream_result->handle_session_prepared(shared_from_this());
|
||||
if (_stream_result) {
|
||||
_stream_result->handle_session_prepared(shared_from_this());
|
||||
}
|
||||
return make_ready_future<prepare_message>(std::move(prepare));
|
||||
}
|
||||
|
||||
@@ -309,10 +332,6 @@ void stream_session::follower_start_sent() {
|
||||
this->start_streaming_files();
|
||||
}
|
||||
|
||||
void stream_session::session_failed() {
|
||||
close_session(stream_session_state::FAILED);
|
||||
}
|
||||
|
||||
session_info stream_session::make_session_info() {
|
||||
std::vector<stream_summary> receiving_summaries;
|
||||
for (auto& receiver : _receivers) {
|
||||
@@ -339,28 +358,41 @@ void stream_session::transfer_task_completed(UUID cf_id) {
|
||||
maybe_completed();
|
||||
}
|
||||
|
||||
void stream_session::send_complete_message() {
|
||||
void stream_session::transfer_task_completed_all() {
|
||||
_transfers.clear();
|
||||
sslog.debug("[Stream #{}] transfer task_completed: all done, stream_receive_task.size={} stream_transfer_task.size={}",
|
||||
plan_id(), _receivers.size(), _transfers.size());
|
||||
maybe_completed();
|
||||
}
|
||||
|
||||
void stream_session::send_failed_complete_message() {
|
||||
if (!is_initialized()) {
|
||||
return;
|
||||
}
|
||||
auto plan_id = this->plan_id();
|
||||
if (_received_failed_complete_message) {
|
||||
sslog.debug("[Stream #{}] Skip sending failed message back to peer", plan_id);
|
||||
return;
|
||||
}
|
||||
if (!_complete_sent) {
|
||||
_complete_sent = true;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
auto id = msg_addr{this->peer, this->dst_cpu_id};
|
||||
auto plan_id = this->plan_id();
|
||||
sslog.debug("[Stream #{}] SEND COMPLETE_MESSAGE to {}", plan_id, id);
|
||||
auto session = shared_from_this();
|
||||
this->ms().send_complete_message(id, plan_id, this->dst_cpu_id).then([session, id, plan_id] {
|
||||
bool failed = true;
|
||||
this->ms().send_complete_message(id, plan_id, this->dst_cpu_id, failed).then([session, id, plan_id] {
|
||||
sslog.debug("[Stream #{}] GOT COMPLETE_MESSAGE Reply from {}", plan_id, id.addr);
|
||||
}).handle_exception([session, id, plan_id] (auto ep) {
|
||||
sslog.warn("[Stream #{}] COMPLETE_MESSAGE for {} has failed: {}", plan_id, id.addr, ep);
|
||||
session->on_error();
|
||||
sslog.debug("[Stream #{}] COMPLETE_MESSAGE for {} has failed: {}", plan_id, id.addr, ep);
|
||||
});
|
||||
}
|
||||
|
||||
bool stream_session::maybe_completed() {
|
||||
bool completed = _receivers.empty() && _transfers.empty();
|
||||
if (completed) {
|
||||
send_complete_message();
|
||||
sslog.debug("[Stream #{}] maybe_completed: {} -> COMPLETE: session={}, peer={}", plan_id(), _state, this, peer);
|
||||
close_session(stream_session_state::COMPLETE);
|
||||
}
|
||||
@@ -379,11 +411,15 @@ void stream_session::start_streaming_files() {
|
||||
if (!_transfers.empty()) {
|
||||
set_state(stream_session_state::STREAMING);
|
||||
}
|
||||
for (auto it = _transfers.begin(); it != _transfers.end();) {
|
||||
stream_transfer_task& task = it->second;
|
||||
it++;
|
||||
task.start();
|
||||
}
|
||||
do_for_each(_transfers.begin(), _transfers.end(), [this] (auto& item) {
|
||||
sslog.debug("[Stream #{}] Start to send cf_id={}", this->plan_id(), item.first);
|
||||
return item.second.execute();
|
||||
}).then([this] {
|
||||
this->transfer_task_completed_all();
|
||||
}).handle_exception([this] (auto ep) {
|
||||
sslog.warn("[Stream #{}] Failed to send: {}", this->plan_id(), ep);
|
||||
this->on_error();
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<column_family*> stream_session::get_column_family_stores(const sstring& keyspace, const std::vector<sstring>& column_families) {
|
||||
@@ -460,12 +496,15 @@ void stream_session::close_session(stream_session_state final_state) {
|
||||
receiving_failed(x.first);
|
||||
task.abort();
|
||||
}
|
||||
send_failed_complete_message();
|
||||
}
|
||||
|
||||
// Note that we shouldn't block on this close because this method is called on the handler
|
||||
// incoming thread (so we would deadlock).
|
||||
//handler.close();
|
||||
_stream_result->handle_session_complete(shared_from_this());
|
||||
if (_stream_result) {
|
||||
_stream_result->handle_session_complete(shared_from_this());
|
||||
}
|
||||
|
||||
sslog.debug("[Stream #{}] close_session session={}, state={}, cancel keep_alive timer", plan_id(), this, final_state);
|
||||
_keep_alive.cancel();
|
||||
@@ -480,15 +519,19 @@ void stream_session::start() {
|
||||
}
|
||||
auto connecting = netw::get_local_messaging_service().get_preferred_ip(peer);
|
||||
if (peer == connecting) {
|
||||
sslog.info("[Stream #{}] Starting streaming to {}", plan_id(), peer);
|
||||
sslog.debug("[Stream #{}] Starting streaming to {}", plan_id(), peer);
|
||||
} else {
|
||||
sslog.info("[Stream #{}] Starting streaming to {} through {}", plan_id(), peer, connecting);
|
||||
sslog.debug("[Stream #{}] Starting streaming to {} through {}", plan_id(), peer, connecting);
|
||||
}
|
||||
on_initialization_complete().handle_exception([this] (auto ep) {
|
||||
this->on_error();
|
||||
});
|
||||
}
|
||||
|
||||
bool stream_session::is_initialized() const {
|
||||
return bool(_stream_result);
|
||||
}
|
||||
|
||||
void stream_session::init(shared_ptr<stream_result_future> stream_result_) {
|
||||
_stream_result = stream_result_;
|
||||
_keep_alive.set_callback([this] {
|
||||
|
||||
@@ -151,7 +151,7 @@ public:
|
||||
* Each {@code StreamSession} is identified by this InetAddress which is broadcast address of the node streaming.
|
||||
*/
|
||||
inet_address peer;
|
||||
unsigned dst_cpu_id;
|
||||
unsigned dst_cpu_id = 0;
|
||||
private:
|
||||
// should not be null when session is started
|
||||
shared_ptr<stream_result_future> _stream_result;
|
||||
@@ -174,11 +174,12 @@ private:
|
||||
|
||||
stream_session_state _state = stream_session_state::INITIALIZED;
|
||||
bool _complete_sent = false;
|
||||
bool _received_failed_complete_message = false;
|
||||
|
||||
// If the session is idle for 300 minutes, close the session
|
||||
std::chrono::seconds _keep_alive_timeout{60 * 300};
|
||||
// Check every 10 minutes
|
||||
std::chrono::seconds _keep_alive_interval{60 * 10};
|
||||
// If the session is idle for 10 minutes, close the session
|
||||
std::chrono::seconds _keep_alive_timeout{60 * 10};
|
||||
// Check every 1 minutes
|
||||
std::chrono::seconds _keep_alive_interval{60};
|
||||
timer<lowres_clock> _keep_alive;
|
||||
stream_bytes _last_stream_bytes;
|
||||
lowres_clock::time_point _last_stream_progress;
|
||||
@@ -231,6 +232,8 @@ public:
|
||||
|
||||
void start();
|
||||
|
||||
bool is_initialized() const;
|
||||
|
||||
/**
|
||||
* Request data fetch task to this session.
|
||||
*
|
||||
@@ -299,6 +302,10 @@ public:
|
||||
*/
|
||||
void on_error();
|
||||
|
||||
void abort();
|
||||
|
||||
void received_failed_complete_message();
|
||||
|
||||
/**
|
||||
* Prepare this session for sending/receiving files.
|
||||
*/
|
||||
@@ -311,11 +318,6 @@ public:
|
||||
*/
|
||||
void complete();
|
||||
|
||||
/**
|
||||
* Call back on receiving {@code StreamMessage.Type.SESSION_FAILED} message.
|
||||
*/
|
||||
void session_failed();
|
||||
|
||||
/**
|
||||
* @return Current snapshot of this session info.
|
||||
*/
|
||||
@@ -333,8 +335,9 @@ public:
|
||||
|
||||
void receive_task_completed(UUID cf_id);
|
||||
void transfer_task_completed(UUID cf_id);
|
||||
void transfer_task_completed_all();
|
||||
private:
|
||||
void send_complete_message();
|
||||
void send_failed_complete_message();
|
||||
bool maybe_completed();
|
||||
void prepare_receiving(stream_summary& summary);
|
||||
void start_streaming_files();
|
||||
|
||||
@@ -134,7 +134,7 @@ future<> send_mutations(lw_shared_ptr<send_info> si) {
|
||||
});
|
||||
}
|
||||
|
||||
void stream_transfer_task::start() {
|
||||
future<> stream_transfer_task::execute() {
|
||||
auto plan_id = session->plan_id();
|
||||
auto cf_id = this->cf_id;
|
||||
auto dst_cpu_id = session->dst_cpu_id;
|
||||
@@ -143,7 +143,7 @@ void stream_transfer_task::start() {
|
||||
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}", plan_id, cf_id);
|
||||
sort_and_merge_ranges();
|
||||
_shard_ranges = dht::split_ranges_to_shards(_ranges, *schema);
|
||||
parallel_for_each(_shard_ranges, [this, dst_cpu_id, plan_id, cf_id, id] (auto& item) {
|
||||
return parallel_for_each(_shard_ranges, [this, dst_cpu_id, plan_id, cf_id, id] (auto& item) {
|
||||
auto& shard = item.first;
|
||||
auto& prs = item.second;
|
||||
return session->get_db().invoke_on(shard, [plan_id, cf_id, id, dst_cpu_id, prs = std::move(prs)] (database& db) mutable {
|
||||
@@ -160,10 +160,9 @@ void stream_transfer_task::start() {
|
||||
}).then([this, id, plan_id, cf_id] {
|
||||
sslog.debug("[Stream #{}] GOT STREAM_MUTATION_DONE Reply from {}", plan_id, id.addr);
|
||||
session->start_keep_alive_timer();
|
||||
session->transfer_task_completed(cf_id);
|
||||
}).handle_exception([this, plan_id, id] (auto ep){
|
||||
sslog.warn("[Stream #{}] stream_transfer_task: Fail to send to {}: {}", plan_id, id, ep);
|
||||
this->session->on_error();
|
||||
std::rethrow_exception(ep);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ public:
|
||||
return _total_size;
|
||||
}
|
||||
|
||||
void start();
|
||||
future<> execute();
|
||||
|
||||
void append_ranges(const dht::token_range_vector& ranges);
|
||||
void sort_and_merge_ranges();
|
||||
|
||||
10
test.py
10
test.py
@@ -81,7 +81,7 @@ boost_tests = [
|
||||
'virtual_reader_test',
|
||||
'counter_test',
|
||||
'cell_locker_test',
|
||||
'clustering_ranges_walker_test',
|
||||
'view_schema_test',
|
||||
]
|
||||
|
||||
other_tests = [
|
||||
@@ -128,7 +128,6 @@ if __name__ == "__main__":
|
||||
help='Verbose reporting')
|
||||
args = parser.parse_args()
|
||||
|
||||
black_hole = open('/dev/null', 'w')
|
||||
print_status = print_status_verbose if args.verbose else print_status_short
|
||||
|
||||
test_to_run = []
|
||||
@@ -136,9 +135,9 @@ if __name__ == "__main__":
|
||||
for mode in modes_to_run:
|
||||
prefix = os.path.join('build', mode, 'tests')
|
||||
for test in other_tests:
|
||||
test_to_run.append((os.path.join(prefix, test), 'other'))
|
||||
test_to_run.append((os.path.join(prefix, test), 'other', '-c2 -m4G'.split()))
|
||||
for test in boost_tests:
|
||||
test_to_run.append((os.path.join(prefix, test), 'boost'))
|
||||
test_to_run.append((os.path.join(prefix, test), 'boost', '-c2 -m4G'.split()))
|
||||
|
||||
if 'release' in modes_to_run:
|
||||
test_to_run.append(('build/release/tests/lsa_async_eviction_test', 'other',
|
||||
@@ -152,11 +151,9 @@ if __name__ == "__main__":
|
||||
test_to_run.append(('build/release/tests/row_cache_alloc_stress', 'other',
|
||||
'-c1 -m1G'.split()))
|
||||
test_to_run.append(('build/release/tests/sstable_test', 'boost', ['-c1']))
|
||||
test_to_run.append(('build/release/tests/view_schema_test', 'boost', ['-c1']))
|
||||
test_to_run.append(('build/release/tests/row_cache_stress_test', 'other', '-c1 -m1G --seconds 10'.split()))
|
||||
if 'debug' in modes_to_run:
|
||||
test_to_run.append(('build/debug/tests/sstable_test', 'boost', ['-c1']))
|
||||
test_to_run.append(('build/debug/tests/view_schema_test', 'boost', ['-c1']))
|
||||
|
||||
if args.name:
|
||||
test_to_run = [t for t in test_to_run if args.name in t[0]]
|
||||
@@ -168,6 +165,7 @@ if __name__ == "__main__":
|
||||
# disable false positive due to new (with_alignment(...)) ...
|
||||
env['ASAN_OPTIONS'] = 'alloc_dealloc_mismatch=0'
|
||||
env['UBSAN_OPTIONS'] = 'print_stacktrace=1'
|
||||
env['BOOST_TEST_CATCH_SYSTEM_ERRORS'] = 'no'
|
||||
for n, test in enumerate(test_to_run):
|
||||
path = test[0]
|
||||
exec_args = test[2] if len(test) >= 3 else []
|
||||
|
||||
@@ -291,7 +291,7 @@ BOOST_AUTO_TEST_CASE(test_composite_serialize_value) {
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_composite_from_exploded) {
|
||||
using components = std::vector<composite::component>;
|
||||
BOOST_REQUIRE_EQUAL(composite::from_exploded({bytes_view(bytes({'e', 'l', '1'}))}, composite::eoc::start).components(),
|
||||
BOOST_REQUIRE_EQUAL(composite::from_exploded({bytes_view(bytes({'e', 'l', '1'}))}, true, composite::eoc::start).components(),
|
||||
components({std::make_pair(bytes("el1"), composite::eoc::start)}));
|
||||
}
|
||||
|
||||
|
||||
@@ -753,6 +753,40 @@ SEASTAR_TEST_CASE(test_range_deletion_scenarios) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_range_deletion_scenarios_with_compact_storage) {
|
||||
return do_with_cql_env_thread([] (auto& e) {
|
||||
e.execute_cql("create table cf (p int, c int, v text, primary key (p, c)) with compact storage;").get();
|
||||
for (auto i = 0; i < 10; ++i) {
|
||||
e.execute_cql(sprint("insert into cf (p, c, v) values (1, %d, 'abc');", i)).get();
|
||||
}
|
||||
|
||||
try {
|
||||
e.execute_cql("delete from cf where p = 1 and c <= 3").get();
|
||||
BOOST_FAIL("should've thrown");
|
||||
} catch (...) { }
|
||||
try {
|
||||
e.execute_cql("delete from cf where p = 1 and c >= 0").get();
|
||||
BOOST_FAIL("should've thrown");
|
||||
} catch (...) { }
|
||||
try {
|
||||
e.execute_cql("delete from cf where p = 1 and c > 0 and c <= 3").get();
|
||||
BOOST_FAIL("should've thrown");
|
||||
} catch (...) { }
|
||||
try {
|
||||
e.execute_cql("delete from cf where p = 1 and c >= 0 and c < 3").get();
|
||||
BOOST_FAIL("should've thrown");
|
||||
} catch (...) { }
|
||||
try {
|
||||
e.execute_cql("delete from cf where p = 1 and c > 0 and c < 3").get();
|
||||
BOOST_FAIL("should've thrown");
|
||||
} catch (...) { }
|
||||
try {
|
||||
e.execute_cql("delete from cf where p = 1 and c >= 0 and c <= 3").get();
|
||||
BOOST_FAIL("should've thrown");
|
||||
} catch (...) { }
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_map_insert_update) {
|
||||
return do_with_cql_env([] (auto& e) {
|
||||
auto make_my_map_type = [] { return map_type_impl::get_instance(int32_type, int32_type, true); };
|
||||
|
||||
@@ -120,7 +120,7 @@ public:
|
||||
});
|
||||
}
|
||||
|
||||
virtual future<bytes> prepare(sstring query) override {
|
||||
virtual future<cql3::prepared_cache_key_type> prepare(sstring query) override {
|
||||
return qp().invoke_on_all([query, this] (auto& local_qp) {
|
||||
auto qs = this->make_query_state();
|
||||
return local_qp.prepare(query, *qs).finally([qs] {}).discard_result();
|
||||
@@ -130,7 +130,7 @@ public:
|
||||
}
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>> execute_prepared(
|
||||
bytes id,
|
||||
cql3::prepared_cache_key_type id,
|
||||
std::vector<cql3::raw_value> values) override
|
||||
{
|
||||
auto prepared = local_qp().get_prepared(id);
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "transport/messages/result_message_base.hh"
|
||||
#include "cql3/query_options_fwd.hh"
|
||||
#include "cql3/values.hh"
|
||||
#include "cql3/prepared_statements_cache.hh"
|
||||
#include "bytes.hh"
|
||||
#include "schema.hh"
|
||||
|
||||
@@ -43,7 +44,7 @@ namespace cql3 {
|
||||
|
||||
class not_prepared_exception : public std::runtime_error {
|
||||
public:
|
||||
not_prepared_exception(const bytes& id) : std::runtime_error(sprint("Not prepared: %s", id)) {}
|
||||
not_prepared_exception(const cql3::prepared_cache_key_type& id) : std::runtime_error(sprint("Not prepared: %s", id)) {}
|
||||
};
|
||||
|
||||
namespace db {
|
||||
@@ -59,10 +60,10 @@ public:
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>> execute_cql(
|
||||
const sstring& text, std::unique_ptr<cql3::query_options> qo) = 0;
|
||||
|
||||
virtual future<bytes> prepare(sstring query) = 0;
|
||||
virtual future<cql3::prepared_cache_key_type> prepare(sstring query) = 0;
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>> execute_prepared(
|
||||
bytes id, std::vector<cql3::raw_value> values) = 0;
|
||||
cql3::prepared_cache_key_type id, std::vector<cql3::raw_value> values) = 0;
|
||||
|
||||
virtual future<> create_table(std::function<schema(const sstring&)> schema_maker) = 0;
|
||||
|
||||
|
||||
321
tests/loading_cache_test.cc
Normal file
321
tests/loading_cache_test.cc
Normal file
@@ -0,0 +1,321 @@
|
||||
/*
|
||||
* Copyright (C) 2017 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <boost/test/unit_test.hpp>
|
||||
#include "utils/loading_shared_values.hh"
|
||||
#include "utils/loading_cache.hh"
|
||||
#include <seastar/core/file.hh>
|
||||
#include <seastar/core/thread.hh>
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/reactor.hh>
|
||||
#include <seastar/core/sleep.hh>
|
||||
|
||||
|
||||
#include "seastarx.hh"
|
||||
|
||||
#include "tests/test-utils.hh"
|
||||
#include "tmpdir.hh"
|
||||
#include "log.hh"
|
||||
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
|
||||
/// Get a random integer in the [0, max) range.
|
||||
/// \param upper bound of the random value range
|
||||
/// \return The uniformly distributed random integer from the [0, \ref max) range.
|
||||
static int rand_int(int max) {
|
||||
std::random_device rd; // only used once to initialise (seed) engine
|
||||
std::mt19937 rng(rd()); // random-number engine used (Mersenne-Twister in this case)
|
||||
std::uniform_int_distribution<int> uni(0, max - 1); // guaranteed unbiased
|
||||
return uni(rng);
|
||||
}
|
||||
|
||||
|
||||
#include "disk-error-handler.hh"
|
||||
|
||||
thread_local disk_error_signal_type general_disk_error;
|
||||
thread_local disk_error_signal_type commit_error;
|
||||
|
||||
static const sstring test_file_name = "loading_cache_test.txt";
|
||||
static const sstring test_string = "1";
|
||||
static bool file_prepared = false;
|
||||
static constexpr int num_loaders = 1000;
|
||||
|
||||
static logging::logger test_logger("loading_cache_test");
|
||||
|
||||
static thread_local int load_count;
|
||||
static const tmpdir& get_tmpdir() {
|
||||
static thread_local tmpdir tmp;
|
||||
return tmp;
|
||||
}
|
||||
|
||||
static future<> prepare() {
|
||||
if (file_prepared) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
return open_file_dma((boost::filesystem::path(get_tmpdir().path) / test_file_name.c_str()).c_str(), open_flags::create | open_flags::wo).then([] (file f) {
|
||||
return do_with(std::move(f), [] (file& f) {
|
||||
return f.dma_write(0, test_string.c_str(), test_string.size() + 1).then([] (size_t s) {
|
||||
BOOST_REQUIRE_EQUAL(s, test_string.size() + 1);
|
||||
file_prepared = true;
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static future<sstring> loader(const int& k) {
|
||||
return open_file_dma((boost::filesystem::path(get_tmpdir().path) / test_file_name.c_str()).c_str(), open_flags::ro).then([] (file f) -> future<sstring> {
|
||||
return do_with(std::move(f), [] (file& f) -> future<sstring> {
|
||||
return f.dma_read_exactly<char>(0, test_string.size() + 1).then([] (auto buf) {
|
||||
sstring str(buf.get());
|
||||
BOOST_REQUIRE_EQUAL(str, test_string);
|
||||
++load_count;
|
||||
return make_ready_future<sstring>(std::move(str));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_shared_values_parallel_loading_same_key) {
|
||||
return seastar::async([] {
|
||||
std::vector<int> ivec(num_loaders);
|
||||
load_count = 0;
|
||||
utils::loading_shared_values<int, sstring> shared_values;
|
||||
std::list<typename utils::loading_shared_values<int, sstring>::entry_ptr> anchors_list;
|
||||
|
||||
prepare().get();
|
||||
|
||||
std::fill(ivec.begin(), ivec.end(), 0);
|
||||
|
||||
parallel_for_each(ivec, [&] (int& k) {
|
||||
return shared_values.get_or_load(k, loader).then([&] (auto entry_ptr) {
|
||||
anchors_list.emplace_back(std::move(entry_ptr));
|
||||
});
|
||||
}).get();
|
||||
|
||||
// "loader" must be called exactly once
|
||||
BOOST_REQUIRE_EQUAL(load_count, 1);
|
||||
BOOST_REQUIRE_EQUAL(shared_values.size(), 1);
|
||||
anchors_list.clear();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_shared_values_parallel_loading_different_keys) {
|
||||
return seastar::async([] {
|
||||
std::vector<int> ivec(num_loaders);
|
||||
load_count = 0;
|
||||
utils::loading_shared_values<int, sstring> shared_values;
|
||||
std::list<typename utils::loading_shared_values<int, sstring>::entry_ptr> anchors_list;
|
||||
|
||||
prepare().get();
|
||||
|
||||
std::iota(ivec.begin(), ivec.end(), 0);
|
||||
|
||||
parallel_for_each(ivec, [&] (int& k) {
|
||||
return shared_values.get_or_load(k, loader).then([&] (auto entry_ptr) {
|
||||
anchors_list.emplace_back(std::move(entry_ptr));
|
||||
});
|
||||
}).get();
|
||||
|
||||
// "loader" must be called once for each key
|
||||
BOOST_REQUIRE_EQUAL(load_count, num_loaders);
|
||||
BOOST_REQUIRE_EQUAL(shared_values.size(), num_loaders);
|
||||
anchors_list.clear();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_shared_values_rehash) {
|
||||
return seastar::async([] {
|
||||
std::vector<int> ivec(num_loaders);
|
||||
load_count = 0;
|
||||
utils::loading_shared_values<int, sstring> shared_values;
|
||||
std::list<typename utils::loading_shared_values<int, sstring>::entry_ptr> anchors_list;
|
||||
|
||||
prepare().get();
|
||||
|
||||
std::iota(ivec.begin(), ivec.end(), 0);
|
||||
|
||||
// verify that load factor is always in the (0.25, 0.75) range
|
||||
for (int k = 0; k < num_loaders; ++k) {
|
||||
shared_values.get_or_load(k, loader).then([&] (auto entry_ptr) {
|
||||
anchors_list.emplace_back(std::move(entry_ptr));
|
||||
}).get();
|
||||
BOOST_REQUIRE_LE(shared_values.size(), 3 * shared_values.buckets_count() / 4);
|
||||
}
|
||||
|
||||
BOOST_REQUIRE_GE(shared_values.size(), shared_values.buckets_count() / 4);
|
||||
|
||||
// minimum buckets count (by default) is 16, so don't check for less than 4 elements
|
||||
for (int k = 0; k < num_loaders - 4; ++k) {
|
||||
anchors_list.pop_back();
|
||||
shared_values.rehash();
|
||||
BOOST_REQUIRE_GE(shared_values.size(), shared_values.buckets_count() / 4);
|
||||
}
|
||||
|
||||
anchors_list.clear();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_shared_values_parallel_loading_explicit_eviction) {
|
||||
return seastar::async([] {
|
||||
std::vector<int> ivec(num_loaders);
|
||||
load_count = 0;
|
||||
utils::loading_shared_values<int, sstring> shared_values;
|
||||
std::vector<typename utils::loading_shared_values<int, sstring>::entry_ptr> anchors_vec(num_loaders);
|
||||
|
||||
prepare().get();
|
||||
|
||||
std::iota(ivec.begin(), ivec.end(), 0);
|
||||
|
||||
parallel_for_each(ivec, [&] (int& k) {
|
||||
return shared_values.get_or_load(k, loader).then([&] (auto entry_ptr) {
|
||||
anchors_vec[k] = std::move(entry_ptr);
|
||||
});
|
||||
}).get();
|
||||
|
||||
int rand_key = rand_int(num_loaders);
|
||||
BOOST_REQUIRE(shared_values.find(rand_key) != shared_values.end());
|
||||
anchors_vec[rand_key] = nullptr;
|
||||
BOOST_REQUIRE_MESSAGE(shared_values.find(rand_key) == shared_values.end(), format("explicit removal for key {} failed", rand_key));
|
||||
anchors_vec.clear();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_cache_loading_same_key) {
|
||||
return seastar::async([] {
|
||||
using namespace std::chrono;
|
||||
std::vector<int> ivec(num_loaders);
|
||||
load_count = 0;
|
||||
utils::loading_cache<int, sstring> loading_cache(num_loaders, 1s, test_logger);
|
||||
|
||||
prepare().get();
|
||||
|
||||
std::fill(ivec.begin(), ivec.end(), 0);
|
||||
|
||||
parallel_for_each(ivec, [&] (int& k) {
|
||||
return loading_cache.get_ptr(k, loader).discard_result();
|
||||
}).get();
|
||||
|
||||
// "loader" must be called exactly once
|
||||
BOOST_REQUIRE_EQUAL(load_count, 1);
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.size(), 1);
|
||||
loading_cache.stop().get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_cache_loading_different_keys) {
|
||||
return seastar::async([] {
|
||||
using namespace std::chrono;
|
||||
std::vector<int> ivec(num_loaders);
|
||||
load_count = 0;
|
||||
utils::loading_cache<int, sstring> loading_cache(num_loaders, 1s, test_logger);
|
||||
|
||||
prepare().get();
|
||||
|
||||
std::iota(ivec.begin(), ivec.end(), 0);
|
||||
|
||||
parallel_for_each(ivec, [&] (int& k) {
|
||||
return loading_cache.get_ptr(k, loader).discard_result();
|
||||
}).get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(load_count, num_loaders);
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.size(), num_loaders);
|
||||
loading_cache.stop().get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_cache_loading_expiry_eviction) {
|
||||
return seastar::async([] {
|
||||
using namespace std::chrono;
|
||||
utils::loading_cache<int, sstring> loading_cache(num_loaders, 20ms, test_logger);
|
||||
|
||||
prepare().get();
|
||||
|
||||
loading_cache.get_ptr(0, loader).discard_result().get();
|
||||
|
||||
BOOST_REQUIRE(loading_cache.find(0) != loading_cache.end());
|
||||
|
||||
// timers get delayed sometimes (especially in a debug mode)
|
||||
constexpr int max_retry = 10;
|
||||
int i = 0;
|
||||
do_until(
|
||||
[&] { return i++ > max_retry || loading_cache.find(0) == loading_cache.end(); },
|
||||
[] { return sleep(40ms); }
|
||||
).get();
|
||||
BOOST_REQUIRE(loading_cache.find(0) == loading_cache.end());
|
||||
loading_cache.stop().get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_cache_loading_reloading) {
|
||||
return seastar::async([] {
|
||||
using namespace std::chrono;
|
||||
load_count = 0;
|
||||
utils::loading_cache<int, sstring, utils::loading_cache_reload_enabled::yes> loading_cache(num_loaders, 100ms, 20ms, test_logger, loader);
|
||||
prepare().get();
|
||||
loading_cache.get_ptr(0, loader).discard_result().get();
|
||||
sleep(60ms).get();
|
||||
BOOST_REQUIRE_MESSAGE(load_count >= 2, format("load_count is {}", load_count));
|
||||
loading_cache.stop().get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_cache_max_size_eviction) {
|
||||
return seastar::async([] {
|
||||
using namespace std::chrono;
|
||||
load_count = 0;
|
||||
utils::loading_cache<int, sstring> loading_cache(1, 1s, test_logger);
|
||||
|
||||
prepare().get();
|
||||
|
||||
for (int i = 0; i < num_loaders; ++i) {
|
||||
loading_cache.get_ptr(i % 2, loader).discard_result().get();
|
||||
}
|
||||
|
||||
BOOST_REQUIRE_EQUAL(load_count, num_loaders);
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.size(), 1);
|
||||
loading_cache.stop().get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_loading_cache_reload_during_eviction) {
|
||||
return seastar::async([] {
|
||||
using namespace std::chrono;
|
||||
load_count = 0;
|
||||
utils::loading_cache<int, sstring, utils::loading_cache_reload_enabled::yes> loading_cache(1, 100ms, 10ms, test_logger, loader);
|
||||
|
||||
prepare().get();
|
||||
|
||||
auto curr_time = lowres_clock::now();
|
||||
int i = 0;
|
||||
|
||||
// this will cause reloading when values are being actively evicted due to the limited cache size
|
||||
do_until(
|
||||
[&] { return lowres_clock::now() - curr_time > 1s; },
|
||||
[&] { return loading_cache.get_ptr(i++ % 2).discard_result(); }
|
||||
).get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.size(), 1);
|
||||
loading_cache.stop().get();
|
||||
});
|
||||
}
|
||||
@@ -1194,3 +1194,39 @@ SEASTAR_TEST_CASE(test_reclaiming_runs_as_long_as_there_is_soft_pressure) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_zone_reclaiming_preserves_free_size) {
|
||||
return seastar::async([] {
|
||||
region r;
|
||||
with_allocator(r.allocator(), [&] {
|
||||
chunked_fifo<managed_bytes> objs;
|
||||
|
||||
auto zone_size = max_zone_segments * segment_size;
|
||||
|
||||
// We need to generate 3 zones, so that at least one zone (not last) can be released fully. The first
|
||||
// zone would not due to emergency reserve.
|
||||
while (logalloc::shard_tracker().region_occupancy().used_space() < zone_size * 2 + zone_size / 4) {
|
||||
objs.emplace_back(managed_bytes(managed_bytes::initialized_later(), 1024));
|
||||
}
|
||||
|
||||
BOOST_TEST_MESSAGE(logalloc::shard_tracker().non_lsa_used_space());
|
||||
BOOST_TEST_MESSAGE(logalloc::shard_tracker().region_occupancy());
|
||||
|
||||
while (logalloc::shard_tracker().region_occupancy().used_space() >= logalloc::segment_size * 2) {
|
||||
objs.pop_front();
|
||||
}
|
||||
|
||||
BOOST_TEST_MESSAGE(logalloc::shard_tracker().non_lsa_used_space());
|
||||
BOOST_TEST_MESSAGE(logalloc::shard_tracker().region_occupancy());
|
||||
|
||||
auto before = logalloc::shard_tracker().non_lsa_used_space();
|
||||
logalloc::shard_tracker().reclaim(logalloc::segment_size);
|
||||
auto after = logalloc::shard_tracker().non_lsa_used_space();
|
||||
|
||||
BOOST_TEST_MESSAGE(logalloc::shard_tracker().non_lsa_used_space());
|
||||
BOOST_TEST_MESSAGE(logalloc::shard_tracker().region_occupancy());
|
||||
|
||||
BOOST_REQUIRE(after <= before);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "tests/test-utils.hh"
|
||||
#include "tests/mutation_assertions.hh"
|
||||
#include "tests/mutation_reader_assertions.hh"
|
||||
#include "tests/test_services.hh"
|
||||
|
||||
#include "mutation_reader.hh"
|
||||
#include "core/do_with.hh"
|
||||
|
||||
@@ -259,8 +259,9 @@ static void test_fast_forwarding_across_partitions_to_empty_range(populate_fn po
|
||||
|
||||
mutation_source ms = populate(s, partitions);
|
||||
|
||||
auto pr = dht::partition_range::make({keys[0]}, {keys[1]});
|
||||
mutation_reader rd = ms(s,
|
||||
dht::partition_range::make({keys[0]}, {keys[1]}),
|
||||
pr,
|
||||
query::full_slice,
|
||||
default_priority_class(),
|
||||
nullptr,
|
||||
@@ -280,14 +281,16 @@ static void test_fast_forwarding_across_partitions_to_empty_range(populate_fn po
|
||||
// ...don't finish consumption to leave the reader in the middle of partition
|
||||
}
|
||||
|
||||
rd.fast_forward_to(dht::partition_range::make({missing_key}, {missing_key})).get();
|
||||
pr = dht::partition_range::make({missing_key}, {missing_key});
|
||||
rd.fast_forward_to(pr).get();
|
||||
|
||||
{
|
||||
streamed_mutation_opt smo = rd().get0();
|
||||
BOOST_REQUIRE(!smo);
|
||||
}
|
||||
|
||||
rd.fast_forward_to(dht::partition_range::make({keys[3]}, {keys[3]})).get();
|
||||
pr = dht::partition_range::make({keys[3]}, {keys[3]});
|
||||
rd.fast_forward_to(pr).get();
|
||||
|
||||
{
|
||||
streamed_mutation_opt smo = rd().get0();
|
||||
@@ -303,7 +306,8 @@ static void test_fast_forwarding_across_partitions_to_empty_range(populate_fn po
|
||||
BOOST_REQUIRE(!smo);
|
||||
}
|
||||
|
||||
rd.fast_forward_to(dht::partition_range::make_starting_with({keys[keys.size() - 1]})).get();
|
||||
pr = dht::partition_range::make_starting_with({keys[keys.size() - 1]});
|
||||
rd.fast_forward_to(pr).get();
|
||||
|
||||
{
|
||||
streamed_mutation_opt smo = rd().get0();
|
||||
@@ -314,7 +318,8 @@ static void test_fast_forwarding_across_partitions_to_empty_range(populate_fn po
|
||||
// ...don't finish consumption to leave the reader in the middle of partition
|
||||
}
|
||||
|
||||
rd.fast_forward_to(dht::partition_range::make({key_after_all}, {key_after_all})).get();
|
||||
pr = dht::partition_range::make({key_after_all}, {key_after_all});
|
||||
rd.fast_forward_to(pr).get();
|
||||
|
||||
{
|
||||
streamed_mutation_opt smo = rd().get0();
|
||||
@@ -1274,7 +1279,7 @@ public:
|
||||
set_random_cells(row.cells(), column_kind::regular_column);
|
||||
row.marker() = random_row_marker();
|
||||
} else {
|
||||
m.partition().clustered_row(*_schema, ckey, is_dummy::yes, continuous);
|
||||
m.partition().clustered_row(*_schema, position_in_partition::after_all_clustered_rows(), is_dummy::yes, continuous);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -44,6 +44,7 @@
|
||||
#include "tests/mutation_assertions.hh"
|
||||
#include "tests/mutation_reader_assertions.hh"
|
||||
#include "tests/result_set_assertions.hh"
|
||||
#include "tests/test_services.hh"
|
||||
#include "mutation_source_test.hh"
|
||||
#include "cell_locking.hh"
|
||||
|
||||
@@ -279,6 +280,7 @@ SEASTAR_TEST_CASE(test_list_mutations) {
|
||||
|
||||
SEASTAR_TEST_CASE(test_multiple_memtables_one_partition) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto s = make_lw_shared(schema({}, some_keyspace, some_column_family,
|
||||
{{"p1", utf8_type}}, {{"c1", int32_type}}, {{"r1", int32_type}}, {}, utf8_type));
|
||||
|
||||
@@ -343,6 +345,7 @@ SEASTAR_TEST_CASE(test_flush_in_the_middle_of_a_scan) {
|
||||
|
||||
return with_column_family(s, cfg, [s](column_family& cf) {
|
||||
return seastar::async([s, &cf] {
|
||||
storage_service_for_tests ssft;
|
||||
// populate
|
||||
auto new_key = [&] {
|
||||
static thread_local int next = 0;
|
||||
@@ -406,6 +409,7 @@ SEASTAR_TEST_CASE(test_flush_in_the_middle_of_a_scan) {
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_multiple_memtables_multiple_partitions) {
|
||||
return seastar::async([] {
|
||||
auto s = make_lw_shared(schema({}, some_keyspace, some_column_family,
|
||||
{{"p1", int32_type}}, {{"c1", int32_type}}, {{"r1", int32_type}}, {}, utf8_type));
|
||||
|
||||
@@ -416,7 +420,7 @@ SEASTAR_TEST_CASE(test_multiple_memtables_multiple_partitions) {
|
||||
cfg.enable_disk_writes = false;
|
||||
cfg.enable_incremental_backups = false;
|
||||
cfg.cf_stats = &*cf_stats;
|
||||
return with_column_family(s, cfg, [s] (auto& cf) mutable {
|
||||
with_column_family(s, cfg, [s] (auto& cf) mutable {
|
||||
std::map<int32_t, std::map<int32_t, int32_t>> shadow, result;
|
||||
|
||||
const column_definition& r1_col = *s->get_column_definition("r1");
|
||||
@@ -456,7 +460,8 @@ SEASTAR_TEST_CASE(test_multiple_memtables_multiple_partitions) {
|
||||
BOOST_REQUIRE(shadow == result);
|
||||
});
|
||||
});
|
||||
}).then([cf_stats] {});
|
||||
}).then([cf_stats] {}).get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_cell_ordering) {
|
||||
|
||||
@@ -592,6 +592,24 @@ BOOST_AUTO_TEST_CASE(test_add_overlapping_range_to_range_with_empty_end) {
|
||||
BOOST_REQUIRE(it == l.end());
|
||||
}
|
||||
|
||||
// Reproduces https://github.com/scylladb/scylla/issues/3083
|
||||
BOOST_AUTO_TEST_CASE(test_coalescing_with_end_bound_inclusiveness_change_with_prefix_bound) {
|
||||
range_tombstone_list l(*s);
|
||||
|
||||
auto rt1 = rtie(4, 8, 4);
|
||||
auto rt2 = range_tombstone(key({8, 1}), bound_kind::incl_start, key({10}), bound_kind::excl_end, {1, gc_now});
|
||||
|
||||
l.apply(*s, rt1);
|
||||
l.apply(*s, rt2);
|
||||
|
||||
l.apply(*s, rt(1, 5, 4));
|
||||
|
||||
auto it = l.begin();
|
||||
assert_rt(rtie(1, 8, 4), *it++);
|
||||
assert_rt(rt2, *it++);
|
||||
BOOST_REQUIRE(it == l.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_search_with_empty_start) {
|
||||
range_tombstone_list l(*s);
|
||||
|
||||
|
||||
@@ -1886,3 +1886,47 @@ SEASTAR_TEST_CASE(test_concurrent_population_before_latest_version_iterator) {
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_tombstone_merging_of_overlapping_tombstones_in_many_versions) {
|
||||
return seastar::async([] {
|
||||
simple_schema s;
|
||||
cache_tracker tracker;
|
||||
memtable_snapshot_source underlying(s.schema());
|
||||
|
||||
auto pk = s.make_pkey(0);
|
||||
auto pr = dht::partition_range::make_singular(pk);
|
||||
|
||||
mutation m1(pk, s.schema());
|
||||
m1.partition().apply_delete(*s.schema(),
|
||||
s.make_range_tombstone(s.make_ckey_range(2, 107), s.new_tombstone()));
|
||||
s.add_row(m1, s.make_ckey(5), "val");
|
||||
|
||||
// What is important here is that it contains a newer range tombstone
|
||||
// which trims [2, 107] from m1 into (100, 107], which starts after ck=5.
|
||||
mutation m2(pk, s.schema());
|
||||
m2.partition().apply_delete(*s.schema(),
|
||||
s.make_range_tombstone(s.make_ckey_range(1, 100), s.new_tombstone()));
|
||||
|
||||
row_cache cache(s.schema(), snapshot_source([&] { return underlying(); }), tracker);
|
||||
|
||||
auto make_sm = [&] {
|
||||
auto rd = cache.make_reader(s.schema());
|
||||
auto smo = rd().get0();
|
||||
BOOST_REQUIRE(smo);
|
||||
streamed_mutation& sm = *smo;
|
||||
sm.set_max_buffer_size(1);
|
||||
return std::move(sm);
|
||||
};
|
||||
|
||||
apply(cache, underlying, m1);
|
||||
populate_range(cache, pr, s.make_ckey_range(0, 3));
|
||||
|
||||
auto sm1 = make_sm();
|
||||
|
||||
apply(cache, underlying, m2);
|
||||
|
||||
assert_that(cache.make_reader(s.schema()))
|
||||
.produces(m1 + m2)
|
||||
.produces_end_of_stream();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -408,7 +408,7 @@ SEASTAR_TEST_CASE(test_prepared_statement_is_invalidated_by_schema_change) {
|
||||
logging::logger_registry().set_logger_level("query_processor", logging::log_level::debug);
|
||||
e.execute_cql("create keyspace tests with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };").get();
|
||||
e.execute_cql("create table tests.table1 (pk int primary key, c1 int, c2 int);").get();
|
||||
bytes id = e.prepare("select * from tests.table1;").get0();
|
||||
auto id = e.prepare("select * from tests.table1;").get0();
|
||||
|
||||
e.execute_cql("alter table tests.table1 add s1 int;").get();
|
||||
|
||||
|
||||
@@ -43,12 +43,16 @@ public:
|
||||
api::timestamp_type new_timestamp() {
|
||||
return _timestamp++;
|
||||
}
|
||||
tombstone new_tombstone() {
|
||||
return {new_timestamp(), gc_clock::now()};
|
||||
}
|
||||
public:
|
||||
simple_schema()
|
||||
using with_static = bool_class<class static_tag>;
|
||||
simple_schema(with_static ws = with_static::yes)
|
||||
: _s(schema_builder("ks", "cf")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("ck", utf8_type, column_kind::clustering_key)
|
||||
.with_column("s1", utf8_type, column_kind::static_column)
|
||||
.with_column("s1", utf8_type, ws ? column_kind::static_column : column_kind::regular_column)
|
||||
.with_column("v", utf8_type)
|
||||
.build())
|
||||
, _v_def(*_s->get_column_definition(to_bytes("v")))
|
||||
|
||||
@@ -52,7 +52,7 @@ public:
|
||||
auto& prev = pi->entries[0];
|
||||
for (size_t i = 1; i < pi->entries.size(); ++i) {
|
||||
auto& cur = pi->entries[i];
|
||||
if (!pos_cmp(prev.end, cur.start)) {
|
||||
if (pos_cmp(cur.start, prev.end)) {
|
||||
std::cout << "promoted index:\n";
|
||||
for (auto& e : pi->entries) {
|
||||
std::cout << " " << e.start << "-" << e.end << ": +" << e.offset << " len=" << e.width << std::endl;
|
||||
@@ -66,6 +66,16 @@ public:
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
index_reader_assertions& is_empty(const schema& s) {
|
||||
_r->read_partition_data().get();
|
||||
while (!_r->eof()) {
|
||||
auto* pi = _r->current_partition_entry().get_promoted_index(s);
|
||||
BOOST_REQUIRE(pi == nullptr);
|
||||
_r->advance_to_next_partition().get();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
|
||||
@@ -47,6 +47,7 @@
|
||||
#include "cell_locking.hh"
|
||||
#include "simple_schema.hh"
|
||||
#include "memtable-sstable.hh"
|
||||
#include "tests/sstable_assertions.hh"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ftw.h>
|
||||
@@ -1009,6 +1010,8 @@ static ::mutation_reader sstable_reader(shared_sstable sst, schema_ptr s, const
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(compaction_manager_test) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
BOOST_REQUIRE(smp::count == 1);
|
||||
auto s = make_lw_shared(schema({}, some_keyspace, some_column_family,
|
||||
{{"p1", utf8_type}}, {{"c1", utf8_type}}, {{"r1", int32_type}}, {}, utf8_type));
|
||||
@@ -1030,7 +1033,7 @@ SEASTAR_TEST_CASE(compaction_manager_test) {
|
||||
|
||||
auto generations = make_lw_shared<std::vector<unsigned long>>({1, 2, 3, 4});
|
||||
|
||||
return do_for_each(*generations, [generations, cf, cm, s, tmp] (unsigned long generation) {
|
||||
do_for_each(*generations, [generations, cf, cm, s, tmp] (unsigned long generation) {
|
||||
// create 4 sstables of similar size to be compacted later on.
|
||||
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
@@ -1083,7 +1086,8 @@ SEASTAR_TEST_CASE(compaction_manager_test) {
|
||||
});
|
||||
}).finally([s, cm, tmp, cl_stats] {
|
||||
return cm->stop().then([cm] {});
|
||||
});
|
||||
}).get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(compact) {
|
||||
@@ -1650,8 +1654,6 @@ SEASTAR_TEST_CASE(datafile_generation_47) {
|
||||
SEASTAR_TEST_CASE(test_counter_write) {
|
||||
return test_setup::do_with_test_directory([] {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
|
||||
auto s = schema_builder(some_keyspace, some_column_family)
|
||||
.with_column("p1", utf8_type, column_kind::partition_key)
|
||||
.with_column("c1", utf8_type, column_kind::clustering_key)
|
||||
@@ -2280,6 +2282,7 @@ static shared_sstable make_sstable_containing(std::function<shared_sstable()> ss
|
||||
SEASTAR_TEST_CASE(tombstone_purge_test) {
|
||||
BOOST_REQUIRE(smp::count == 1);
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
cell_locker_stats cl_stats;
|
||||
|
||||
// In a column family with gc_grace_seconds set to 0, check that a tombstone
|
||||
@@ -3175,6 +3178,7 @@ static void test_min_max_clustering_key(schema_ptr s, std::vector<bytes> explode
|
||||
|
||||
SEASTAR_TEST_CASE(min_max_clustering_key_test) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
{
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
@@ -3222,6 +3226,7 @@ SEASTAR_TEST_CASE(min_max_clustering_key_test) {
|
||||
|
||||
SEASTAR_TEST_CASE(min_max_clustering_key_test_2) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("ck1", utf8_type, column_kind::clustering_key)
|
||||
@@ -3270,6 +3275,7 @@ SEASTAR_TEST_CASE(min_max_clustering_key_test_2) {
|
||||
|
||||
SEASTAR_TEST_CASE(sstable_tombstone_metadata_check) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("ck1", utf8_type, column_kind::clustering_key)
|
||||
@@ -3439,6 +3445,7 @@ shared_sstable make_sstable(sstring path, streamed_mutation sm, sstable_writer_c
|
||||
|
||||
SEASTAR_TEST_CASE(test_repeated_tombstone_skipping) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
simple_schema table;
|
||||
|
||||
std::vector<mutation_fragment> fragments;
|
||||
@@ -3506,6 +3513,7 @@ uint64_t consume_all(streamed_mutation& sm) {
|
||||
|
||||
SEASTAR_TEST_CASE(test_skipping_using_index) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
simple_schema table;
|
||||
|
||||
const unsigned rows_per_part = 10;
|
||||
@@ -3843,3 +3851,63 @@ SEASTAR_TEST_CASE(test_wrong_counter_shard_order) {
|
||||
BOOST_REQUIRE(!reader().get0());
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_broken_promoted_index_is_skipped) {
|
||||
// create table ks.test (pk int, ck int, v int, primary key(pk, ck)) with compact storage;
|
||||
//
|
||||
// Populated with:
|
||||
//
|
||||
// insert into ks.test (pk, ck, v) values (1, 1, 1);
|
||||
// insert into ks.test (pk, ck, v) values (1, 2, 1);
|
||||
// insert into ks.test (pk, ck, v) values (1, 3, 1);
|
||||
// delete from ks.test where pk = 1 and ck = 2;
|
||||
return seastar::async([] {
|
||||
auto s = schema_builder("ks", "test")
|
||||
.with_column("pk", int32_type, column_kind::partition_key)
|
||||
.with_column("ck", int32_type, column_kind::clustering_key)
|
||||
.with_column("v", int32_type)
|
||||
.build(schema_builder::compact_storage::yes);
|
||||
|
||||
auto sst = sstables::make_sstable(s, "tests/sstables/broken_non_compound_pi_and_range_tombstone", 1, sstables::sstable::version_types::ka, big);
|
||||
sst->load().get0();
|
||||
|
||||
{
|
||||
assert_that(sst->get_index_reader(default_priority_class())).is_empty(*s);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_old_format_non_compound_range_tombstone_is_read) {
|
||||
// create table ks.test (pk int, ck int, v int, primary key(pk, ck)) with compact storage;
|
||||
//
|
||||
// Populated with:
|
||||
//
|
||||
// insert into ks.test (pk, ck, v) values (1, 1, 1);
|
||||
// insert into ks.test (pk, ck, v) values (1, 2, 1);
|
||||
// insert into ks.test (pk, ck, v) values (1, 3, 1);
|
||||
// delete from ks.test where pk = 1 and ck = 2;
|
||||
return seastar::async([] {
|
||||
auto s = schema_builder("ks", "test")
|
||||
.with_column("pk", int32_type, column_kind::partition_key)
|
||||
.with_column("ck", int32_type, column_kind::clustering_key)
|
||||
.with_column("v", int32_type)
|
||||
.build(schema_builder::compact_storage::yes);
|
||||
|
||||
auto sst = sstables::make_sstable(s, "tests/sstables/broken_non_compound_pi_and_range_tombstone", 1, sstables::sstable::version_types::ka, big);
|
||||
sst->load().get0();
|
||||
|
||||
auto pk = partition_key::from_exploded(*s, { int32_type->decompose(1) });
|
||||
auto dk = dht::global_partitioner().decorate_key(*s, pk);
|
||||
auto ck = clustering_key::from_exploded(*s, {int32_type->decompose(2)});
|
||||
mutation m(dk, s);
|
||||
m.set_clustered_cell(ck, *s->get_column_definition("v"), atomic_cell::make_live(1511270919978349, int32_type->decompose(1), { }));
|
||||
m.partition().apply_delete(*s, ck, {1511270943827278, gc_clock::from_time_t(1511270943)});
|
||||
|
||||
{
|
||||
auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_singular({ck})).build();
|
||||
assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -32,10 +32,12 @@
|
||||
#include "mutation_reader.hh"
|
||||
#include "mutation_reader_assertions.hh"
|
||||
#include "mutation_source_test.hh"
|
||||
#include "partition_slice_builder.hh"
|
||||
#include "tmpdir.hh"
|
||||
#include "memtable-sstable.hh"
|
||||
#include "disk-error-handler.hh"
|
||||
#include "tests/sstable_assertions.hh"
|
||||
#include "tests/test_services.hh"
|
||||
|
||||
thread_local disk_error_signal_type commit_error;
|
||||
thread_local disk_error_signal_type general_disk_error;
|
||||
@@ -386,6 +388,7 @@ void test_mutation_source(sstable_writer_config cfg, sstables::sstable::version_
|
||||
|
||||
SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
for (auto version : {sstables::sstable::version_types::ka, sstables::sstable::version_types::la}) {
|
||||
for (auto index_block_size : {1, 128, 64*1024}) {
|
||||
sstable_writer_config cfg;
|
||||
@@ -398,6 +401,7 @@ SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) {
|
||||
|
||||
SEASTAR_TEST_CASE(test_sstable_can_write_and_read_range_tombstone) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
auto s = make_lw_shared(schema({}, "ks", "cf",
|
||||
{{"p1", utf8_type}}, {{"c1", int32_type}}, {{"r1", int32_type}}, {}, utf8_type));
|
||||
@@ -772,6 +776,7 @@ SEASTAR_TEST_CASE(tombstone_in_tombstone2) {
|
||||
|
||||
SEASTAR_TEST_CASE(test_non_compound_table_row_is_not_marked_as_static) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
schema_builder builder("ks", "cf");
|
||||
builder.with_column("p", utf8_type, column_kind::partition_key);
|
||||
@@ -805,6 +810,7 @@ SEASTAR_TEST_CASE(test_non_compound_table_row_is_not_marked_as_static) {
|
||||
|
||||
SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
schema_builder builder("ks", "cf");
|
||||
builder.with_column("p", utf8_type, column_kind::partition_key);
|
||||
@@ -851,3 +857,283 @@ SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic) {
|
||||
assert_that(sst->get_index_reader(default_priority_class())).has_monotonic_positions(*s);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic_compound_dense) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
schema_builder builder("ks", "cf");
|
||||
builder.with_column("p", utf8_type, column_kind::partition_key);
|
||||
builder.with_column("c1", int32_type, column_kind::clustering_key);
|
||||
builder.with_column("c2", int32_type, column_kind::clustering_key);
|
||||
builder.with_column("v", int32_type);
|
||||
auto s = builder.build(schema_builder::compact_storage::yes);
|
||||
|
||||
auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
|
||||
auto cell = atomic_cell::make_live(1, int32_type->decompose(88), { });
|
||||
mutation m(dk, s);
|
||||
|
||||
auto ck1 = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(2)});
|
||||
m.set_clustered_cell(ck1, *s->get_column_definition("v"), cell);
|
||||
|
||||
auto ck2 = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(4)});
|
||||
m.set_clustered_cell(ck2, *s->get_column_definition("v"), cell);
|
||||
|
||||
auto ck3 = clustering_key::from_exploded(*s, {int32_type->decompose(1), int32_type->decompose(6)});
|
||||
m.set_clustered_cell(ck3, *s->get_column_definition("v"), cell);
|
||||
|
||||
auto ck4 = clustering_key::from_exploded(*s, {int32_type->decompose(3), int32_type->decompose(9)});
|
||||
m.set_clustered_cell(ck4, *s->get_column_definition("v"), cell);
|
||||
|
||||
m.partition().apply_row_tombstone(*s, range_tombstone(
|
||||
clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
|
||||
bound_kind::incl_start,
|
||||
clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
|
||||
bound_kind::incl_end,
|
||||
{1, gc_clock::now()}));
|
||||
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
mt->apply(std::move(m));
|
||||
|
||||
auto sst = sstables::make_sstable(s,
|
||||
dir->path,
|
||||
1 /* generation */,
|
||||
sstables::sstable::version_types::ka,
|
||||
sstables::sstable::format_types::big);
|
||||
sstable_writer_config cfg;
|
||||
cfg.promoted_index_block_size = 1;
|
||||
sst->write_components(mt->make_reader(s), 1, s, cfg).get();
|
||||
sst->load().get();
|
||||
|
||||
{
|
||||
assert_that(sst->get_index_reader(default_priority_class())).has_monotonic_positions(*s);
|
||||
}
|
||||
|
||||
{
|
||||
auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck1})).build();
|
||||
assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_promoted_index_blocks_are_monotonic_non_compound_dense) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
schema_builder builder("ks", "cf");
|
||||
builder.with_column("p", utf8_type, column_kind::partition_key);
|
||||
builder.with_column("c1", int32_type, column_kind::clustering_key);
|
||||
builder.with_column("v", int32_type);
|
||||
auto s = builder.build(schema_builder::compact_storage::yes);
|
||||
|
||||
auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
|
||||
auto cell = atomic_cell::make_live(1, int32_type->decompose(88), { });
|
||||
mutation m(dk, s);
|
||||
|
||||
auto ck1 = clustering_key::from_exploded(*s, {int32_type->decompose(1)});
|
||||
m.set_clustered_cell(ck1, *s->get_column_definition("v"), cell);
|
||||
|
||||
auto ck2 = clustering_key::from_exploded(*s, {int32_type->decompose(2)});
|
||||
m.set_clustered_cell(ck2, *s->get_column_definition("v"), cell);
|
||||
|
||||
auto ck3 = clustering_key::from_exploded(*s, {int32_type->decompose(3)});
|
||||
m.set_clustered_cell(ck3, *s->get_column_definition("v"), cell);
|
||||
|
||||
m.partition().apply_row_tombstone(*s, range_tombstone(
|
||||
clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
|
||||
bound_kind::incl_start,
|
||||
clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
|
||||
bound_kind::incl_end,
|
||||
{1, gc_clock::now()}));
|
||||
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
mt->apply(std::move(m));
|
||||
|
||||
auto sst = sstables::make_sstable(s,
|
||||
dir->path,
|
||||
1 /* generation */,
|
||||
sstables::sstable::version_types::ka,
|
||||
sstables::sstable::format_types::big);
|
||||
sstable_writer_config cfg;
|
||||
cfg.promoted_index_block_size = 1;
|
||||
sst->write_components(mt->make_reader(s), 1, s, cfg).get();
|
||||
sst->load().get();
|
||||
|
||||
{
|
||||
assert_that(sst->get_index_reader(default_priority_class())).has_monotonic_positions(*s);
|
||||
}
|
||||
|
||||
{
|
||||
auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck1})).build();
|
||||
assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_promoted_index_repeats_open_tombstones) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
int id = 0;
|
||||
for (auto& compact : { schema_builder::compact_storage::no, schema_builder::compact_storage::yes }) {
|
||||
schema_builder builder("ks", sprint("cf%d", id++));
|
||||
builder.with_column("p", utf8_type, column_kind::partition_key);
|
||||
builder.with_column("c1", bytes_type, column_kind::clustering_key);
|
||||
builder.with_column("v", int32_type);
|
||||
auto s = builder.build(compact);
|
||||
|
||||
auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
|
||||
auto cell = atomic_cell::make_live(1, int32_type->decompose(88), { });
|
||||
mutation m(dk, s);
|
||||
|
||||
m.partition().apply_row_tombstone(*s, range_tombstone(
|
||||
clustering_key_prefix::from_exploded(*s, {bytes_type->decompose(data_value(to_bytes("ck1")))}),
|
||||
bound_kind::incl_start,
|
||||
clustering_key_prefix::from_exploded(*s, {bytes_type->decompose(data_value(to_bytes("ck5")))}),
|
||||
bound_kind::incl_end,
|
||||
{1, gc_clock::now()}));
|
||||
|
||||
auto ck = clustering_key::from_exploded(*s, {bytes_type->decompose(data_value(to_bytes("ck3")))});
|
||||
m.set_clustered_cell(ck, *s->get_column_definition("v"), cell);
|
||||
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
mt->apply(m);
|
||||
|
||||
auto sst = sstables::make_sstable(s,
|
||||
dir->path,
|
||||
1 /* generation */,
|
||||
sstables::sstable::version_types::ka,
|
||||
sstables::sstable::format_types::big);
|
||||
sstable_writer_config cfg;
|
||||
cfg.promoted_index_block_size = 1;
|
||||
sst->write_components(mt->make_reader(s), 1, s, cfg).get();
|
||||
sst->load().get();
|
||||
|
||||
{
|
||||
auto slice = partition_slice_builder(*s).with_range(query::clustering_range::make_starting_with({ck})).build();
|
||||
assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_range_tombstones_are_correctly_seralized_for_non_compound_dense_schemas) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
schema_builder builder("ks", "cf");
|
||||
builder.with_column("p", utf8_type, column_kind::partition_key);
|
||||
builder.with_column("c", int32_type, column_kind::clustering_key);
|
||||
builder.with_column("v", int32_type);
|
||||
auto s = builder.build(schema_builder::compact_storage::yes);
|
||||
|
||||
auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
|
||||
mutation m(dk, s);
|
||||
|
||||
m.partition().apply_row_tombstone(*s, range_tombstone(
|
||||
clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
|
||||
bound_kind::incl_start,
|
||||
clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
|
||||
bound_kind::incl_end,
|
||||
{1, gc_clock::now()}));
|
||||
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
mt->apply(m);
|
||||
|
||||
auto sst = sstables::make_sstable(s,
|
||||
dir->path,
|
||||
1 /* generation */,
|
||||
sstables::sstable::version_types::ka,
|
||||
sstables::sstable::format_types::big);
|
||||
sstable_writer_config cfg;
|
||||
sst->write_components(mt->make_reader(s), 1, s, cfg).get();
|
||||
sst->load().get();
|
||||
|
||||
{
|
||||
auto slice = partition_slice_builder(*s).build();
|
||||
assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_promoted_index_is_absent_for_schemas_without_clustering_key) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
schema_builder builder("ks", "cf");
|
||||
builder.with_column("p", utf8_type, column_kind::partition_key);
|
||||
builder.with_column("v", int32_type);
|
||||
auto s = builder.build(schema_builder::compact_storage::yes);
|
||||
|
||||
auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
|
||||
mutation m(dk, s);
|
||||
for (auto&& v : { 1, 2, 3, 4 }) {
|
||||
auto cell = atomic_cell::make_live(1, int32_type->decompose(v), { });
|
||||
m.set_clustered_cell(clustering_key_prefix::make_empty(), *s->get_column_definition("v"), cell);
|
||||
}
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
mt->apply(m);
|
||||
|
||||
auto sst = sstables::make_sstable(s,
|
||||
dir->path,
|
||||
1 /* generation */,
|
||||
sstables::sstable::version_types::ka,
|
||||
sstables::sstable::format_types::big);
|
||||
sstable_writer_config cfg;
|
||||
cfg.promoted_index_block_size = 1;
|
||||
sst->write_components(mt->make_reader(s), 1, s, cfg).get();
|
||||
sst->load().get();
|
||||
|
||||
assert_that(sst->get_index_reader(default_priority_class())).is_empty(*s);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_can_write_and_read_non_compound_range_tombstone_as_compound) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
auto dir = make_lw_shared<tmpdir>();
|
||||
schema_builder builder("ks", "cf");
|
||||
builder.with_column("p", utf8_type, column_kind::partition_key);
|
||||
builder.with_column("c", int32_type, column_kind::clustering_key);
|
||||
builder.with_column("v", int32_type);
|
||||
auto s = builder.build(schema_builder::compact_storage::yes);
|
||||
|
||||
auto dk = dht::global_partitioner().decorate_key(*s, partition_key::from_exploded(*s, {to_bytes("key1")}));
|
||||
mutation m(dk, s);
|
||||
|
||||
m.partition().apply_row_tombstone(*s, range_tombstone(
|
||||
clustering_key_prefix::from_exploded(*s, {int32_type->decompose(1)}),
|
||||
bound_kind::incl_start,
|
||||
clustering_key_prefix::from_exploded(*s, {int32_type->decompose(2)}),
|
||||
bound_kind::incl_end,
|
||||
{1, gc_clock::now()}));
|
||||
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
mt->apply(m);
|
||||
|
||||
auto sst = sstables::make_sstable(s,
|
||||
dir->path,
|
||||
1 /* generation */,
|
||||
sstables::sstable::version_types::ka,
|
||||
sstables::sstable::format_types::big);
|
||||
sstable_writer_config cfg;
|
||||
cfg.correctly_serialize_non_compound_range_tombstones = false;
|
||||
sst->write_components(mt->make_reader(s), 1, s, cfg).get();
|
||||
sst->load().get();
|
||||
|
||||
{
|
||||
auto slice = partition_slice_builder(*s).build();
|
||||
assert_that(sst->as_mutation_source()(s, dht::partition_range::make_singular(dk), slice))
|
||||
.produces(m)
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "schema.hh"
|
||||
#include "schema_builder.hh"
|
||||
#include "core/thread.hh"
|
||||
#include "tests/test_services.hh"
|
||||
|
||||
static auto la = sstables::sstable::version_types::la;
|
||||
static auto big = sstables::sstable::format_types::big;
|
||||
@@ -597,12 +598,12 @@ public:
|
||||
}
|
||||
|
||||
static future<> do_with_test_directory(std::function<future<> ()>&& fut, sstring p = path()) {
|
||||
return test_setup::create_empty_test_dir(p).then([fut = std::move(fut), p] () mutable {
|
||||
return fut();
|
||||
}).finally([p] {
|
||||
return test_setup::empty_test_dir(p).then([p] {
|
||||
return engine().remove_file(p);
|
||||
});
|
||||
return seastar::async([p, fut = std::move(fut)] {
|
||||
storage_service_for_tests ssft;
|
||||
test_setup::create_empty_test_dir(p).get();
|
||||
fut().get();
|
||||
test_setup::empty_test_dir(p).get();
|
||||
engine().remove_file(p).get();
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
|
||||
2104758772
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
Scylla.db
|
||||
CompressionInfo.db
|
||||
Filter.db
|
||||
Statistics.db
|
||||
TOC.txt
|
||||
Digest.sha1
|
||||
Index.db
|
||||
Summary.db
|
||||
Data.db
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1002,7 +1002,7 @@ public:
|
||||
|
||||
void execute_prepared_cql3_query(tcxx::function<void(CqlResult const& _return)> cob, tcxx::function<void(::apache::thrift::TDelayedException* _throw)> exn_cob, const int32_t itemId, const std::vector<std::string> & values, const ConsistencyLevel::type consistency) {
|
||||
with_exn_cob(std::move(exn_cob), [&] {
|
||||
auto prepared = _query_processor.local().get_prepared_for_thrift(itemId);
|
||||
auto prepared = _query_processor.local().get_prepared(cql3::prepared_cache_key_type(itemId));
|
||||
if (!prepared) {
|
||||
throw make_exception<InvalidRequestException>("Prepared query with id %d not found", itemId);
|
||||
}
|
||||
|
||||
103
thrift/server.cc
103
thrift/server.cc
@@ -50,6 +50,8 @@ using namespace apache::thrift::protocol;
|
||||
using namespace apache::thrift::async;
|
||||
using namespace ::cassandra;
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
class thrift_stats {
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
public:
|
||||
@@ -68,8 +70,10 @@ thrift_server::~thrift_server() {
|
||||
}
|
||||
|
||||
future<> thrift_server::stop() {
|
||||
auto f = _stop_gate.close();
|
||||
std::for_each(_listeners.begin(), _listeners.end(), std::mem_fn(&server_socket::abort_accept));
|
||||
std::for_each(_connections_list.begin(), _connections_list.end(), std::mem_fn(&connection::shutdown));
|
||||
return make_ready_future<>();
|
||||
return f;
|
||||
}
|
||||
|
||||
struct handler_deleter {
|
||||
@@ -101,8 +105,27 @@ thrift_server::connection::connection(thrift_server& server, connected_socket&&
|
||||
}
|
||||
|
||||
thrift_server::connection::~connection() {
|
||||
--_server._current_connections;
|
||||
_server._connections_list.erase(_server._connections_list.iterator_to(*this));
|
||||
if (is_linked()) {
|
||||
--_server._current_connections;
|
||||
_server._connections_list.erase(_server._connections_list.iterator_to(*this));
|
||||
}
|
||||
}
|
||||
|
||||
thrift_server::connection::connection(connection&& other)
|
||||
: _server(other._server)
|
||||
, _fd(std::move(other._fd))
|
||||
, _read_buf(std::move(other._read_buf))
|
||||
, _write_buf(std::move(other._write_buf))
|
||||
, _transport(std::move(other._transport))
|
||||
, _input(std::move(other._input))
|
||||
, _output(std::move(other._output))
|
||||
, _in_proto(std::move(other._in_proto))
|
||||
, _out_proto(std::move(other._out_proto))
|
||||
, _processor(std::move(other._processor)) {
|
||||
if (other.is_linked()) {
|
||||
boost::intrusive::list<connection>::node_algorithms::init(this_ptr());
|
||||
boost::intrusive::list<connection>::node_algorithms::swap_nodes(other.this_ptr(), this_ptr());
|
||||
}
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -190,29 +213,65 @@ thrift_server::listen(ipv4_addr addr, bool keepalive) {
|
||||
|
||||
void
|
||||
thrift_server::do_accepts(int which, bool keepalive) {
|
||||
_listeners[which].accept().then([this, which, keepalive] (connected_socket fd, socket_address addr) mutable {
|
||||
fd.set_nodelay(true);
|
||||
fd.set_keepalive(keepalive);
|
||||
auto conn = new connection(*this, std::move(fd), addr);
|
||||
conn->process().then_wrapped([this, conn] (future<> f) {
|
||||
conn->shutdown();
|
||||
delete conn;
|
||||
try {
|
||||
f.get();
|
||||
} catch (std::exception& ex) {
|
||||
tlogger.debug("request error {}", ex.what());
|
||||
}
|
||||
if (_stop_gate.is_closed()) {
|
||||
return;
|
||||
}
|
||||
with_gate(_stop_gate, [&, this] {
|
||||
return _listeners[which].accept().then([this, which, keepalive] (connected_socket fd, socket_address addr) {
|
||||
fd.set_nodelay(true);
|
||||
fd.set_keepalive(keepalive);
|
||||
with_gate(_stop_gate, [&, this] {
|
||||
return do_with(connection(*this, std::move(fd), addr), [this] (auto& conn) {
|
||||
return conn.process().then_wrapped([this, &conn] (future<> f) {
|
||||
conn.shutdown();
|
||||
try {
|
||||
f.get();
|
||||
} catch (std::exception& ex) {
|
||||
tlogger.debug("request error {}", ex.what());
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
do_accepts(which, keepalive);
|
||||
}).handle_exception([this, which, keepalive] (auto ex) {
|
||||
tlogger.debug("accept failed {}", ex);
|
||||
this->maybe_retry_accept(which, keepalive, std::move(ex));
|
||||
});
|
||||
do_accepts(which, keepalive);
|
||||
}).then_wrapped([] (future<> f) {
|
||||
try {
|
||||
f.get();
|
||||
} catch (std::exception& ex) {
|
||||
std::cout << "accept failed: " << ex.what() << "\n";
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void thrift_server::maybe_retry_accept(int which, bool keepalive, std::exception_ptr ex) {
|
||||
auto retry = [this, which, keepalive] {
|
||||
tlogger.debug("retrying accept after failure");
|
||||
do_accepts(which, keepalive);
|
||||
};
|
||||
auto retry_with_backoff = [&] {
|
||||
// FIXME: Consider using exponential backoff
|
||||
sleep(1ms).then([retry = std::move(retry)] { retry(); });
|
||||
};
|
||||
try {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
} catch (const std::system_error& e) {
|
||||
switch (e.code().value()) {
|
||||
// FIXME: Don't retry for other fatal errors
|
||||
case EBADF:
|
||||
break;
|
||||
case ENFILE:
|
||||
case EMFILE:
|
||||
case ENOMEM:
|
||||
retry_with_backoff();
|
||||
default:
|
||||
retry();
|
||||
}
|
||||
} catch (const std::bad_alloc&) {
|
||||
retry_with_backoff();
|
||||
} catch (const seastar::gate_closed_exception&) {
|
||||
return;
|
||||
} catch (...) {
|
||||
retry();
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t
|
||||
thrift_server::total_connections() const {
|
||||
return _total_connections;
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "core/reactor.hh"
|
||||
#include "core/distributed.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <memory>
|
||||
#include <cstdint>
|
||||
#include <boost/intrusive/list.hpp>
|
||||
@@ -79,6 +80,7 @@ class thrift_server {
|
||||
public:
|
||||
connection(thrift_server& server, connected_socket&& fd, socket_address addr);
|
||||
~connection();
|
||||
connection(connection&&);
|
||||
future<> process();
|
||||
future<> read();
|
||||
future<> write();
|
||||
@@ -96,6 +98,7 @@ private:
|
||||
uint64_t _current_connections = 0;
|
||||
uint64_t _requests_served = 0;
|
||||
boost::intrusive::list<connection> _connections_list;
|
||||
seastar::gate _stop_gate;
|
||||
public:
|
||||
thrift_server(distributed<database>& db, distributed<cql3::query_processor>& qp);
|
||||
~thrift_server();
|
||||
@@ -105,6 +108,9 @@ public:
|
||||
uint64_t total_connections() const;
|
||||
uint64_t current_connections() const;
|
||||
uint64_t requests_served() const;
|
||||
|
||||
private:
|
||||
void maybe_retry_accept(int which, bool keepalive, std::exception_ptr ex);
|
||||
};
|
||||
|
||||
#endif /* APPS_SEASTAR_THRIFT_SERVER_HH_ */
|
||||
|
||||
@@ -66,12 +66,12 @@ void cql_server::event_notifier::on_create_keyspace(const sstring& ks_name)
|
||||
{
|
||||
for (auto&& conn : _schema_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
event::schema_change::change_type::CREATED,
|
||||
ks_name
|
||||
}));
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,14 +79,14 @@ void cql_server::event_notifier::on_create_column_family(const sstring& ks_name,
|
||||
{
|
||||
for (auto&& conn : _schema_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
event::schema_change::change_type::CREATED,
|
||||
event::schema_change::target_type::TABLE,
|
||||
ks_name,
|
||||
cf_name
|
||||
}));
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,14 +94,14 @@ void cql_server::event_notifier::on_create_user_type(const sstring& ks_name, con
|
||||
{
|
||||
for (auto&& conn : _schema_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
event::schema_change::change_type::CREATED,
|
||||
event::schema_change::target_type::TYPE,
|
||||
ks_name,
|
||||
type_name
|
||||
}));
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,12 +124,12 @@ void cql_server::event_notifier::on_update_keyspace(const sstring& ks_name)
|
||||
{
|
||||
for (auto&& conn : _schema_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
event::schema_change::change_type::UPDATED,
|
||||
ks_name
|
||||
}));
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,14 +137,14 @@ void cql_server::event_notifier::on_update_column_family(const sstring& ks_name,
|
||||
{
|
||||
for (auto&& conn : _schema_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
event::schema_change::change_type::UPDATED,
|
||||
event::schema_change::target_type::TABLE,
|
||||
ks_name,
|
||||
cf_name
|
||||
}));
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,14 +152,14 @@ void cql_server::event_notifier::on_update_user_type(const sstring& ks_name, con
|
||||
{
|
||||
for (auto&& conn : _schema_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
event::schema_change::change_type::UPDATED,
|
||||
event::schema_change::target_type::TYPE,
|
||||
ks_name,
|
||||
type_name
|
||||
}));
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,12 +182,12 @@ void cql_server::event_notifier::on_drop_keyspace(const sstring& ks_name)
|
||||
{
|
||||
for (auto&& conn : _schema_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
event::schema_change::change_type::DROPPED,
|
||||
ks_name
|
||||
}));
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -195,14 +195,14 @@ void cql_server::event_notifier::on_drop_column_family(const sstring& ks_name, c
|
||||
{
|
||||
for (auto&& conn : _schema_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
event::schema_change::change_type::DROPPED,
|
||||
event::schema_change::target_type::TABLE,
|
||||
ks_name,
|
||||
cf_name
|
||||
}));
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,14 +210,14 @@ void cql_server::event_notifier::on_drop_user_type(const sstring& ks_name, const
|
||||
{
|
||||
for (auto&& conn : _schema_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_schema_change_event(event::schema_change{
|
||||
event::schema_change::change_type::DROPPED,
|
||||
event::schema_change::target_type::TYPE,
|
||||
ks_name,
|
||||
type_name
|
||||
}));
|
||||
});
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -240,9 +240,9 @@ void cql_server::event_notifier::on_join_cluster(const gms::inet_address& endpoi
|
||||
{
|
||||
for (auto&& conn : _topology_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_topology_change_event(event::topology_change::new_node(endpoint, conn->_server_addr.port)));
|
||||
});
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_topology_change_event(event::topology_change::new_node(endpoint, conn->_server_addr.port)));
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -250,9 +250,9 @@ void cql_server::event_notifier::on_leave_cluster(const gms::inet_address& endpo
|
||||
{
|
||||
for (auto&& conn : _topology_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_topology_change_event(event::topology_change::removed_node(endpoint, conn->_server_addr.port)));
|
||||
});
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_topology_change_event(event::topology_change::removed_node(endpoint, conn->_server_addr.port)));
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -260,9 +260,9 @@ void cql_server::event_notifier::on_move(const gms::inet_address& endpoint)
|
||||
{
|
||||
for (auto&& conn : _topology_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_topology_change_event(event::topology_change::moved_node(endpoint, conn->_server_addr.port)));
|
||||
});
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_topology_change_event(event::topology_change::moved_node(endpoint, conn->_server_addr.port)));
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -273,9 +273,9 @@ void cql_server::event_notifier::on_up(const gms::inet_address& endpoint)
|
||||
if (!was_up) {
|
||||
for (auto&& conn : _status_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_status_change_event(event::status_change::node_up(endpoint, conn->_server_addr.port)));
|
||||
});
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_status_change_event(event::status_change::node_up(endpoint, conn->_server_addr.port)));
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -287,9 +287,9 @@ void cql_server::event_notifier::on_down(const gms::inet_address& endpoint)
|
||||
if (!was_down) {
|
||||
for (auto&& conn : _status_change_listeners) {
|
||||
using namespace cql_transport;
|
||||
with_gate(conn->_pending_requests_gate, [&] {
|
||||
return conn->write_response(conn->make_status_change_event(event::status_change::node_down(endpoint, conn->_server_addr.port)));
|
||||
});
|
||||
if (!conn->_pending_requests_gate.is_closed()) {
|
||||
conn->write_response(conn->make_status_change_event(event::status_change::node_down(endpoint, conn->_server_addr.port)));
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -590,8 +590,8 @@ future<> cql_server::connection::process()
|
||||
return write_response(make_error(0, exceptions::exception_code::SERVER_ERROR, "unknown error", tracing::trace_state_ptr()));
|
||||
}
|
||||
}).finally([this] {
|
||||
_server._notifier->unregister_connection(this);
|
||||
return _pending_requests_gate.close().then([this] {
|
||||
_server._notifier->unregister_connection(this);
|
||||
return _ready_to_respond.finally([this] {
|
||||
return _write_buf.close();
|
||||
});
|
||||
@@ -826,15 +826,14 @@ future<response_type> cql_server::connection::process_prepare(uint16_t stream, b
|
||||
return parallel_for_each(cpus.begin(), cpus.end(), [this, query, cpu_id, &cs] (unsigned int c) mutable {
|
||||
if (c != cpu_id) {
|
||||
return smp::submit_to(c, [this, query, &cs] () mutable {
|
||||
_server._query_processor.local().prepare(query, cs, false);
|
||||
// FIXME: error handling
|
||||
return _server._query_processor.local().prepare(std::move(query), cs, false).discard_result();
|
||||
});
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}).then([this, query, stream, &cs] {
|
||||
}).then([this, query, stream, &cs] () mutable {
|
||||
tracing::trace(cs.get_trace_state(), "Done preparing on remote shards");
|
||||
return _server._query_processor.local().prepare(query, cs, false).then([this, stream, &cs] (auto msg) {
|
||||
return _server._query_processor.local().prepare(std::move(query), cs, false).then([this, stream, &cs] (auto msg) {
|
||||
tracing::trace(cs.get_trace_state(), "Done preparing on a local shard - preparing a result. ID is [{}]", seastar::value_of([&msg] {
|
||||
return messages::result_message::prepared::cql::get_id(msg);
|
||||
}));
|
||||
@@ -848,8 +847,9 @@ future<response_type> cql_server::connection::process_prepare(uint16_t stream, b
|
||||
|
||||
future<response_type> cql_server::connection::process_execute(uint16_t stream, bytes_view buf, service::client_state client_state)
|
||||
{
|
||||
auto id = read_short_bytes(buf);
|
||||
auto prepared = _server._query_processor.local().get_prepared(id);
|
||||
cql3::prepared_cache_key_type cache_key(read_short_bytes(buf));
|
||||
auto& id = cql3::prepared_cache_key_type::cql_id(cache_key);
|
||||
auto prepared = _server._query_processor.local().get_prepared(cache_key);
|
||||
if (!prepared) {
|
||||
throw exceptions::prepared_query_not_found_exception(id);
|
||||
}
|
||||
@@ -925,8 +925,9 @@ cql_server::connection::process_batch(uint16_t stream, bytes_view buf, service::
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
auto id = read_short_bytes(buf);
|
||||
ps = _server._query_processor.local().get_prepared(id);
|
||||
cql3::prepared_cache_key_type cache_key(read_short_bytes(buf));
|
||||
auto& id = cql3::prepared_cache_key_type::cql_id(cache_key);
|
||||
ps = _server._query_processor.local().get_prepared(cache_key);
|
||||
if (!ps) {
|
||||
throw exceptions::prepared_query_not_found_exception(id);
|
||||
}
|
||||
|
||||
9
types.cc
9
types.cc
@@ -1963,8 +1963,7 @@ map_type_impl::to_string(const bytes& b) const {
|
||||
|
||||
size_t
|
||||
map_type_impl::hash(bytes_view v) const {
|
||||
// FIXME:
|
||||
abort();
|
||||
return std::hash<bytes_view>()(v);
|
||||
}
|
||||
|
||||
bytes
|
||||
@@ -2448,8 +2447,7 @@ set_type_impl::to_string(const bytes& b) const {
|
||||
|
||||
size_t
|
||||
set_type_impl::hash(bytes_view v) const {
|
||||
// FIXME:
|
||||
abort();
|
||||
return std::hash<bytes_view>()(v);
|
||||
}
|
||||
|
||||
bytes
|
||||
@@ -2637,8 +2635,7 @@ list_type_impl::to_string(const bytes& b) const {
|
||||
|
||||
size_t
|
||||
list_type_impl::hash(bytes_view v) const {
|
||||
// FIXME:
|
||||
abort();
|
||||
return std::hash<bytes_view>()(v);
|
||||
}
|
||||
|
||||
bytes
|
||||
|
||||
@@ -29,77 +29,54 @@
|
||||
#include <seastar/core/timer.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
|
||||
#include "utils/exceptions.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "utils/loading_shared_values.hh"
|
||||
#include "log.hh"
|
||||
|
||||
namespace bi = boost::intrusive;
|
||||
|
||||
namespace utils {
|
||||
// Simple variant of the "LoadingCache" used for permissions in origin.
|
||||
|
||||
typedef lowres_clock loading_cache_clock_type;
|
||||
typedef bi::list_base_hook<bi::link_mode<bi::auto_unlink>> auto_unlink_list_hook;
|
||||
using loading_cache_clock_type = seastar::lowres_clock;
|
||||
using auto_unlink_list_hook = bi::list_base_hook<bi::link_mode<bi::auto_unlink>>;
|
||||
|
||||
template<typename Tp, typename Key, typename Hash, typename EqualPred>
|
||||
class timestamped_val : public auto_unlink_list_hook, public bi::unordered_set_base_hook<bi::store_hash<true>> {
|
||||
template<typename Tp, typename Key, typename EntrySize , typename Hash, typename EqualPred, typename LoadingSharedValuesStats>
|
||||
class timestamped_val {
|
||||
public:
|
||||
typedef bi::list<timestamped_val, bi::constant_time_size<false>> lru_list_type;
|
||||
typedef Key key_type;
|
||||
typedef Tp value_type;
|
||||
using value_type = Tp;
|
||||
using loading_values_type = typename utils::loading_shared_values<Key, timestamped_val, Hash, EqualPred, LoadingSharedValuesStats, 256>;
|
||||
class lru_entry;
|
||||
class value_ptr;
|
||||
|
||||
private:
|
||||
std::experimental::optional<Tp> _opt_value;
|
||||
value_type _value;
|
||||
loading_cache_clock_type::time_point _loaded;
|
||||
loading_cache_clock_type::time_point _last_read;
|
||||
lru_list_type& _lru_list; /// MRU item is at the front, LRU - at the back
|
||||
Key _key;
|
||||
lru_entry* _lru_entry_ptr = nullptr; /// MRU item is at the front, LRU - at the back
|
||||
size_t _size = 0;
|
||||
|
||||
public:
|
||||
struct key_eq {
|
||||
bool operator()(const Key& k, const timestamped_val& c) const {
|
||||
return EqualPred()(k, c.key());
|
||||
}
|
||||
|
||||
bool operator()(const timestamped_val& c, const Key& k) const {
|
||||
return EqualPred()(c.key(), k);
|
||||
}
|
||||
};
|
||||
|
||||
timestamped_val(lru_list_type& lru_list, const Key& key)
|
||||
: _loaded(loading_cache_clock_type::now())
|
||||
timestamped_val(value_type val)
|
||||
: _value(std::move(val))
|
||||
, _loaded(loading_cache_clock_type::now())
|
||||
, _last_read(_loaded)
|
||||
, _lru_list(lru_list)
|
||||
, _key(key) {}
|
||||
|
||||
timestamped_val(lru_list_type& lru_list, Key&& key)
|
||||
: _loaded(loading_cache_clock_type::now())
|
||||
, _last_read(_loaded)
|
||||
, _lru_list(lru_list)
|
||||
, _key(std::move(key)) {}
|
||||
|
||||
timestamped_val(const timestamped_val&) = default;
|
||||
, _size(EntrySize()(_value))
|
||||
{}
|
||||
timestamped_val(timestamped_val&&) = default;
|
||||
|
||||
// Make sure copy/move-assignments don't go through the template below
|
||||
timestamped_val& operator=(const timestamped_val&) = default;
|
||||
timestamped_val& operator=(timestamped_val&) = default;
|
||||
timestamped_val& operator=(timestamped_val&&) = default;
|
||||
timestamped_val& operator=(value_type new_val) {
|
||||
assert(_lru_entry_ptr);
|
||||
|
||||
template <typename U>
|
||||
timestamped_val& operator=(U&& new_val) {
|
||||
_opt_value = std::forward<U>(new_val);
|
||||
_value = std::move(new_val);
|
||||
_loaded = loading_cache_clock_type::now();
|
||||
_lru_entry_ptr->cache_size() -= _size;
|
||||
_size = EntrySize()(_value);
|
||||
_lru_entry_ptr->cache_size() += _size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const Tp& value() {
|
||||
_last_read = loading_cache_clock_type::now();
|
||||
touch();
|
||||
return _opt_value.value();
|
||||
}
|
||||
|
||||
explicit operator bool() const noexcept {
|
||||
return bool(_opt_value);
|
||||
}
|
||||
value_type& value() noexcept { return _value; }
|
||||
const value_type& value() const noexcept { return _value; }
|
||||
|
||||
loading_cache_clock_type::time_point last_read() const noexcept {
|
||||
return _last_read;
|
||||
@@ -109,163 +86,353 @@ public:
|
||||
return _loaded;
|
||||
}
|
||||
|
||||
const Key& key() const {
|
||||
return _key;
|
||||
size_t size() const {
|
||||
return _size;
|
||||
}
|
||||
|
||||
friend bool operator==(const timestamped_val& a, const timestamped_val& b){
|
||||
return EqualPred()(a.key(), b.key());
|
||||
}
|
||||
|
||||
friend std::size_t hash_value(const timestamped_val& v) {
|
||||
return Hash()(v.key());
|
||||
bool ready() const noexcept {
|
||||
return _lru_entry_ptr;
|
||||
}
|
||||
|
||||
private:
|
||||
void touch() noexcept {
|
||||
assert(_lru_entry_ptr);
|
||||
_last_read = loading_cache_clock_type::now();
|
||||
_lru_entry_ptr->touch();
|
||||
}
|
||||
|
||||
void set_anchor_back_reference(lru_entry* lru_entry_ptr) noexcept {
|
||||
_lru_entry_ptr = lru_entry_ptr;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Tp>
|
||||
struct simple_entry_size {
|
||||
size_t operator()(const Tp& val) {
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Tp, typename Key, typename EntrySize , typename Hash, typename EqualPred, typename LoadingSharedValuesStats>
|
||||
class timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>::value_ptr {
|
||||
private:
|
||||
using ts_value_type = timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>;
|
||||
using loading_values_type = typename ts_value_type::loading_values_type;
|
||||
|
||||
public:
|
||||
using timestamped_val_ptr = typename loading_values_type::entry_ptr;
|
||||
using value_type = Tp;
|
||||
|
||||
private:
|
||||
timestamped_val_ptr _ts_val_ptr;
|
||||
|
||||
public:
|
||||
value_ptr(timestamped_val_ptr ts_val_ptr) : _ts_val_ptr(std::move(ts_val_ptr)) { _ts_val_ptr->touch(); }
|
||||
explicit operator bool() const noexcept { return bool(_ts_val_ptr); }
|
||||
value_type& operator*() const noexcept { return _ts_val_ptr->value(); }
|
||||
value_type* operator->() const noexcept { return &_ts_val_ptr->value(); }
|
||||
};
|
||||
|
||||
/// \brief This is and LRU list entry which is also an anchor for a loading_cache value.
|
||||
template<typename Tp, typename Key, typename EntrySize , typename Hash, typename EqualPred, typename LoadingSharedValuesStats>
|
||||
class timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>::lru_entry : public auto_unlink_list_hook {
|
||||
private:
|
||||
using ts_value_type = timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>;
|
||||
using loading_values_type = typename ts_value_type::loading_values_type;
|
||||
|
||||
public:
|
||||
using lru_list_type = bi::list<lru_entry, bi::constant_time_size<false>>;
|
||||
using timestamped_val_ptr = typename loading_values_type::entry_ptr;
|
||||
|
||||
private:
|
||||
timestamped_val_ptr _ts_val_ptr;
|
||||
lru_list_type& _lru_list;
|
||||
size_t& _cache_size;
|
||||
|
||||
public:
|
||||
lru_entry(timestamped_val_ptr ts_val, lru_list_type& lru_list, size_t& cache_size)
|
||||
: _ts_val_ptr(std::move(ts_val))
|
||||
, _lru_list(lru_list)
|
||||
, _cache_size(cache_size)
|
||||
{
|
||||
_ts_val_ptr->set_anchor_back_reference(this);
|
||||
_cache_size += _ts_val_ptr->size();
|
||||
}
|
||||
|
||||
~lru_entry() {
|
||||
_cache_size -= _ts_val_ptr->size();
|
||||
_ts_val_ptr->set_anchor_back_reference(nullptr);
|
||||
}
|
||||
|
||||
size_t& cache_size() noexcept {
|
||||
return _cache_size;
|
||||
}
|
||||
|
||||
/// Set this item as the most recently used item.
|
||||
/// The MRU item is going to be at the front of the _lru_list, the LRU item - at the back.
|
||||
void touch() noexcept {
|
||||
auto_unlink_list_hook::unlink();
|
||||
_lru_list.push_front(*this);
|
||||
}
|
||||
};
|
||||
|
||||
class shared_mutex {
|
||||
private:
|
||||
lw_shared_ptr<semaphore> _mutex_ptr;
|
||||
|
||||
public:
|
||||
shared_mutex() : _mutex_ptr(make_lw_shared<semaphore>(1)) {}
|
||||
semaphore& get() const noexcept {
|
||||
return *_mutex_ptr;
|
||||
const Key& key() const noexcept {
|
||||
return loading_values_type::to_key(_ts_val_ptr);
|
||||
}
|
||||
|
||||
timestamped_val& timestamped_value() noexcept { return *_ts_val_ptr; }
|
||||
const timestamped_val& timestamped_value() const noexcept { return *_ts_val_ptr; }
|
||||
timestamped_val_ptr timestamped_value_ptr() noexcept { return _ts_val_ptr; }
|
||||
};
|
||||
|
||||
enum class loading_cache_reload_enabled { no, yes };
|
||||
|
||||
/// \brief Loading cache is a cache that loads the value into the cache using the given asynchronous callback.
|
||||
///
|
||||
/// Each cached value if reloading is enabled (\tparam ReloadEnabled == loading_cache_reload_enabled::yes) is reloaded after
|
||||
/// the "refresh" time period since it was loaded for the last time.
|
||||
///
|
||||
/// The values are going to be evicted from the cache if they are not accessed during the "expiration" period or haven't
|
||||
/// been reloaded even once during the same period.
|
||||
///
|
||||
/// If "expiration" is set to zero - the caching is going to be disabled and get_XXX(...) is going to call the "loader" callback
|
||||
/// every time in order to get the requested value.
|
||||
///
|
||||
/// \note In order to avoid the eviction of cached entries due to "aging" of the contained value the user has to choose
|
||||
/// the "expiration" to be at least ("refresh" + "max load latency"). This way the value is going to stay in the cache and is going to be
|
||||
/// read in a non-blocking way as long as it's frequently accessed. Note however that since reloading is an asynchronous
|
||||
/// procedure it may get delayed by other running task. Therefore choosing the "expiration" too close to the ("refresh" + "max load latency")
|
||||
/// value one risks to have his/her cache values evicted when the system is heavily loaded.
|
||||
///
|
||||
/// The cache is also limited in size and if adding the next value is going
|
||||
/// to exceed the cache size limit the least recently used value(s) is(are) going to be evicted until the size of the cache
|
||||
/// becomes such that adding the new value is not going to break the size limit. If the new entry's size is greater than
|
||||
/// the cache size then the get_XXX(...) method is going to return a future with the loading_cache::entry_is_too_big exception.
|
||||
///
|
||||
/// The size of the cache is defined as a sum of sizes of all cached entries.
|
||||
/// The size of each entry is defined by the value returned by the \tparam EntrySize predicate applied on it.
|
||||
///
|
||||
/// The get(key) or get_ptr(key) methods ensures that the "loader" callback is called only once for each cached entry regardless of how many
|
||||
/// callers are calling for the get_XXX(key) for the same "key" at the same time. Only after the value is evicted from the cache
|
||||
/// it's going to be "loaded" in the context of get_XXX(key). As long as the value is cached get_XXX(key) is going to return the
|
||||
/// cached value immediately and reload it in the background every "refresh" time period as described above.
|
||||
///
|
||||
/// \tparam Key type of the cache key
|
||||
/// \tparam Tp type of the cached value
|
||||
/// \tparam ReloadEnabled if loading_cache_reload_enabled::yes allow reloading the values otherwise don't reload
|
||||
/// \tparam EntrySize predicate to calculate the entry size
|
||||
/// \tparam Hash hash function
|
||||
/// \tparam EqualPred equality predicate
|
||||
/// \tparam LoadingSharedValuesStats statistics incrementing class (see utils::loading_shared_values)
|
||||
/// \tparam Alloc elements allocator
|
||||
template<typename Key,
|
||||
typename Tp,
|
||||
loading_cache_reload_enabled ReloadEnabled = loading_cache_reload_enabled::no,
|
||||
typename EntrySize = simple_entry_size<Tp>,
|
||||
typename Hash = std::hash<Key>,
|
||||
typename EqualPred = std::equal_to<Key>,
|
||||
typename Alloc = std::allocator<timestamped_val<Tp, Key, Hash, EqualPred>>,
|
||||
typename SharedMutexMapAlloc = std::allocator<std::pair<const Key, shared_mutex>>>
|
||||
typename LoadingSharedValuesStats = utils::do_nothing_loading_shared_values_stats,
|
||||
typename Alloc = std::allocator<typename timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>::lru_entry>>
|
||||
class loading_cache {
|
||||
private:
|
||||
typedef timestamped_val<Tp, Key, Hash, EqualPred> ts_value_type;
|
||||
typedef bi::unordered_set<ts_value_type, bi::power_2_buckets<true>, bi::compare_hash<true>> set_type;
|
||||
typedef std::unordered_map<Key, shared_mutex, Hash, EqualPred, SharedMutexMapAlloc> write_mutex_map_type;
|
||||
typedef typename ts_value_type::lru_list_type lru_list_type;
|
||||
typedef typename set_type::bucket_traits bi_set_bucket_traits;
|
||||
|
||||
static constexpr int initial_num_buckets = 256;
|
||||
static constexpr int max_num_buckets = 1024 * 1024;
|
||||
using ts_value_type = timestamped_val<Tp, Key, EntrySize, Hash, EqualPred, LoadingSharedValuesStats>;
|
||||
using loading_values_type = typename ts_value_type::loading_values_type;
|
||||
using timestamped_val_ptr = typename loading_values_type::entry_ptr;
|
||||
using ts_value_lru_entry = typename ts_value_type::lru_entry;
|
||||
using set_iterator = typename loading_values_type::iterator;
|
||||
using lru_list_type = typename ts_value_lru_entry::lru_list_type;
|
||||
struct value_extractor_fn {
|
||||
Tp& operator()(ts_value_type& tv) const {
|
||||
return tv.value();
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
typedef Tp value_type;
|
||||
typedef Key key_type;
|
||||
typedef typename set_type::iterator iterator;
|
||||
using value_type = Tp;
|
||||
using key_type = Key;
|
||||
using value_ptr = typename ts_value_type::value_ptr;
|
||||
|
||||
class entry_is_too_big : public std::exception {};
|
||||
using iterator = boost::transform_iterator<value_extractor_fn, set_iterator>;
|
||||
|
||||
private:
|
||||
loading_cache(size_t max_size, std::chrono::milliseconds expiry, std::chrono::milliseconds refresh, logging::logger& logger)
|
||||
: _max_size(max_size)
|
||||
, _expiry(expiry)
|
||||
, _refresh(refresh)
|
||||
, _logger(logger)
|
||||
, _timer([this] { on_timer(); })
|
||||
{
|
||||
// Sanity check: if expiration period is given then non-zero refresh period and maximal size are required
|
||||
if (caching_enabled() && (_refresh == std::chrono::milliseconds(0) || _max_size == 0)) {
|
||||
throw exceptions::configuration_exception("loading_cache: caching is enabled but refresh period and/or max_size are zero");
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template<typename Func>
|
||||
loading_cache(size_t max_size, std::chrono::milliseconds expiry, std::chrono::milliseconds refresh, logging::logger& logger, Func&& load)
|
||||
: _buckets(initial_num_buckets)
|
||||
, _set(bi_set_bucket_traits(_buckets.data(), _buckets.size()))
|
||||
, _max_size(max_size)
|
||||
, _expiry(expiry)
|
||||
, _refresh(refresh)
|
||||
, _logger(logger)
|
||||
, _load(std::forward<Func>(load)) {
|
||||
: loading_cache(max_size, expiry, refresh, logger)
|
||||
{
|
||||
static_assert(ReloadEnabled == loading_cache_reload_enabled::yes, "This constructor should only be invoked when ReloadEnabled == loading_cache_reload_enabled::yes");
|
||||
static_assert(std::is_same<future<value_type>, std::result_of_t<Func(const key_type&)>>::value, "Bad Func signature");
|
||||
|
||||
_load = std::forward<Func>(load);
|
||||
|
||||
// If expiration period is zero - caching is disabled
|
||||
if (!caching_enabled()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Sanity check: if expiration period is given then non-zero refresh period and maximal size are required
|
||||
if (_refresh == std::chrono::milliseconds(0) || _max_size == 0) {
|
||||
throw exceptions::configuration_exception("loading_cache: caching is enabled but refresh period and/or max_size are zero");
|
||||
_timer_period = std::min(_expiry, _refresh);
|
||||
_timer.arm(_timer_period);
|
||||
}
|
||||
|
||||
loading_cache(size_t max_size, std::chrono::milliseconds expiry, logging::logger& logger)
|
||||
: loading_cache(max_size, expiry, loading_cache_clock_type::time_point::max().time_since_epoch(), logger)
|
||||
{
|
||||
static_assert(ReloadEnabled == loading_cache_reload_enabled::no, "This constructor should only be invoked when ReloadEnabled == loading_cache_reload_enabled::no");
|
||||
|
||||
// If expiration period is zero - caching is disabled
|
||||
if (!caching_enabled()) {
|
||||
return;
|
||||
}
|
||||
|
||||
_timer.set_callback([this] { on_timer(); });
|
||||
_timer.arm(_refresh);
|
||||
_timer_period = _expiry;
|
||||
_timer.arm(_timer_period);
|
||||
}
|
||||
|
||||
~loading_cache() {
|
||||
_set.clear_and_dispose([] (ts_value_type* ptr) { loading_cache::destroy_ts_value(ptr); });
|
||||
_lru_list.erase_and_dispose(_lru_list.begin(), _lru_list.end(), [] (ts_value_lru_entry* ptr) { loading_cache::destroy_ts_value(ptr); });
|
||||
}
|
||||
|
||||
template <typename LoadFunc>
|
||||
future<value_ptr> get_ptr(const Key& k, LoadFunc&& load) {
|
||||
static_assert(std::is_same<future<value_type>, std::result_of_t<LoadFunc(const key_type&)>>::value, "Bad LoadFunc signature");
|
||||
// We shouldn't be here if caching is disabled
|
||||
assert(caching_enabled());
|
||||
|
||||
return _loading_values.get_or_load(k, [this, load = std::forward<LoadFunc>(load)] (const Key& k) mutable {
|
||||
return load(k).then([this] (value_type val) {
|
||||
return ts_value_type(std::move(val));
|
||||
});
|
||||
}).then([this, k] (timestamped_val_ptr ts_val_ptr) {
|
||||
// check again since it could have already been inserted and initialized
|
||||
if (!ts_val_ptr->ready()) {
|
||||
_logger.trace("{}: storing the value for the first time", k);
|
||||
|
||||
if (ts_val_ptr->size() > _max_size) {
|
||||
return make_exception_future<value_ptr>(entry_is_too_big());
|
||||
}
|
||||
|
||||
ts_value_lru_entry* new_lru_entry = Alloc().allocate(1);
|
||||
new(new_lru_entry) ts_value_lru_entry(std::move(ts_val_ptr), _lru_list, _current_size);
|
||||
|
||||
// This will "touch" the entry and add it to the LRU list - we must do this before the shrink() call.
|
||||
value_ptr vp(new_lru_entry->timestamped_value_ptr());
|
||||
|
||||
// Remove the least recently used items if map is too big.
|
||||
shrink();
|
||||
|
||||
return make_ready_future<value_ptr>(std::move(vp));
|
||||
}
|
||||
|
||||
return make_ready_future<value_ptr>(std::move(ts_val_ptr));
|
||||
});
|
||||
}
|
||||
|
||||
future<value_ptr> get_ptr(const Key& k) {
|
||||
static_assert(ReloadEnabled == loading_cache_reload_enabled::yes, "reload must be enabled");
|
||||
return get_ptr(k, _load);
|
||||
}
|
||||
|
||||
future<Tp> get(const Key& k) {
|
||||
static_assert(ReloadEnabled == loading_cache_reload_enabled::yes, "reload must be enabled");
|
||||
|
||||
// If caching is disabled - always load in the foreground
|
||||
if (!caching_enabled()) {
|
||||
return _load(k);
|
||||
return _load(k).then([] (Tp val) {
|
||||
return make_ready_future<Tp>(std::move(val));
|
||||
});
|
||||
}
|
||||
|
||||
// If the key is not in the cache yet, then find_or_create() is going to
|
||||
// create a new uninitialized value in the map. If the value is already
|
||||
// in the cache (the fast path) simply return the value. Otherwise, take
|
||||
// the mutex and try to load the value (the slow path).
|
||||
iterator ts_value_it = find_or_create(k);
|
||||
if (*ts_value_it) {
|
||||
return make_ready_future<Tp>(ts_value_it->value());
|
||||
} else {
|
||||
return slow_load(k);
|
||||
}
|
||||
return get_ptr(k).then([] (value_ptr v_ptr) {
|
||||
return make_ready_future<Tp>(*v_ptr);
|
||||
});
|
||||
}
|
||||
|
||||
future<> stop() {
|
||||
return _timer_reads_gate.close().finally([this] { _timer.cancel(); });
|
||||
}
|
||||
|
||||
iterator find(const Key& k) noexcept {
|
||||
return boost::make_transform_iterator(set_find(k), _value_extractor_fn);
|
||||
}
|
||||
|
||||
iterator end() {
|
||||
return boost::make_transform_iterator(_loading_values.end(), _value_extractor_fn);
|
||||
}
|
||||
|
||||
iterator begin() {
|
||||
return boost::make_transform_iterator(_loading_values.begin(), _value_extractor_fn);
|
||||
}
|
||||
|
||||
template <typename Pred>
|
||||
void remove_if(Pred&& pred) {
|
||||
static_assert(std::is_same<bool, std::result_of_t<Pred(const value_type&)>>::value, "Bad Pred signature");
|
||||
|
||||
_lru_list.remove_and_dispose_if([this, &pred] (const ts_value_lru_entry& v) {
|
||||
return pred(v.timestamped_value().value());
|
||||
}, [this] (ts_value_lru_entry* p) {
|
||||
loading_cache::destroy_ts_value(p);
|
||||
});
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return _loading_values.size();
|
||||
}
|
||||
|
||||
/// \brief returns the memory size the currently cached entries occupy according to the EntrySize predicate.
|
||||
size_t memory_footprint() const {
|
||||
return _current_size;
|
||||
}
|
||||
|
||||
private:
|
||||
set_iterator set_find(const Key& k) noexcept {
|
||||
set_iterator it = _loading_values.find(k);
|
||||
set_iterator end_it = set_end();
|
||||
|
||||
if (it == end_it || !it->ready()) {
|
||||
return end_it;
|
||||
}
|
||||
return it;
|
||||
}
|
||||
|
||||
set_iterator set_end() noexcept {
|
||||
return _loading_values.end();
|
||||
}
|
||||
|
||||
set_iterator set_begin() noexcept {
|
||||
return _loading_values.begin();
|
||||
}
|
||||
|
||||
bool caching_enabled() const {
|
||||
return _expiry != std::chrono::milliseconds(0);
|
||||
}
|
||||
|
||||
/// Look for the entry with the given key. It it doesn't exist - create a new one and add it to the _set.
|
||||
///
|
||||
/// \param k The key to look for
|
||||
///
|
||||
/// \return An iterator to the value with the given key (always dirrerent from _set.end())
|
||||
template <typename KeyType>
|
||||
iterator find_or_create(KeyType&& k) {
|
||||
iterator i = _set.find(k, Hash(), typename ts_value_type::key_eq());
|
||||
if (i == _set.end()) {
|
||||
ts_value_type* new_ts_val = Alloc().allocate(1);
|
||||
new(new_ts_val) ts_value_type(_lru_list, std::forward<KeyType>(k));
|
||||
auto p = _set.insert(*new_ts_val);
|
||||
i = p.first;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static void destroy_ts_value(ts_value_type* val) {
|
||||
val->~ts_value_type();
|
||||
static void destroy_ts_value(ts_value_lru_entry* val) {
|
||||
val->~ts_value_lru_entry();
|
||||
Alloc().deallocate(val, 1);
|
||||
}
|
||||
|
||||
future<Tp> slow_load(const Key& k) {
|
||||
// If the key is not in the cache yet, then _write_mutex_map[k] is going
|
||||
// to create a new value with the initialized mutex. The mutex is going
|
||||
// to serialize the producers and only the first one is going to
|
||||
// actually issue a load operation and initialize the value with the
|
||||
// received result. The rest are going to see (and read) the initialized
|
||||
// value when they enter the critical section.
|
||||
shared_mutex sm = _write_mutex_map[k];
|
||||
return with_semaphore(sm.get(), 1, [this, k] {
|
||||
iterator ts_value_it = find_or_create(k);
|
||||
if (*ts_value_it) {
|
||||
return make_ready_future<Tp>(ts_value_it->value());
|
||||
future<> reload(ts_value_lru_entry& lru_entry) {
|
||||
return _load(lru_entry.key()).then_wrapped([this, key = lru_entry.key()] (auto&& f) mutable {
|
||||
// if the entry has been evicted by now - simply end here
|
||||
set_iterator it = this->set_find(key);
|
||||
if (it == this->set_end()) {
|
||||
this->_logger.trace("{}: entry was dropped during the reload", key);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
_logger.trace("{}: storing the value for the first time", k);
|
||||
return _load(k).then([this, k] (Tp t) {
|
||||
// we have to "re-read" the _set here because the value may have been evicted by now
|
||||
iterator ts_value_it = find_or_create(std::move(k));
|
||||
*ts_value_it = std::move(t);
|
||||
return make_ready_future<Tp>(ts_value_it->value());
|
||||
});
|
||||
}).finally([sm] {});
|
||||
}
|
||||
|
||||
future<> reload(ts_value_type& ts_val) {
|
||||
return _load(ts_val.key()).then_wrapped([this, &ts_val] (auto&& f) {
|
||||
// The exceptions are related to the load operation itself.
|
||||
// We should ignore them for the background reads - if
|
||||
// they persist the value will age and will be reloaded in
|
||||
@@ -273,120 +440,97 @@ private:
|
||||
// will be propagated up to the user and will fail the
|
||||
// corresponding query.
|
||||
try {
|
||||
ts_val = f.get0();
|
||||
*it = f.get0();
|
||||
} catch (std::exception& e) {
|
||||
_logger.debug("{}: reload failed: {}", ts_val.key(), e.what());
|
||||
this->_logger.debug("{}: reload failed: {}", key, e.what());
|
||||
} catch (...) {
|
||||
_logger.debug("{}: reload failed: unknown error", ts_val.key());
|
||||
this->_logger.debug("{}: reload failed: unknown error", key);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void erase(iterator it) {
|
||||
_set.erase_and_dispose(it, [] (ts_value_type* ptr) { loading_cache::destroy_ts_value(ptr); });
|
||||
// no need to delete the item from _lru_list - it's auto-deleted
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
void drop_expired() {
|
||||
auto now = loading_cache_clock_type::now();
|
||||
_lru_list.remove_and_dispose_if([now, this] (const ts_value_type& v) {
|
||||
_lru_list.remove_and_dispose_if([now, this] (const ts_value_lru_entry& lru_entry) {
|
||||
using namespace std::chrono;
|
||||
// An entry should be discarded if it hasn't been reloaded for too long or nobody cares about it anymore
|
||||
const ts_value_type& v = lru_entry.timestamped_value();
|
||||
auto since_last_read = now - v.last_read();
|
||||
auto since_loaded = now - v.loaded();
|
||||
if (_expiry < since_last_read || _expiry < since_loaded) {
|
||||
_logger.trace("drop_expired(): {}: dropping the entry: _expiry {}, ms passed since: loaded {} last_read {}", v.key(), _expiry.count(), duration_cast<milliseconds>(since_loaded).count(), duration_cast<milliseconds>(since_last_read).count());
|
||||
if (_expiry < since_last_read || (ReloadEnabled == loading_cache_reload_enabled::yes && _expiry < since_loaded)) {
|
||||
_logger.trace("drop_expired(): {}: dropping the entry: _expiry {}, ms passed since: loaded {} last_read {}", lru_entry.key(), _expiry.count(), duration_cast<milliseconds>(since_loaded).count(), duration_cast<milliseconds>(since_last_read).count());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}, [this] (ts_value_type* p) {
|
||||
erase(_set.iterator_to(*p));
|
||||
}, [this] (ts_value_lru_entry* p) {
|
||||
loading_cache::destroy_ts_value(p);
|
||||
});
|
||||
}
|
||||
|
||||
// Shrink the cache to the _max_size discarding the least recently used items
|
||||
void shrink() {
|
||||
if (_set.size() > _max_size) {
|
||||
auto num_items_to_erase = _set.size() - _max_size;
|
||||
for (size_t i = 0; i < num_items_to_erase; ++i) {
|
||||
using namespace std::chrono;
|
||||
ts_value_type& ts_val = *_lru_list.rbegin();
|
||||
_logger.trace("shrink(): {}: dropping the entry: ms since last_read {}", ts_val.key(), duration_cast<milliseconds>(loading_cache_clock_type::now() - ts_val.last_read()).count());
|
||||
erase(_set.iterator_to(ts_val));
|
||||
}
|
||||
while (_current_size > _max_size) {
|
||||
using namespace std::chrono;
|
||||
ts_value_lru_entry& lru_entry = *_lru_list.rbegin();
|
||||
_logger.trace("shrink(): {}: dropping the entry: ms since last_read {}", lru_entry.key(), duration_cast<milliseconds>(loading_cache_clock_type::now() - lru_entry.timestamped_value().last_read()).count());
|
||||
loading_cache::destroy_ts_value(&lru_entry);
|
||||
}
|
||||
}
|
||||
|
||||
void rehash() {
|
||||
size_t new_buckets_count = 0;
|
||||
|
||||
// Don't grow or shrink too fast even if there is a steep drop/growth in the number of elements in the set.
|
||||
// Exponential growth/backoff should be good enough.
|
||||
//
|
||||
// Try to keep the load factor between 0.25 and 1.0.
|
||||
if (_set.size() < _current_buckets_count / 4) {
|
||||
new_buckets_count = _current_buckets_count / 4;
|
||||
} else if (_set.size() > _current_buckets_count) {
|
||||
new_buckets_count = _current_buckets_count * 2;
|
||||
// Try to bring the load factors of the _loading_values into a known range.
|
||||
void periodic_rehash() noexcept {
|
||||
try {
|
||||
_loading_values.rehash();
|
||||
} catch (...) {
|
||||
// if rehashing fails - continue with the current buckets array
|
||||
}
|
||||
|
||||
if (new_buckets_count < initial_num_buckets || new_buckets_count > max_num_buckets) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<typename set_type::bucket_type> new_buckets(new_buckets_count);
|
||||
_set.rehash(bi_set_bucket_traits(new_buckets.data(), new_buckets.size()));
|
||||
_logger.trace("rehash(): buckets count changed: {} -> {}", _current_buckets_count, new_buckets_count);
|
||||
|
||||
_buckets.swap(new_buckets);
|
||||
_current_buckets_count = new_buckets_count;
|
||||
}
|
||||
|
||||
void on_timer() {
|
||||
_logger.trace("on_timer(): start");
|
||||
|
||||
auto timer_start_tp = loading_cache_clock_type::now();
|
||||
|
||||
// Clear all cached mutexes
|
||||
_write_mutex_map.clear();
|
||||
|
||||
// Clean up items that were not touched for the whole _expiry period.
|
||||
drop_expired();
|
||||
|
||||
// Remove the least recently used items if map is too big.
|
||||
shrink();
|
||||
|
||||
// check if rehashing is needed and do it if it is.
|
||||
rehash();
|
||||
periodic_rehash();
|
||||
|
||||
if (ReloadEnabled == loading_cache_reload_enabled::no) {
|
||||
_logger.trace("on_timer(): rearming");
|
||||
_timer.arm(loading_cache_clock_type::now() + _timer_period);
|
||||
return;
|
||||
}
|
||||
|
||||
// Reload all those which vlaue needs to be reloaded.
|
||||
with_gate(_timer_reads_gate, [this, timer_start_tp] {
|
||||
return parallel_for_each(_set.begin(), _set.end(), [this, curr_time = timer_start_tp] (auto& ts_val) {
|
||||
_logger.trace("on_timer(): {}: checking the value age", ts_val.key());
|
||||
if (ts_val && ts_val.loaded() + _refresh < curr_time) {
|
||||
_logger.trace("on_timer(): {}: reloading the value", ts_val.key());
|
||||
return this->reload(ts_val);
|
||||
with_gate(_timer_reads_gate, [this] {
|
||||
return parallel_for_each(_lru_list.begin(), _lru_list.end(), [this] (ts_value_lru_entry& lru_entry) {
|
||||
_logger.trace("on_timer(): {}: checking the value age", lru_entry.key());
|
||||
if (lru_entry.timestamped_value().loaded() + _refresh < loading_cache_clock_type::now()) {
|
||||
_logger.trace("on_timer(): {}: reloading the value", lru_entry.key());
|
||||
return this->reload(lru_entry);
|
||||
}
|
||||
return now();
|
||||
}).finally([this, timer_start_tp] {
|
||||
}).finally([this] {
|
||||
_logger.trace("on_timer(): rearming");
|
||||
_timer.arm(timer_start_tp + _refresh);
|
||||
_timer.arm(loading_cache_clock_type::now() + _timer_period);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<typename set_type::bucket_type> _buckets;
|
||||
size_t _current_buckets_count = initial_num_buckets;
|
||||
set_type _set;
|
||||
write_mutex_map_type _write_mutex_map;
|
||||
loading_values_type _loading_values;
|
||||
lru_list_type _lru_list;
|
||||
size_t _max_size;
|
||||
size_t _current_size = 0;
|
||||
size_t _max_size = 0;
|
||||
std::chrono::milliseconds _expiry;
|
||||
std::chrono::milliseconds _refresh;
|
||||
loading_cache_clock_type::duration _timer_period;
|
||||
logging::logger& _logger;
|
||||
std::function<future<Tp>(const Key&)> _load;
|
||||
timer<lowres_clock> _timer;
|
||||
seastar::gate _timer_reads_gate;
|
||||
value_extractor_fn _value_extractor_fn;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -137,7 +137,11 @@ private:
|
||||
using set_type = bi::unordered_set<entry, bi::power_2_buckets<true>, bi::compare_hash<true>>;
|
||||
using bi_set_bucket_traits = typename set_type::bucket_traits;
|
||||
using set_iterator = typename set_type::iterator;
|
||||
using value_extractor_fn = std::function<value_type& (entry&)>;
|
||||
struct value_extractor_fn {
|
||||
value_type& operator()(entry& e) const {
|
||||
return e.value();
|
||||
}
|
||||
};
|
||||
enum class shrinking_is_allowed { no, yes };
|
||||
|
||||
public:
|
||||
@@ -186,7 +190,6 @@ public:
|
||||
loading_shared_values()
|
||||
: _buckets(InitialBucketsCount)
|
||||
, _set(bi_set_bucket_traits(_buckets.data(), _buckets.size()))
|
||||
, _value_extractor_fn([] (entry& e) -> value_type& { return e.value(); })
|
||||
{
|
||||
static_assert(noexcept(Stats::inc_evictions()), "Stats::inc_evictions must be non-throwing");
|
||||
static_assert(noexcept(Stats::inc_hits()), "Stats::inc_hits must be non-throwing");
|
||||
|
||||
@@ -117,6 +117,7 @@ public:
|
||||
void reclaim_all_free_segments();
|
||||
occupancy_stats region_occupancy();
|
||||
occupancy_stats occupancy();
|
||||
size_t non_lsa_used_space();
|
||||
void set_reclamation_step(size_t step_in_segments) { _reclamation_step = step_in_segments; }
|
||||
size_t reclamation_step() const { return _reclamation_step; }
|
||||
void enable_abort_on_bad_alloc() { _abort_on_bad_alloc = true; }
|
||||
@@ -153,6 +154,10 @@ occupancy_stats tracker::occupancy() {
|
||||
return _impl->occupancy();
|
||||
}
|
||||
|
||||
size_t tracker::non_lsa_used_space() const {
|
||||
return _impl->non_lsa_used_space();
|
||||
}
|
||||
|
||||
void tracker::full_compaction() {
|
||||
return _impl->full_compaction();
|
||||
}
|
||||
@@ -291,7 +296,7 @@ static inline bool can_allocate_more_memory(size_t size)
|
||||
class segment_zone : public bi::set_base_hook<>, public bi::slist_base_hook<> {
|
||||
struct free_segment : public bi::slist_base_hook<> { };
|
||||
|
||||
static constexpr size_t maximum_size = 256;
|
||||
static constexpr size_t maximum_size = max_zone_segments;
|
||||
static constexpr size_t minimum_size = 16;
|
||||
static thread_local size_t next_attempt_size;
|
||||
|
||||
@@ -574,10 +579,8 @@ size_t segment_pool::reclaim_segments(size_t target) {
|
||||
bi::slist<segment_zone> zones_to_remove;
|
||||
for (auto& zone : _all_zones | boost::adaptors::reversed) {
|
||||
if (zone.empty()) {
|
||||
if (reclaimed_segments < target || !zone.free_segment_count()) {
|
||||
reclaimed_segments += zone.free_segment_count();
|
||||
zones_to_remove.push_front(zone);
|
||||
}
|
||||
reclaimed_segments += zone.free_segment_count();
|
||||
zones_to_remove.push_front(zone);
|
||||
} else if (zone.free_segment_count()) {
|
||||
_free_segments_in_zones += zone.free_segment_count();
|
||||
zone.rebuild_free_segments_list();
|
||||
@@ -1681,6 +1684,11 @@ occupancy_stats tracker::impl::occupancy() {
|
||||
return occ;
|
||||
}
|
||||
|
||||
size_t tracker::impl::non_lsa_used_space() {
|
||||
auto free_space_in_zones = shard_segment_pool.free_segments_in_zones() * segment_size;
|
||||
return memory::stats().allocated_memory() - region_occupancy().total_space() - free_space_in_zones;
|
||||
}
|
||||
|
||||
void tracker::impl::reclaim_all_free_segments()
|
||||
{
|
||||
llogger.debug("Reclaiming all free segments");
|
||||
@@ -2013,11 +2021,8 @@ tracker::impl::impl() {
|
||||
sm::make_gauge("large_objects_total_space_bytes", [this] { return shard_segment_pool.non_lsa_memory_in_use(); },
|
||||
sm::description("Holds a current size of allocated non-LSA memory.")),
|
||||
|
||||
sm::make_gauge("non_lsa_used_space_bytes",
|
||||
[this] {
|
||||
auto free_space_in_zones = shard_segment_pool.free_segments_in_zones() * segment_size;
|
||||
return memory::stats().allocated_memory() - region_occupancy().total_space() - free_space_in_zones;
|
||||
}, sm::description("Holds a current amount of used non-LSA memory.")),
|
||||
sm::make_gauge("non_lsa_used_space_bytes", [this] { return non_lsa_used_space(); },
|
||||
sm::description("Holds a current amount of used non-LSA memory.")),
|
||||
|
||||
sm::make_gauge("free_space_in_zones", [this] { return shard_segment_pool.free_segments_in_zones() * segment_size; },
|
||||
sm::description("Holds a current amount of free memory in zones.")),
|
||||
|
||||
@@ -43,6 +43,7 @@ class allocating_section;
|
||||
|
||||
constexpr int segment_size_shift = 18; // 256K; see #151, #152
|
||||
constexpr size_t segment_size = 1 << segment_size_shift;
|
||||
constexpr size_t max_zone_segments = 256;
|
||||
|
||||
//
|
||||
// Frees some amount of objects from the region to which it's attached.
|
||||
@@ -455,6 +456,9 @@ public:
|
||||
// Returns statistics for all segments allocated by LSA on this shard.
|
||||
occupancy_stats occupancy();
|
||||
|
||||
// Returns amount of allocated memory not managed by LSA
|
||||
size_t non_lsa_used_space() const;
|
||||
|
||||
impl& get_impl() { return *_impl; }
|
||||
|
||||
// Set the minimum number of segments reclaimed during single reclamation cycle.
|
||||
|
||||
Reference in New Issue
Block a user