Compare commits
112 Commits
master
...
scylla-3.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6b011fbe0a | ||
|
|
9dd4e1b01f | ||
|
|
e91c741ef5 | ||
|
|
b18e9e115d | ||
|
|
0b86ab0d2a | ||
|
|
97cd9108d6 | ||
|
|
f81fe96b0b | ||
|
|
91ce3a7957 | ||
|
|
af7e58f4c5 | ||
|
|
bd3373b511 | ||
|
|
4820130abe | ||
|
|
9b299241e5 | ||
|
|
745a98e151 | ||
|
|
b9c99af18b | ||
|
|
cded9c7ac7 | ||
|
|
4acfc5ed8f | ||
|
|
cb9199bc7f | ||
|
|
695ff5383f | ||
|
|
730e48bf60 | ||
|
|
af6d4f40e1 | ||
|
|
9d8507de09 | ||
|
|
07c980845d | ||
|
|
c52b8239d0 | ||
|
|
5a07a4fac8 | ||
|
|
b9c046b17b | ||
|
|
979cb636b8 | ||
|
|
59cf9d9070 | ||
|
|
c9ec9d4087 | ||
|
|
2e8fefbc5a | ||
|
|
6be0635029 | ||
|
|
04a544c0a2 | ||
|
|
028f9b95d1 | ||
|
|
54258ca8eb | ||
|
|
c9a030f1f0 | ||
|
|
1c7daef554 | ||
|
|
f8195a77b0 | ||
|
|
5b724c80ab | ||
|
|
4a7ae81b3f | ||
|
|
3cf26a60a2 | ||
|
|
2103d0d52b | ||
|
|
16ee3b3ebe | ||
|
|
b0a9c40ab1 | ||
|
|
53924e5c7f | ||
|
|
befe0012f5 | ||
|
|
1953c5fa61 | ||
|
|
b72a94b53e | ||
|
|
3f82b697f2 | ||
|
|
ee1ef853e5 | ||
|
|
6e7e7f3822 | ||
|
|
82a36edc9d | ||
|
|
d4efa3c9b2 | ||
|
|
324dae3e12 | ||
|
|
c0ffc9a2b7 | ||
|
|
f81fa5f75c | ||
|
|
6fd1cfcfce | ||
|
|
9d458ffea9 | ||
|
|
9776a048e7 | ||
|
|
10cf97375e | ||
|
|
e6355a9a01 | ||
|
|
e57907a1d5 | ||
|
|
f94b46e7e0 | ||
|
|
6847c12668 | ||
|
|
80b86def1f | ||
|
|
c6de9ea39b | ||
|
|
94bed81c1d | ||
|
|
0f3a21f0bb | ||
|
|
976db7e9e0 | ||
|
|
996b86b804 | ||
|
|
b7b217cc43 | ||
|
|
c274430933 | ||
|
|
893a18a7c4 | ||
|
|
39b39058fc | ||
|
|
6bf4a73d88 | ||
|
|
ca4846dd63 | ||
|
|
2663ff7bc1 | ||
|
|
043a575fcd | ||
|
|
00dc400993 | ||
|
|
522a48a244 | ||
|
|
5faa28ce45 | ||
|
|
52be02558e | ||
|
|
a7cbfbe63f | ||
|
|
28fd2044d2 | ||
|
|
76ff2e5c3d | ||
|
|
7b34d54a96 | ||
|
|
26c31f6798 | ||
|
|
28fa66591a | ||
|
|
0fee1d9e43 | ||
|
|
76e72e28f4 | ||
|
|
f969e80965 | ||
|
|
2029134063 | ||
|
|
f30fe7bd17 | ||
|
|
aeb418af9e | ||
|
|
714e6d741f | ||
|
|
95c5872450 | ||
|
|
87f8968553 | ||
|
|
2895428d44 | ||
|
|
e18f182cfc | ||
|
|
cf8cdbf87d | ||
|
|
eb2814067d | ||
|
|
0c722d4547 | ||
|
|
54cf463430 | ||
|
|
d2a0622edd | ||
|
|
60edaec757 | ||
|
|
5802532cb3 | ||
|
|
83ea91055e | ||
|
|
e7863d3d54 | ||
|
|
57f124b905 | ||
|
|
40d8de5784 | ||
|
|
1468ec62de | ||
|
|
c6ef56ae1e | ||
|
|
ad62313b86 | ||
|
|
de87f798e1 |
5
.gitmodules
vendored
5
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
@@ -9,3 +9,6 @@
|
||||
[submodule "xxHash"]
|
||||
path = xxHash
|
||||
url = ../xxHash
|
||||
[submodule "libdeflate"]
|
||||
path = libdeflate
|
||||
url = ../libdeflate
|
||||
|
||||
@@ -138,4 +138,5 @@ target_include_directories(scylla PUBLIC
|
||||
${SEASTAR_INCLUDE_DIRS}
|
||||
${Boost_INCLUDE_DIRS}
|
||||
xxhash
|
||||
libdeflate
|
||||
build/release/gen)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
VERSION=666.development
|
||||
VERSION=3.0.rc2
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -2228,11 +2228,11 @@
|
||||
"description":"The column family"
|
||||
},
|
||||
"total":{
|
||||
"type":"int",
|
||||
"type":"long",
|
||||
"description":"The total snapshot size"
|
||||
},
|
||||
"live":{
|
||||
"type":"int",
|
||||
"type":"long",
|
||||
"description":"The live snapshot size"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,11 +87,17 @@ future<> create_metadata_table_if_missing(
|
||||
return mm.announce_new_column_family(b.build(), false);
|
||||
}
|
||||
|
||||
future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db) {
|
||||
future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
|
||||
static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };
|
||||
|
||||
return do_until([&db] { return db.get_version() != database::empty_version; }, pause).then([&mm] {
|
||||
return do_until([&mm] { return mm.have_schema_agreement(); }, pause);
|
||||
return do_until([&db, &as] {
|
||||
as.check();
|
||||
return db.get_version() != database::empty_version;
|
||||
}, pause).then([&mm, &as] {
|
||||
return do_until([&mm, &as] {
|
||||
as.check();
|
||||
return mm.have_schema_agreement();
|
||||
}, pause);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ future<> create_metadata_table_if_missing(
|
||||
stdx::string_view cql,
|
||||
::service::migration_manager&);
|
||||
|
||||
future<> wait_for_schema_agreement(::service::migration_manager&, const database&);
|
||||
future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);
|
||||
|
||||
///
|
||||
/// Time-outs for internal, non-local CQL queries.
|
||||
|
||||
@@ -160,7 +160,7 @@ future<> default_authorizer::start() {
|
||||
_migration_manager).then([this] {
|
||||
_finished = do_after_system_ready(_as, [this] {
|
||||
return async([this] {
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||
|
||||
if (legacy_metadata_exists()) {
|
||||
if (!any_granted().get0()) {
|
||||
@@ -178,7 +178,7 @@ future<> default_authorizer::start() {
|
||||
|
||||
future<> default_authorizer::stop() {
|
||||
_as.request_abort();
|
||||
return _finished.handle_exception_type([](const sleep_aborted&) {});
|
||||
return _finished.handle_exception_type([](const sleep_aborted&) {}).handle_exception_type([](const abort_requested_exception&) {});
|
||||
}
|
||||
|
||||
future<permission_set>
|
||||
|
||||
@@ -157,7 +157,7 @@ future<> password_authenticator::start() {
|
||||
|
||||
_stopped = do_after_system_ready(_as, [this] {
|
||||
return async([this] {
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||
|
||||
if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
|
||||
if (legacy_metadata_exists()) {
|
||||
@@ -182,7 +182,7 @@ future<> password_authenticator::start() {
|
||||
|
||||
future<> password_authenticator::stop() {
|
||||
_as.request_abort();
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { });
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
|
||||
}
|
||||
|
||||
db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
|
||||
|
||||
@@ -227,7 +227,7 @@ future<> standard_role_manager::start() {
|
||||
return this->create_metadata_tables_if_missing().then([this] {
|
||||
_stopped = auth::do_after_system_ready(_as, [this] {
|
||||
return seastar::async([this] {
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||
|
||||
if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
|
||||
if (this->legacy_metadata_exists()) {
|
||||
@@ -251,7 +251,7 @@ future<> standard_role_manager::start() {
|
||||
|
||||
future<> standard_role_manager::stop() {
|
||||
_as.request_abort();
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { });
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
|
||||
}
|
||||
|
||||
future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
|
||||
|
||||
@@ -200,8 +200,9 @@ public:
|
||||
return _current_start;
|
||||
}
|
||||
|
||||
position_in_partition_view upper_bound() const {
|
||||
return _current_end;
|
||||
// Returns the upper bound of the last range in provided ranges set
|
||||
position_in_partition_view uppermost_bound() const {
|
||||
return position_in_partition_view::for_range_end(_ranges.back());
|
||||
}
|
||||
|
||||
// When lower_bound() changes, this also does
|
||||
|
||||
@@ -112,7 +112,7 @@ const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_kb";
|
||||
const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";
|
||||
|
||||
compression_parameters::compression_parameters()
|
||||
: compression_parameters(nullptr)
|
||||
: compression_parameters(compressor::lz4)
|
||||
{}
|
||||
|
||||
compression_parameters::~compression_parameters()
|
||||
|
||||
@@ -118,6 +118,10 @@ public:
|
||||
std::map<sstring, sstring> get_options() const;
|
||||
bool operator==(const compression_parameters& other) const;
|
||||
bool operator!=(const compression_parameters& other) const;
|
||||
|
||||
static compression_parameters no_compression() {
|
||||
return compression_parameters(nullptr);
|
||||
}
|
||||
private:
|
||||
void validate_options(const std::map<sstring, sstring>&);
|
||||
};
|
||||
|
||||
31
configure.py
31
configure.py
@@ -197,7 +197,9 @@ class Thrift(object):
|
||||
|
||||
def default_target_arch():
|
||||
if platform.machine() in ['i386', 'i686', 'x86_64']:
|
||||
return 'nehalem'
|
||||
return 'westmere' # support PCLMUL
|
||||
elif platform.machine() == 'aarch64':
|
||||
return 'armv8-a+crc+crypto'
|
||||
else:
|
||||
return ''
|
||||
|
||||
@@ -271,6 +273,7 @@ scylla_tests = [
|
||||
'tests/perf/perf_sstable',
|
||||
'tests/cql_query_test',
|
||||
'tests/secondary_index_test',
|
||||
'tests/filtering_test',
|
||||
'tests/storage_proxy_test',
|
||||
'tests/schema_change_test',
|
||||
'tests/mutation_reader_test',
|
||||
@@ -306,6 +309,7 @@ scylla_tests = [
|
||||
'tests/log_heap_test',
|
||||
'tests/managed_vector_test',
|
||||
'tests/crc_test',
|
||||
'tests/checksum_utils_test',
|
||||
'tests/flush_queue_test',
|
||||
'tests/dynamic_bitset_test',
|
||||
'tests/auth_test',
|
||||
@@ -356,6 +360,7 @@ scylla_tests = [
|
||||
|
||||
perf_tests = [
|
||||
'tests/perf/perf_mutation_readers',
|
||||
'tests/perf/perf_checksum',
|
||||
'tests/perf/perf_mutation_fragment',
|
||||
'tests/perf/perf_idl',
|
||||
]
|
||||
@@ -431,6 +436,7 @@ extra_cxxflags = {}
|
||||
cassandra_interface = Thrift(source='interface/cassandra.thrift', service='Cassandra')
|
||||
|
||||
scylla_core = (['database.cc',
|
||||
'table.cc',
|
||||
'atomic_cell.cc',
|
||||
'schema.cc',
|
||||
'frozen_schema.cc',
|
||||
@@ -579,6 +585,7 @@ scylla_core = (['database.cc',
|
||||
'db/marshal/type_parser.cc',
|
||||
'db/batchlog_manager.cc',
|
||||
'db/view/view.cc',
|
||||
'db/view/view_update_from_staging_generator.cc',
|
||||
'db/view/row_locking.cc',
|
||||
'index/secondary_index_manager.cc',
|
||||
'index/secondary_index.cc',
|
||||
@@ -592,6 +599,7 @@ scylla_core = (['database.cc',
|
||||
'utils/managed_bytes.cc',
|
||||
'utils/exceptions.cc',
|
||||
'utils/config_file.cc',
|
||||
'utils/gz/crc_combine.cc',
|
||||
'gms/version_generator.cc',
|
||||
'gms/versioned_value.cc',
|
||||
'gms/gossiper.cc',
|
||||
@@ -682,6 +690,7 @@ scylla_core = (['database.cc',
|
||||
'data/cell.cc',
|
||||
'multishard_writer.cc',
|
||||
'multishard_mutation_query.cc',
|
||||
'reader_concurrency_semaphore.cc',
|
||||
] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
|
||||
)
|
||||
|
||||
@@ -773,6 +782,7 @@ pure_boost_tests = set([
|
||||
'tests/test-serialization',
|
||||
'tests/range_test',
|
||||
'tests/crc_test',
|
||||
'tests/checksum_utils_test',
|
||||
'tests/managed_vector_test',
|
||||
'tests/dynamic_bitset_test',
|
||||
'tests/idl_test',
|
||||
@@ -1001,6 +1011,8 @@ seastar_ldflags = args.user_ldflags
|
||||
seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags), '--ldflags=%s' % (seastar_ldflags),
|
||||
'--c++-dialect=gnu++1z', '--optflags=%s' % (modes['release']['opt']), ]
|
||||
|
||||
libdeflate_cflags = seastar_cflags
|
||||
|
||||
status = subprocess.call([args.python, './configure.py'] + seastar_flags, cwd='seastar')
|
||||
|
||||
if status != 0:
|
||||
@@ -1100,6 +1112,9 @@ with open(buildfile, 'w') as f:
|
||||
command = {ninja} -C $subdir $target
|
||||
restat = 1
|
||||
description = NINJA $out
|
||||
rule run
|
||||
command = $in > $out
|
||||
description = GEN $out
|
||||
rule copy
|
||||
command = cp $in $out
|
||||
description = COPY $out
|
||||
@@ -1172,6 +1187,10 @@ with open(buildfile, 'w') as f:
|
||||
if binary.endswith('.a'):
|
||||
f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
|
||||
else:
|
||||
objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
|
||||
'libdeflate/libdeflate.a'
|
||||
]])
|
||||
objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
|
||||
if binary.startswith('tests/'):
|
||||
local_libs = '$libs'
|
||||
if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
|
||||
@@ -1213,6 +1232,12 @@ with open(buildfile, 'w') as f:
|
||||
antlr3_grammars.add(src)
|
||||
else:
|
||||
raise Exception('No rule for ' + src)
|
||||
compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
|
||||
compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
|
||||
f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
|
||||
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
|
||||
f.write('build {}: link.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
|
||||
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
|
||||
for obj in compiles:
|
||||
src = compiles[obj]
|
||||
gen_headers = list(ragels.keys())
|
||||
@@ -1262,6 +1287,10 @@ with open(buildfile, 'w') as f:
|
||||
''').format(**locals()))
|
||||
f.write('build build/$mode/scylla-package.tar: package build/{mode}/scylla build/{mode}/iotune\n'.format(**locals()))
|
||||
f.write(' mode = {mode}\n'.format(**locals()))
|
||||
f.write('rule libdeflate.{mode}\n'.format(**locals()))
|
||||
f.write(' command = make -C libdeflate BUILD_DIR=../build/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc}\n'.format(**locals()))
|
||||
f.write('build build/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
|
||||
|
||||
f.write('build {}: phony\n'.format(seastar_deps))
|
||||
f.write(textwrap.dedent('''\
|
||||
rule configure
|
||||
|
||||
@@ -67,6 +67,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
|
||||
*/
|
||||
const sstring_view _query;
|
||||
|
||||
/**
|
||||
* An empty bitset to be used as a workaround for AntLR null dereference
|
||||
* bug.
|
||||
*/
|
||||
static typename ExceptionBaseType::BitsetListType _empty_bit_list;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
@@ -144,6 +150,14 @@ private:
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// AntLR Exception class has a bug of dereferencing a null
|
||||
// pointer in the displayRecognitionError. The following
|
||||
// if statement makes sure it will not be null before the
|
||||
// call to that function (displayRecognitionError).
|
||||
// bug reference: https://github.com/antlr/antlr3/issues/191
|
||||
if (!ex->get_expectingSet()) {
|
||||
ex->set_expectingSet(&_empty_bit_list);
|
||||
}
|
||||
ex->displayRecognitionError(token_names, msg);
|
||||
}
|
||||
return msg.str();
|
||||
@@ -345,4 +359,8 @@ private:
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
|
||||
typename ExceptionBaseType::BitsetListType
|
||||
error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
|
||||
|
||||
}
|
||||
|
||||
@@ -106,6 +106,11 @@ public:
|
||||
virtual size_t prefix_size() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t prefix_size(const schema_ptr schema) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template<>
|
||||
@@ -129,5 +134,23 @@ inline bool primary_key_restrictions<clustering_key>::needs_filtering(const sche
|
||||
return false;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
|
||||
size_t count = 0;
|
||||
if (schema->clustering_key_columns().empty()) {
|
||||
return count;
|
||||
}
|
||||
auto column_defs = get_column_defs();
|
||||
column_id expected_column_id = schema->clustering_key_columns().begin()->id;
|
||||
for (auto&& cdef : column_defs) {
|
||||
if (schema->position(*cdef) != expected_column_id) {
|
||||
return count;
|
||||
}
|
||||
expected_column_id++;
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,19 +166,7 @@ public:
|
||||
}
|
||||
|
||||
virtual size_t prefix_size() const override {
|
||||
size_t count = 0;
|
||||
if (_schema->clustering_key_columns().empty()) {
|
||||
return count;
|
||||
}
|
||||
column_id expected_column_id = _schema->clustering_key_columns().begin()->id;
|
||||
for (const auto& restriction_entry : _restrictions->restrictions()) {
|
||||
if (_schema->position(*restriction_entry.first) != expected_column_id) {
|
||||
return count;
|
||||
}
|
||||
expected_column_id++;
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
return primary_key_restrictions<ValueType>::prefix_size(_schema);
|
||||
}
|
||||
|
||||
::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
|
||||
|
||||
@@ -337,6 +337,52 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
|
||||
return _index_restrictions;
|
||||
}
|
||||
|
||||
std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
|
||||
for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
|
||||
for (const auto& cdef : restriction->get_column_defs()) {
|
||||
for (auto index : sim.list_indexes()) {
|
||||
if (index.depends_on(*cdef)) {
|
||||
return std::make_optional<secondary_index::index>(std::move(index));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
|
||||
std::vector<const column_definition*> column_defs_for_filtering;
|
||||
if (need_filtering()) {
|
||||
auto& sim = db.find_column_family(_schema).get_index_manager();
|
||||
std::optional<secondary_index::index> opt_idx = find_idx(sim);
|
||||
auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
|
||||
return opt_idx && opt_idx->depends_on(*cdef);
|
||||
};
|
||||
if (_partition_key_restrictions->needs_filtering(*_schema)) {
|
||||
for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
|
||||
if (!column_uses_indexing(cdef)) {
|
||||
column_defs_for_filtering.emplace_back(cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
column_id first_non_prefix_id = _schema->clustering_key_columns().begin()->id +
|
||||
_clustering_columns_restrictions->prefix_size(_schema);
|
||||
for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
|
||||
if ((cdef->id >= first_non_prefix_id) && (!column_uses_indexing(cdef))) {
|
||||
column_defs_for_filtering.emplace_back(cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
|
||||
if (!column_uses_indexing(cdef)) {
|
||||
column_defs_for_filtering.emplace_back(cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
return column_defs_for_filtering;
|
||||
}
|
||||
|
||||
void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
|
||||
// If there is a queriable index, no special condition are required on the other restrictions.
|
||||
// But we still need to know 2 things:
|
||||
|
||||
@@ -163,6 +163,20 @@ public:
|
||||
return _clustering_columns_restrictions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a possibly empty collection of column definitions that will be used for filtering
|
||||
* @param db - the database context
|
||||
* @return A list with the column definitions needed for filtering.
|
||||
*/
|
||||
std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
|
||||
|
||||
/**
|
||||
* Determines the index to be used with the restriction.
|
||||
* @param db - the database context (for extracting index manager)
|
||||
* @return If an index can be used, an optional containing this index, otherwise an empty optional.
|
||||
*/
|
||||
std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
|
||||
|
||||
/**
|
||||
* Checks if the partition key has some unrestricted components.
|
||||
* @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
|
||||
|
||||
@@ -156,9 +156,9 @@ public:
|
||||
return _factories->uses_function(ks_name, function_name);
|
||||
}
|
||||
|
||||
virtual uint32_t add_column_for_ordering(const column_definition& c) override {
|
||||
uint32_t index = selection::add_column_for_ordering(c);
|
||||
_factories->add_selector_for_ordering(c, index);
|
||||
virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
|
||||
uint32_t index = selection::add_column_for_post_processing(c);
|
||||
_factories->add_selector_for_post_processing(c, index);
|
||||
return index;
|
||||
}
|
||||
|
||||
@@ -227,7 +227,7 @@ protected:
|
||||
return simple_selection::make(schema, std::move(columns), false);
|
||||
}
|
||||
|
||||
uint32_t selection::add_column_for_ordering(const column_definition& c) {
|
||||
uint32_t selection::add_column_for_post_processing(const column_definition& c) {
|
||||
_columns.push_back(&c);
|
||||
_metadata->add_non_serialized_column(c.column_specification);
|
||||
return _columns.size() - 1;
|
||||
@@ -339,7 +339,7 @@ std::unique_ptr<result_set> result_set_builder::build() {
|
||||
return std::move(_result_set);
|
||||
}
|
||||
|
||||
bool result_set_builder::restrictions_filter::operator()(const selection& selection,
|
||||
bool result_set_builder::restrictions_filter::do_filter(const selection& selection,
|
||||
const std::vector<bytes>& partition_key,
|
||||
const std::vector<bytes>& clustering_key,
|
||||
const query::result_row_view& static_row,
|
||||
@@ -427,6 +427,18 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
|
||||
return true;
|
||||
}
|
||||
|
||||
bool result_set_builder::restrictions_filter::operator()(const selection& selection,
|
||||
const std::vector<bytes>& partition_key,
|
||||
const std::vector<bytes>& clustering_key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) const {
|
||||
bool accepted = do_filter(selection, partition_key, clustering_key, static_row, row);
|
||||
if (!accepted) {
|
||||
++_rows_dropped;
|
||||
}
|
||||
return accepted;
|
||||
}
|
||||
|
||||
api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
|
||||
return _timestamps[idx];
|
||||
}
|
||||
|
||||
@@ -176,7 +176,7 @@ public:
|
||||
static ::shared_ptr<selection> wildcard(schema_ptr schema);
|
||||
static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);
|
||||
|
||||
virtual uint32_t add_column_for_ordering(const column_definition& c);
|
||||
virtual uint32_t add_column_for_post_processing(const column_definition& c);
|
||||
|
||||
virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
|
||||
return false;
|
||||
@@ -259,12 +259,16 @@ public:
|
||||
}
|
||||
void reset() {
|
||||
}
|
||||
uint32_t get_rows_dropped() const {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
class restrictions_filter {
|
||||
::shared_ptr<restrictions::statement_restrictions> _restrictions;
|
||||
const query_options& _options;
|
||||
mutable bool _current_partition_key_does_not_match = false;
|
||||
mutable bool _current_static_row_does_not_match = false;
|
||||
mutable uint32_t _rows_dropped = 0;
|
||||
public:
|
||||
restrictions_filter() = default;
|
||||
explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options) : _restrictions(restrictions), _options(options) {}
|
||||
@@ -272,7 +276,13 @@ public:
|
||||
void reset() {
|
||||
_current_partition_key_does_not_match = false;
|
||||
_current_static_row_does_not_match = false;
|
||||
_rows_dropped = 0;
|
||||
}
|
||||
uint32_t get_rows_dropped() const {
|
||||
return _rows_dropped;
|
||||
}
|
||||
private:
|
||||
bool do_filter(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
|
||||
};
|
||||
|
||||
result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
|
||||
@@ -372,7 +382,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void accept_partition_end(const query::result_row_view& static_row) {
|
||||
uint32_t accept_partition_end(const query::result_row_view& static_row) {
|
||||
if (_row_count == 0) {
|
||||
_builder.new_row();
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
@@ -386,6 +396,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
return _filter.get_rows_dropped();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -53,6 +53,7 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
|
||||
: _contains_write_time_factory(false)
|
||||
, _contains_ttl_factory(false)
|
||||
, _number_of_aggregate_factories(0)
|
||||
, _number_of_factories_for_post_processing(0)
|
||||
{
|
||||
_factories.reserve(selectables.size());
|
||||
|
||||
@@ -76,8 +77,9 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
|
||||
return false;
|
||||
}
|
||||
|
||||
void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
|
||||
void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
|
||||
_factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
|
||||
++_number_of_factories_for_post_processing;
|
||||
}
|
||||
|
||||
std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
|
||||
|
||||
@@ -74,6 +74,11 @@ private:
|
||||
*/
|
||||
uint32_t _number_of_aggregate_factories;
|
||||
|
||||
/**
|
||||
* The number of factories that are only for post processing.
|
||||
*/
|
||||
uint32_t _number_of_factories_for_post_processing;
|
||||
|
||||
public:
|
||||
/**
|
||||
* Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
|
||||
@@ -97,11 +102,12 @@ public:
|
||||
bool uses_function(const sstring& ks_name, const sstring& function_name) const;
|
||||
|
||||
/**
|
||||
* Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
|
||||
* Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
|
||||
* processing purposes.
|
||||
* @param def the column that is needed for ordering
|
||||
* @param index the index of the column definition in the Selection's list of columns
|
||||
*/
|
||||
void add_selector_for_ordering(const column_definition& def, uint32_t index);
|
||||
void add_selector_for_post_processing(const column_definition& def, uint32_t index);
|
||||
|
||||
/**
|
||||
* Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
|
||||
@@ -111,7 +117,7 @@ public:
|
||||
*/
|
||||
bool contains_only_aggregate_functions() const {
|
||||
auto size = _factories.size();
|
||||
return size != 0 && _number_of_aggregate_factories == size;
|
||||
return size != 0 && _number_of_aggregate_factories == (size - _number_of_factories_for_post_processing);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -137,10 +137,15 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
||||
|
||||
bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
|
||||
&& dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
|
||||
bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
|
||||
bool is_collection = cd->type->is_collection();
|
||||
bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();
|
||||
|
||||
if (is_frozen_collection) {
|
||||
validate_for_frozen_collection(target);
|
||||
} else if (is_collection) {
|
||||
// NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
|
||||
throw exceptions::invalid_request_exception(
|
||||
sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
|
||||
} else {
|
||||
validate_not_full_index(target);
|
||||
validate_is_values_index_if_target_column_not_collection(cd, target);
|
||||
|
||||
@@ -84,7 +84,6 @@ create_view_statement::create_view_statement(
|
||||
, _clustering_keys{clustering_keys}
|
||||
, _if_not_exists{if_not_exists}
|
||||
{
|
||||
service::get_local_storage_proxy().get_db().local().get_config().check_experimental("Creating materialized views");
|
||||
if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
|
||||
throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
|
||||
}
|
||||
@@ -315,6 +314,27 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
|
||||
throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
|
||||
}
|
||||
|
||||
// The unique feature of a filter by a non-key column is that the
|
||||
// value of such column can be updated - and also be expired with TTL
|
||||
// and cause the view row to appear and disappear. We don't currently
|
||||
// support support this case - see issue #3430, and neither does
|
||||
// Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
|
||||
// Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
|
||||
// view row is now depending on multiple base columns (multiple filtered
|
||||
// non-pk base column + base column used in view pk)". When the filtered
|
||||
// column *is* the base column added to the view pk, we don't have this
|
||||
// problem. And this case actually works correctly.
|
||||
auto non_pk_restrictions = restrictions->get_non_pk_restriction();
|
||||
if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
|
||||
std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
|
||||
// This case (filter by new PK column of the view) works, as explained above
|
||||
} else if (!non_pk_restrictions.empty()) {
|
||||
auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
|
||||
throw exceptions::invalid_request_exception(sprint(
|
||||
"Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
|
||||
column_family(), column_names));
|
||||
}
|
||||
|
||||
schema_builder builder{keyspace(), column_family()};
|
||||
auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
|
||||
for (auto* def : defs) {
|
||||
|
||||
@@ -49,7 +49,7 @@ void cql3::statements::index_prop_defs::validate() {
|
||||
property_definitions::validate(keywords);
|
||||
|
||||
if (is_custom && !custom_class) {
|
||||
throw exceptions::invalid_request_exception("CUSTOM index requires specifiying the index class");
|
||||
throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
|
||||
}
|
||||
|
||||
if (!is_custom && custom_class) {
|
||||
@@ -64,6 +64,16 @@ void cql3::statements::index_prop_defs::validate() {
|
||||
sprint("Cannot specify %s as a CUSTOM option",
|
||||
db::index::secondary_index::custom_index_option_name));
|
||||
}
|
||||
|
||||
// Currently, Scylla does not support *any* class of custom index
|
||||
// implementation. If in the future we do (e.g., SASI, or something
|
||||
// new), we'll need to check for valid values here.
|
||||
if (is_custom && custom_class) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
|
||||
*custom_class));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
index_options_map
|
||||
|
||||
@@ -141,6 +141,10 @@ private:
|
||||
/** If ALLOW FILTERING was not specified, this verifies that it is not needed */
|
||||
void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||
|
||||
void ensure_filtering_columns_retrieval(database& db,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||
|
||||
bool contains_alias(::shared_ptr<column_identifier> name);
|
||||
|
||||
::shared_ptr<column_specification> limit_receiver();
|
||||
|
||||
@@ -383,8 +383,9 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
int32_t limit = get_limit(options);
|
||||
auto now = gc_clock::now();
|
||||
|
||||
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||
++_stats.reads;
|
||||
_stats.filtered_reads += _restrictions->need_filtering();
|
||||
_stats.filtered_reads += restrictions_need_filtering;
|
||||
|
||||
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
|
||||
make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
|
||||
@@ -396,37 +397,42 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
// An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
|
||||
// If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
|
||||
// Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
|
||||
auto aggregate = _selection->is_aggregate();
|
||||
if (aggregate && page_size <= 0) {
|
||||
const bool aggregate = _selection->is_aggregate();
|
||||
const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
|
||||
if (aggregate || nonpaged_filtering) {
|
||||
page_size = DEFAULT_COUNT_PAGE_SIZE;
|
||||
}
|
||||
|
||||
auto key_ranges = _restrictions->get_partition_key_ranges(options);
|
||||
|
||||
if (!aggregate && (page_size <= 0
|
||||
if (!aggregate && !restrictions_need_filtering && (page_size <= 0
|
||||
|| !service::pager::query_pagers::may_need_paging(*_schema, page_size,
|
||||
*command, key_ranges))) {
|
||||
return execute(proxy, command, std::move(key_ranges), state, options, now);
|
||||
}
|
||||
|
||||
command->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
|
||||
auto p = service::pager::query_pagers::pager(_schema, _selection,
|
||||
state, options, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);
|
||||
state, options, command, std::move(key_ranges), _stats, restrictions_need_filtering ? _restrictions : nullptr);
|
||||
|
||||
if (aggregate) {
|
||||
if (aggregate || nonpaged_filtering) {
|
||||
return do_with(
|
||||
cql3::selection::result_set_builder(*_selection, now,
|
||||
options.get_cql_serialization_format()),
|
||||
[this, p, page_size, now, timeout](auto& builder) {
|
||||
[this, p, page_size, now, timeout_duration, restrictions_need_filtering, limit](auto& builder) {
|
||||
return do_until([p] {return p->is_exhausted();},
|
||||
[p, &builder, page_size, now, timeout] {
|
||||
[p, &builder, page_size, now, timeout_duration] {
|
||||
auto timeout = db::timeout_clock::now() + timeout_duration;
|
||||
return p->fetch_page(builder, page_size, now, timeout);
|
||||
}
|
||||
).then([this, &builder] {
|
||||
).then([this, &builder, restrictions_need_filtering, limit] {
|
||||
auto rs = builder.build();
|
||||
if (restrictions_need_filtering) {
|
||||
rs->trim(limit);
|
||||
_stats.filtered_rows_matched_total += rs->size();
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
});
|
||||
@@ -439,7 +445,8 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
" you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
|
||||
}
|
||||
|
||||
if (_selection->is_trivial() && !_restrictions->need_filtering()) {
|
||||
auto timeout = db::timeout_clock::now() + timeout_duration;
|
||||
if (_selection->is_trivial() && !restrictions_need_filtering) {
|
||||
return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
|
||||
auto meta = [&] () -> shared_ptr<const cql3::metadata> {
|
||||
if (!p->is_exhausted()) {
|
||||
@@ -458,14 +465,17 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
}
|
||||
|
||||
return p->fetch_page(page_size, now, timeout).then(
|
||||
[this, p, &options, limit, now](std::unique_ptr<cql3::result_set> rs) {
|
||||
[this, p, &options, limit, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {
|
||||
|
||||
if (!p->is_exhausted()) {
|
||||
rs->get_metadata().set_paging_state(p->state());
|
||||
}
|
||||
|
||||
if (restrictions_need_filtering) {
|
||||
rs->trim(limit);
|
||||
_stats.filtered_rows_matched_total += rs->size();
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
});
|
||||
@@ -492,15 +502,9 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_
|
||||
return KeyType::from_range(exploded_base_key);
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
auto cmd = ::make_lw_shared<query::read_command>(
|
||||
lw_shared_ptr<query::read_command>
|
||||
indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
|
||||
lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
|
||||
_schema->id(),
|
||||
_schema->version(),
|
||||
make_partition_slice(options),
|
||||
@@ -510,9 +514,25 @@ indexed_table_select_statement::execute_base_query(
|
||||
query::max_partitions,
|
||||
utils::UUID(),
|
||||
options.get_timestamp(state));
|
||||
if (options.get_page_size() > 0) {
|
||||
if (use_paging) {
|
||||
cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||
cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
|
||||
if (_schema->clustering_key_size() > 0) {
|
||||
cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
|
||||
}
|
||||
}
|
||||
return cmd;
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
dht::partition_range_vector per_vnode_ranges;
|
||||
per_vnode_ranges.reserve(partition_ranges.size());
|
||||
@@ -586,19 +606,7 @@ indexed_table_select_statement::execute_base_query(
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
auto cmd = make_lw_shared<query::read_command>(
|
||||
_schema->id(),
|
||||
_schema->version(),
|
||||
make_partition_slice(options),
|
||||
get_limit(options),
|
||||
now,
|
||||
tracing::make_trace_info(state.get_trace_state()),
|
||||
query::max_partitions,
|
||||
utils::UUID(),
|
||||
options.get_timestamp(state));
|
||||
if (options.get_page_size() > 0) {
|
||||
cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||
}
|
||||
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
|
||||
struct base_query_state {
|
||||
@@ -714,7 +722,8 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
const query_options& options,
|
||||
gc_clock::time_point now)
|
||||
{
|
||||
bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !_restrictions->need_filtering();
|
||||
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||
const bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !restrictions_need_filtering;
|
||||
if (fast_path) {
|
||||
return make_shared<cql_transport::messages::result_message::rows>(result(
|
||||
result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
|
||||
@@ -724,7 +733,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
|
||||
cql3::selection::result_set_builder builder(*_selection, now,
|
||||
options.get_cql_serialization_format());
|
||||
if (_restrictions->need_filtering()) {
|
||||
if (restrictions_need_filtering) {
|
||||
results->ensure_counts();
|
||||
_stats.filtered_rows_read_total += *results->row_count();
|
||||
query::result_view::consume(*results, cmd->slice,
|
||||
@@ -743,9 +752,11 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
rs->reverse();
|
||||
}
|
||||
rs->trim(cmd->row_limit);
|
||||
} else if (restrictions_need_filtering) {
|
||||
rs->trim(cmd->row_limit);
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
||||
_stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
|
||||
return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
}
|
||||
|
||||
@@ -774,7 +785,8 @@ indexed_table_select_statement::prepare(database& db,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
::shared_ptr<term> limit, cql_stats &stats)
|
||||
{
|
||||
auto index_opt = find_idx(db, schema, restrictions);
|
||||
auto& sim = db.find_column_family(schema).get_index_manager();
|
||||
auto index_opt = restrictions->find_idx(sim);
|
||||
if (!index_opt) {
|
||||
throw std::runtime_error("No index found.");
|
||||
}
|
||||
@@ -798,24 +810,6 @@ indexed_table_select_statement::prepare(database& db,
|
||||
|
||||
}
|
||||
|
||||
|
||||
stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
|
||||
schema_ptr schema,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions)
|
||||
{
|
||||
auto& sim = db.find_column_family(schema).get_index_manager();
|
||||
for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
|
||||
for (const auto& cdef : restriction->get_column_defs()) {
|
||||
for (auto index : sim.list_indexes()) {
|
||||
if (index.depends_on(*cdef)) {
|
||||
return stdx::make_optional<secondary_index::index>(std::move(index));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return stdx::nullopt;
|
||||
}
|
||||
|
||||
indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
|
||||
::shared_ptr<parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
@@ -1219,6 +1213,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
|
||||
}
|
||||
|
||||
check_needs_filtering(restrictions);
|
||||
ensure_filtering_columns_retrieval(db, selection, restrictions);
|
||||
|
||||
::shared_ptr<cql3::statements::select_statement> stmt;
|
||||
if (restrictions->uses_secondary_indexing()) {
|
||||
@@ -1357,7 +1352,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
|
||||
}
|
||||
auto index = selection->index_of(*def);
|
||||
if (index < 0) {
|
||||
index = selection->add_column_for_ordering(*def);
|
||||
index = selection->add_column_for_post_processing(*def);
|
||||
}
|
||||
|
||||
sorters.emplace_back(index, def->type);
|
||||
@@ -1444,6 +1439,23 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds columns that are needed for the purpose of filtering to the selection.
|
||||
* The columns that are added to the selection are columns that
|
||||
* are needed for filtering on the coordinator but are not part of the selection.
|
||||
* The columns are added with a meta-data indicating they are not to be returned
|
||||
* to the user.
|
||||
*/
|
||||
void select_statement::ensure_filtering_columns_retrieval(database& db,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions) {
|
||||
for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
|
||||
if (!selection->has_column(*cdef)) {
|
||||
selection->add_column_for_post_processing(*cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
|
||||
return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
|
||||
return raw->alias && *name == *raw->alias;
|
||||
|
||||
@@ -186,10 +186,6 @@ public:
|
||||
schema_ptr view_schema);
|
||||
|
||||
private:
|
||||
static stdx::optional<secondary_index::index> find_idx(database& db,
|
||||
schema_ptr schema,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
|
||||
service::query_state& state, const query_options& options) override;
|
||||
|
||||
@@ -214,6 +210,9 @@ private:
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
|
||||
lw_shared_ptr<query::read_command>
|
||||
prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
|
||||
110
database.cc
110
database.cc
@@ -76,6 +76,8 @@
|
||||
#include "sstables/compaction_manager.hh"
|
||||
#include "sstables/compaction_backlog_manager.hh"
|
||||
#include "sstables/progress_monitor.hh"
|
||||
#include "auth/common.hh"
|
||||
#include "tracing/trace_keyspace_helper.hh"
|
||||
|
||||
#include "checked-file-impl.hh"
|
||||
#include "disk-error-handler.hh"
|
||||
@@ -178,6 +180,18 @@ bool is_system_keyspace(const sstring& name) {
|
||||
return system_keyspaces.find(name) != system_keyspaces.end();
|
||||
}
|
||||
|
||||
static const std::unordered_set<sstring> internal_keyspaces = {
|
||||
db::system_distributed_keyspace::NAME,
|
||||
db::system_keyspace::NAME,
|
||||
db::schema_tables::NAME,
|
||||
auth::meta::AUTH_KS,
|
||||
tracing::trace_keyspace_helper::KEYSPACE_NAME
|
||||
};
|
||||
|
||||
bool is_internal_keyspace(const sstring& name) {
|
||||
return internal_keyspaces.find(name) != internal_keyspaces.end();
|
||||
}
|
||||
|
||||
// Used for tests where the CF exists without a database object. We need to pass a valid
|
||||
// dirty_memory manager in that case.
|
||||
thread_local dirty_memory_manager default_dirty_memory_manager;
|
||||
@@ -684,9 +698,11 @@ table::make_reader(schema_ptr s,
|
||||
return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
|
||||
}
|
||||
|
||||
sstables::shared_sstable
|
||||
table::make_streaming_sstable_for_write() {
|
||||
sstables::shared_sstable table::make_streaming_sstable_for_write(std::optional<sstring> subdir) {
|
||||
sstring dir = _config.datadir;
|
||||
if (subdir) {
|
||||
dir += "/" + *subdir;
|
||||
}
|
||||
auto newtab = sstables::make_sstable(_schema,
|
||||
dir, calculate_generation_for_new_table(),
|
||||
get_highest_supported_format(),
|
||||
@@ -826,7 +842,11 @@ void table::add_sstable(sstables::shared_sstable sstable, const std::vector<unsi
|
||||
new_sstables->insert(sstable);
|
||||
_sstables = std::move(new_sstables);
|
||||
update_stats_for_new_sstable(sstable->bytes_on_disk(), shards_for_the_sstable);
|
||||
_compaction_strategy.get_backlog_tracker().add_sstable(sstable);
|
||||
if (sstable->is_staging()) {
|
||||
_sstables_staging.emplace(sstable->generation(), sstable);
|
||||
} else {
|
||||
_compaction_strategy.get_backlog_tracker().add_sstable(sstable);
|
||||
}
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -1613,7 +1633,9 @@ std::vector<sstables::shared_sstable> table::select_sstables(const dht::partitio
|
||||
|
||||
std::vector<sstables::shared_sstable> table::candidates_for_compaction() const {
|
||||
return boost::copy_range<std::vector<sstables::shared_sstable>>(*get_sstables()
|
||||
| boost::adaptors::filtered([this] (auto& sst) { return !_sstables_need_rewrite.count(sst->generation()); }));
|
||||
| boost::adaptors::filtered([this] (auto& sst) {
|
||||
return !_sstables_need_rewrite.count(sst->generation()) && !_sstables_staging.count(sst->generation());
|
||||
}));
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable> table::sstables_need_rewrite() const {
|
||||
@@ -1671,9 +1693,9 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e
|
||||
// to distribute evenly the resource usage among all shards.
|
||||
|
||||
return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
|
||||
[&db, comps = std::move(comps), func = std::move(func), pc] (database& local) {
|
||||
[&db, comps = std::move(comps), func = std::move(func), &pc] (database& local) {
|
||||
|
||||
return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), pc] {
|
||||
return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), &pc] {
|
||||
auto& cf = local.find_column_family(comps.ks, comps.cf);
|
||||
|
||||
auto f = sstables::sstable::load_shared_components(cf.schema(), comps.sstdir, comps.generation, comps.version, comps.format, pc);
|
||||
@@ -1969,6 +1991,12 @@ future<sstables::entry_descriptor> distributed_loader::probe_file(distributed<da
|
||||
}
|
||||
auto cf_sstable_open = [sstdir, comps, fname] (column_family& cf, sstables::foreign_sstable_open_info info) {
|
||||
cf.update_sstables_known_generation(comps.generation);
|
||||
if (shared_sstable sst = cf.get_staging_sstable(comps.generation)) {
|
||||
dblog.warn("SSTable {} is already present in staging/ directory. Moving from staging will be retried.", sst->get_filename());
|
||||
return seastar::async([sst = std::move(sst), comps = std::move(comps)] () {
|
||||
sst->move_to_new_dir_in_thread(comps.sstdir, comps.generation);
|
||||
});
|
||||
}
|
||||
{
|
||||
auto i = boost::range::find_if(*cf._sstables->all(), [gen = comps.generation] (sstables::shared_sstable sst) { return sst->generation() == gen; });
|
||||
if (i != cf._sstables->all()->end()) {
|
||||
@@ -2154,9 +2182,6 @@ database::database(const db::config& cfg, database_config dbcfg)
|
||||
[this] {
|
||||
++_stats->sstable_read_queue_overloaded;
|
||||
return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
|
||||
},
|
||||
[this] {
|
||||
return _querier_cache.evict_one();
|
||||
})
|
||||
// No timeouts or queue length limits - a failure here can kill an entire repair.
|
||||
// Trust the caller to limit concurrency.
|
||||
@@ -2168,7 +2193,7 @@ database::database(const db::config& cfg, database_config dbcfg)
|
||||
, _version(empty_version)
|
||||
, _compaction_manager(make_compaction_manager(*_cfg, dbcfg))
|
||||
, _enable_incremental_backups(cfg.incremental_backups())
|
||||
, _querier_cache(dbcfg.available_memory * 0.04)
|
||||
, _querier_cache(_read_concurrency_sem, dbcfg.available_memory * 0.04)
|
||||
, _large_partition_handler(std::make_unique<db::cql_table_large_partition_handler>(_cfg->compaction_large_partition_warning_threshold_mb()*1024*1024))
|
||||
, _result_memory_limiter(dbcfg.available_memory / 10)
|
||||
{
|
||||
@@ -2420,6 +2445,9 @@ database::setup_metrics() {
|
||||
}
|
||||
|
||||
database::~database() {
|
||||
_read_concurrency_sem.clear_inactive_reads();
|
||||
_streaming_concurrency_sem.clear_inactive_reads();
|
||||
_system_read_concurrency_sem.clear_inactive_reads();
|
||||
}
|
||||
|
||||
void database::update_version(const utils::UUID& version) {
|
||||
@@ -2450,6 +2478,8 @@ future<> distributed_loader::populate_keyspace(distributed<database>& db, sstrin
|
||||
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
|
||||
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
|
||||
return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/staging", ks_name, cfname);
|
||||
}).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
|
||||
}).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
|
||||
std::string msg =
|
||||
@@ -2930,6 +2960,7 @@ keyspace::make_directory_for_column_family(const sstring& name, utils::UUID uuid
|
||||
io_check(recursive_touch_directory, cfdir).get();
|
||||
}
|
||||
io_check(touch_directory, cfdirs[0] + "/upload").get();
|
||||
io_check(touch_directory, cfdirs[0] + "/staging").get();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -4239,6 +4270,7 @@ future<> table::fail_streaming_mutations(utils::UUID plan_id) {
|
||||
_streaming_memtables_big.erase(it);
|
||||
return entry->flush_in_progress.close().then([this, entry] {
|
||||
for (auto&& sst : entry->sstables) {
|
||||
sst.monitor->write_failed();
|
||||
sst.sstable->mark_for_deletion();
|
||||
}
|
||||
});
|
||||
@@ -4447,64 +4479,6 @@ future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an update for the base table, calculates the set of potentially affected views,
|
||||
* generates the relevant updates, and sends them to the paired view replicas.
|
||||
*/
|
||||
future<row_locker::lock_holder> table::push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const {
|
||||
//FIXME: Avoid unfreezing here.
|
||||
auto m = fm.unfreeze(s);
|
||||
auto& base = schema();
|
||||
m.upgrade(base);
|
||||
auto views = affected_views(base, m);
|
||||
if (views.empty()) {
|
||||
return make_ready_future<row_locker::lock_holder>();
|
||||
}
|
||||
auto cr_ranges = db::view::calculate_affected_clustering_ranges(*base, m.decorated_key(), m.partition(), views);
|
||||
if (cr_ranges.empty()) {
|
||||
return generate_and_propagate_view_updates(base, std::move(views), std::move(m), { }, timeout).then([] {
|
||||
// In this case we are not doing a read-before-write, just a
|
||||
// write, so no lock is needed.
|
||||
return make_ready_future<row_locker::lock_holder>();
|
||||
});
|
||||
}
|
||||
// We read the whole set of regular columns in case the update now causes a base row to pass
|
||||
// a view's filters, and a view happens to include columns that have no value in this update.
|
||||
// Also, one of those columns can determine the lifetime of the base row, if it has a TTL.
|
||||
auto columns = boost::copy_range<std::vector<column_id>>(
|
||||
base->regular_columns() | boost::adaptors::transformed(std::mem_fn(&column_definition::id)));
|
||||
query::partition_slice::option_set opts;
|
||||
opts.set(query::partition_slice::option::send_partition_key);
|
||||
opts.set(query::partition_slice::option::send_clustering_key);
|
||||
opts.set(query::partition_slice::option::send_timestamp);
|
||||
opts.set(query::partition_slice::option::send_ttl);
|
||||
auto slice = query::partition_slice(
|
||||
std::move(cr_ranges), { }, std::move(columns), std::move(opts), { }, cql_serialization_format::internal(), query::max_rows);
|
||||
// Take the shard-local lock on the base-table row or partition as needed.
|
||||
// We'll return this lock to the caller, which will release it after
|
||||
// writing the base-table update.
|
||||
future<row_locker::lock_holder> lockf = local_base_lock(base, m.decorated_key(), slice.default_row_ranges(), timeout);
|
||||
return lockf.then([m = std::move(m), slice = std::move(slice), views = std::move(views), base, this, timeout] (row_locker::lock_holder lock) {
|
||||
return do_with(
|
||||
dht::partition_range::make_singular(m.decorated_key()),
|
||||
std::move(slice),
|
||||
std::move(m),
|
||||
[base, views = std::move(views), lock = std::move(lock), this, timeout] (auto& pk, auto& slice, auto& m) mutable {
|
||||
auto reader = this->make_reader(
|
||||
base,
|
||||
pk,
|
||||
slice,
|
||||
service::get_local_sstable_query_read_priority());
|
||||
return this->generate_and_propagate_view_updates(base, std::move(views), std::move(m), std::move(reader), timeout).then([lock = std::move(lock)] () mutable {
|
||||
// return the local partition/row lock we have taken so it
|
||||
// remains locked until the caller is done modifying this
|
||||
// partition/row and destroys the lock object.
|
||||
return std::move(lock);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Shard-local locking of clustering rows or entire partitions of the base
|
||||
* table during a Materialized-View read-modify-update:
|
||||
|
||||
37
database.hh
37
database.hh
@@ -298,6 +298,8 @@ public:
|
||||
class table;
|
||||
using column_family = table;
|
||||
|
||||
class database_sstable_write_monitor;
|
||||
|
||||
class table : public enable_lw_shared_from_this<table> {
|
||||
public:
|
||||
struct config {
|
||||
@@ -395,7 +397,7 @@ private:
|
||||
// plan memtables and the resulting sstables are not made visible until
|
||||
// the streaming is complete.
|
||||
struct monitored_sstable {
|
||||
std::unique_ptr<sstables::write_monitor> monitor;
|
||||
std::unique_ptr<database_sstable_write_monitor> monitor;
|
||||
sstables::shared_sstable sstable;
|
||||
};
|
||||
|
||||
@@ -432,6 +434,9 @@ private:
|
||||
// but for correct compaction we need to start the compaction only after
|
||||
// reading all sstables.
|
||||
std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
|
||||
// sstables that should not be compacted (e.g. because they need to be used
|
||||
// to generate view updates later)
|
||||
std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_staging;
|
||||
// Control background fibers waiting for sstables to be deleted
|
||||
seastar::gate _sstable_deletion_gate;
|
||||
// There are situations in which we need to stop writing sstables. Flushers will take
|
||||
@@ -485,6 +490,11 @@ private:
|
||||
utils::phased_barrier _pending_reads_phaser;
|
||||
public:
|
||||
future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
|
||||
void move_sstable_from_staging_in_thread(sstables::shared_sstable sst);
|
||||
sstables::shared_sstable get_staging_sstable(uint64_t generation) {
|
||||
auto it = _sstables_staging.find(generation);
|
||||
return it != _sstables_staging.end() ? it->second : nullptr;
|
||||
}
|
||||
private:
|
||||
void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
|
||||
// Adds new sstable to the set of sstables
|
||||
@@ -618,6 +628,14 @@ public:
|
||||
tracing::trace_state_ptr trace_state = nullptr,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
|
||||
flat_mutation_reader make_reader_excluding_sstable(schema_ptr schema,
|
||||
sstables::shared_sstable sst,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc = default_priority_class(),
|
||||
tracing::trace_state_ptr trace_state = nullptr,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
|
||||
|
||||
flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
|
||||
auto& full_slice = schema->full_slice();
|
||||
@@ -632,9 +650,13 @@ public:
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema,
|
||||
const dht::partition_range_vector& ranges) const;
|
||||
|
||||
sstables::shared_sstable make_streaming_sstable_for_write();
|
||||
sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
|
||||
sstables::shared_sstable make_streaming_staging_sstable() {
|
||||
return make_streaming_sstable_for_write("staging");
|
||||
}
|
||||
|
||||
mutation_source as_mutation_source() const;
|
||||
mutation_source as_mutation_source_excluding(sstables::shared_sstable sst) const;
|
||||
|
||||
void set_virtual_reader(mutation_source virtual_reader) {
|
||||
_virtual_reader = std::move(virtual_reader);
|
||||
@@ -842,6 +864,8 @@ public:
|
||||
void clear_views();
|
||||
const std::vector<view_ptr>& views() const;
|
||||
future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
|
||||
future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const;
|
||||
future<row_locker::lock_holder> stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const;
|
||||
void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
|
||||
std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);
|
||||
|
||||
@@ -860,6 +884,7 @@ public:
|
||||
flat_mutation_reader&&);
|
||||
|
||||
private:
|
||||
future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
|
||||
std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
|
||||
future<> generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views,
|
||||
@@ -1399,6 +1424,12 @@ public:
|
||||
std::unordered_set<sstring> get_initial_tokens();
|
||||
std::experimental::optional<gms::inet_address> get_replace_address();
|
||||
bool is_replacing();
|
||||
reader_concurrency_semaphore& user_read_concurrency_sem() {
|
||||
return _read_concurrency_sem;
|
||||
}
|
||||
reader_concurrency_semaphore& streaming_read_concurrency_sem() {
|
||||
return _streaming_concurrency_sem;
|
||||
}
|
||||
reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
|
||||
return _system_read_concurrency_sem;
|
||||
}
|
||||
@@ -1428,6 +1459,8 @@ public:
|
||||
|
||||
future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);
|
||||
|
||||
bool is_internal_keyspace(const sstring& name);
|
||||
|
||||
class distributed_loader {
|
||||
public:
|
||||
static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
|
||||
|
||||
@@ -1673,14 +1673,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
|
||||
// No commit_io_check needed in the log reader since the database will fail
|
||||
// on error at startup if required
|
||||
future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
|
||||
db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
|
||||
db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
|
||||
struct work {
|
||||
private:
|
||||
file_input_stream_options make_file_input_stream_options() {
|
||||
file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
|
||||
file_input_stream_options fo;
|
||||
fo.buffer_size = db::commitlog::segment::default_size;
|
||||
fo.read_ahead = 10;
|
||||
fo.io_priority_class = service::get_local_commitlog_priority();
|
||||
fo.io_priority_class = read_io_prio_class;
|
||||
return fo;
|
||||
}
|
||||
public:
|
||||
@@ -1699,8 +1699,8 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
|
||||
bool header = true;
|
||||
bool failed = false;
|
||||
|
||||
work(file f, position_type o = 0)
|
||||
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
|
||||
work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
|
||||
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
|
||||
}
|
||||
work(work&&) = default;
|
||||
|
||||
@@ -1918,9 +1918,9 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
|
||||
return fut;
|
||||
});
|
||||
|
||||
return fut.then([off, next](file f) {
|
||||
return fut.then([off, next, read_io_prio_class] (file f) {
|
||||
f = make_checked_file(commit_error_handler, std::move(f));
|
||||
auto w = make_lw_shared<work>(std::move(f), off);
|
||||
auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
|
||||
auto ret = w->s.listen(next);
|
||||
|
||||
w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
|
||||
|
||||
@@ -355,7 +355,7 @@ public:
|
||||
};
|
||||
|
||||
static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
|
||||
const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
|
||||
const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
|
||||
private:
|
||||
commitlog(config);
|
||||
|
||||
|
||||
@@ -34,7 +34,8 @@ public:
|
||||
commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
|
||||
: _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
|
||||
const stdx::optional<column_mapping>& mapping() const { return _mapping; }
|
||||
const frozen_mutation& mutation() const { return _mutation; }
|
||||
const frozen_mutation& mutation() const & { return _mutation; }
|
||||
frozen_mutation&& mutation() && { return std::move(_mutation); }
|
||||
};
|
||||
|
||||
class commitlog_entry_writer {
|
||||
@@ -80,5 +81,6 @@ public:
|
||||
commitlog_entry_reader(const temporary_buffer<char>& buffer);
|
||||
|
||||
const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
|
||||
const frozen_mutation& mutation() const { return _ce.mutation(); }
|
||||
const frozen_mutation& mutation() const & { return _ce.mutation(); }
|
||||
frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
|
||||
};
|
||||
|
||||
@@ -58,6 +58,7 @@
|
||||
#include "converting_mutation_partition_applier.hh"
|
||||
#include "schema_registry.hh"
|
||||
#include "commitlog_entry.hh"
|
||||
#include "service/priority_manager.hh"
|
||||
|
||||
static logging::logger rlogger("commitlog_replayer");
|
||||
|
||||
@@ -223,7 +224,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
|
||||
auto s = make_lw_shared<stats>();
|
||||
auto& exts = _qp.local().db().local().get_config().extensions();
|
||||
|
||||
return db::commitlog::read_log_file(file,
|
||||
return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
|
||||
std::bind(&impl::process, this, s.get(), std::placeholders::_1,
|
||||
std::placeholders::_2), p, &exts).then([](auto s) {
|
||||
auto f = s->done();
|
||||
|
||||
@@ -453,7 +453,7 @@ public:
|
||||
"The maximum number of tombstones a query can scan before aborting." \
|
||||
) \
|
||||
/* Network timeout settings */ \
|
||||
val(range_request_timeout_in_ms, uint32_t, 10000, Unused, \
|
||||
val(range_request_timeout_in_ms, uint32_t, 10000, Used, \
|
||||
"The time in milliseconds that the coordinator waits for sequential or index scans to complete." \
|
||||
) \
|
||||
val(read_request_timeout_in_ms, uint32_t, 5000, Used, \
|
||||
@@ -472,7 +472,7 @@ public:
|
||||
"The time in milliseconds that the coordinator waits for write operations to complete.\n" \
|
||||
"Related information: About hinted handoff writes" \
|
||||
) \
|
||||
val(request_timeout_in_ms, uint32_t, 10000, Unused, \
|
||||
val(request_timeout_in_ms, uint32_t, 10000, Used, \
|
||||
"The default timeout for other, miscellaneous operations.\n" \
|
||||
"Related information: About hinted handoff writes" \
|
||||
) \
|
||||
@@ -578,7 +578,7 @@ public:
|
||||
val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused, \
|
||||
"The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval." \
|
||||
) \
|
||||
val(hinted_handoff_enabled, sstring, "false", Used, \
|
||||
val(hinted_handoff_enabled, sstring, "true", Used, \
|
||||
"Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
|
||||
"Related information: About hinted handoff writes" \
|
||||
) \
|
||||
@@ -621,7 +621,7 @@ public:
|
||||
val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused, \
|
||||
"Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting." \
|
||||
) \
|
||||
val(thrift_max_message_length_in_mb, uint32_t, 16, Unused, \
|
||||
val(thrift_max_message_length_in_mb, uint32_t, 16, Used, \
|
||||
"The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)." \
|
||||
) \
|
||||
/* Security properties */ \
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "disk-error-handler.hh"
|
||||
#include "lister.hh"
|
||||
#include "db/timeout_clock.hh"
|
||||
#include "service/priority_manager.hh"
|
||||
|
||||
using namespace std::literals::chrono_literals;
|
||||
|
||||
@@ -95,6 +96,7 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
|
||||
return compute_hints_dir_device_id();
|
||||
}).then([this] {
|
||||
_strorage_service_anchor->register_subscriber(this);
|
||||
set_started();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -105,7 +107,7 @@ future<> manager::stop() {
|
||||
_strorage_service_anchor->unregister_subscriber(this);
|
||||
}
|
||||
|
||||
_stopping = true;
|
||||
set_stopping();
|
||||
|
||||
return _draining_eps_gate.close().finally([this] {
|
||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||
@@ -277,7 +279,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
|
||||
}
|
||||
|
||||
bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
|
||||
if (_stopping || !can_hint_for(ep)) {
|
||||
if (stopping() || !started() || !can_hint_for(ep)) {
|
||||
manager_logger.trace("Can't store a hint to {}", ep);
|
||||
++_stats.dropped;
|
||||
return false;
|
||||
@@ -380,7 +382,7 @@ future<timespec> manager::end_point_hints_manager::sender::get_last_file_modific
|
||||
});
|
||||
}
|
||||
|
||||
future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
|
||||
future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
|
||||
return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
|
||||
// The fact that we send with CL::ALL in both cases below ensures that new hints are not going
|
||||
// to be generated as a result of hints sending.
|
||||
@@ -392,7 +394,8 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation
|
||||
// FIXME: using 1h as infinite timeout. If a node is down, we should get an
|
||||
// unavailable exception.
|
||||
auto timeout = db::timeout_clock::now() + 1h;
|
||||
return _proxy.mutate({std::move(m)}, consistency_level::ALL, timeout, nullptr);
|
||||
//FIXME: Add required frozen_mutation overloads
|
||||
return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -418,21 +421,19 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
|
||||
}
|
||||
}
|
||||
|
||||
mutation manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
|
||||
frozen_mutation_and_schema manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
|
||||
hint_entry_reader hr(buf);
|
||||
auto& fm = hr.mutation();
|
||||
auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
|
||||
auto& cf = _db.find_column_family(fm.column_family_id());
|
||||
auto schema = _db.find_schema(fm.column_family_id());
|
||||
|
||||
if (cf.schema()->version() != fm.schema_version()) {
|
||||
mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
|
||||
converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
|
||||
if (schema->version() != fm.schema_version()) {
|
||||
mutation m(schema, fm.decorated_key(*schema));
|
||||
converting_mutation_partition_applier v(cm, *schema, m.partition());
|
||||
fm.partition().accept(cm, v);
|
||||
|
||||
return std::move(m);
|
||||
} else {
|
||||
return fm.unfreeze(cf.schema());
|
||||
return {freeze(m), std::move(schema)};
|
||||
}
|
||||
return {std::move(hr).mutation(), std::move(schema)};
|
||||
}
|
||||
|
||||
const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
|
||||
@@ -502,7 +503,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
|
||||
}
|
||||
|
||||
void manager::drain_for(gms::inet_address endpoint) {
|
||||
if (_stopping) {
|
||||
if (stopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -543,6 +544,7 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
|
||||
, _resource_manager(_shard_manager._resource_manager)
|
||||
, _proxy(local_storage_proxy)
|
||||
, _db(local_db)
|
||||
, _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
|
||||
, _gossiper(local_gossiper)
|
||||
, _file_update_mutex(_ep_manager.file_update_mutex())
|
||||
{}
|
||||
@@ -555,6 +557,7 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
|
||||
, _resource_manager(_shard_manager._resource_manager)
|
||||
, _proxy(other._proxy)
|
||||
, _db(other._db)
|
||||
, _hints_cpu_sched_group(other._hints_cpu_sched_group)
|
||||
, _gossiper(other._gossiper)
|
||||
, _file_update_mutex(_ep_manager.file_update_mutex())
|
||||
{}
|
||||
@@ -610,7 +613,10 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
|
||||
}
|
||||
|
||||
void manager::end_point_hints_manager::sender::start() {
|
||||
_stopped = seastar::async([this] {
|
||||
seastar::thread_attributes attr;
|
||||
|
||||
attr.sched_group = _hints_cpu_sched_group;
|
||||
_stopped = seastar::async(std::move(attr), [this] {
|
||||
manager_logger.trace("ep_manager({})::sender: started", end_point_key());
|
||||
while (!stopping()) {
|
||||
try {
|
||||
@@ -630,10 +636,11 @@ void manager::end_point_hints_manager::sender::start() {
|
||||
});
|
||||
}
|
||||
|
||||
future<> manager::end_point_hints_manager::sender::send_one_mutation(mutation m) {
|
||||
keyspace& ks = _db.find_keyspace(m.schema()->ks_name());
|
||||
future<> manager::end_point_hints_manager::sender::send_one_mutation(frozen_mutation_and_schema m) {
|
||||
keyspace& ks = _db.find_keyspace(m.s->ks_name());
|
||||
auto& rs = ks.get_replication_strategy();
|
||||
std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(m.token());
|
||||
auto token = dht::global_partitioner().get_token(*m.s, m.fm.key(*m.s));
|
||||
std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(std::move(token));
|
||||
|
||||
return do_send_one_mutation(std::move(m), natural_endpoints);
|
||||
}
|
||||
@@ -651,8 +658,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
mutation m = this->get_mutation(ctx_ptr, buf);
|
||||
gc_clock::duration gc_grace_sec = m.schema()->gc_grace_seconds();
|
||||
auto m = this->get_mutation(ctx_ptr, buf);
|
||||
gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();
|
||||
|
||||
// The hint is too old - drop it.
|
||||
//
|
||||
@@ -693,7 +700,7 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
|
||||
lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();
|
||||
|
||||
try {
|
||||
auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
|
||||
auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
|
||||
// Check that we can still send the next hint. Don't try to send it if the destination host
|
||||
// is DOWN or if we have already failed to send some of the previous hints.
|
||||
if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
|
||||
@@ -759,7 +766,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
|
||||
int replayed_segments_count = 0;
|
||||
|
||||
try {
|
||||
while (have_segments()) {
|
||||
while (replay_allowed() && have_segments()) {
|
||||
if (!send_one_file(*_segments_to_replay.begin())) {
|
||||
break;
|
||||
}
|
||||
@@ -784,14 +791,24 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
|
||||
manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
|
||||
}
|
||||
|
||||
template<typename Func>
|
||||
static future<> scan_for_hints_dirs(const sstring& hints_directory, Func&& f) {
|
||||
return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::forward<Func>(f)] (lister::path dir, directory_entry de) {
|
||||
try {
|
||||
return f(std::move(dir), std::move(de), std::stoi(de.name.c_str()));
|
||||
} catch (std::invalid_argument& ex) {
|
||||
manager_logger.debug("Ignore invalid directory {}", de.name);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// runs in seastar::async context
|
||||
manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
|
||||
hints_segments_map current_hints_segments;
|
||||
|
||||
// shards level
|
||||
lister::scan_dir(hints_directory, { directory_entry_type::directory }, [¤t_hints_segments] (lister::path dir, directory_entry de) {
|
||||
unsigned shard_id = std::stoi(de.name.c_str());
|
||||
|
||||
scan_for_hints_dirs(hints_directory, [¤t_hints_segments] (lister::path dir, directory_entry de, unsigned shard_id) {
|
||||
manager_logger.trace("shard_id = {}", shard_id);
|
||||
// IPs level
|
||||
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [¤t_hints_segments, shard_id] (lister::path dir, directory_entry de) {
|
||||
@@ -908,9 +925,7 @@ void manager::rebalance_segments_for(
|
||||
// runs in seastar::async context
|
||||
void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
|
||||
// shards level
|
||||
lister::scan_dir(hints_directory, { directory_entry_type::directory }, [] (lister::path dir, directory_entry de) {
|
||||
unsigned shard_id = std::stoi(de.name.c_str());
|
||||
|
||||
scan_for_hints_dirs(hints_directory, [] (lister::path dir, directory_entry de, unsigned shard_id) {
|
||||
if (shard_id >= smp::count) {
|
||||
// IPs level
|
||||
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
|
||||
@@ -936,5 +951,15 @@ future<> manager::rebalance(sstring hints_directory) {
|
||||
});
|
||||
}
|
||||
|
||||
void manager::update_backlog(size_t backlog, size_t max_backlog) {
|
||||
_backlog_size = backlog;
|
||||
_max_backlog_size = max_backlog;
|
||||
if (backlog < max_backlog) {
|
||||
allow_hints();
|
||||
} else {
|
||||
forbid_hints_for_eps_with_pending_hints();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,6 +69,8 @@ private:
|
||||
class drain_tag {};
|
||||
using drain = seastar::bool_class<drain_tag>;
|
||||
|
||||
friend class space_watchdog;
|
||||
|
||||
public:
|
||||
class end_point_hints_manager {
|
||||
public:
|
||||
@@ -119,6 +121,7 @@ public:
|
||||
resource_manager& _resource_manager;
|
||||
service::storage_proxy& _proxy;
|
||||
database& _db;
|
||||
seastar::scheduling_group _hints_cpu_sched_group;
|
||||
gms::gossiper& _gossiper;
|
||||
seastar::shared_mutex& _file_update_mutex;
|
||||
|
||||
@@ -179,6 +182,10 @@ public:
|
||||
return _state.contains(state::stopping);
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _ep_manager.replay_allowed();
|
||||
}
|
||||
|
||||
/// \brief Try to send one hint read from the file.
|
||||
/// - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
|
||||
/// - Discard the hints that are older than the grace seconds value of the corresponding table.
|
||||
@@ -210,7 +217,7 @@ public:
|
||||
/// \param ctx_ptr pointer to the send context
|
||||
/// \param buf hints file entry
|
||||
/// \return The mutation object representing the original mutation stored in the hints file.
|
||||
mutation get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
|
||||
frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
|
||||
|
||||
/// \brief Get a reference to the column_mapping object for a given frozen mutation.
|
||||
/// \param ctx_ptr pointer to the send context
|
||||
@@ -227,13 +234,13 @@ public:
|
||||
/// \param m mutation to send
|
||||
/// \param natural_endpoints current replicas for the given mutation
|
||||
/// \return future that resolves when the operation is complete
|
||||
future<> do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
|
||||
future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
|
||||
|
||||
/// \brief Send one mutation out.
|
||||
///
|
||||
/// \param m mutation to send
|
||||
/// \return future that resolves when the mutation sending processing is complete.
|
||||
future<> send_one_mutation(mutation m);
|
||||
future<> send_one_mutation(frozen_mutation_and_schema m);
|
||||
|
||||
/// \brief Get the last modification time stamp for a given file.
|
||||
/// \param fname File name
|
||||
@@ -328,6 +335,10 @@ public:
|
||||
return _hints_in_progress;
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _shard_manager.replay_allowed();
|
||||
}
|
||||
|
||||
bool can_hint() const noexcept {
|
||||
return _state.contains(state::can_hint);
|
||||
}
|
||||
@@ -393,6 +404,17 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
enum class state {
|
||||
started, // hinting is currently allowed (start() call is complete)
|
||||
replay_allowed, // replaying (hints sending) is allowed
|
||||
stopping // hinting is not allowed - stopping is in progress (stop() method has been called)
|
||||
};
|
||||
|
||||
using state_set = enum_set<super_enum<state,
|
||||
state::started,
|
||||
state::replay_allowed,
|
||||
state::stopping>>;
|
||||
|
||||
private:
|
||||
using ep_key_type = typename end_point_hints_manager::key_type;
|
||||
using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
|
||||
@@ -403,6 +425,7 @@ public:
|
||||
static const std::chrono::seconds hint_file_write_timeout;
|
||||
|
||||
private:
|
||||
state_set _state;
|
||||
const boost::filesystem::path _hints_dir;
|
||||
dev_t _hints_dir_device_id = 0;
|
||||
|
||||
@@ -414,7 +437,7 @@ private:
|
||||
locator::snitch_ptr& _local_snitch_ptr;
|
||||
int64_t _max_hint_window_us = 0;
|
||||
database& _local_db;
|
||||
bool _stopping = false;
|
||||
|
||||
seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call
|
||||
|
||||
resource_manager& _resource_manager;
|
||||
@@ -424,9 +447,14 @@ private:
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
std::unordered_set<ep_key_type> _eps_with_pending_hints;
|
||||
|
||||
size_t _max_backlog_size;
|
||||
size_t _backlog_size;
|
||||
|
||||
public:
|
||||
manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
|
||||
virtual ~manager();
|
||||
manager(manager&&) = delete;
|
||||
manager& operator=(manager&&) = delete;
|
||||
void register_metrics(const sstring& group_name);
|
||||
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
||||
future<> stop();
|
||||
@@ -503,6 +531,18 @@ public:
|
||||
void forbid_hints();
|
||||
void forbid_hints_for_eps_with_pending_hints();
|
||||
|
||||
size_t max_backlog_size() const {
|
||||
return _max_backlog_size;
|
||||
}
|
||||
|
||||
size_t backlog_size() const {
|
||||
return _backlog_size;
|
||||
}
|
||||
|
||||
void allow_replaying() noexcept {
|
||||
_state.set(state::replay_allowed);
|
||||
}
|
||||
|
||||
/// \brief Rebalance hints segments among all present shards.
|
||||
///
|
||||
/// The difference between the number of segments on every two shard will be not greater than 1 after the
|
||||
@@ -616,6 +656,28 @@ private:
|
||||
/// \param endpoint node that left the cluster
|
||||
void drain_for(gms::inet_address endpoint);
|
||||
|
||||
void update_backlog(size_t backlog, size_t max_backlog);
|
||||
|
||||
bool stopping() const noexcept {
|
||||
return _state.contains(state::stopping);
|
||||
}
|
||||
|
||||
void set_stopping() noexcept {
|
||||
_state.set(state::stopping);
|
||||
}
|
||||
|
||||
bool started() const noexcept {
|
||||
return _state.contains(state::started);
|
||||
}
|
||||
|
||||
void set_started() noexcept {
|
||||
_state.set(state::started);
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _state.contains(state::replay_allowed);
|
||||
}
|
||||
|
||||
public:
|
||||
ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
|
||||
return _ep_managers.find(ep_key);
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "lister.hh"
|
||||
#include "disk-error-handler.hh"
|
||||
#include "seastarx.hh"
|
||||
#include <seastar/core/sleep.hh>
|
||||
|
||||
namespace db {
|
||||
namespace hints {
|
||||
@@ -65,19 +66,28 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
|
||||
space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
|
||||
: _shard_managers(managers)
|
||||
, _per_device_limits_map(per_device_limits_map)
|
||||
, _timer([this] { on_timer(); })
|
||||
{}
|
||||
|
||||
void space_watchdog::start() {
|
||||
_timer.arm(timer_clock_type::now());
|
||||
_started = seastar::async([this] {
|
||||
while (!_as.abort_requested()) {
|
||||
try {
|
||||
on_timer();
|
||||
} catch (...) {
|
||||
resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
|
||||
// Stop all hint generators if space_watchdog callback failed
|
||||
for (manager& shard_manager : _shard_managers) {
|
||||
shard_manager.forbid_hints();
|
||||
}
|
||||
}
|
||||
seastar::sleep_abortable(_watchdog_period, _as).get();
|
||||
}
|
||||
}).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
|
||||
}
|
||||
|
||||
future<> space_watchdog::stop() noexcept {
|
||||
try {
|
||||
return _gate.close().finally([this] { _timer.cancel(); });
|
||||
} catch (...) {
|
||||
return make_exception_future<>(std::current_exception());
|
||||
}
|
||||
_as.request_abort();
|
||||
return std::move(_started);
|
||||
}
|
||||
|
||||
future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
|
||||
@@ -94,83 +104,62 @@ future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager&
|
||||
});
|
||||
}
|
||||
|
||||
// Called from the context of a seastar::thread.
|
||||
void space_watchdog::on_timer() {
|
||||
with_gate(_gate, [this] {
|
||||
return futurize_apply([this] {
|
||||
_total_size = 0;
|
||||
// The hints directories are organized as follows:
|
||||
// <hints root>
|
||||
// |- <shard1 ID>
|
||||
// | |- <EP1 address>
|
||||
// | |- <hints file1>
|
||||
// | |- <hints file2>
|
||||
// | |- ...
|
||||
// | |- <EP2 address>
|
||||
// | |- ...
|
||||
// | |-...
|
||||
// |- <shard2 ID>
|
||||
// | |- ...
|
||||
// ...
|
||||
// |- <shardN ID>
|
||||
// | |- ...
|
||||
//
|
||||
|
||||
return do_for_each(_shard_managers, [this] (manager& shard_manager) {
|
||||
shard_manager.clear_eps_with_pending_hints();
|
||||
|
||||
// The hints directories are organized as follows:
|
||||
// <hints root>
|
||||
// |- <shard1 ID>
|
||||
// | |- <EP1 address>
|
||||
// | |- <hints file1>
|
||||
// | |- <hints file2>
|
||||
// | |- ...
|
||||
// | |- <EP2 address>
|
||||
// | |- ...
|
||||
// | |-...
|
||||
// |- <shard2 ID>
|
||||
// | |- ...
|
||||
// ...
|
||||
// |- <shardN ID>
|
||||
// | |- ...
|
||||
for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
|
||||
_total_size = 0;
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.clear_eps_with_pending_hints();
|
||||
lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
_files_count = 0;
|
||||
// Let's scan per-end-point directories and enumerate hints files...
|
||||
//
|
||||
return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
_files_count = 0;
|
||||
// Let's scan per-end-point directories and enumerate hints files...
|
||||
//
|
||||
// Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
|
||||
// not hintable).
|
||||
// If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
|
||||
// continue to enumeration - there is no one to change them.
|
||||
auto it = shard_manager.find_ep_manager(de.name);
|
||||
if (it != shard_manager.ep_managers_end()) {
|
||||
return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
|
||||
return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
|
||||
});
|
||||
} else {
|
||||
return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
|
||||
}
|
||||
});
|
||||
}).then([this] {
|
||||
return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
|
||||
space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
|
||||
|
||||
size_t adjusted_quota = 0;
|
||||
size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
|
||||
return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
|
||||
// Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
|
||||
// not hintable).
|
||||
// If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
|
||||
// continue to enumeration - there is no one to change them.
|
||||
auto it = shard_manager.find_ep_manager(de.name);
|
||||
if (it != shard_manager.ep_managers_end()) {
|
||||
return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
|
||||
return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
|
||||
});
|
||||
if (per_device_limits.max_shard_disk_space_size > delta) {
|
||||
adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
|
||||
}
|
||||
} else {
|
||||
return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
|
||||
}
|
||||
}).get();
|
||||
}
|
||||
|
||||
bool can_hint = _total_size < adjusted_quota;
|
||||
resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
|
||||
|
||||
if (!can_hint) {
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.forbid_hints_for_eps_with_pending_hints();
|
||||
}
|
||||
} else {
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.allow_hints();
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}).handle_exception([this] (auto eptr) {
|
||||
resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
|
||||
// Stop all hint generators if space_watchdog callback failed
|
||||
for (manager& shard_manager : _shard_managers) {
|
||||
shard_manager.forbid_hints();
|
||||
}
|
||||
}).finally([this] {
|
||||
_timer.arm(_watchdog_period);
|
||||
// Adjust the quota to take into account the space we guarantee to every end point manager
|
||||
size_t adjusted_quota = 0;
|
||||
size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
|
||||
return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
|
||||
});
|
||||
});
|
||||
if (per_device_limits.max_shard_disk_space_size > delta) {
|
||||
adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
|
||||
}
|
||||
|
||||
resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.update_backlog(_total_size, adjusted_quota);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
|
||||
@@ -183,6 +172,10 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
|
||||
});
|
||||
}
|
||||
|
||||
void resource_manager::allow_replaying() noexcept {
|
||||
boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
|
||||
}
|
||||
|
||||
future<> resource_manager::stop() noexcept {
|
||||
return parallel_for_each(_shard_managers, [](manager& m) {
|
||||
return m.stop();
|
||||
@@ -201,14 +194,18 @@ future<> resource_manager::prepare_per_device_limits() {
|
||||
auto it = _per_device_limits_map.find(device_id);
|
||||
if (it == _per_device_limits_map.end()) {
|
||||
return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
|
||||
// By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
|
||||
size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
|
||||
// If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
|
||||
// Then, reserve 90% of all space instead of 10% above.
|
||||
if (is_mountpoint) {
|
||||
max_size *= 9;
|
||||
auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
|
||||
// Since we possibly deferred, we need to recheck the _per_device_limits_map.
|
||||
if (inserted) {
|
||||
// By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
|
||||
it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
|
||||
// If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
|
||||
// Then, reserve 90% of all space instead of 10% above.
|
||||
if (is_mountpoint) {
|
||||
it->second.max_shard_disk_space_size *= 9;
|
||||
}
|
||||
}
|
||||
_per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
|
||||
it->second.managers.emplace_back(std::ref(shard_manager));
|
||||
});
|
||||
} else {
|
||||
it->second.managers.emplace_back(std::ref(shard_manager));
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/memory.hh>
|
||||
@@ -78,8 +79,8 @@ private:
|
||||
shard_managers_set& _shard_managers;
|
||||
per_device_limits_map& _per_device_limits_map;
|
||||
|
||||
seastar::gate _gate;
|
||||
seastar::timer<timer_clock_type> _timer;
|
||||
future<> _started = make_ready_future<>();
|
||||
seastar::abort_source _as;
|
||||
int _files_count = 0;
|
||||
|
||||
public:
|
||||
@@ -137,6 +138,9 @@ public:
|
||||
, _space_watchdog(_shard_managers, _per_device_limits_map)
|
||||
{}
|
||||
|
||||
resource_manager(resource_manager&&) = delete;
|
||||
resource_manager& operator=(resource_manager&&) = delete;
|
||||
|
||||
future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);
|
||||
|
||||
bool too_many_hints_in_progress() const {
|
||||
@@ -156,6 +160,7 @@ public:
|
||||
}
|
||||
|
||||
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
||||
void allow_replaying() noexcept;
|
||||
future<> stop() noexcept;
|
||||
void register_manager(manager& m);
|
||||
future<> prepare_per_device_limits();
|
||||
|
||||
@@ -87,7 +87,7 @@ future<> system_distributed_keyspace::start() {
|
||||
return do_with(all_tables(), [this] (std::vector<schema_ptr>& tables) {
|
||||
return do_for_each(tables, [this] (schema_ptr table) {
|
||||
return ignore_existing([this, table = std::move(table)] {
|
||||
return _mm.announce_new_column_family(std::move(table), false);
|
||||
return _mm.announce_new_column_family(std::move(table), api::min_timestamp, false);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -931,7 +931,7 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
|
||||
auto fs = std::make_unique<std::vector<future<>>>();
|
||||
for (auto& mut : mutations) {
|
||||
auto view_token = mut.token();
|
||||
auto keyspace_name = mut.schema()->ks_name();
|
||||
auto& keyspace_name = mut.schema()->ks_name();
|
||||
auto paired_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
|
||||
auto pending_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
|
||||
if (paired_endpoint) {
|
||||
@@ -951,10 +951,19 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
|
||||
// do not wait for it to complete.
|
||||
// Note also that mutate_locally(mut) copies mut (in
|
||||
// frozen form) so don't need to increase its lifetime.
|
||||
fs->push_back(service::get_local_storage_proxy().mutate_locally(mut).handle_exception([&stats] (auto ep) {
|
||||
vlogger.error("Error applying local view update: {}", ep);
|
||||
stats.view_updates_failed_local++;
|
||||
return make_exception_future<>(std::move(ep));
|
||||
// send_to_endpoint() below updates statistics on pending
|
||||
// writes but mutate_locally() doesn't, so we need to do that here.
|
||||
++stats.writes;
|
||||
fs->push_back(service::get_local_storage_proxy().mutate_locally(mut).then_wrapped([&stats] (auto&& fut) {
|
||||
--stats.writes;
|
||||
if (fut.failed()) {
|
||||
auto ep = fut.get_exception();
|
||||
vlogger.error("Error applying local view update: {}", ep);
|
||||
++stats.view_updates_failed_local;
|
||||
return make_exception_future<>(std::move(ep));
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}));
|
||||
} else {
|
||||
vlogger.debug("Sending view update to endpoint {}, with pending endpoints = {}", *paired_endpoint, pending_endpoints);
|
||||
@@ -1226,6 +1235,20 @@ future<> view_builder::calculate_shard_build_step(
|
||||
}
|
||||
}
|
||||
|
||||
// All shards need to arrive at the same decisions on whether or not to
|
||||
// restart a view build at some common token (reshard), and which token
|
||||
// to restart at. So we need to wait until all shards have read the view
|
||||
// build statuses before they can all proceed to make the (same) decision.
|
||||
// If we don't synchronoize here, a fast shard may make a decision, start
|
||||
// building and finish a build step - before the slowest shard even read
|
||||
// the view build information.
|
||||
container().invoke_on(0, [] (view_builder& builder) {
|
||||
if (++builder._shards_finished_read == smp::count) {
|
||||
builder._shards_finished_read_promise.set_value();
|
||||
}
|
||||
return builder._shards_finished_read_promise.get_shared_future();
|
||||
}).get();
|
||||
|
||||
std::unordered_set<utils::UUID> loaded_views;
|
||||
if (view_build_status_per_shard.size() != smp::count) {
|
||||
reshard(std::move(view_build_status_per_shard), loaded_views);
|
||||
@@ -1591,10 +1614,10 @@ future<> view_builder::maybe_mark_view_as_built(view_ptr view, dht::token next_t
|
||||
});
|
||||
}
|
||||
|
||||
future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout) {
|
||||
return container().invoke_on(0, [ks_name, view_name, timeout] (view_builder& builder) {
|
||||
future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name) {
|
||||
return container().invoke_on(0, [ks_name, view_name] (view_builder& builder) {
|
||||
auto v = std::pair(std::move(ks_name), std::move(view_name));
|
||||
return builder._build_notifiers[std::move(v)].get_shared_future(timeout);
|
||||
return builder._build_notifiers[std::move(v)].get_shared_future();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -151,6 +151,10 @@ class view_builder final : public service::migration_listener::only_view_notific
|
||||
future<> _started = make_ready_future<>();
|
||||
// Used to coordinate between shards the conclusion of the build process for a particular view.
|
||||
std::unordered_set<utils::UUID> _built_views;
|
||||
// Counter and promise (both on shard 0 only!) allowing to wait for all
|
||||
// shards to have read the view build statuses
|
||||
unsigned _shards_finished_read = 0;
|
||||
seastar::shared_promise<> _shards_finished_read_promise;
|
||||
// Used for testing.
|
||||
std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;
|
||||
|
||||
@@ -178,7 +182,7 @@ public:
|
||||
virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override;
|
||||
|
||||
// For tests
|
||||
future<> wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout);
|
||||
future<> wait_until_built(const sstring& ks_name, const sstring& view_name);
|
||||
|
||||
private:
|
||||
build_step& get_or_create_build_step(utils::UUID);
|
||||
|
||||
66
db/view/view_update_from_staging_generator.cc
Normal file
66
db/view/view_update_from_staging_generator.cc
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "view_update_from_staging_generator.hh"
|
||||
|
||||
namespace db::view {
|
||||
|
||||
|
||||
future<> view_update_from_staging_generator::start() {
|
||||
_started = seastar::async([this]() mutable {
|
||||
while (!_sstables_with_tables.empty()) {
|
||||
auto& entry = _sstables_with_tables.front();
|
||||
schema_ptr s = entry.t->schema();
|
||||
if (_as.abort_requested()) {
|
||||
return;
|
||||
}
|
||||
flat_mutation_reader staging_sstable_reader = entry.sst->read_rows_flat(s);
|
||||
auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, _proxy, entry.sst, _as), db::no_timeout);
|
||||
if (result == stop_iteration::no) {
|
||||
entry.t->move_sstable_from_staging_in_thread(entry.sst);
|
||||
_registration_sem.signal();
|
||||
_sstables_with_tables.pop_front();
|
||||
}
|
||||
}
|
||||
});
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
future<> view_update_from_staging_generator::stop() {
|
||||
_as.request_abort();
|
||||
return std::move(_started);
|
||||
}
|
||||
|
||||
future<> view_update_from_staging_generator::register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table) {
|
||||
_sstables_with_tables.emplace_back(std::move(sst), std::move(table));
|
||||
if (_as.abort_requested()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
future<> restart = make_ready_future<>();
|
||||
if (_started.available()) {
|
||||
restart = start();
|
||||
}
|
||||
return restart.then([this] () {
|
||||
return _registration_sem.wait(1);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
51
db/view/view_update_from_staging_generator.hh
Normal file
51
db/view/view_update_from_staging_generator.hh
Normal file
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "database.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include "db/view/view_updating_consumer.hh"
|
||||
|
||||
namespace db::view {
|
||||
|
||||
class view_update_from_staging_generator {
|
||||
static constexpr size_t registration_queue_size = 5;
|
||||
database& _db;
|
||||
service::storage_proxy& _proxy;
|
||||
seastar::abort_source _as;
|
||||
future<> _started = make_ready_future<>();
|
||||
semaphore _registration_sem{registration_queue_size};
|
||||
struct sstable_with_table {
|
||||
sstables::shared_sstable sst;
|
||||
lw_shared_ptr<table> t;
|
||||
sstable_with_table(sstables::shared_sstable sst, lw_shared_ptr<table> t) : sst(sst), t(t) { }
|
||||
};
|
||||
std::deque<sstable_with_table> _sstables_with_tables;
|
||||
public:
|
||||
view_update_from_staging_generator(database& db, service::storage_proxy& proxy) : _db(db), _proxy(proxy) { }
|
||||
|
||||
future<> start();
|
||||
future<> stop();
|
||||
future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
|
||||
};
|
||||
|
||||
}
|
||||
92
db/view/view_updating_consumer.hh
Normal file
92
db/view/view_updating_consumer.hh
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "dht/i_partitioner.hh"
|
||||
#include "schema.hh"
|
||||
#include "mutation_fragment.hh"
|
||||
#include "sstables/shared_sstable.hh"
|
||||
|
||||
namespace db::view {
|
||||
|
||||
/*
|
||||
* A consumer that pushes materialized view updates for each consumed mutation.
|
||||
* It is expected to be run in seastar::async threaded context through consume_in_thread()
|
||||
*/
|
||||
class view_updating_consumer {
|
||||
schema_ptr _schema;
|
||||
lw_shared_ptr<table> _table;
|
||||
sstables::shared_sstable _excluded_sstable;
|
||||
const seastar::abort_source& _as;
|
||||
std::optional<mutation> _m;
|
||||
public:
|
||||
view_updating_consumer(schema_ptr schema, service::storage_proxy& proxy, sstables::shared_sstable excluded_sstable, const seastar::abort_source& as)
|
||||
: _schema(std::move(schema))
|
||||
, _table(proxy.get_db().local().find_column_family(_schema->id()).shared_from_this())
|
||||
, _excluded_sstable(excluded_sstable)
|
||||
, _as(as)
|
||||
, _m()
|
||||
{ }
|
||||
|
||||
void consume_new_partition(const dht::decorated_key& dk) {
|
||||
_m = mutation(_schema, dk, mutation_partition(_schema));
|
||||
}
|
||||
|
||||
void consume(tombstone t) {
|
||||
_m->partition().apply(std::move(t));
|
||||
}
|
||||
|
||||
stop_iteration consume(static_row&& sr) {
|
||||
if (_as.abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_m->partition().apply(*_schema, std::move(sr));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
stop_iteration consume(clustering_row&& cr) {
|
||||
if (_as.abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_m->partition().apply(*_schema, std::move(cr));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
stop_iteration consume(range_tombstone&& rt) {
|
||||
if (_as.abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_m->partition().apply(*_schema, std::move(rt));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
// Expected to be run in seastar::async threaded context (consume_in_thread())
|
||||
stop_iteration consume_end_of_partition();
|
||||
|
||||
stop_iteration consume_end_of_stream() {
|
||||
return stop_iteration(_as.abort_requested());
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ namespace dht {
|
||||
future<> boot_strapper::bootstrap() {
|
||||
blogger.debug("Beginning bootstrap process: sorted_tokens={}", _token_metadata.sorted_tokens());
|
||||
|
||||
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap");
|
||||
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap", streaming::stream_reason::bootstrap);
|
||||
streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
|
||||
for (const auto& keyspace_name : _db.local().get_non_system_keyspaces()) {
|
||||
auto& ks = _db.local().find_keyspace(keyspace_name);
|
||||
|
||||
@@ -294,7 +294,7 @@ future<> range_streamer::do_stream_async() {
|
||||
size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
|
||||
dht::token_range_vector ranges_to_stream;
|
||||
auto do_streaming = [&] {
|
||||
auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
|
||||
auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++), _reason);
|
||||
logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
|
||||
description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
|
||||
if (_nr_rx_added) {
|
||||
|
||||
@@ -42,6 +42,7 @@
|
||||
#include "locator/snitch_base.hh"
|
||||
#include "streaming/stream_plan.hh"
|
||||
#include "streaming/stream_state.hh"
|
||||
#include "streaming/stream_reason.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "gms/i_failure_detector.hh"
|
||||
#include "range.hh"
|
||||
@@ -101,17 +102,18 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description)
|
||||
range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description, streaming::stream_reason reason)
|
||||
: _db(db)
|
||||
, _metadata(tm)
|
||||
, _tokens(std::move(tokens))
|
||||
, _address(address)
|
||||
, _description(std::move(description))
|
||||
, _reason(reason)
|
||||
, _stream_plan(_description) {
|
||||
}
|
||||
|
||||
range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description)
|
||||
: range_streamer(db, tm, std::unordered_set<token>(), address, description) {
|
||||
range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description, streaming::stream_reason reason)
|
||||
: range_streamer(db, tm, std::unordered_set<token>(), address, description, reason) {
|
||||
}
|
||||
|
||||
void add_source_filter(std::unique_ptr<i_source_filter> filter) {
|
||||
@@ -166,6 +168,7 @@ private:
|
||||
std::unordered_set<token> _tokens;
|
||||
inet_address _address;
|
||||
sstring _description;
|
||||
streaming::stream_reason _reason;
|
||||
std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
|
||||
std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
|
||||
stream_plan _stream_plan;
|
||||
|
||||
7
dist/common/scripts/scylla_prepare
vendored
7
dist/common/scripts/scylla_prepare
vendored
@@ -62,10 +62,9 @@ if __name__ == '__main__':
|
||||
run('hugeadm --create-mounts')
|
||||
fi
|
||||
else:
|
||||
set_nic = cfg.get('SET_NIC')
|
||||
set_nic_and_disks = get_set_nic_and_disks_config_value(cfg)
|
||||
ifname = cfg.get('IFNAME')
|
||||
if set_nic == 'yes':
|
||||
if set_nic_and_disks == 'yes':
|
||||
create_perftune_conf(ifname)
|
||||
run('/usr/lib/scylla/posix_net_conf.sh {IFNAME} --options-file /etc/scylla.d/perftune.yaml'.format(IFNAME=ifname))
|
||||
run("{} --options-file /etc/scylla.d/perftune.yaml".format(perftune_base_command()))
|
||||
|
||||
run('/usr/lib/scylla/scylla-blocktune')
|
||||
|
||||
12
dist/common/scripts/scylla_setup
vendored
12
dist/common/scripts/scylla_setup
vendored
@@ -122,8 +122,8 @@ if __name__ == '__main__':
|
||||
help='specify NTP domain')
|
||||
parser.add_argument('--ami', action='store_true', default=False,
|
||||
help='setup AMI instance')
|
||||
parser.add_argument('--setup-nic', action='store_true', default=False,
|
||||
help='optimize NIC queue')
|
||||
parser.add_argument('--setup-nic-and-disks', action='store_true', default=False,
|
||||
help='optimize NIC and disks')
|
||||
parser.add_argument('--developer-mode', action='store_true', default=False,
|
||||
help='enable developer mode')
|
||||
parser.add_argument('--no-ec2-check', action='store_true', default=False,
|
||||
@@ -173,7 +173,7 @@ if __name__ == '__main__':
|
||||
|
||||
disks = args.disks
|
||||
nic = args.nic
|
||||
set_nic = args.setup_nic
|
||||
set_nic_and_disks = args.setup_nic_and_disks
|
||||
ec2_check = not args.no_ec2_check
|
||||
kernel_check = not args.no_kernel_check
|
||||
verify_package = not args.no_verify_package
|
||||
@@ -336,11 +336,11 @@ if __name__ == '__main__':
|
||||
if interactive:
|
||||
sysconfig_setup = interactive_ask_service('Do you want to setup a system-wide customized configuration for Scylla?', 'Yes - setup the sysconfig file. No - skips this step.', 'yes')
|
||||
if sysconfig_setup:
|
||||
nic = interactive_choose_nic()
|
||||
if interactive:
|
||||
set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
|
||||
nic = interactive_choose_nic()
|
||||
set_nic_and_disks = interactive_ask_service('Do you want to enable Network Interface Card (NIC) and disk(s) optimization?', 'Yes - optimize the NIC queue and disks settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
|
||||
if sysconfig_setup:
|
||||
setup_args = '--setup-nic' if set_nic else ''
|
||||
setup_args = '--setup-nic-and-disks' if set_nic_and_disks else ''
|
||||
run_setup_script('NIC queue', '/usr/lib/scylla/scylla_sysconfig_setup --nic {nic} {setup_args}'.format(nic=nic, setup_args=setup_args))
|
||||
|
||||
if interactive:
|
||||
|
||||
19
dist/common/scripts/scylla_sysconfig_setup
vendored
19
dist/common/scripts/scylla_sysconfig_setup
vendored
@@ -40,7 +40,7 @@ if __name__ == '__main__':
|
||||
cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
|
||||
else:
|
||||
cfg = sysconfig_parser('/etc/default/scylla-server')
|
||||
set_nic = str2bool(cfg.get('SET_NIC'))
|
||||
set_nic_and_disks = str2bool(get_set_nic_and_disks_config_value(cfg))
|
||||
ami = str2bool(cfg.get('AMI'))
|
||||
|
||||
parser = argparse.ArgumentParser(description='Setting parameters on Scylla sysconfig file.')
|
||||
@@ -58,8 +58,8 @@ if __name__ == '__main__':
|
||||
help='scylla home directory')
|
||||
parser.add_argument('--confdir',
|
||||
help='scylla config directory')
|
||||
parser.add_argument('--setup-nic', action='store_true', default=set_nic,
|
||||
help='setup NIC\'s interrupts, RPS, XPS')
|
||||
parser.add_argument('--setup-nic-and-disks', action='store_true', default=set_nic_and_disks,
|
||||
help='setup NIC\'s and disks\' interrupts, RPS, XPS, nomerges and I/O scheduler')
|
||||
parser.add_argument('--ami', action='store_true', default=ami,
|
||||
help='AMI instance mode')
|
||||
args = parser.parse_args()
|
||||
@@ -71,8 +71,8 @@ if __name__ == '__main__':
|
||||
ifname = args.nic if args.nic else cfg.get('IFNAME')
|
||||
network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')
|
||||
|
||||
if args.setup_nic:
|
||||
rps_cpus = out('/usr/lib/scylla/posix_net_conf.sh --cpu-mask {}'.format(ifname))
|
||||
if args.setup_nic_and_disks:
|
||||
rps_cpus = out('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname))
|
||||
if len(rps_cpus) > 0:
|
||||
cpuset = hex2list(rps_cpus)
|
||||
run('/usr/lib/scylla/scylla_cpuset_setup --cpuset {}'.format(cpuset))
|
||||
@@ -104,8 +104,13 @@ if __name__ == '__main__':
|
||||
cfg.set('SCYLLA_HOME', args.homedir)
|
||||
if args.confdir:
|
||||
cfg.set('SCYLLA_CONF', args.confdir)
|
||||
if str2bool(cfg.get('SET_NIC')) != args.setup_nic:
|
||||
cfg.set('SET_NIC', bool2str(args.setup_nic))
|
||||
|
||||
if str2bool(get_set_nic_and_disks_config_value(cfg)) != args.setup_nic_and_disks:
|
||||
if cfg.has_option('SET_NIC'):
|
||||
cfg.set('SET_NIC', bool2str(args.setup_nic_and_disks))
|
||||
else:
|
||||
cfg.set('SET_NIC_AND_DISKS', bool2str(args.setup_nic_and_disks))
|
||||
|
||||
if str2bool(cfg.get('AMI')) != args.ami:
|
||||
cfg.set('AMI', bool2str(args.ami))
|
||||
cfg.commit()
|
||||
|
||||
54
dist/common/scripts/scylla_util.py
vendored
54
dist/common/scripts/scylla_util.py
vendored
@@ -28,6 +28,7 @@ import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import yaml
|
||||
|
||||
|
||||
def curl(url, byte=False):
|
||||
@@ -384,6 +385,35 @@ def get_mode_cpuset(nic, mode):
|
||||
except subprocess.CalledProcessError:
|
||||
return '-1'
|
||||
|
||||
def get_scylla_dirs():
|
||||
"""
|
||||
Returns a list of scylla directories configured in /etc/scylla/scylla.yaml.
|
||||
Verifies that mandatory parameters are set.
|
||||
"""
|
||||
scylla_yaml_name = '/etc/scylla/scylla.yaml'
|
||||
y = yaml.load(open(scylla_yaml_name))
|
||||
|
||||
# Check that mandatory fields are set
|
||||
if 'data_file_directories' not in y or \
|
||||
not y['data_file_directories'] or \
|
||||
not len(y['data_file_directories']) or \
|
||||
not " ".join(y['data_file_directories']).strip():
|
||||
raise Exception("{}: at least one directory has to be set in 'data_file_directory'".format(scylla_yaml_name))
|
||||
if 'commitlog_directory' not in y or not y['commitlog_directory']:
|
||||
raise Exception("{}: 'commitlog_directory' has to be set".format(scylla_yaml_name))
|
||||
|
||||
dirs = []
|
||||
dirs.extend(y['data_file_directories'])
|
||||
dirs.append(y['commitlog_directory'])
|
||||
|
||||
if 'hints_directory' in y and y['hints_directory']:
|
||||
dirs.append(y['hints_directory'])
|
||||
|
||||
return [d for d in dirs if d is not None]
|
||||
|
||||
def perftune_base_command():
|
||||
disk_tune_param = "--tune disks " + " ".join("--dir {}".format(d) for d in get_scylla_dirs())
|
||||
return '/usr/lib/scylla/perftune.py {}'.format(disk_tune_param)
|
||||
|
||||
def get_cur_cpuset():
|
||||
cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
|
||||
@@ -419,6 +449,25 @@ def create_perftune_conf(nic='eth0'):
|
||||
def is_valid_nic(nic):
|
||||
return os.path.exists('/sys/class/net/{}'.format(nic))
|
||||
|
||||
# Remove this when we do not support SET_NIC configuration value anymore
|
||||
def get_set_nic_and_disks_config_value(cfg):
|
||||
"""
|
||||
Get the SET_NIC_AND_DISKS configuration value.
|
||||
Return the SET_NIC configuration value if SET_NIC_AND_DISKS is not found (old releases case).
|
||||
:param cfg: sysconfig_parser object
|
||||
:return configuration value
|
||||
:except If the configuration value is not found
|
||||
"""
|
||||
|
||||
# Sanity check
|
||||
if cfg.has_option('SET_NIC_AND_DISKS') and cfg.has_option('SET_NIC'):
|
||||
raise Exception("Only one of 'SET_NIC_AND_DISKS' and 'SET_NIC' is allowed to be present")
|
||||
|
||||
try:
|
||||
return cfg.get('SET_NIC_AND_DISKS')
|
||||
except:
|
||||
# For backwards compatibility
|
||||
return cfg.get('SET_NIC')
|
||||
|
||||
class SystemdException(Exception):
|
||||
pass
|
||||
@@ -483,8 +532,11 @@ class sysconfig_parser:
|
||||
def get(self, key):
|
||||
return self._cfg.get('global', key).strip('"')
|
||||
|
||||
def has_option(self, key):
|
||||
return self._cfg.has_option('global', key)
|
||||
|
||||
def set(self, key, val):
|
||||
if not self._cfg.has_option('global', key):
|
||||
if not self.has_option(key):
|
||||
return self.__add(key, val)
|
||||
self._data = re.sub('^{}=[^\n]*$'.format(key), '{}="{}"'.format(key, self.__escape(val)), self._data, flags=re.MULTILINE)
|
||||
self.__load()
|
||||
|
||||
4
dist/common/sysconfig/scylla-server
vendored
4
dist/common/sysconfig/scylla-server
vendored
@@ -10,8 +10,8 @@ BRIDGE=virbr0
|
||||
# ethernet device name
|
||||
IFNAME=eth0
|
||||
|
||||
# setup NIC's interrupts, RPS, XPS (posix)
|
||||
SET_NIC=no
|
||||
# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
|
||||
SET_NIC_AND_DISKS=no
|
||||
|
||||
# ethernet device driver (dpdk)
|
||||
ETHDRV=
|
||||
|
||||
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
Normal file
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# Raise max AIO events
|
||||
fs.aio-max-nr = 1048576
|
||||
@@ -6,7 +6,12 @@ After=network.target
|
||||
Type=simple
|
||||
User=scylla
|
||||
Group=scylla
|
||||
{{#debian}}
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/apt/sources.list.d/scylla*.list' version --mode r
|
||||
{{/debian}}
|
||||
{{#redhat}}
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode r
|
||||
{{/redhat}}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
dist/common/sysctl.d/99-scylla-sched.conf /etc/sysctl.d
|
||||
dist/common/sysctl.d/99-scylla-aio.conf /etc/sysctl.d
|
||||
|
||||
@@ -9,6 +9,7 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
|
||||
else
|
||||
# expect failures in virtualized environments
|
||||
sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
|
||||
sysctl -p/etc/sysctl.d/99-scylla-aio.conf || :
|
||||
fi
|
||||
|
||||
#DEBHELPER#
|
||||
|
||||
2
dist/debian/rules.mustache
vendored
2
dist/debian/rules.mustache
vendored
@@ -4,7 +4,7 @@ export PYBUILD_DISABLE=1
|
||||
jobs := $(shell echo $$DEB_BUILD_OPTIONS | sed -r "s/.*parallel=([0-9]+).*/-j\1/")
|
||||
|
||||
override_dh_auto_configure:
|
||||
./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"
|
||||
./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --c-compiler=/opt/scylladb/bin/gcc-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"
|
||||
|
||||
override_dh_auto_build:
|
||||
PATH="/opt/scylladb/bin:$$PATH" ninja $(jobs)
|
||||
|
||||
1
dist/debian/scylla-server.install.mustache
vendored
1
dist/debian/scylla-server.install.mustache
vendored
@@ -1,7 +1,6 @@
|
||||
dist/common/limits.d/scylla.conf etc/security/limits.d
|
||||
dist/common/scylla.d/*.conf etc/scylla.d
|
||||
seastar/dpdk/usertools/dpdk-devbind.py usr/lib/scylla
|
||||
seastar/scripts/posix_net_conf.sh usr/lib/scylla
|
||||
seastar/scripts/perftune.py usr/lib/scylla
|
||||
dist/common/scripts/* usr/lib/scylla
|
||||
scylla-housekeeping usr/lib/scylla
|
||||
|
||||
2
dist/docker/redhat/Dockerfile
vendored
2
dist/docker/redhat/Dockerfile
vendored
@@ -26,7 +26,7 @@ ADD commandlineparser.py /commandlineparser.py
|
||||
ADD docker-entrypoint.py /docker-entrypoint.py
|
||||
|
||||
# Install Scylla:
|
||||
RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
|
||||
RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.0.repo -o /etc/yum.repos.d/scylla.repo && \
|
||||
yum -y install epel-release && \
|
||||
yum -y clean expire-cache && \
|
||||
yum -y update && \
|
||||
|
||||
@@ -10,8 +10,8 @@ BRIDGE=virbr0
|
||||
# ethernet device name
|
||||
IFNAME=eth0
|
||||
|
||||
# setup NIC's interrupts, RPS, XPS (posix)
|
||||
SET_NIC=no
|
||||
# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
|
||||
SET_NIC_AND_DISKS=no
|
||||
|
||||
# ethernet device driver (dpdk)
|
||||
ETHDRV=
|
||||
|
||||
@@ -91,7 +91,27 @@ mkdir -p build/offline_installer
|
||||
cp dist/offline_installer/redhat/header build/offline_installer
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve scylla
|
||||
# XXX: resolve option doesn't fetch some dependencies, need to manually fetch them
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sudo.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntp.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libedit.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntpdate.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve net-tools.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve kernel
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve grubby.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve linux-firmware
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve initscripts.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iproute.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iptables.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnfnetlink.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnetfilter_conntrack.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libmnl.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sysvinit-tools.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve yajl.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve mdadm.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libreport-filesystem.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve xfsprogs.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve PyYAML.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libyaml.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libjpeg-turbo.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libaio.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve snappy.x86_64
|
||||
|
||||
4
dist/redhat/scylla.spec.mustache
vendored
4
dist/redhat/scylla.spec.mustache
vendored
@@ -97,7 +97,7 @@ cflags="--cflags=${defines[*]}"
|
||||
%endif
|
||||
%if 0%{?rhel}
|
||||
. /etc/profile.d/scylla.sh
|
||||
python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
|
||||
python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
|
||||
%endif
|
||||
ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
|
||||
|
||||
@@ -193,7 +193,6 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%{_prefix}/lib/scylla/scylla_cpuscaling_setup
|
||||
%{_prefix}/lib/scylla/scylla_fstrim
|
||||
%{_prefix}/lib/scylla/scylla_fstrim_setup
|
||||
%{_prefix}/lib/scylla/posix_net_conf.sh
|
||||
%{_prefix}/lib/scylla/perftune.py
|
||||
%{_prefix}/lib/scylla/dpdk-devbind.py
|
||||
%{_prefix}/lib/scylla/hex2list.py
|
||||
@@ -283,6 +282,7 @@ if Scylla is the main application on your server and you wish to optimize its la
|
||||
# We cannot use the sysctl_apply rpm macro because it is not present in 7.0
|
||||
# following is a "manual" expansion
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
|
||||
|
||||
%files kernel-conf
|
||||
%defattr(-,root,root)
|
||||
|
||||
@@ -78,6 +78,11 @@ public:
|
||||
|
||||
frozen_mutation freeze(const mutation& m);
|
||||
|
||||
struct frozen_mutation_and_schema {
|
||||
frozen_mutation fm;
|
||||
schema_ptr s;
|
||||
};
|
||||
|
||||
// Can receive streamed_mutation in reversed order.
|
||||
class streamed_mutation_freezer {
|
||||
const schema& _schema;
|
||||
|
||||
@@ -129,26 +129,8 @@ public:
|
||||
update_is_normal();
|
||||
}
|
||||
|
||||
void apply_application_state(application_state key, versioned_value&& value) {
|
||||
auto&& e = _application_state[key];
|
||||
if (e.version < value.version) {
|
||||
e = std::move(value);
|
||||
}
|
||||
update_is_normal();
|
||||
}
|
||||
|
||||
void apply_application_state(application_state key, const versioned_value& value) {
|
||||
auto&& e = _application_state[key];
|
||||
if (e.version < value.version) {
|
||||
e = value;
|
||||
}
|
||||
update_is_normal();
|
||||
}
|
||||
|
||||
void apply_application_state(const endpoint_state& es) {
|
||||
for (auto&& e : es._application_state) {
|
||||
apply_application_state(e.first, e.second);
|
||||
}
|
||||
void add_application_state(const endpoint_state& es) {
|
||||
_application_state = es._application_state;
|
||||
update_is_normal();
|
||||
}
|
||||
|
||||
|
||||
@@ -930,7 +930,7 @@ void gossiper::make_random_gossip_digest(utils::chunked_vector<gossip_digest>& g
|
||||
future<> gossiper::replicate(inet_address ep, const endpoint_state& es) {
|
||||
return container().invoke_on_all([ep, es, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
|
||||
if (engine().cpu_id() != orig) {
|
||||
g.endpoint_state_map[ep].apply_application_state(es);
|
||||
g.endpoint_state_map[ep].add_application_state(es);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -939,7 +939,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
|
||||
return container().invoke_on_all([ep, &src, &changed, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
|
||||
if (engine().cpu_id() != orig) {
|
||||
for (auto&& key : changed) {
|
||||
g.endpoint_state_map[ep].apply_application_state(key, src.at(key));
|
||||
g.endpoint_state_map[ep].add_application_state(key, src.at(key));
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -948,7 +948,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
|
||||
future<> gossiper::replicate(inet_address ep, application_state key, const versioned_value& value) {
|
||||
return container().invoke_on_all([ep, key, &value, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
|
||||
if (engine().cpu_id() != orig) {
|
||||
g.endpoint_state_map[ep].apply_application_state(key, value);
|
||||
g.endpoint_state_map[ep].add_application_state(key, value);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -1175,11 +1175,13 @@ stdx::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_ad
|
||||
}
|
||||
}
|
||||
|
||||
void gossiper::reset_endpoint_state_map() {
|
||||
endpoint_state_map.clear();
|
||||
future<> gossiper::reset_endpoint_state_map() {
|
||||
_unreachable_endpoints.clear();
|
||||
_live_endpoints.clear();
|
||||
_live_endpoints_just_added.clear();
|
||||
return container().invoke_on_all([] (gossiper& g) {
|
||||
g.endpoint_state_map.clear();
|
||||
});
|
||||
}
|
||||
|
||||
std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpoint_states() {
|
||||
@@ -1298,6 +1300,14 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
// Runs inside seastar::async context
|
||||
void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
logger.trace("marking as alive {}", addr);
|
||||
|
||||
// Do not mark a node with status shutdown as UP.
|
||||
auto status = get_gossip_status(local_state);
|
||||
if (status == sstring(versioned_value::SHUTDOWN)) {
|
||||
logger.warn("Skip marking node {} with status = {} as UP", addr, status);
|
||||
return;
|
||||
}
|
||||
|
||||
local_state.mark_alive();
|
||||
local_state.update_timestamp(); // prevents do_status_check from racing us and evicting if it was down > A_VERY_LONG_TIME
|
||||
|
||||
@@ -1319,7 +1329,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
}
|
||||
|
||||
if (!_in_shadow_round) {
|
||||
logger.info("InetAddress {} is now UP, status = {}", addr, get_gossip_status(local_state));
|
||||
logger.info("InetAddress {} is now UP, status = {}", addr, status);
|
||||
}
|
||||
|
||||
_subscribers.for_each([addr, local_state] (auto& subscriber) {
|
||||
@@ -1662,6 +1672,7 @@ void gossiper::maybe_initialize_local_state(int generation_nbr) {
|
||||
}
|
||||
}
|
||||
|
||||
// Runs inside seastar::async context
|
||||
void gossiper::add_saved_endpoint(inet_address ep) {
|
||||
if (ep == get_broadcast_address()) {
|
||||
logger.debug("Attempt to add self as saved endpoint");
|
||||
@@ -1687,6 +1698,7 @@ void gossiper::add_saved_endpoint(inet_address ep) {
|
||||
}
|
||||
ep_state.mark_dead();
|
||||
endpoint_state_map[ep] = ep_state;
|
||||
replicate(ep, ep_state).get();
|
||||
_unreachable_endpoints[ep] = now();
|
||||
logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
|
||||
}
|
||||
@@ -1924,6 +1936,7 @@ void gossiper::mark_as_shutdown(const inet_address& endpoint) {
|
||||
auto& ep_state = *es;
|
||||
ep_state.add_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true));
|
||||
ep_state.get_heart_beat_state().force_highest_possible_version_unsafe();
|
||||
replicate(endpoint, ep_state).get();
|
||||
mark_dead(endpoint, ep_state);
|
||||
get_local_failure_detector().force_conviction(endpoint);
|
||||
}
|
||||
|
||||
@@ -417,7 +417,7 @@ public:
|
||||
stdx::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;
|
||||
|
||||
// removes ALL endpoint states; should only be called after shadow gossip
|
||||
void reset_endpoint_state_map();
|
||||
future<> reset_endpoint_state_map();
|
||||
|
||||
std::unordered_map<inet_address, endpoint_state>& get_endpoint_states();
|
||||
|
||||
|
||||
@@ -42,4 +42,13 @@ class prepare_message {
|
||||
uint32_t dst_cpu_id;
|
||||
};
|
||||
|
||||
enum class stream_reason : uint8_t {
|
||||
unspecified,
|
||||
bootstrap,
|
||||
decommission,
|
||||
removenode,
|
||||
rebuild,
|
||||
repair,
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -93,7 +93,6 @@ install -m644 build/*.service -Dt "$rprefix"/lib/systemd/system
|
||||
install -m644 dist/common/systemd/*.service -Dt "$rprefix"/lib/systemd/system
|
||||
install -m644 dist/common/systemd/*.timer -Dt "$rprefix"/lib/systemd/system
|
||||
install -m755 dist/common/scripts/* -Dt "$rprefix"/lib/scylla/
|
||||
install -m755 seastar/scripts/posix_net_conf.sh "$rprefix"/lib/scylla/
|
||||
install -m755 seastar/scripts/perftune.py -Dt "$rprefix"/lib/scylla/
|
||||
install -m755 seastar/dpdk/usertools/dpdk-devbind.py -Dt "$rprefix"/lib/scylla/
|
||||
install -m755 build/release/scylla -Dt "$rprefix/bin"
|
||||
|
||||
1
libdeflate
Submodule
1
libdeflate
Submodule
Submodule libdeflate added at 17ec6c94d8
21
licenses/libdeflate-license.txt
Normal file
21
licenses/libdeflate-license.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
Copyright 2016 Eric Biggers
|
||||
|
||||
Permission is hereby granted, free of charge, to any person
|
||||
obtaining a copy of this software and associated documentation files
|
||||
(the "Software"), to deal in the Software without restriction,
|
||||
including without limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -119,9 +119,17 @@ insert_token_range_to_sorted_container_while_unwrapping(
|
||||
const dht::token& tok,
|
||||
dht::token_range_vector& ret) {
|
||||
if (prev_tok < tok) {
|
||||
ret.emplace_back(
|
||||
dht::token_range::bound(prev_tok, false),
|
||||
dht::token_range::bound(tok, true));
|
||||
auto pos = ret.end();
|
||||
if (!ret.empty() && !std::prev(pos)->end()) {
|
||||
// We inserted a wrapped range (a, b] previously as
|
||||
// (-inf, b], (a, +inf). So now we insert in the next-to-last
|
||||
// position to keep the last range (a, +inf) at the end.
|
||||
pos = std::prev(pos);
|
||||
}
|
||||
ret.insert(pos,
|
||||
dht::token_range{
|
||||
dht::token_range::bound(prev_tok, false),
|
||||
dht::token_range::bound(tok, true)});
|
||||
} else {
|
||||
ret.emplace_back(
|
||||
dht::token_range::bound(prev_tok, false),
|
||||
|
||||
49
main.cc
49
main.cc
@@ -62,6 +62,7 @@
|
||||
#include "service/cache_hitrate_calculator.hh"
|
||||
#include "sstables/compaction_manager.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include <db/view/view_update_from_staging_generator.hh>
|
||||
|
||||
seastar::metrics::metric_groups app_metrics;
|
||||
|
||||
@@ -647,6 +648,21 @@ int main(int ac, char** av) {
|
||||
|
||||
supervisor::notify("loading sstables");
|
||||
distributed_loader::init_non_system_keyspaces(db, proxy).get();
|
||||
|
||||
static sharded<db::view::view_update_from_staging_generator> view_update_from_staging_generator;
|
||||
view_update_from_staging_generator.start(std::ref(db), std::ref(proxy)).get();
|
||||
supervisor::notify("discovering staging sstables");
|
||||
db.invoke_on_all([] (database& db) {
|
||||
for (auto& x : db.get_column_families()) {
|
||||
table& t = *(x.second);
|
||||
for (sstables::shared_sstable sst : *t.get_sstables()) {
|
||||
if (sst->is_staging()) {
|
||||
view_update_from_staging_generator.local().register_staging_sstable(std::move(sst), t.shared_from_this());
|
||||
}
|
||||
}
|
||||
}
|
||||
}).get();
|
||||
|
||||
// register connection drop notification to update cf's cache hit rate data
|
||||
db.invoke_on_all([] (database& db) {
|
||||
db.register_connection_drop_notifier(netw::get_local_messaging_service());
|
||||
@@ -700,9 +716,21 @@ int main(int ac, char** av) {
|
||||
proxy.invoke_on_all([] (service::storage_proxy& p) {
|
||||
p.init_messaging_service();
|
||||
}).get();
|
||||
|
||||
supervisor::notify("starting streaming service");
|
||||
streaming::stream_session::init_streaming_service(db).get();
|
||||
streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_from_staging_generator).get();
|
||||
api::set_server_stream_manager(ctx).get();
|
||||
|
||||
supervisor::notify("starting hinted handoff manager");
|
||||
if (hinted_handoff_enabled) {
|
||||
db::hints::manager::rebalance(cfg->hints_directory()).get();
|
||||
}
|
||||
db::hints::manager::rebalance(cfg->data_file_directories()[0] + "/view_pending_updates").get();
|
||||
|
||||
proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
|
||||
local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
|
||||
}).get();
|
||||
|
||||
supervisor::notify("starting messaging service");
|
||||
// Start handling REPAIR_CHECKSUM_RANGE messages
|
||||
netw::get_messaging_service().invoke_on_all([&db] (auto& ms) {
|
||||
@@ -739,16 +767,16 @@ int main(int ac, char** av) {
|
||||
gms::get_local_gossiper().wait_for_gossip_to_settle().get();
|
||||
api::set_server_gossip_settle(ctx).get();
|
||||
|
||||
supervisor::notify("starting hinted handoff manager");
|
||||
if (hinted_handoff_enabled) {
|
||||
db::hints::manager::rebalance(cfg->hints_directory()).get();
|
||||
}
|
||||
db::hints::manager::rebalance(cfg->data_file_directories()[0] + "/view_pending_updates").get();
|
||||
|
||||
supervisor::notify("allow replaying hints");
|
||||
proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
|
||||
local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
|
||||
local_proxy.allow_replaying_hints();
|
||||
}).get();
|
||||
|
||||
if (cfg->view_building()) {
|
||||
supervisor::notify("Launching generate_mv_updates for non system tables");
|
||||
view_update_from_staging_generator.invoke_on_all(&db::view::view_update_from_staging_generator::start).get();
|
||||
}
|
||||
|
||||
static sharded<db::view::view_builder> view_builder;
|
||||
if (cfg->view_building()) {
|
||||
supervisor::notify("starting the view builder");
|
||||
@@ -786,6 +814,11 @@ int main(int ac, char** av) {
|
||||
engine().at_exit([] {
|
||||
return repair_shutdown(service::get_local_storage_service().db());
|
||||
});
|
||||
|
||||
engine().at_exit([] {
|
||||
return view_update_from_staging_generator.stop();
|
||||
});
|
||||
|
||||
engine().at_exit([] {
|
||||
return service::get_local_storage_service().drain_on_shutdown();
|
||||
});
|
||||
|
||||
@@ -214,7 +214,9 @@ private:
|
||||
|
||||
void update(const schema& s, const deletable_row& dr) {
|
||||
update(dr.marker());
|
||||
update(dr.deleted_at().tomb());
|
||||
row_tombstone row_tomb = dr.deleted_at();
|
||||
update(row_tomb.regular());
|
||||
update(row_tomb.tomb());
|
||||
update(s, dr.cells(), column_kind::regular_column);
|
||||
}
|
||||
|
||||
|
||||
@@ -135,12 +135,14 @@ struct messaging_service::rpc_protocol_wrapper : public rpc_protocol { using rpc
|
||||
// This should be integrated into messaging_service proper.
|
||||
class messaging_service::rpc_protocol_client_wrapper {
|
||||
std::unique_ptr<rpc_protocol::client> _p;
|
||||
::shared_ptr<seastar::tls::server_credentials> _credentials;
|
||||
public:
|
||||
rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local = ipv4_addr())
|
||||
: _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), addr, local)) {
|
||||
}
|
||||
rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local, ::shared_ptr<seastar::tls::server_credentials> c)
|
||||
: _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), seastar::tls::socket(c), addr, local))
|
||||
, _credentials(c)
|
||||
{}
|
||||
auto get_stats() const { return _p->get_stats(); }
|
||||
future<> stop() { return _p->stop(); }
|
||||
@@ -148,6 +150,19 @@ public:
|
||||
return _p->error();
|
||||
}
|
||||
operator rpc_protocol::client&() { return *_p; }
|
||||
|
||||
/**
|
||||
* #3787 Must ensure we use the right type of socker. I.e. tls or not.
|
||||
* See above, we retain credentials object so we here can know if we
|
||||
* are tls or not.
|
||||
*/
|
||||
template<typename Serializer, typename... Out>
|
||||
future<rpc::sink<Out...>> make_stream_sink() {
|
||||
if (_credentials) {
|
||||
return _p->make_stream_sink<Serializer, Out...>(seastar::tls::socket(_credentials));
|
||||
}
|
||||
return _p->make_stream_sink<Serializer, Out...>();
|
||||
}
|
||||
};
|
||||
|
||||
struct messaging_service::rpc_protocol_server_wrapper : public rpc_protocol::server { using rpc_protocol::server::server; };
|
||||
@@ -638,17 +653,18 @@ rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rp
|
||||
}
|
||||
|
||||
future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>
|
||||
messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, msg_addr id) {
|
||||
rpc_protocol::client& rpc_client = *get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
|
||||
return rpc_client.make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
|
||||
auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
|
||||
return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, sink).then([sink] (rpc::source<int32_t> source) mutable {
|
||||
messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id) {
|
||||
auto wrapper = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
|
||||
rpc_protocol::client& rpc_client = *wrapper;
|
||||
return wrapper->make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
|
||||
auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
|
||||
return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then([sink] (rpc::source<int32_t> source) mutable {
|
||||
return make_ready_future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>(std::move(sink), std::move(source));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::source<frozen_mutation_fragment> source)>&& func) {
|
||||
void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment> source)>&& func) {
|
||||
register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
|
||||
}
|
||||
|
||||
@@ -726,13 +742,13 @@ auto send_message_oneway_timeout(messaging_service* ms, Timeout timeout, messagi
|
||||
|
||||
// PREPARE_MESSAGE
|
||||
void messaging_service::register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description)>&& func) {
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func) {
|
||||
register_handler(this, messaging_verb::PREPARE_MESSAGE, std::move(func));
|
||||
}
|
||||
future<streaming::prepare_message> messaging_service::send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
|
||||
sstring description) {
|
||||
sstring description, streaming::stream_reason reason) {
|
||||
return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
|
||||
std::move(msg), plan_id, std::move(description));
|
||||
std::move(msg), plan_id, std::move(description), reason);
|
||||
}
|
||||
|
||||
// PREPARE_DONE_MESSAGE
|
||||
@@ -745,12 +761,12 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
|
||||
}
|
||||
|
||||
// STREAM_MUTATION
|
||||
void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented)>&& func) {
|
||||
void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
|
||||
register_handler(this, messaging_verb::STREAM_MUTATION, std::move(func));
|
||||
}
|
||||
future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented) {
|
||||
future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented, streaming::stream_reason reason) {
|
||||
return send_message<void>(this, messaging_verb::STREAM_MUTATION, id,
|
||||
plan_id, std::move(fm), dst_cpu_id, fragmented);
|
||||
plan_id, std::move(fm), dst_cpu_id, fragmented, reason);
|
||||
}
|
||||
|
||||
// STREAM_MUTATION_DONE
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "repair/repair.hh"
|
||||
#include "tracing/tracing.hh"
|
||||
#include "digest_algorithm.hh"
|
||||
#include "streaming/stream_reason.hh"
|
||||
|
||||
#include <seastar/net/tls.hh>
|
||||
|
||||
@@ -237,23 +238,23 @@ public:
|
||||
|
||||
// Wrapper for PREPARE_MESSAGE verb
|
||||
void register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description)>&& func);
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
|
||||
future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
|
||||
sstring description);
|
||||
sstring description, streaming::stream_reason);
|
||||
|
||||
// Wrapper for PREPARE_DONE_MESSAGE verb
|
||||
void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
|
||||
|
||||
// Wrapper for STREAM_MUTATION verb
|
||||
void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>)>&& func);
|
||||
future<> send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented);
|
||||
void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
|
||||
future<> send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented, streaming::stream_reason reason);
|
||||
|
||||
// Wrapper for STREAM_MUTATION_FRAGMENTS
|
||||
// The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
|
||||
void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::source<frozen_mutation_fragment> source)>&& func);
|
||||
void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source)>&& func);
|
||||
rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source);
|
||||
future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, msg_addr id);
|
||||
future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
|
||||
|
||||
void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
|
||||
|
||||
@@ -60,7 +60,7 @@ using foreign_unique_ptr = foreign_ptr<std::unique_ptr<T>>;
|
||||
/// 3) Both, `read_context::lookup_readers()` and `read_context::save_readers()`
|
||||
/// knows to do nothing when the query is not stateful and just short
|
||||
/// circuit.
|
||||
class read_context {
|
||||
class read_context : public reader_lifecycle_policy {
|
||||
struct reader_params {
|
||||
std::unique_ptr<const dht::partition_range> range;
|
||||
std::unique_ptr<const query::partition_slice> slice;
|
||||
@@ -80,6 +80,20 @@ class read_context {
|
||||
foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
|
||||
foreign_unique_ptr<flat_mutation_reader> reader;
|
||||
};
|
||||
struct paused_reader {
|
||||
shard_id shard;
|
||||
reader_concurrency_semaphore::inactive_read_handle handle;
|
||||
bool has_pending_next_partition;
|
||||
};
|
||||
struct inactive_read : public reader_concurrency_semaphore::inactive_read {
|
||||
foreign_unique_ptr<flat_mutation_reader> reader;
|
||||
explicit inactive_read(foreign_unique_ptr<flat_mutation_reader> reader)
|
||||
: reader(std::move(reader)) {
|
||||
}
|
||||
virtual void evict() override {
|
||||
reader.reset();
|
||||
}
|
||||
};
|
||||
|
||||
using inexistent_state = std::monostate;
|
||||
struct successful_lookup_state {
|
||||
@@ -94,61 +108,64 @@ class read_context {
|
||||
struct dismantling_state {
|
||||
foreign_unique_ptr<reader_params> params;
|
||||
foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
|
||||
future<stopped_foreign_reader> reader_fut;
|
||||
std::variant<foreign_unique_ptr<flat_mutation_reader>, paused_reader> reader;
|
||||
circular_buffer<mutation_fragment> buffer;
|
||||
};
|
||||
struct ready_to_save_state {
|
||||
foreign_unique_ptr<reader_params> params;
|
||||
foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
|
||||
foreign_unique_ptr<flat_mutation_reader> reader;
|
||||
std::variant<foreign_unique_ptr<flat_mutation_reader>, paused_reader> reader;
|
||||
circular_buffer<mutation_fragment> buffer;
|
||||
};
|
||||
struct future_used_state {
|
||||
future<used_state> fut;
|
||||
struct paused_state {
|
||||
foreign_unique_ptr<reader_params> params;
|
||||
foreign_unique_ptr<utils::phased_barrier::operation> read_operation;
|
||||
reader_concurrency_semaphore::inactive_read_handle handle;
|
||||
};
|
||||
struct future_dismantling_state {
|
||||
future<dismantling_state> fut;
|
||||
struct evicted_state {
|
||||
};
|
||||
|
||||
// ( )
|
||||
// ( ) (O)
|
||||
// | ^
|
||||
// | |
|
||||
// +--- inexistent ---+
|
||||
// | |
|
||||
// (1) | (3) | (3)
|
||||
// | | +------ evicted -> (O)
|
||||
// successful_lookup | | ^
|
||||
// | | | | (7) |
|
||||
// | | | +-------+ | (8)
|
||||
// | | (4) | | | |
|
||||
// | +----------> used paused
|
||||
// | | | (6) ^ |
|
||||
// (2) | | +-------+ |
|
||||
// | (5) | | (5)
|
||||
// | | |
|
||||
// | | |
|
||||
// | dismantling <------+
|
||||
// | |
|
||||
// | (2) |
|
||||
// | |
|
||||
// +---------------> ready_to_save
|
||||
// |
|
||||
// +------ inexistent_state -----+
|
||||
// | |
|
||||
// (1) | (6) |
|
||||
// | |
|
||||
// successful_lookup_state future_used_state
|
||||
// | | | |
|
||||
// (2) | (3) | (7) | (8) |
|
||||
// | | | |
|
||||
// | used_state <---------+ future_dismantling_state
|
||||
// | | |
|
||||
// | (4) | (9) |
|
||||
// | | |
|
||||
// | dismantling_state <-----------------+
|
||||
// | |
|
||||
// | (5) |
|
||||
// | |
|
||||
// +----> ready_to_save_state
|
||||
// |
|
||||
// (O)
|
||||
// (O)
|
||||
//
|
||||
// 1) lookup_readers()
|
||||
// 2) save_readers()
|
||||
// 3) make_remote_reader()
|
||||
// 4) dismantle_reader()
|
||||
// 5) prepare_reader_for_saving()
|
||||
// 6) do_make_remote_reader()
|
||||
// 7) reader is created
|
||||
// 8) dismantle_reader()
|
||||
// 9) reader is created
|
||||
// 3) do_make_remote_reader()
|
||||
// 4) make_remote_reader()
|
||||
// 5) dismantle_reader()
|
||||
// 6) pause_reader()
|
||||
// 7) try_resume() - success
|
||||
// 8) try_resume() - failure
|
||||
using reader_state = std::variant<
|
||||
inexistent_state,
|
||||
successful_lookup_state,
|
||||
used_state,
|
||||
paused_state,
|
||||
evicted_state,
|
||||
dismantling_state,
|
||||
ready_to_save_state,
|
||||
future_used_state,
|
||||
future_dismantling_state>;
|
||||
ready_to_save_state>;
|
||||
|
||||
struct dismantle_buffer_stats {
|
||||
size_t partitions = 0;
|
||||
@@ -184,6 +201,8 @@ class read_context {
|
||||
// One for each shard. Index is shard id.
|
||||
std::vector<reader_state> _readers;
|
||||
|
||||
gate _dismantling_gate;
|
||||
|
||||
static future<bundled_remote_reader> do_make_remote_reader(
|
||||
distributed<database>& db,
|
||||
shard_id shard,
|
||||
@@ -200,13 +219,10 @@ class read_context {
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd_sm,
|
||||
mutation_reader::forwarding fwd_mr);
|
||||
|
||||
void dismantle_reader(shard_id shard, future<stopped_foreign_reader>&& stopped_reader_fut);
|
||||
void dismantle_reader(shard_id shard, future<paused_or_stopped_reader>&& reader_fut);
|
||||
|
||||
ready_to_save_state* prepare_reader_for_saving(dismantling_state& current_state, future<stopped_foreign_reader>&& stopped_reader_fut,
|
||||
const dht::decorated_key& last_pkey, const std::optional<clustering_key_prefix>& last_ckey);
|
||||
dismantle_buffer_stats dismantle_combined_buffer(circular_buffer<mutation_fragment> combined_buffer, const dht::decorated_key& pkey);
|
||||
dismantle_buffer_stats dismantle_compaction_state(detached_compaction_state compaction_state);
|
||||
future<> save_reader(ready_to_save_state& current_state, const dht::decorated_key& last_pkey,
|
||||
@@ -229,26 +245,24 @@ public:
|
||||
read_context& operator=(read_context&&) = delete;
|
||||
read_context& operator=(const read_context&) = delete;
|
||||
|
||||
remote_reader_factory factory() {
|
||||
return [this] (
|
||||
shard_id shard,
|
||||
schema_ptr schema,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd_sm,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return make_remote_reader(shard, std::move(schema), pr, ps, pc, std::move(trace_state), fwd_sm, fwd_mr);
|
||||
};
|
||||
virtual future<foreign_unique_ptr<flat_mutation_reader>> create_reader(
|
||||
shard_id shard,
|
||||
schema_ptr schema,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
mutation_reader::forwarding fwd_mr) override {
|
||||
return make_remote_reader(shard, std::move(schema), pr, ps, pc, std::move(trace_state), fwd_mr);
|
||||
}
|
||||
|
||||
foreign_reader_dismantler dismantler() {
|
||||
return [this] (shard_id shard, future<stopped_foreign_reader>&& stopped_reader_fut) {
|
||||
dismantle_reader(shard, std::move(stopped_reader_fut));
|
||||
};
|
||||
virtual void destroy_reader(shard_id shard, future<paused_or_stopped_reader> reader_fut) noexcept override {
|
||||
dismantle_reader(shard, std::move(reader_fut));
|
||||
}
|
||||
|
||||
virtual future<> pause(foreign_unique_ptr<flat_mutation_reader> reader) override;
|
||||
virtual future<foreign_unique_ptr<flat_mutation_reader>> try_resume(shard_id shard) override;
|
||||
|
||||
future<> lookup_readers();
|
||||
|
||||
future<> save_readers(circular_buffer<mutation_fragment> unconsumed_buffer, detached_compaction_state compaction_state,
|
||||
@@ -289,7 +303,6 @@ future<foreign_unique_ptr<flat_mutation_reader>> read_context::make_remote_reade
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding,
|
||||
mutation_reader::forwarding) {
|
||||
auto& rs = _readers[shard];
|
||||
|
||||
@@ -306,100 +319,71 @@ future<foreign_unique_ptr<flat_mutation_reader>> read_context::make_remote_reade
|
||||
return make_ready_future<foreign_unique_ptr<flat_mutation_reader>>(std::move(reader));
|
||||
}
|
||||
|
||||
auto created = promise<used_state>();
|
||||
rs = future_used_state{created.get_future()};
|
||||
return do_make_remote_reader(_db, shard, std::move(schema), pr, ps, pc, std::move(trace_state)).then_wrapped([this, &rs,
|
||||
created = std::move(created)] (future<bundled_remote_reader>&& bundled_reader_fut) mutable {
|
||||
if (bundled_reader_fut.failed()) {
|
||||
auto ex = bundled_reader_fut.get_exception();
|
||||
if (!std::holds_alternative<future_used_state>(rs)) {
|
||||
created.set_exception(ex);
|
||||
}
|
||||
return make_exception_future<foreign_unique_ptr<flat_mutation_reader>>(std::move(ex));
|
||||
}
|
||||
|
||||
auto bundled_reader = bundled_reader_fut.get0();
|
||||
auto new_state = used_state{std::move(bundled_reader.params), std::move(bundled_reader.read_operation)};
|
||||
if (std::holds_alternative<future_used_state>(rs)) {
|
||||
rs = std::move(new_state);
|
||||
} else {
|
||||
created.set_value(std::move(new_state));
|
||||
}
|
||||
return do_make_remote_reader(_db, shard, std::move(schema), pr, ps, pc, std::move(trace_state)).then(
|
||||
[this, &rs] (bundled_remote_reader&& bundled_reader) mutable {
|
||||
rs = used_state{std::move(bundled_reader.params), std::move(bundled_reader.read_operation)};
|
||||
return make_ready_future<foreign_unique_ptr<flat_mutation_reader>>(std::move(bundled_reader.reader));
|
||||
});
|
||||
}
|
||||
|
||||
void read_context::dismantle_reader(shard_id shard, future<stopped_foreign_reader>&& stopped_reader_fut) {
|
||||
auto& rs = _readers[shard];
|
||||
void read_context::dismantle_reader(shard_id shard, future<paused_or_stopped_reader>&& reader_fut) {
|
||||
with_gate(_dismantling_gate, [this, shard, reader_fut = std::move(reader_fut)] () mutable {
|
||||
return reader_fut.then_wrapped([this, shard] (future<paused_or_stopped_reader>&& reader_fut) {
|
||||
if (reader_fut.failed()) {
|
||||
mmq_log.debug("Failed to stop reader on shard {}: {}", shard, reader_fut.get_exception());
|
||||
++_db.local().get_stats().multishard_query_failed_reader_stops;
|
||||
return;
|
||||
}
|
||||
|
||||
if (auto* maybe_used_state = std::get_if<used_state>(&rs)) {
|
||||
auto read_operation = std::move(maybe_used_state->read_operation);
|
||||
auto params = std::move(maybe_used_state->params);
|
||||
rs = dismantling_state{std::move(params), std::move(read_operation), std::move(stopped_reader_fut), circular_buffer<mutation_fragment>{}};
|
||||
} else if (auto* maybe_future_used_state = std::get_if<future_used_state>(&rs)) {
|
||||
auto f = maybe_future_used_state->fut.then([stopped_reader_fut = std::move(stopped_reader_fut)] (used_state&& current_state) mutable {
|
||||
auto read_operation = std::move(current_state.read_operation);
|
||||
auto params = std::move(current_state.params);
|
||||
return dismantling_state{std::move(params), std::move(read_operation), std::move(stopped_reader_fut),
|
||||
circular_buffer<mutation_fragment>{}};
|
||||
auto reader = reader_fut.get0();
|
||||
auto& rs = _readers[shard];
|
||||
if (auto* maybe_used_state = std::get_if<used_state>(&rs)) {
|
||||
auto read_operation = std::move(maybe_used_state->read_operation);
|
||||
auto params = std::move(maybe_used_state->params);
|
||||
rs = dismantling_state{std::move(params), std::move(read_operation), std::move(reader.remote_reader),
|
||||
std::move(reader.unconsumed_fragments)};
|
||||
} else if (auto* maybe_paused_state = std::get_if<paused_state>(&rs)) {
|
||||
auto read_operation = std::move(maybe_paused_state->read_operation);
|
||||
auto params = std::move(maybe_paused_state->params);
|
||||
auto handle = maybe_paused_state->handle;
|
||||
rs = dismantling_state{std::move(params), std::move(read_operation), paused_reader{shard, handle, reader.has_pending_next_partition},
|
||||
std::move(reader.unconsumed_fragments)};
|
||||
// Do nothing for evicted readers.
|
||||
} else if (!std::holds_alternative<evicted_state>(rs)) {
|
||||
mmq_log.warn(
|
||||
"Unexpected request to dismantle reader in state {} for shard {}."
|
||||
" Reader was not created nor is in the process of being created.",
|
||||
rs.index(),
|
||||
shard);
|
||||
}
|
||||
});
|
||||
rs = future_dismantling_state{std::move(f)};
|
||||
} else {
|
||||
mmq_log.warn("Unexpected request to dismantle reader for shard {}. Reader was not created nor is in the process of being created.", shard);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
future<> read_context::stop() {
|
||||
auto cleanup = [db = &_db.local()] (shard_id shard, dismantling_state state) {
|
||||
return state.reader_fut.then_wrapped([db, shard, params = std::move(state.params),
|
||||
read_operation = std::move(state.read_operation)] (future<stopped_foreign_reader>&& fut) mutable {
|
||||
if (fut.failed()) {
|
||||
mmq_log.debug("Failed to stop reader on shard {}: {}", shard, fut.get_exception());
|
||||
++db->get_stats().multishard_query_failed_reader_stops;
|
||||
} else {
|
||||
smp::submit_to(shard, [reader = fut.get0().remote_reader, params = std::move(params),
|
||||
read_operation = std::move(read_operation)] () mutable {
|
||||
reader.release();
|
||||
auto pr = promise<>();
|
||||
auto fut = pr.get_future();
|
||||
auto gate_fut = _dismantling_gate.is_closed() ? make_ready_future<>() : _dismantling_gate.close();
|
||||
gate_fut.then([this] {
|
||||
for (shard_id shard = 0; shard != smp::count; ++shard) {
|
||||
if (auto* maybe_dismantling_state = std::get_if<dismantling_state>(&_readers[shard])) {
|
||||
_db.invoke_on(shard, [reader = std::move(maybe_dismantling_state->reader),
|
||||
params = std::move(maybe_dismantling_state->params),
|
||||
read_operation = std::move(maybe_dismantling_state->read_operation)] (database& db) mutable {
|
||||
if (auto* maybe_stopped_reader = std::get_if<foreign_unique_ptr<flat_mutation_reader>>(&reader)) {
|
||||
maybe_stopped_reader->release();
|
||||
} else {
|
||||
db.user_read_concurrency_sem().unregister_inactive_read(std::get<paused_reader>(reader).handle);
|
||||
}
|
||||
params.release();
|
||||
read_operation.release();
|
||||
});
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
std::vector<future<>> futures;
|
||||
auto immediate_cleanup = size_t(0);
|
||||
auto future_cleanup = size_t(0);
|
||||
|
||||
// Wait for pending read-aheads in the background.
|
||||
for (shard_id shard = 0; shard != smp::count; ++shard) {
|
||||
auto& rs = _readers[shard];
|
||||
|
||||
if (auto maybe_dismantling_state = std::get_if<dismantling_state>(&rs)) {
|
||||
++immediate_cleanup;
|
||||
cleanup(shard, std::move(*maybe_dismantling_state));
|
||||
} else if (auto maybe_future_dismantling_state = std::get_if<future_dismantling_state>(&rs)) {
|
||||
++future_cleanup;
|
||||
futures.emplace_back(maybe_future_dismantling_state->fut.then_wrapped([=] (future<dismantling_state>&& current_state_fut) {
|
||||
if (current_state_fut.failed()) {
|
||||
mmq_log.debug("Failed to stop reader on shard {}: {}", shard, current_state_fut.get_exception());
|
||||
++_db.local().get_stats().multishard_query_failed_reader_stops;
|
||||
} else {
|
||||
cleanup(shard, current_state_fut.get0());
|
||||
}
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
if (const auto total = immediate_cleanup + future_cleanup) {
|
||||
tracing::trace(_trace_state,
|
||||
"Stopping {} shard readers, {} ready for immediate cleanup, {} will be cleaned up after finishes read-ahead",
|
||||
total,
|
||||
immediate_cleanup,
|
||||
future_cleanup);
|
||||
}
|
||||
|
||||
return when_all(futures.begin(), futures.end()).discard_result();
|
||||
}).finally([pr = std::move(pr)] () mutable {
|
||||
pr.set_value();
|
||||
});
|
||||
return fut;
|
||||
}
|
||||
|
||||
read_context::dismantle_buffer_stats read_context::dismantle_combined_buffer(circular_buffer<mutation_fragment> combined_buffer,
|
||||
@@ -459,46 +443,35 @@ read_context::dismantle_buffer_stats read_context::dismantle_compaction_state(de
|
||||
return stats;
|
||||
}
|
||||
|
||||
read_context::ready_to_save_state* read_context::prepare_reader_for_saving(
|
||||
dismantling_state& current_state,
|
||||
future<stopped_foreign_reader>&& stopped_reader_fut,
|
||||
const dht::decorated_key& last_pkey,
|
||||
const std::optional<clustering_key_prefix>& last_ckey) {
|
||||
const auto shard = current_state.params.get_owner_shard();
|
||||
auto& rs = _readers[shard];
|
||||
|
||||
if (stopped_reader_fut.failed()) {
|
||||
mmq_log.debug("Failed to stop reader on shard {}: {}", shard, stopped_reader_fut.get_exception());
|
||||
++_db.local().get_stats().multishard_query_failed_reader_stops;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto stopped_reader = stopped_reader_fut.get0();
|
||||
|
||||
// If the buffer is empty just overwrite it.
|
||||
// If it has some data in it append the fragments to the back.
|
||||
// The unconsumed fragments appended here come from the
|
||||
// foreign_reader which is at the lowest layer, hence its
|
||||
// fragments need to be at the back of the buffer.
|
||||
if (current_state.buffer.empty()) {
|
||||
current_state.buffer = std::move(stopped_reader.unconsumed_fragments);
|
||||
} else {
|
||||
std::move(stopped_reader.unconsumed_fragments.begin(), stopped_reader.unconsumed_fragments.end(), std::back_inserter(current_state.buffer));
|
||||
}
|
||||
rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation), std::move(stopped_reader.remote_reader),
|
||||
std::move(current_state.buffer)};
|
||||
return &std::get<ready_to_save_state>(rs);
|
||||
}
|
||||
|
||||
future<> read_context::save_reader(ready_to_save_state& current_state, const dht::decorated_key& last_pkey,
|
||||
const std::optional<clustering_key_prefix>& last_ckey) {
|
||||
const auto shard = current_state.reader.get_owner_shard();
|
||||
auto* maybe_stopped_reader = std::get_if<foreign_unique_ptr<flat_mutation_reader>>(¤t_state.reader);
|
||||
const auto shard = maybe_stopped_reader
|
||||
? maybe_stopped_reader->get_owner_shard()
|
||||
: std::get<paused_reader>(current_state.reader).shard;
|
||||
|
||||
return _db.invoke_on(shard, [shard, query_uuid = _cmd.query_uuid, query_ranges = _ranges, ¤t_state, &last_pkey, &last_ckey,
|
||||
gts = tracing::global_trace_state_ptr(_trace_state)] (database& db) mutable {
|
||||
try {
|
||||
auto params = current_state.params.release();
|
||||
auto read_operation = current_state.read_operation.release();
|
||||
auto reader = current_state.reader.release();
|
||||
|
||||
flat_mutation_reader_opt reader;
|
||||
if (auto* maybe_paused_reader = std::get_if<paused_reader>(¤t_state.reader)) {
|
||||
if (auto inactive_read_ptr = db.user_read_concurrency_sem().unregister_inactive_read(maybe_paused_reader->handle)) {
|
||||
reader = std::move(*static_cast<inactive_read&>(*inactive_read_ptr).reader);
|
||||
if (maybe_paused_reader->has_pending_next_partition) {
|
||||
reader->next_partition();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
reader = std::move(*std::get<foreign_unique_ptr<flat_mutation_reader>>(current_state.reader));
|
||||
}
|
||||
|
||||
if (!reader) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto& buffer = current_state.buffer;
|
||||
const auto fragments = buffer.size();
|
||||
const auto size_before = reader->buffer_size();
|
||||
@@ -541,6 +514,33 @@ future<> read_context::save_reader(ready_to_save_state& current_state, const dht
|
||||
});
|
||||
}
|
||||
|
||||
future<> read_context::pause(foreign_unique_ptr<flat_mutation_reader> reader) {
|
||||
const auto shard = reader.get_owner_shard();
|
||||
return _db.invoke_on(shard, [reader = std::move(reader)] (database& db) mutable {
|
||||
return db.user_read_concurrency_sem().register_inactive_read(std::make_unique<inactive_read>(std::move(reader)));
|
||||
}).then([this, shard] (reader_concurrency_semaphore::inactive_read_handle handle) {
|
||||
auto& current_state = std::get<used_state>(_readers[shard]);
|
||||
_readers[shard] = paused_state{std::move(current_state.params), std::move(current_state.read_operation), handle};
|
||||
});
|
||||
}
|
||||
|
||||
future<foreign_unique_ptr<flat_mutation_reader>> read_context::try_resume(shard_id shard) {
|
||||
return _db.invoke_on(shard, [handle = std::get<paused_state>(_readers[shard]).handle] (database& db) mutable {
|
||||
if (auto inactive_read_ptr = db.user_read_concurrency_sem().unregister_inactive_read(handle)) {
|
||||
return std::move(static_cast<inactive_read&>(*inactive_read_ptr).reader);
|
||||
}
|
||||
return foreign_unique_ptr<flat_mutation_reader>();
|
||||
}).then([this, shard] (foreign_unique_ptr<flat_mutation_reader> reader) {
|
||||
if (reader) {
|
||||
auto& current_state = std::get<paused_state>(_readers[shard]);
|
||||
_readers[shard] = used_state{std::move(current_state.params), std::move(current_state.read_operation)};
|
||||
} else {
|
||||
_readers[shard] = evicted_state{};
|
||||
}
|
||||
return std::move(reader);
|
||||
});
|
||||
}
|
||||
|
||||
future<> read_context::lookup_readers() {
|
||||
if (_cmd.query_uuid == utils::UUID{} || _cmd.is_first_page) {
|
||||
return make_ready_future<>();
|
||||
@@ -574,49 +574,37 @@ future<> read_context::save_readers(circular_buffer<mutation_fragment> unconsume
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto last_pkey = compaction_state.partition_start.key();
|
||||
return _dismantling_gate.close().then([this, unconsumed_buffer = std::move(unconsumed_buffer), compaction_state = std::move(compaction_state),
|
||||
last_ckey = std::move(last_ckey)] () mutable {
|
||||
auto last_pkey = compaction_state.partition_start.key();
|
||||
|
||||
const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
|
||||
tracing::trace(_trace_state, "Dismantled combined buffer: {} partitions/{} fragments/{} bytes", cb_stats.partitions, cb_stats.fragments,
|
||||
cb_stats.bytes);
|
||||
const auto cb_stats = dismantle_combined_buffer(std::move(unconsumed_buffer), last_pkey);
|
||||
tracing::trace(_trace_state, "Dismantled combined buffer: {} partitions/{} fragments/{} bytes", cb_stats.partitions, cb_stats.fragments,
|
||||
cb_stats.bytes);
|
||||
|
||||
const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
|
||||
tracing::trace(_trace_state, "Dismantled compaction state: {} partitions/{} fragments/{} bytes", cs_stats.partitions, cs_stats.fragments,
|
||||
cs_stats.bytes);
|
||||
const auto cs_stats = dismantle_compaction_state(std::move(compaction_state));
|
||||
tracing::trace(_trace_state, "Dismantled compaction state: {} partitions/{} fragments/{} bytes", cs_stats.partitions, cs_stats.fragments,
|
||||
cs_stats.bytes);
|
||||
|
||||
return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
|
||||
return do_with(std::move(last_pkey), std::move(last_ckey), [this] (const dht::decorated_key& last_pkey,
|
||||
const std::optional<clustering_key_prefix>& last_ckey) {
|
||||
return parallel_for_each(_readers, [this, &last_pkey, &last_ckey] (reader_state& rs) {
|
||||
if (auto* maybe_successful_lookup_state = std::get_if<successful_lookup_state>(&rs)) {
|
||||
auto& current_state = *maybe_successful_lookup_state;
|
||||
rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation),
|
||||
std::move(current_state.reader), circular_buffer<mutation_fragment>{}};
|
||||
return save_reader(std::get<ready_to_save_state>(rs), last_pkey, last_ckey);
|
||||
}
|
||||
return parallel_for_each(_readers, [this, &last_pkey, &last_ckey] (reader_state& rs) {
|
||||
if (auto* maybe_successful_lookup_state = std::get_if<successful_lookup_state>(&rs)) {
|
||||
auto& current_state = *maybe_successful_lookup_state;
|
||||
rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation),
|
||||
std::move(current_state.reader), circular_buffer<mutation_fragment>{}};
|
||||
return save_reader(std::get<ready_to_save_state>(rs), last_pkey, last_ckey);
|
||||
}
|
||||
|
||||
auto finish_saving = [this, &last_pkey, &last_ckey] (dismantling_state& current_state) {
|
||||
return current_state.reader_fut.then_wrapped([this, ¤t_state, &last_pkey, &last_ckey] (
|
||||
future<stopped_foreign_reader>&& stopped_reader_fut) mutable {
|
||||
if (auto* ready_state = prepare_reader_for_saving(current_state, std::move(stopped_reader_fut), last_pkey, last_ckey)) {
|
||||
return save_reader(*ready_state, last_pkey, last_ckey);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
};
|
||||
if (auto* maybe_dismantling_state = std::get_if<dismantling_state>(&rs)) {
|
||||
auto& current_state = *maybe_dismantling_state;
|
||||
rs = ready_to_save_state{std::move(current_state.params), std::move(current_state.read_operation),
|
||||
std::move(current_state.reader), std::move(current_state.buffer)};
|
||||
return save_reader(std::get<ready_to_save_state>(rs), last_pkey, last_ckey);
|
||||
}
|
||||
|
||||
if (auto* maybe_dismantling_state = std::get_if<dismantling_state>(&rs)) {
|
||||
return finish_saving(*maybe_dismantling_state);
|
||||
}
|
||||
|
||||
if (auto* maybe_future_dismantling_state = std::get_if<future_dismantling_state>(&rs)) {
|
||||
return maybe_future_dismantling_state->fut.then([this, &rs,
|
||||
finish_saving = std::move(finish_saving)] (dismantling_state&& next_state) mutable {
|
||||
rs = std::move(next_state);
|
||||
return finish_saving(std::get<dismantling_state>(rs));
|
||||
});
|
||||
}
|
||||
|
||||
return make_ready_future<>();
|
||||
return make_ready_future<>();
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -629,8 +617,8 @@ static future<reconcilable_result> do_query_mutations(
|
||||
tracing::trace_state_ptr trace_state,
|
||||
db::timeout_clock::time_point timeout,
|
||||
query::result_memory_accounter&& accounter) {
|
||||
return do_with(std::make_unique<read_context>(db, s, cmd, ranges, trace_state), [s, &cmd, &ranges, trace_state, timeout,
|
||||
accounter = std::move(accounter)] (std::unique_ptr<read_context>& ctx) mutable {
|
||||
return do_with(seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state), [s, &cmd, &ranges, trace_state, timeout,
|
||||
accounter = std::move(accounter)] (shared_ptr<read_context>& ctx) mutable {
|
||||
return ctx->lookup_readers().then([&ctx, s = std::move(s), &cmd, &ranges, trace_state, timeout,
|
||||
accounter = std::move(accounter)] () mutable {
|
||||
auto ms = mutation_source([&] (schema_ptr s,
|
||||
@@ -638,10 +626,9 @@ static future<reconcilable_result> do_query_mutations(
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd_sm,
|
||||
streamed_mutation::forwarding,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return make_multishard_combining_reader(std::move(s), pr, ps, pc, dht::global_partitioner(), ctx->factory(), std::move(trace_state),
|
||||
fwd_sm, fwd_mr, ctx->dismantler());
|
||||
return make_multishard_combining_reader(ctx, dht::global_partitioner(), std::move(s), pr, ps, pc, std::move(trace_state), fwd_mr);
|
||||
});
|
||||
auto reader = make_flat_multi_range_reader(s, std::move(ms), ranges, cmd.slice, service::get_local_sstable_query_read_priority(),
|
||||
trace_state, mutation_reader::forwarding::no);
|
||||
|
||||
@@ -556,128 +556,6 @@ flat_mutation_reader make_combined_reader(schema_ptr schema,
|
||||
return make_combined_reader(std::move(schema), std::move(v), fwd_sm, fwd_mr);
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::signal(const resources& r) {
|
||||
_resources += r;
|
||||
while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
|
||||
auto& x = _wait_list.front();
|
||||
_resources -= x.res;
|
||||
x.pr.set_value(make_lw_shared<reader_permit>(*this, x.res));
|
||||
_wait_list.pop_front();
|
||||
}
|
||||
}
|
||||
|
||||
future<lw_shared_ptr<reader_concurrency_semaphore::reader_permit>> reader_concurrency_semaphore::wait_admission(size_t memory,
|
||||
db::timeout_clock::time_point timeout) {
|
||||
if (_wait_list.size() >= _max_queue_length) {
|
||||
return make_exception_future<lw_shared_ptr<reader_permit>>(_make_queue_overloaded_exception());
|
||||
}
|
||||
auto r = resources(1, static_cast<ssize_t>(memory));
|
||||
if (!may_proceed(r) && _evict_an_inactive_reader) {
|
||||
while (_evict_an_inactive_reader() && !may_proceed(r));
|
||||
}
|
||||
if (may_proceed(r)) {
|
||||
_resources -= r;
|
||||
return make_ready_future<lw_shared_ptr<reader_permit>>(make_lw_shared<reader_permit>(*this, r));
|
||||
}
|
||||
promise<lw_shared_ptr<reader_permit>> pr;
|
||||
auto fut = pr.get_future();
|
||||
_wait_list.push_back(entry(std::move(pr), r), timeout);
|
||||
return fut;
|
||||
}
|
||||
|
||||
// A file that tracks the memory usage of buffers resulting from read
|
||||
// operations.
|
||||
class tracking_file_impl : public file_impl {
|
||||
file _tracked_file;
|
||||
lw_shared_ptr<reader_concurrency_semaphore::reader_permit> _permit;
|
||||
|
||||
// Shouldn't be called if semaphore is NULL.
|
||||
temporary_buffer<uint8_t> make_tracked_buf(temporary_buffer<uint8_t> buf) {
|
||||
return seastar::temporary_buffer<uint8_t>(buf.get_write(),
|
||||
buf.size(),
|
||||
make_deleter(buf.release(), std::bind(&reader_concurrency_semaphore::reader_permit::signal_memory, _permit, buf.size())));
|
||||
}
|
||||
|
||||
public:
|
||||
tracking_file_impl(file file, reader_resource_tracker resource_tracker)
|
||||
: _tracked_file(std::move(file))
|
||||
, _permit(resource_tracker.get_permit()) {
|
||||
}
|
||||
|
||||
tracking_file_impl(const tracking_file_impl&) = delete;
|
||||
tracking_file_impl& operator=(const tracking_file_impl&) = delete;
|
||||
tracking_file_impl(tracking_file_impl&&) = default;
|
||||
tracking_file_impl& operator=(tracking_file_impl&&) = default;
|
||||
|
||||
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->write_dma(pos, buffer, len, pc);
|
||||
}
|
||||
|
||||
virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->write_dma(pos, std::move(iov), pc);
|
||||
}
|
||||
|
||||
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->read_dma(pos, buffer, len, pc);
|
||||
}
|
||||
|
||||
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->read_dma(pos, iov, pc);
|
||||
}
|
||||
|
||||
virtual future<> flush(void) override {
|
||||
return get_file_impl(_tracked_file)->flush();
|
||||
}
|
||||
|
||||
virtual future<struct stat> stat(void) override {
|
||||
return get_file_impl(_tracked_file)->stat();
|
||||
}
|
||||
|
||||
virtual future<> truncate(uint64_t length) override {
|
||||
return get_file_impl(_tracked_file)->truncate(length);
|
||||
}
|
||||
|
||||
virtual future<> discard(uint64_t offset, uint64_t length) override {
|
||||
return get_file_impl(_tracked_file)->discard(offset, length);
|
||||
}
|
||||
|
||||
virtual future<> allocate(uint64_t position, uint64_t length) override {
|
||||
return get_file_impl(_tracked_file)->allocate(position, length);
|
||||
}
|
||||
|
||||
virtual future<uint64_t> size(void) override {
|
||||
return get_file_impl(_tracked_file)->size();
|
||||
}
|
||||
|
||||
virtual future<> close() override {
|
||||
return get_file_impl(_tracked_file)->close();
|
||||
}
|
||||
|
||||
virtual std::unique_ptr<file_handle_impl> dup() override {
|
||||
return get_file_impl(_tracked_file)->dup();
|
||||
}
|
||||
|
||||
virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override {
|
||||
return get_file_impl(_tracked_file)->list_directory(std::move(next));
|
||||
}
|
||||
|
||||
virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->dma_read_bulk(offset, range_size, pc).then([this] (temporary_buffer<uint8_t> buf) {
|
||||
if (_permit) {
|
||||
buf = make_tracked_buf(std::move(buf));
|
||||
_permit->consume_memory(buf.size());
|
||||
}
|
||||
return make_ready_future<temporary_buffer<uint8_t>>(std::move(buf));
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
file reader_resource_tracker::track(file f) const {
|
||||
return file(make_shared<tracking_file_impl>(f, *this));
|
||||
}
|
||||
|
||||
|
||||
class restricting_mutation_reader : public flat_mutation_reader::impl {
|
||||
struct mutation_source_and_params {
|
||||
mutation_source _ms;
|
||||
@@ -840,12 +718,14 @@ class foreign_reader : public flat_mutation_reader::impl {
|
||||
template <typename T>
|
||||
using foreign_unique_ptr = foreign_ptr<std::unique_ptr<T>>;
|
||||
|
||||
using fragment_buffer = circular_buffer<mutation_fragment>;
|
||||
|
||||
foreign_unique_ptr<flat_mutation_reader> _reader;
|
||||
foreign_unique_ptr<future<>> _read_ahead_future;
|
||||
// Increase this counter every time next_partition() is called.
|
||||
// These pending calls will be executed the next time we go to the remote
|
||||
// Set this flag when next_partition() is called.
|
||||
// This pending call will be executed the next time we go to the remote
|
||||
// reader (a fill_buffer() or a fast_forward_to() call).
|
||||
unsigned _pending_next_partition = 0;
|
||||
bool _pending_next_partition = false;
|
||||
streamed_mutation::forwarding _fwd_sm;
|
||||
|
||||
// Forward an operation to the reader on the remote shard.
|
||||
@@ -859,12 +739,11 @@ class foreign_reader : public flat_mutation_reader::impl {
|
||||
Result forward_operation(db::timeout_clock::time_point timeout, Operation op) {
|
||||
return smp::submit_to(_reader.get_owner_shard(), [reader = _reader.get(),
|
||||
read_ahead_future = std::exchange(_read_ahead_future, nullptr),
|
||||
pending_next_partition = std::exchange(_pending_next_partition, 0),
|
||||
pending_next_partition = std::exchange(_pending_next_partition, false),
|
||||
timeout,
|
||||
op = std::move(op)] () mutable {
|
||||
auto exec_op_and_read_ahead = [=] () mutable {
|
||||
while (pending_next_partition) {
|
||||
--pending_next_partition;
|
||||
if (pending_next_partition) {
|
||||
reader->next_partition();
|
||||
}
|
||||
return op().then([=] (auto... results) {
|
||||
@@ -883,6 +762,8 @@ class foreign_reader : public flat_mutation_reader::impl {
|
||||
return make_ready_future<decltype(results)...>(std::move(results)...);
|
||||
});
|
||||
}
|
||||
|
||||
void update_buffer_with(foreign_unique_ptr<fragment_buffer> buffer, bool end_of_steam);
|
||||
public:
|
||||
foreign_reader(schema_ptr schema,
|
||||
foreign_unique_ptr<flat_mutation_reader> reader,
|
||||
@@ -902,10 +783,22 @@ public:
|
||||
virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override;
|
||||
|
||||
const mutation_fragment& peek_buffer() const { return buffer().front(); }
|
||||
const circular_buffer<mutation_fragment>& get_buffer() const { return buffer(); }
|
||||
|
||||
future<stopped_foreign_reader> stop();
|
||||
future<foreign_unique_ptr<flat_mutation_reader>> pause();
|
||||
void resume(foreign_unique_ptr<flat_mutation_reader> reader);
|
||||
|
||||
future<reader_lifecycle_policy::paused_or_stopped_reader> stop();
|
||||
};
|
||||
|
||||
void foreign_reader::update_buffer_with(foreign_unique_ptr<fragment_buffer> buffer, bool end_of_steam) {
|
||||
_end_of_stream = end_of_steam;
|
||||
for (const auto& mf : *buffer) {
|
||||
// Need a copy since the mf is on the remote shard.
|
||||
push_mutation_fragment(mutation_fragment(*_schema, mf));
|
||||
}
|
||||
}
|
||||
|
||||
foreign_reader::foreign_reader(schema_ptr schema,
|
||||
foreign_unique_ptr<flat_mutation_reader> reader,
|
||||
streamed_mutation::forwarding fwd_sm)
|
||||
@@ -931,8 +824,6 @@ future<> foreign_reader::fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
return make_ready_future();
|
||||
}
|
||||
|
||||
using fragment_buffer = circular_buffer<mutation_fragment>;
|
||||
|
||||
return forward_operation(timeout, [reader = _reader.get(), timeout] () {
|
||||
auto f = reader->is_buffer_empty() ? reader->fill_buffer(timeout) : make_ready_future<>();
|
||||
return f.then([=] {
|
||||
@@ -940,12 +831,8 @@ future<> foreign_reader::fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
std::make_unique<fragment_buffer>(reader->detach_buffer()),
|
||||
reader->is_end_of_stream());
|
||||
});
|
||||
}).then([this] (foreign_unique_ptr<fragment_buffer> buffer, bool end_of_steam) mutable {
|
||||
_end_of_stream = end_of_steam;
|
||||
for (const auto& mf : *buffer) {
|
||||
// Need a copy since the mf is on the remote shard.
|
||||
push_mutation_fragment(mutation_fragment(*_schema, mf));
|
||||
}
|
||||
}).then([this] (foreign_unique_ptr<fragment_buffer> buffer, bool end_of_stream) mutable {
|
||||
update_buffer_with(std::move(buffer), end_of_stream);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -953,12 +840,12 @@ void foreign_reader::next_partition() {
|
||||
if (_fwd_sm == streamed_mutation::forwarding::yes) {
|
||||
clear_buffer();
|
||||
_end_of_stream = false;
|
||||
++_pending_next_partition;
|
||||
_pending_next_partition = true;
|
||||
} else {
|
||||
clear_buffer_to_next_partition();
|
||||
if (is_buffer_empty()) {
|
||||
_end_of_stream = false;
|
||||
++_pending_next_partition;
|
||||
_pending_next_partition = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -979,26 +866,61 @@ future<> foreign_reader::fast_forward_to(position_range pr, db::timeout_clock::t
|
||||
});
|
||||
}
|
||||
|
||||
future<stopped_foreign_reader> foreign_reader::stop() {
|
||||
if (_read_ahead_future || _pending_next_partition) {
|
||||
future<reader_lifecycle_policy::paused_or_stopped_reader> foreign_reader::stop() {
|
||||
if (_reader && (_read_ahead_future || _pending_next_partition)) {
|
||||
const auto owner_shard = _reader.get_owner_shard();
|
||||
return smp::submit_to(owner_shard, [reader = _reader.get(),
|
||||
read_ahead_future = std::exchange(_read_ahead_future, nullptr),
|
||||
pending_next_partition = std::exchange(_pending_next_partition, 0)] () mutable {
|
||||
pending_next_partition = std::exchange(_pending_next_partition, false)] () mutable {
|
||||
auto fut = read_ahead_future ? std::move(*read_ahead_future) : make_ready_future<>();
|
||||
return fut.then([=] () mutable {
|
||||
for (;pending_next_partition > 0; --pending_next_partition) {
|
||||
if (pending_next_partition) {
|
||||
reader->next_partition();
|
||||
}
|
||||
});
|
||||
}).then([this] {
|
||||
return stopped_foreign_reader{std::move(_reader), detach_buffer()};
|
||||
return reader_lifecycle_policy::paused_or_stopped_reader{std::move(_reader), detach_buffer(), false};
|
||||
});
|
||||
} else {
|
||||
return make_ready_future<stopped_foreign_reader>(stopped_foreign_reader{std::move(_reader), detach_buffer()});
|
||||
return make_ready_future<reader_lifecycle_policy::paused_or_stopped_reader>(
|
||||
reader_lifecycle_policy::paused_or_stopped_reader{std::move(_reader), detach_buffer(), _pending_next_partition});
|
||||
}
|
||||
}
|
||||
|
||||
future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> foreign_reader::pause() {
|
||||
return smp::submit_to(_reader.get_owner_shard(), [reader = _reader.get(),
|
||||
read_ahead_future = std::exchange(_read_ahead_future, nullptr),
|
||||
pending_next_partition = std::exchange(_pending_next_partition, false)] () mutable {
|
||||
auto fut = read_ahead_future ? std::move(*read_ahead_future) : make_ready_future<>();
|
||||
return fut.then([=] () mutable {
|
||||
if (pending_next_partition) {
|
||||
reader->next_partition();
|
||||
}
|
||||
return make_ready_future<foreign_unique_ptr<fragment_buffer>, bool>(
|
||||
std::make_unique<fragment_buffer>(reader->detach_buffer()),
|
||||
reader->is_end_of_stream());
|
||||
});
|
||||
}).then([this] (foreign_unique_ptr<fragment_buffer>&& buffer, bool end_of_stream) mutable {
|
||||
update_buffer_with(std::move(buffer), end_of_stream);
|
||||
|
||||
// An ongoing pause() might overlap with a next_partition() call.
|
||||
// So if there is a pending next partition, try to execute it again
|
||||
// after the remote buffer was transferred. This is required for
|
||||
// correctness, otherwise some fragments belonging to the to-be-skipped
|
||||
// partition can escape the next_partition() call, both on the local and
|
||||
// the remote shard.
|
||||
if (_pending_next_partition) {
|
||||
_pending_next_partition = false;
|
||||
next_partition();
|
||||
}
|
||||
return std::move(_reader);
|
||||
});
|
||||
}
|
||||
|
||||
void foreign_reader::resume(foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) {
|
||||
_reader = std::move(reader);
|
||||
}
|
||||
|
||||
flat_mutation_reader make_foreign_reader(schema_ptr schema,
|
||||
foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader,
|
||||
streamed_mutation::forwarding fwd_sm) {
|
||||
@@ -1010,14 +932,12 @@ flat_mutation_reader make_foreign_reader(schema_ptr schema,
|
||||
|
||||
// See make_multishard_combining_reader() for description.
|
||||
class multishard_combining_reader : public flat_mutation_reader::impl {
|
||||
shared_ptr<reader_lifecycle_policy> _lifecycle_policy;
|
||||
const dht::i_partitioner& _partitioner;
|
||||
const dht::partition_range* _pr;
|
||||
const query::partition_slice& _ps;
|
||||
const io_priority_class& _pc;
|
||||
remote_reader_factory _reader_factory;
|
||||
foreign_reader_dismantler _reader_dismantler;
|
||||
tracing::trace_state_ptr _trace_state;
|
||||
const streamed_mutation::forwarding _fwd_sm;
|
||||
const mutation_reader::forwarding _fwd_mr;
|
||||
|
||||
// Thin wrapper around a flat_mutation_reader (foreign_reader) that
|
||||
@@ -1035,14 +955,30 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
|
||||
class shard_reader {
|
||||
struct state {
|
||||
std::unique_ptr<foreign_reader> reader;
|
||||
unsigned pending_next_partition = 0;
|
||||
bool stopped = false;
|
||||
promise<> reader_promise;
|
||||
bool drop_partition_start = false;
|
||||
bool drop_static_row = false;
|
||||
};
|
||||
const multishard_combining_reader& _parent;
|
||||
const unsigned _shard;
|
||||
lw_shared_ptr<state> _state;
|
||||
std::optional<future<>> _read_ahead;
|
||||
std::optional<future<>> _pause;
|
||||
|
||||
std::optional<dht::decorated_key> _last_pkey;
|
||||
std::optional<position_in_partition> _last_position_in_partition;
|
||||
// These are used when the reader has to be recreated (after having been
|
||||
// evicted while paused) and the range and/or slice it is recreated with
|
||||
// differs from the original ones.
|
||||
std::optional<dht::partition_range> _range_override;
|
||||
std::optional<query::partition_slice> _slice_override;
|
||||
|
||||
private:
|
||||
void update_last_position();
|
||||
void adjust_partition_slice();
|
||||
future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> recreate_reader();
|
||||
future<> resume();
|
||||
future<> do_fill_buffer(db::timeout_clock::time_point timeout);
|
||||
|
||||
public:
|
||||
shard_reader(multishard_combining_reader& parent, unsigned shard)
|
||||
@@ -1057,11 +993,7 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
|
||||
shard_reader(const shard_reader&) = delete;
|
||||
shard_reader& operator=(const shard_reader&) = delete;
|
||||
|
||||
~shard_reader() {
|
||||
if (!_state->stopped) {
|
||||
stop();
|
||||
}
|
||||
}
|
||||
~shard_reader();
|
||||
|
||||
// These methods assume the reader is already created.
|
||||
bool is_end_of_stream() const {
|
||||
@@ -1081,7 +1013,6 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
|
||||
// These methods don't assume the reader is already created.
|
||||
void next_partition();
|
||||
future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout);
|
||||
future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout);
|
||||
future<> create_reader();
|
||||
explicit operator bool() const {
|
||||
return bool(_state->reader);
|
||||
@@ -1093,7 +1024,7 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
|
||||
bool is_read_ahead_in_progress() const {
|
||||
return _read_ahead.has_value();
|
||||
}
|
||||
future<stopped_foreign_reader> stop();
|
||||
void pause();
|
||||
};
|
||||
|
||||
std::vector<shard_reader> _shard_readers;
|
||||
@@ -1106,18 +1037,15 @@ class multishard_combining_reader : public flat_mutation_reader::impl {
|
||||
future<> handle_empty_reader_buffer(db::timeout_clock::time_point timeout);
|
||||
|
||||
public:
|
||||
multishard_combining_reader(schema_ptr s,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
const dht::i_partitioner& partitioner,
|
||||
remote_reader_factory reader_factory,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd_sm,
|
||||
mutation_reader::forwarding fwd_mr,
|
||||
foreign_reader_dismantler reader_dismantler);
|
||||
|
||||
~multishard_combining_reader();
|
||||
multishard_combining_reader(
|
||||
shared_ptr<reader_lifecycle_policy> lifecycle_policy,
|
||||
const dht::i_partitioner& partitioner,
|
||||
schema_ptr s,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
mutation_reader::forwarding fwd_mr);
|
||||
|
||||
// this is captured.
|
||||
multishard_combining_reader(const multishard_combining_reader&) = delete;
|
||||
@@ -1131,94 +1059,289 @@ public:
|
||||
virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override;
|
||||
};
|
||||
|
||||
multishard_combining_reader::shard_reader::~shard_reader() {
|
||||
// Nothing to do if there was no reader created, nor is there a background
|
||||
// read ahead in progress which will create one.
|
||||
if (!_state->reader && !_read_ahead) {
|
||||
return;
|
||||
}
|
||||
|
||||
_state->stopped = true;
|
||||
|
||||
auto f = [this] {
|
||||
if (_read_ahead) {
|
||||
return std::move(*_read_ahead);
|
||||
} else if (_pause) {
|
||||
return std::move(*_pause);
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}();
|
||||
|
||||
_parent._lifecycle_policy->destroy_reader(_shard, f.then([state = _state.get()] {
|
||||
return state->reader->stop();
|
||||
}).finally([state = _state] {}));
|
||||
}
|
||||
|
||||
void multishard_combining_reader::shard_reader::update_last_position() {
|
||||
auto& reader = *_state->reader;
|
||||
if (reader.is_buffer_empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto rbegin = std::reverse_iterator(reader.get_buffer().end());
|
||||
auto rend = std::reverse_iterator(reader.get_buffer().begin());
|
||||
if (auto pk_it = std::find_if(rbegin, rend, std::mem_fn(&mutation_fragment::is_partition_start)); pk_it != rend) {
|
||||
_last_pkey = pk_it->as_partition_start().key();
|
||||
}
|
||||
|
||||
_last_position_in_partition.emplace(reader.get_buffer().back().position());
|
||||
}
|
||||
|
||||
void multishard_combining_reader::shard_reader::adjust_partition_slice() {
|
||||
if (!_slice_override) {
|
||||
_slice_override = _parent._ps;
|
||||
}
|
||||
|
||||
const auto& schema = *_parent._schema;
|
||||
_slice_override->clear_range(schema, _last_pkey->key());
|
||||
auto& last_ckey = _last_position_in_partition->key();
|
||||
|
||||
auto cmp = bound_view::compare(schema);
|
||||
auto eq = clustering_key_prefix::equality(schema);
|
||||
|
||||
auto ranges = _slice_override->default_row_ranges();
|
||||
auto it = ranges.begin();
|
||||
while (it != ranges.end()) {
|
||||
auto range = bound_view::from_range(*it);
|
||||
if (cmp(range.second, last_ckey) || eq(range.second.prefix(), last_ckey)) {
|
||||
it = ranges.erase(it);
|
||||
} else {
|
||||
if (cmp(range.first, last_ckey)) {
|
||||
assert(cmp(last_ckey, range.second));
|
||||
*it = query::clustering_range(query::clustering_range::bound{last_ckey, false}, it->end());
|
||||
}
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
_slice_override->clear_ranges();
|
||||
_slice_override->set_range(schema, _last_pkey->key(), std::move(ranges));
|
||||
}
|
||||
|
||||
future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> multishard_combining_reader::shard_reader::recreate_reader() {
|
||||
const dht::partition_range* range = _parent._pr;
|
||||
const query::partition_slice* slice = &_parent._ps;
|
||||
|
||||
if (_last_pkey) {
|
||||
bool partition_range_is_inclusive = true;
|
||||
|
||||
if (_last_position_in_partition) {
|
||||
switch (_last_position_in_partition->region()) {
|
||||
case partition_region::partition_start:
|
||||
_state->drop_partition_start = true;
|
||||
break;
|
||||
case partition_region::static_row:
|
||||
_state->drop_partition_start = true;
|
||||
_state->drop_static_row = true;
|
||||
break;
|
||||
case partition_region::clustered:
|
||||
_state->drop_partition_start = true;
|
||||
_state->drop_static_row = true;
|
||||
adjust_partition_slice();
|
||||
slice = &*_slice_override;
|
||||
break;
|
||||
case partition_region::partition_end:
|
||||
partition_range_is_inclusive = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// The original range contained a single partition and we've read it
|
||||
// all. We'd have to create a reader with an empty range that would
|
||||
// immediately be at EOS. This is not possible so just don't recreate
|
||||
// the reader.
|
||||
// This should be extremely rare (who'd create a multishard reader to
|
||||
// read a single partition) but still, let's make sure we handle it
|
||||
// correctly.
|
||||
if (_parent._pr->is_singular() && !partition_range_is_inclusive) {
|
||||
return make_ready_future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>>();
|
||||
}
|
||||
|
||||
_range_override = dht::partition_range({dht::partition_range::bound(*_last_pkey, partition_range_is_inclusive)}, _parent._pr->end());
|
||||
range = &*_range_override;
|
||||
}
|
||||
|
||||
return _parent._lifecycle_policy->create_reader(
|
||||
_shard,
|
||||
_parent._schema,
|
||||
*range,
|
||||
*slice,
|
||||
_parent._pc,
|
||||
_parent._trace_state,
|
||||
_parent._fwd_mr);
|
||||
}
|
||||
|
||||
future<> multishard_combining_reader::shard_reader::resume() {
|
||||
return std::exchange(_pause, std::nullopt)->then([this, state = _state] {
|
||||
if (state->stopped) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return _parent._lifecycle_policy->try_resume(_shard).then(
|
||||
[this, state = std::move(state)] (foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) mutable {
|
||||
if (reader) {
|
||||
state->reader->resume(std::move(reader));
|
||||
return make_ready_future<>();
|
||||
} else if (state->stopped) {
|
||||
return make_ready_future<>();
|
||||
} else {
|
||||
return recreate_reader().then([this, state = std::move(state)] (foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) {
|
||||
state->reader->resume(std::move(reader));
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> multishard_combining_reader::shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
return _state->reader->fill_buffer(timeout).then([this, state = _state] {
|
||||
auto& reader = *state->reader;
|
||||
|
||||
if (reader.is_buffer_empty()) {
|
||||
return;
|
||||
}
|
||||
if (state->drop_partition_start) {
|
||||
state->drop_partition_start = false;
|
||||
if (reader.peek_buffer().is_partition_start()) {
|
||||
reader.pop_mutation_fragment();
|
||||
}
|
||||
}
|
||||
|
||||
if (reader.is_buffer_empty()) {
|
||||
return;
|
||||
}
|
||||
if (state->drop_static_row) {
|
||||
state->drop_static_row = false;
|
||||
if (reader.peek_buffer().is_static_row()) {
|
||||
reader.pop_mutation_fragment();
|
||||
}
|
||||
}
|
||||
|
||||
if (!state->stopped) {
|
||||
update_last_position();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
future<> multishard_combining_reader::shard_reader::fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
if (_read_ahead) {
|
||||
return *std::exchange(_read_ahead, std::nullopt);
|
||||
}
|
||||
return _state->reader->fill_buffer(timeout);
|
||||
if (!_state->reader->is_buffer_empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (_pause) {
|
||||
return resume().then([this, timeout] {
|
||||
return fill_buffer(timeout);
|
||||
});
|
||||
}
|
||||
return do_fill_buffer(timeout);
|
||||
}
|
||||
|
||||
void multishard_combining_reader::shard_reader::next_partition() {
|
||||
_last_position_in_partition = position_in_partition(position_in_partition::end_of_partition_tag_t{});
|
||||
|
||||
// The only case this can be called with an uncreated reader is when
|
||||
// `next_partition()` is called on the multishard reader before the
|
||||
// first `fill_buffer()` call. In this case we are right before the first
|
||||
// partition so this call has no effect, hence we can ignore it.
|
||||
if (_state->reader) {
|
||||
_state->reader->next_partition();
|
||||
} else {
|
||||
++_state->pending_next_partition;
|
||||
}
|
||||
}
|
||||
|
||||
future<> multishard_combining_reader::shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
|
||||
if (_state->reader) {
|
||||
return _state->reader->fast_forward_to(pr, timeout);
|
||||
_last_pkey.reset();
|
||||
_last_position_in_partition.reset();
|
||||
|
||||
auto do_fast_forward = [this, &pr, timeout] {
|
||||
return _state->reader->fast_forward_to(pr, timeout);
|
||||
};
|
||||
|
||||
if (_pause) {
|
||||
return resume().then(std::move(do_fast_forward));
|
||||
}
|
||||
|
||||
if (_read_ahead) {
|
||||
return std::exchange(_read_ahead, std::nullopt)->then(std::move(do_fast_forward));
|
||||
}
|
||||
|
||||
return do_fast_forward();
|
||||
}
|
||||
// No need to fast-forward uncreated readers, they will be passed the new
|
||||
// range when created.
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
future<> multishard_combining_reader::shard_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
|
||||
if (_state->reader) {
|
||||
return _state->reader->fast_forward_to(pr, timeout);
|
||||
}
|
||||
return create_reader().then([this, pr = std::move(pr), timeout] {
|
||||
return _state->reader->fast_forward_to(pr, timeout);
|
||||
});
|
||||
}
|
||||
|
||||
future<> multishard_combining_reader::shard_reader::create_reader() {
|
||||
if (_state->reader) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (_read_ahead) {
|
||||
return _state->reader_promise.get_future();
|
||||
return *std::exchange(_read_ahead, std::nullopt);
|
||||
}
|
||||
return _parent._reader_factory(_shard, _parent._schema, *_parent._pr, _parent._ps, _parent._pc, _parent._trace_state,
|
||||
_parent._fwd_sm, _parent._fwd_mr).then(
|
||||
[schema = _parent._schema, state = _state, fwd_sm = _parent._fwd_sm] (foreign_ptr<std::unique_ptr<flat_mutation_reader>>&& r) mutable {
|
||||
state->reader = std::make_unique<foreign_reader>(std::move(schema), std::move(r), fwd_sm);
|
||||
for (;state->pending_next_partition; --state->pending_next_partition) {
|
||||
state->reader->next_partition();
|
||||
}
|
||||
|
||||
if (!state->stopped) {
|
||||
state->reader_promise.set_value();
|
||||
}
|
||||
return _parent._lifecycle_policy->create_reader(_shard, _parent._schema, *_parent._pr, _parent._ps, _parent._pc, _parent._trace_state,
|
||||
_parent._fwd_mr).then(
|
||||
[schema = _parent._schema, state = _state] (foreign_ptr<std::unique_ptr<flat_mutation_reader>>&& r) mutable {
|
||||
state->reader = std::make_unique<foreign_reader>(std::move(schema), std::move(r));
|
||||
});
|
||||
}
|
||||
|
||||
void multishard_combining_reader::shard_reader::read_ahead(db::timeout_clock::time_point timeout) {
|
||||
if (_read_ahead) {
|
||||
if (_read_ahead || (_state->reader && (_state->reader->is_end_of_stream() || !_state->reader->is_buffer_empty()))) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (_state->reader) {
|
||||
_read_ahead.emplace(_state->reader->fill_buffer(timeout));
|
||||
} else {
|
||||
_read_ahead.emplace(create_reader().then([state = _state, timeout] () mutable {
|
||||
if (state->stopped) {
|
||||
return make_ready_future<>();
|
||||
auto f = _state->reader
|
||||
? (_pause ? resume() : make_ready_future<>())
|
||||
: create_reader();
|
||||
|
||||
_read_ahead.emplace(f.then([this, state = _state, timeout] () mutable {
|
||||
if (state->stopped) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return do_fill_buffer(timeout).then([this, state = std::move(state)] {
|
||||
// Read ahead is still in the background, so pause the reader.
|
||||
if (!state->stopped && _read_ahead) {
|
||||
pause();
|
||||
}
|
||||
return state->reader->fill_buffer(timeout);
|
||||
}));
|
||||
}
|
||||
});
|
||||
}));
|
||||
}
|
||||
|
||||
future<stopped_foreign_reader> multishard_combining_reader::shard_reader::stop() {
|
||||
_state->stopped = true;
|
||||
|
||||
if (!_state->reader && !_read_ahead) {
|
||||
return make_ready_future<stopped_foreign_reader>(stopped_foreign_reader{nullptr, circular_buffer<mutation_fragment>{}});
|
||||
void multishard_combining_reader::shard_reader::pause() {
|
||||
if (_pause) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto f = [this] {
|
||||
if (_read_ahead) {
|
||||
return _read_ahead->then([state = _state.get()] () mutable {
|
||||
return state->reader->stop();
|
||||
});
|
||||
} else {
|
||||
return _state->reader->stop();
|
||||
auto f = _read_ahead ? *std::exchange(_read_ahead, std::nullopt) : make_ready_future<>();
|
||||
_pause = f.then([this, state = _state] () mutable {
|
||||
if (state->stopped) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}();
|
||||
return f.finally([state = _state] {});
|
||||
return state->reader->pause().then([this, state = std::move(state)] (foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) {
|
||||
if (state->stopped) {
|
||||
state->reader->resume(std::move(reader));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// When pausing, the content of the remote reader's buffer is transferred to
|
||||
// the foreign reader, so we might need to update the last position.
|
||||
update_last_position();
|
||||
|
||||
return _parent._lifecycle_policy->pause(std::move(reader));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void multishard_combining_reader::move_to_next_shard() {
|
||||
@@ -1231,11 +1354,12 @@ future<> multishard_combining_reader::handle_empty_reader_buffer(db::timeout_clo
|
||||
auto& reader = _shard_readers[_current_shard];
|
||||
|
||||
if (reader.is_end_of_stream()) {
|
||||
if (_fwd_sm || std::all_of(_shard_readers.begin(), _shard_readers.end(), std::mem_fn(&shard_reader::done))) {
|
||||
if (std::all_of(_shard_readers.begin(), _shard_readers.end(), std::mem_fn(&shard_reader::done))) {
|
||||
_end_of_stream = true;
|
||||
} else {
|
||||
move_to_next_shard();
|
||||
}
|
||||
reader.pause();
|
||||
return make_ready_future<>();
|
||||
} else if (reader.is_read_ahead_in_progress()) {
|
||||
return reader.fill_buffer(timeout);
|
||||
@@ -1257,25 +1381,22 @@ future<> multishard_combining_reader::handle_empty_reader_buffer(db::timeout_clo
|
||||
}
|
||||
}
|
||||
|
||||
multishard_combining_reader::multishard_combining_reader(schema_ptr s,
|
||||
multishard_combining_reader::multishard_combining_reader(
|
||||
shared_ptr<reader_lifecycle_policy> lifecycle_policy,
|
||||
const dht::i_partitioner& partitioner,
|
||||
schema_ptr s,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
const dht::i_partitioner& partitioner,
|
||||
remote_reader_factory reader_factory,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd_sm,
|
||||
mutation_reader::forwarding fwd_mr,
|
||||
foreign_reader_dismantler reader_dismantler)
|
||||
mutation_reader::forwarding fwd_mr)
|
||||
: impl(s)
|
||||
, _lifecycle_policy(std::move(lifecycle_policy))
|
||||
, _partitioner(partitioner)
|
||||
, _pr(&pr)
|
||||
, _ps(ps)
|
||||
, _pc(pc)
|
||||
, _reader_factory(std::move(reader_factory))
|
||||
, _reader_dismantler(std::move(reader_dismantler))
|
||||
, _trace_state(std::move(trace_state))
|
||||
, _fwd_sm(fwd_sm)
|
||||
, _fwd_mr(fwd_mr)
|
||||
, _current_shard(pr.start() ? _partitioner.shard_of(pr.start()->value().token()) : _partitioner.shard_of_minimum_token())
|
||||
, _next_token(_partitioner.token_for_next_shard(pr.start() ? pr.start()->value().token() : dht::minimum_token(),
|
||||
@@ -1286,25 +1407,6 @@ multishard_combining_reader::multishard_combining_reader(schema_ptr s,
|
||||
}
|
||||
}
|
||||
|
||||
multishard_combining_reader::~multishard_combining_reader() {
|
||||
for (shard_id shard = 0; shard < smp::count; ++shard) {
|
||||
auto& reader = _shard_readers[shard];
|
||||
|
||||
// Readers might also be created by background read-aheads, so it's not
|
||||
// enough to check whether the reader is created at the moment, we also
|
||||
// need to check whether there is a read-ahead in progress. If there is,
|
||||
// it will surely create a reader which also needs to be dismantled.
|
||||
if (!reader && !reader.is_read_ahead_in_progress()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto fut = reader.stop();
|
||||
if (_reader_dismantler) {
|
||||
_reader_dismantler(shard, std::move(fut));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
future<> multishard_combining_reader::fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
_crossed_shards = false;
|
||||
return do_until([this] { return is_buffer_full() || is_end_of_stream(); }, [this, timeout] {
|
||||
@@ -1320,6 +1422,7 @@ future<> multishard_combining_reader::fill_buffer(db::timeout_clock::time_point
|
||||
while (!reader.is_buffer_empty() && !is_buffer_full()) {
|
||||
if (const auto& mf = reader.peek_buffer(); mf.is_partition_start() && mf.as_partition_start().key().token() >= _next_token) {
|
||||
move_to_next_shard();
|
||||
reader.pause();
|
||||
return make_ready_future<>();
|
||||
}
|
||||
push_mutation_fragment(reader.pop_mutation_fragment());
|
||||
@@ -1329,15 +1432,9 @@ future<> multishard_combining_reader::fill_buffer(db::timeout_clock::time_point
|
||||
}
|
||||
|
||||
void multishard_combining_reader::next_partition() {
|
||||
if (_fwd_sm == streamed_mutation::forwarding::yes) {
|
||||
clear_buffer();
|
||||
_end_of_stream = false;
|
||||
clear_buffer_to_next_partition();
|
||||
if (is_buffer_empty()) {
|
||||
_shard_readers[_current_shard].next_partition();
|
||||
} else {
|
||||
clear_buffer_to_next_partition();
|
||||
if (is_buffer_empty()) {
|
||||
_shard_readers[_current_shard].next_partition();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1359,24 +1456,18 @@ future<> multishard_combining_reader::fast_forward_to(const dht::partition_range
|
||||
}
|
||||
|
||||
future<> multishard_combining_reader::fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) {
|
||||
forward_buffer_to(pr.start());
|
||||
_end_of_stream = false;
|
||||
if (is_buffer_empty()) {
|
||||
return _shard_readers[_current_shard].fast_forward_to(std::move(pr), timeout);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
return make_exception_future<>(std::bad_function_call());
|
||||
}
|
||||
|
||||
flat_mutation_reader make_multishard_combining_reader(schema_ptr schema,
|
||||
flat_mutation_reader make_multishard_combining_reader(
|
||||
shared_ptr<reader_lifecycle_policy> lifecycle_policy,
|
||||
const dht::i_partitioner& partitioner,
|
||||
schema_ptr schema,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
const dht::i_partitioner& partitioner,
|
||||
remote_reader_factory reader_factory,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd_sm,
|
||||
mutation_reader::forwarding fwd_mr,
|
||||
foreign_reader_dismantler reader_dismantler) {
|
||||
return make_flat_mutation_reader<multishard_combining_reader>(std::move(schema), pr, ps, pc, partitioner, std::move(reader_factory),
|
||||
std::move(trace_state), fwd_sm, fwd_mr, std::move(reader_dismantler));
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return make_flat_mutation_reader<multishard_combining_reader>(std::move(lifecycle_policy), partitioner, std::move(schema), pr, ps, pc,
|
||||
std::move(trace_state), fwd_mr);
|
||||
}
|
||||
|
||||
@@ -388,27 +388,81 @@ flat_mutation_reader make_foreign_reader(schema_ptr schema,
|
||||
foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader,
|
||||
streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no);
|
||||
|
||||
using remote_reader_factory = noncopyable_function<future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>>(unsigned,
|
||||
schema_ptr,
|
||||
const dht::partition_range&,
|
||||
const query::partition_slice&,
|
||||
const io_priority_class&,
|
||||
tracing::trace_state_ptr,
|
||||
streamed_mutation::forwarding,
|
||||
mutation_reader::forwarding)>;
|
||||
/// Reader lifecycle policy for the mulitshard combining reader.
|
||||
///
|
||||
/// This policy is expected to make sure any additional resource the readers
|
||||
/// might need is kept alive for the lifetime of the readers, not that
|
||||
/// of the multishard reader. This is a very important distinction. As
|
||||
/// destructors cannot return futures, the multishard reader will be
|
||||
/// destroyed before all it's shard readers could stop properly. Hence it
|
||||
/// is the duty of this policy to make sure all objects the shard readers
|
||||
/// depend on stay alive until they are properly destroyed on their home
|
||||
/// shards. Note that this also includes the passed in `range` and `slice`
|
||||
/// parameters because although client code is required to keep them alive as
|
||||
/// long as the top level reader lives, the shard readers might outlive the
|
||||
/// multishard reader itself.
|
||||
class reader_lifecycle_policy {
|
||||
public:
|
||||
struct paused_or_stopped_reader {
|
||||
// Null when the reader is paused.
|
||||
foreign_ptr<std::unique_ptr<flat_mutation_reader>> remote_reader;
|
||||
circular_buffer<mutation_fragment> unconsumed_fragments;
|
||||
// Only set for paused readers.
|
||||
bool has_pending_next_partition;
|
||||
};
|
||||
|
||||
struct stopped_foreign_reader {
|
||||
foreign_ptr<std::unique_ptr<flat_mutation_reader>> remote_reader;
|
||||
circular_buffer<mutation_fragment> unconsumed_fragments;
|
||||
public:
|
||||
/// Create an appropriate reader on the specified shard.
|
||||
///
|
||||
/// Will be called when the multishard reader visits a shard for the
|
||||
/// first time. This method should also enter gates, take locks or
|
||||
/// whatever is appropriate to make sure resources it is using on the
|
||||
/// remote shard stay alive, during the lifetime of the created reader.
|
||||
virtual future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> create_reader(
|
||||
shard_id shard,
|
||||
schema_ptr schema,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
mutation_reader::forwarding fwd_mr) = 0;
|
||||
|
||||
/// Wait on the shard reader to stop then destroy it.
|
||||
///
|
||||
/// Will be called when the multishard reader is being destroyed. It will be
|
||||
/// called for each of the shard readers. The future resolves when the
|
||||
/// reader is stopped, that is it, finishes all background and/or pending
|
||||
/// work.
|
||||
/// This method is expected to do a proper cleanup, that is, leave any gates,
|
||||
/// release any locks or whatever is appropriate for the shard reader.
|
||||
///
|
||||
/// The multishard reader couldn't wait on any future returned from this
|
||||
/// method (as it will be called from the destructor) so waiting on
|
||||
/// all the readers being cleaned up is up to the implementation.
|
||||
///
|
||||
/// This method will be called from a destructor so it cannot throw.
|
||||
virtual void destroy_reader(shard_id shard, future<paused_or_stopped_reader> reader) noexcept = 0;
|
||||
|
||||
/// Pause the reader.
|
||||
///
|
||||
/// The purpose of pausing a reader is making it evictable while it is
|
||||
/// otherwise inactive. This allows freeing up resources that are in-demand
|
||||
/// by evicting these paused readers. Most notably, this allows freeing up
|
||||
/// reader permits when the node is overloaded with reads.
|
||||
virtual future<> pause(foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader) = 0;
|
||||
|
||||
/// Try to resume the reader.
|
||||
///
|
||||
/// The pointer returned will be null when resuming fails. This can happen
|
||||
/// if the reader was evicted while paused.
|
||||
virtual future<foreign_ptr<std::unique_ptr<flat_mutation_reader>>> try_resume(shard_id shard) = 0;
|
||||
};
|
||||
using foreign_reader_dismantler = noncopyable_function<void(shard_id, future<stopped_foreign_reader>)>;
|
||||
|
||||
/// Make a multishard_combining_reader.
|
||||
///
|
||||
/// multishard_combining_reader takes care of reading a range from all shards
|
||||
/// that own a subrange in the range. Readers are created on-demand with the
|
||||
/// supplied reader_factory. This factory function is expected to create an
|
||||
/// appropriate reader on the specified shard and return a foreign_ptr to it.
|
||||
/// that own a subrange in the range. Shard reader are created on-demand, when
|
||||
/// the shard is visited for the first time.
|
||||
///
|
||||
/// The read starts with a concurrency of one, that is the reader reads from a
|
||||
/// single shard at a time. The concurrency is exponentially increased (to a
|
||||
@@ -421,19 +475,13 @@ using foreign_reader_dismantler = noncopyable_function<void(shard_id, future<sto
|
||||
/// For dense tables (where we rarely cross shards) we rely on the
|
||||
/// foreign_reader to issue sufficient read-aheads on its own to avoid blocking.
|
||||
///
|
||||
/// Optionally a dismantler function can be passed to the multishard
|
||||
/// reader. When the multishard reader is destroyed it will invoke the
|
||||
/// dismantler functor for each of its foreign (shard) readers, passing a future
|
||||
/// to a `stopped_foreign_reader`. The future becomes available when the foreign
|
||||
/// reader has stopped, that is, it finished all of its in-progress read aheads
|
||||
/// and/or any pending `next_partition()` calls.
|
||||
flat_mutation_reader make_multishard_combining_reader(schema_ptr schema,
|
||||
/// The readers' life-cycles are managed through the supplied lifecycle policy.
|
||||
flat_mutation_reader make_multishard_combining_reader(
|
||||
shared_ptr<reader_lifecycle_policy> lifecycle_policy,
|
||||
const dht::i_partitioner& partitioner,
|
||||
schema_ptr schema,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
const dht::i_partitioner& partitioner,
|
||||
remote_reader_factory reader_factory,
|
||||
tracing::trace_state_ptr trace_state = nullptr,
|
||||
streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no,
|
||||
foreign_reader_dismantler reader_dismantler = {});
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no);
|
||||
|
||||
@@ -270,6 +270,7 @@ public:
|
||||
static position_in_partition for_range_start(const query::clustering_range&);
|
||||
static position_in_partition for_range_end(const query::clustering_range&);
|
||||
|
||||
partition_region region() const { return _type; }
|
||||
bool is_partition_start() const { return _type == partition_region::partition_start; }
|
||||
bool is_partition_end() const { return _type == partition_region::partition_end; }
|
||||
bool is_static_row() const { return _type == partition_region::static_row; }
|
||||
|
||||
56
querier.cc
56
querier.cc
@@ -216,16 +216,43 @@ static querier_cache::entries::iterator find_querier(querier_cache::entries& ent
|
||||
return it->pos();
|
||||
}
|
||||
|
||||
querier_cache::querier_cache(size_t max_cache_size, std::chrono::seconds entry_ttl)
|
||||
: _expiry_timer([this] { scan_cache_entries(); })
|
||||
querier_cache::querier_cache(reader_concurrency_semaphore& sem, size_t max_cache_size, std::chrono::seconds entry_ttl)
|
||||
: _sem(sem)
|
||||
, _expiry_timer([this] { scan_cache_entries(); })
|
||||
, _entry_ttl(entry_ttl)
|
||||
, _max_queriers_memory_usage(max_cache_size) {
|
||||
_expiry_timer.arm_periodic(entry_ttl / 2);
|
||||
}
|
||||
|
||||
class querier_inactive_read : public reader_concurrency_semaphore::inactive_read {
|
||||
querier_cache::entries& _entries;
|
||||
querier_cache::entries::iterator _pos;
|
||||
querier_cache::stats& _stats;
|
||||
|
||||
public:
|
||||
querier_inactive_read(querier_cache::entries& entries, querier_cache::entries::iterator pos, querier_cache::stats& stats)
|
||||
: _entries(entries)
|
||||
, _pos(pos)
|
||||
, _stats(stats) {
|
||||
}
|
||||
virtual void evict() override {
|
||||
_entries.erase(_pos);
|
||||
++_stats.resource_based_evictions;
|
||||
--_stats.population;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Querier>
|
||||
static void insert_querier(querier_cache::entries& entries, querier_cache::index& index, querier_cache::stats& stats,
|
||||
size_t max_queriers_memory_usage, utils::UUID key, Querier&& q, lowres_clock::time_point expires, tracing::trace_state_ptr trace_state) {
|
||||
static void insert_querier(
|
||||
reader_concurrency_semaphore& sem,
|
||||
querier_cache::entries& entries,
|
||||
querier_cache::index& index,
|
||||
querier_cache::stats& stats,
|
||||
size_t max_queriers_memory_usage,
|
||||
utils::UUID key,
|
||||
Querier&& q,
|
||||
lowres_clock::time_point expires,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
// FIXME: see #3159
|
||||
// In reverse mode flat_mutation_reader drops any remaining rows of the
|
||||
// current partition when the page ends so it cannot be reused across
|
||||
@@ -258,27 +285,30 @@ static void insert_querier(querier_cache::entries& entries, querier_cache::index
|
||||
|
||||
auto& e = entries.emplace_back(key, std::move(q), expires);
|
||||
e.set_pos(--entries.end());
|
||||
e.set_inactive_handle(sem.register_inactive_read(std::make_unique<querier_inactive_read>(entries, e.pos(), stats)));
|
||||
index.insert(e);
|
||||
++stats.population;
|
||||
}
|
||||
|
||||
void querier_cache::insert(utils::UUID key, data_querier&& q, tracing::trace_state_ptr trace_state) {
|
||||
insert_querier(_entries, _data_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
|
||||
insert_querier(_sem, _entries, _data_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
|
||||
std::move(trace_state));
|
||||
}
|
||||
|
||||
void querier_cache::insert(utils::UUID key, mutation_querier&& q, tracing::trace_state_ptr trace_state) {
|
||||
insert_querier(_entries, _mutation_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
|
||||
insert_querier(_sem, _entries, _mutation_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
|
||||
std::move(trace_state));
|
||||
}
|
||||
|
||||
void querier_cache::insert(utils::UUID key, shard_mutation_querier&& q, tracing::trace_state_ptr trace_state) {
|
||||
insert_querier(_entries, _shard_mutation_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
|
||||
insert_querier(_sem, _entries, _shard_mutation_querier_index, _stats, _max_queriers_memory_usage, key, std::move(q), lowres_clock::now() + _entry_ttl,
|
||||
std::move(trace_state));
|
||||
}
|
||||
|
||||
template <typename Querier>
|
||||
static std::optional<Querier> lookup_querier(querier_cache::entries& entries,
|
||||
static std::optional<Querier> lookup_querier(
|
||||
reader_concurrency_semaphore& sem,
|
||||
querier_cache::entries& entries,
|
||||
querier_cache::index& index,
|
||||
querier_cache::stats& stats,
|
||||
utils::UUID key,
|
||||
@@ -294,6 +324,7 @@ static std::optional<Querier> lookup_querier(querier_cache::entries& entries,
|
||||
}
|
||||
|
||||
auto q = std::move(*it).template value<Querier>();
|
||||
sem.unregister_inactive_read(it->get_inactive_handle());
|
||||
entries.erase(it);
|
||||
--stats.population;
|
||||
|
||||
@@ -313,7 +344,7 @@ std::optional<data_querier> querier_cache::lookup_data_querier(utils::UUID key,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
return lookup_querier<data_querier>(_entries, _data_querier_index, _stats, key, s, range, slice, std::move(trace_state));
|
||||
return lookup_querier<data_querier>(_sem, _entries, _data_querier_index, _stats, key, s, range, slice, std::move(trace_state));
|
||||
}
|
||||
|
||||
std::optional<mutation_querier> querier_cache::lookup_mutation_querier(utils::UUID key,
|
||||
@@ -321,7 +352,7 @@ std::optional<mutation_querier> querier_cache::lookup_mutation_querier(utils::UU
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
return lookup_querier<mutation_querier>(_entries, _mutation_querier_index, _stats, key, s, range, slice, std::move(trace_state));
|
||||
return lookup_querier<mutation_querier>(_sem, _entries, _mutation_querier_index, _stats, key, s, range, slice, std::move(trace_state));
|
||||
}
|
||||
|
||||
std::optional<shard_mutation_querier> querier_cache::lookup_shard_mutation_querier(utils::UUID key,
|
||||
@@ -329,7 +360,8 @@ std::optional<shard_mutation_querier> querier_cache::lookup_shard_mutation_queri
|
||||
const dht::partition_range_vector& ranges,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
return lookup_querier<shard_mutation_querier>(_entries, _shard_mutation_querier_index, _stats, key, s, ranges, slice, std::move(trace_state));
|
||||
return lookup_querier<shard_mutation_querier>(_sem, _entries, _shard_mutation_querier_index, _stats, key, s, ranges, slice,
|
||||
std::move(trace_state));
|
||||
}
|
||||
|
||||
void querier_cache::set_entry_ttl(std::chrono::seconds entry_ttl) {
|
||||
@@ -344,6 +376,7 @@ bool querier_cache::evict_one() {
|
||||
|
||||
++_stats.resource_based_evictions;
|
||||
--_stats.population;
|
||||
_sem.unregister_inactive_read(_entries.front().get_inactive_handle());
|
||||
_entries.pop_front();
|
||||
|
||||
return true;
|
||||
@@ -355,6 +388,7 @@ void querier_cache::evict_all_for_table(const utils::UUID& schema_id) {
|
||||
while (it != end) {
|
||||
if (it->schema().id() == schema_id) {
|
||||
--_stats.population;
|
||||
_sem.unregister_inactive_read(it->get_inactive_handle());
|
||||
it = _entries.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
|
||||
17
querier.hh
17
querier.hh
@@ -291,9 +291,8 @@ public:
|
||||
/// Inserted queriers will have a TTL. When this expires the querier is
|
||||
/// evicted. This is to avoid excess and unnecessary resource usage due to
|
||||
/// abandoned queriers.
|
||||
/// Provides a way to evict readers one-by-one via `evict_one()`. This can be
|
||||
/// used by the concurrency-limiting code to evict cached readers to free up
|
||||
/// resources for admitting new ones.
|
||||
/// Registers cached readers with the reader concurrency semaphore, as inactive
|
||||
/// readers, so the latter can evict them if needed.
|
||||
/// Keeps the total memory consumption of cached queriers
|
||||
/// below max_queriers_memory_usage by evicting older entries upon inserting
|
||||
/// new ones if the the memory consupmtion would go above the limit.
|
||||
@@ -327,6 +326,7 @@ public:
|
||||
const utils::UUID _key;
|
||||
const lowres_clock::time_point _expires;
|
||||
std::variant<data_querier, mutation_querier, shard_mutation_querier> _value;
|
||||
std::optional<reader_concurrency_semaphore::inactive_read_handle> _handle;
|
||||
|
||||
public:
|
||||
template <typename Querier>
|
||||
@@ -344,6 +344,14 @@ public:
|
||||
_pos = pos;
|
||||
}
|
||||
|
||||
void set_inactive_handle(reader_concurrency_semaphore::inactive_read_handle handle) {
|
||||
_handle = std::move(handle);
|
||||
}
|
||||
|
||||
reader_concurrency_semaphore::inactive_read_handle get_inactive_handle() const {
|
||||
return *_handle;
|
||||
}
|
||||
|
||||
const utils::UUID& key() const {
|
||||
return _key;
|
||||
}
|
||||
@@ -391,6 +399,7 @@ public:
|
||||
boost::intrusive::constant_time_size<false>>;
|
||||
|
||||
private:
|
||||
reader_concurrency_semaphore& _sem;
|
||||
entries _entries;
|
||||
index _data_querier_index;
|
||||
index _mutation_querier_index;
|
||||
@@ -403,7 +412,7 @@ private:
|
||||
void scan_cache_entries();
|
||||
|
||||
public:
|
||||
explicit querier_cache(size_t max_cache_size = 1'000'000, std::chrono::seconds entry_ttl = default_entry_ttl);
|
||||
explicit querier_cache(reader_concurrency_semaphore& sem, size_t max_cache_size = 1'000'000, std::chrono::seconds entry_ttl = default_entry_ttl);
|
||||
|
||||
querier_cache(const querier_cache&) = delete;
|
||||
querier_cache& operator=(const querier_cache&) = delete;
|
||||
|
||||
@@ -136,6 +136,9 @@ public:
|
||||
const clustering_row_ranges& row_ranges(const schema&, const partition_key&) const;
|
||||
void set_range(const schema&, const partition_key&, clustering_row_ranges);
|
||||
void clear_range(const schema&, const partition_key&);
|
||||
void clear_ranges() {
|
||||
_specific_ranges = nullptr;
|
||||
}
|
||||
// FIXME: possibly make this function return a const ref instead.
|
||||
clustering_row_ranges get_all_ranges() const;
|
||||
|
||||
|
||||
181
reader_concurrency_semaphore.cc
Normal file
181
reader_concurrency_semaphore.cc
Normal file
@@ -0,0 +1,181 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <seastar/core/reactor.hh>
|
||||
|
||||
#include "reader_concurrency_semaphore.hh"
|
||||
|
||||
void reader_concurrency_semaphore::signal(const resources& r) {
|
||||
_resources += r;
|
||||
while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
|
||||
auto& x = _wait_list.front();
|
||||
_resources -= x.res;
|
||||
x.pr.set_value(make_lw_shared<reader_permit>(*this, x.res));
|
||||
_wait_list.pop_front();
|
||||
}
|
||||
}
|
||||
|
||||
reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(std::unique_ptr<inactive_read> ir) {
|
||||
// Implies _inactive_reads.empty(), we don't queue new readers before
|
||||
// evicting all inactive reads.
|
||||
if (_wait_list.empty()) {
|
||||
const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
|
||||
(void)_;
|
||||
return inactive_read_handle(it->first);
|
||||
}
|
||||
|
||||
// The evicted reader will release its permit, hopefully allowing us to
|
||||
// admit some readers from the _wait_list.
|
||||
ir->evict();
|
||||
return inactive_read_handle();
|
||||
}
|
||||
|
||||
std::unique_ptr<reader_concurrency_semaphore::inactive_read> reader_concurrency_semaphore::unregister_inactive_read(inactive_read_handle irh) {
|
||||
if (auto it = _inactive_reads.find(irh._id); it != _inactive_reads.end()) {
|
||||
auto ir = std::move(it->second);
|
||||
_inactive_reads.erase(it);
|
||||
return ir;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
bool reader_concurrency_semaphore::try_evict_one_inactive_read() {
|
||||
if (_inactive_reads.empty()) {
|
||||
return false;
|
||||
}
|
||||
auto it = _inactive_reads.begin();
|
||||
it->second->evict();
|
||||
_inactive_reads.erase(it);
|
||||
return true;
|
||||
}
|
||||
|
||||
future<lw_shared_ptr<reader_concurrency_semaphore::reader_permit>> reader_concurrency_semaphore::wait_admission(size_t memory,
|
||||
db::timeout_clock::time_point timeout) {
|
||||
if (_wait_list.size() >= _max_queue_length) {
|
||||
return make_exception_future<lw_shared_ptr<reader_permit>>(_make_queue_overloaded_exception());
|
||||
}
|
||||
auto r = resources(1, static_cast<ssize_t>(memory));
|
||||
auto it = _inactive_reads.begin();
|
||||
while (!may_proceed(r) && it != _inactive_reads.end()) {
|
||||
auto ir = std::move(it->second);
|
||||
it = _inactive_reads.erase(it);
|
||||
ir->evict();
|
||||
}
|
||||
if (may_proceed(r)) {
|
||||
_resources -= r;
|
||||
return make_ready_future<lw_shared_ptr<reader_permit>>(make_lw_shared<reader_permit>(*this, r));
|
||||
}
|
||||
promise<lw_shared_ptr<reader_permit>> pr;
|
||||
auto fut = pr.get_future();
|
||||
_wait_list.push_back(entry(std::move(pr), r), timeout);
|
||||
return fut;
|
||||
}
|
||||
|
||||
// A file that tracks the memory usage of buffers resulting from read
|
||||
// operations.
|
||||
class tracking_file_impl : public file_impl {
|
||||
file _tracked_file;
|
||||
lw_shared_ptr<reader_concurrency_semaphore::reader_permit> _permit;
|
||||
|
||||
// Shouldn't be called if semaphore is NULL.
|
||||
temporary_buffer<uint8_t> make_tracked_buf(temporary_buffer<uint8_t> buf) {
|
||||
return seastar::temporary_buffer<uint8_t>(buf.get_write(),
|
||||
buf.size(),
|
||||
make_deleter(buf.release(), std::bind(&reader_concurrency_semaphore::reader_permit::signal_memory, _permit, buf.size())));
|
||||
}
|
||||
|
||||
public:
|
||||
tracking_file_impl(file file, reader_resource_tracker resource_tracker)
|
||||
: _tracked_file(std::move(file))
|
||||
, _permit(resource_tracker.get_permit()) {
|
||||
}
|
||||
|
||||
tracking_file_impl(const tracking_file_impl&) = delete;
|
||||
tracking_file_impl& operator=(const tracking_file_impl&) = delete;
|
||||
tracking_file_impl(tracking_file_impl&&) = default;
|
||||
tracking_file_impl& operator=(tracking_file_impl&&) = default;
|
||||
|
||||
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->write_dma(pos, buffer, len, pc);
|
||||
}
|
||||
|
||||
virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->write_dma(pos, std::move(iov), pc);
|
||||
}
|
||||
|
||||
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->read_dma(pos, buffer, len, pc);
|
||||
}
|
||||
|
||||
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->read_dma(pos, iov, pc);
|
||||
}
|
||||
|
||||
virtual future<> flush(void) override {
|
||||
return get_file_impl(_tracked_file)->flush();
|
||||
}
|
||||
|
||||
virtual future<struct stat> stat(void) override {
|
||||
return get_file_impl(_tracked_file)->stat();
|
||||
}
|
||||
|
||||
virtual future<> truncate(uint64_t length) override {
|
||||
return get_file_impl(_tracked_file)->truncate(length);
|
||||
}
|
||||
|
||||
virtual future<> discard(uint64_t offset, uint64_t length) override {
|
||||
return get_file_impl(_tracked_file)->discard(offset, length);
|
||||
}
|
||||
|
||||
virtual future<> allocate(uint64_t position, uint64_t length) override {
|
||||
return get_file_impl(_tracked_file)->allocate(position, length);
|
||||
}
|
||||
|
||||
virtual future<uint64_t> size(void) override {
|
||||
return get_file_impl(_tracked_file)->size();
|
||||
}
|
||||
|
||||
virtual future<> close() override {
|
||||
return get_file_impl(_tracked_file)->close();
|
||||
}
|
||||
|
||||
virtual std::unique_ptr<file_handle_impl> dup() override {
|
||||
return get_file_impl(_tracked_file)->dup();
|
||||
}
|
||||
|
||||
virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) override {
|
||||
return get_file_impl(_tracked_file)->list_directory(std::move(next));
|
||||
}
|
||||
|
||||
virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) override {
|
||||
return get_file_impl(_tracked_file)->dma_read_bulk(offset, range_size, pc).then([this] (temporary_buffer<uint8_t> buf) {
|
||||
if (_permit) {
|
||||
buf = make_tracked_buf(std::move(buf));
|
||||
_permit->consume_memory(buf.size());
|
||||
}
|
||||
return make_ready_future<temporary_buffer<uint8_t>>(std::move(buf));
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
file reader_resource_tracker::track(file f) const {
|
||||
return file(make_shared<tracking_file_impl>(f, *this));
|
||||
}
|
||||
@@ -21,10 +21,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <core/file.hh>
|
||||
#include <core/semaphore.hh>
|
||||
#include <map>
|
||||
#include <seastar/core/file.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include "db/timeout_clock.hh"
|
||||
|
||||
using namespace seastar;
|
||||
|
||||
/// Specific semaphore for controlling reader concurrency
|
||||
///
|
||||
/// Before creating a reader one should obtain a permit by calling
|
||||
@@ -109,13 +113,24 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
class inactive_read {
|
||||
public:
|
||||
virtual void evict() = 0;
|
||||
virtual ~inactive_read() = default;
|
||||
};
|
||||
|
||||
class inactive_read_handle {
|
||||
uint64_t _id = 0;
|
||||
|
||||
friend class reader_concurrency_semaphore;
|
||||
|
||||
inactive_read_handle() = default;
|
||||
explicit inactive_read_handle(uint64_t id)
|
||||
: _id(id) {
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
static std::exception_ptr default_make_queue_overloaded_exception() {
|
||||
return std::make_exception_ptr(std::runtime_error("restricted mutation reader queue overload"));
|
||||
}
|
||||
|
||||
resources _resources;
|
||||
|
||||
struct entry {
|
||||
promise<lw_shared_ptr<reader_permit>> pr;
|
||||
resources res;
|
||||
@@ -126,11 +141,21 @@ private:
|
||||
e.pr.set_exception(semaphore_timed_out());
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
resources _resources;
|
||||
|
||||
expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;
|
||||
|
||||
size_t _max_queue_length = std::numeric_limits<size_t>::max();
|
||||
std::function<std::exception_ptr()> _make_queue_overloaded_exception = default_make_queue_overloaded_exception;
|
||||
std::function<bool()> _evict_an_inactive_reader;
|
||||
std::function<std::exception_ptr()> _make_queue_overloaded_exception;
|
||||
uint64_t _next_id = 1;
|
||||
std::map<uint64_t, std::unique_ptr<inactive_read>> _inactive_reads;
|
||||
|
||||
private:
|
||||
static std::exception_ptr default_make_queue_overloaded_exception() {
|
||||
return std::make_exception_ptr(std::runtime_error("restricted mutation reader queue overload"));
|
||||
}
|
||||
|
||||
bool has_available_units(const resources& r) const {
|
||||
return bool(_resources) && _resources >= r;
|
||||
@@ -153,12 +178,10 @@ public:
|
||||
reader_concurrency_semaphore(unsigned count,
|
||||
size_t memory,
|
||||
size_t max_queue_length = std::numeric_limits<size_t>::max(),
|
||||
std::function<std::exception_ptr()> raise_queue_overloaded_exception = default_make_queue_overloaded_exception,
|
||||
std::function<bool()> evict_an_inactive_reader = {})
|
||||
std::function<std::exception_ptr()> raise_queue_overloaded_exception = default_make_queue_overloaded_exception)
|
||||
: _resources(count, memory)
|
||||
, _max_queue_length(max_queue_length)
|
||||
, _make_queue_overloaded_exception(raise_queue_overloaded_exception)
|
||||
, _evict_an_inactive_reader(std::move(evict_an_inactive_reader)) {
|
||||
, _make_queue_overloaded_exception(raise_queue_overloaded_exception) {
|
||||
}
|
||||
|
||||
reader_concurrency_semaphore(const reader_concurrency_semaphore&) = delete;
|
||||
@@ -167,6 +190,35 @@ public:
|
||||
reader_concurrency_semaphore(reader_concurrency_semaphore&&) = delete;
|
||||
reader_concurrency_semaphore& operator=(reader_concurrency_semaphore&&) = delete;
|
||||
|
||||
/// Register an inactive read.
|
||||
///
|
||||
/// The semaphore will evict this read when there is a shortage of
|
||||
/// permits. This might be immediate, during this register call.
|
||||
/// Clients can use the returned handle to unregister the read, when it
|
||||
/// stops being inactive and hence evictable.
|
||||
///
|
||||
/// An inactive read is an object implementing the `inactive_read`
|
||||
/// interface.
|
||||
/// The semaphore takes ownership of the created object and destroys it if
|
||||
/// it is evicted.
|
||||
inactive_read_handle register_inactive_read(std::unique_ptr<inactive_read> ir);
|
||||
|
||||
/// Unregister the previously registered inactive read.
|
||||
///
|
||||
/// If the read was not evicted, the inactive read object, passed in to the
|
||||
/// register call, will be returned. Otherwise a nullptr is returned.
|
||||
std::unique_ptr<inactive_read> unregister_inactive_read(inactive_read_handle irh);
|
||||
|
||||
/// Try to evict an inactive read.
|
||||
///
|
||||
/// Return true if an inactive read was evicted and false otherwise
|
||||
/// (if there was no reader to evict).
|
||||
bool try_evict_one_inactive_read();
|
||||
|
||||
void clear_inactive_reads() {
|
||||
_inactive_reads.clear();
|
||||
}
|
||||
|
||||
future<lw_shared_ptr<reader_permit>> wait_admission(size_t memory, db::timeout_clock::time_point timeout = db::no_timeout);
|
||||
|
||||
const resources available_resources() const {
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "atomic_cell_hash.hh"
|
||||
#include "streaming/stream_plan.hh"
|
||||
#include "streaming/stream_state.hh"
|
||||
#include "streaming/stream_reason.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "db/config.hh"
|
||||
#include "service/storage_service.hh"
|
||||
@@ -95,8 +96,8 @@ public:
|
||||
future<> do_streaming() {
|
||||
size_t ranges_in = 0;
|
||||
size_t ranges_out = 0;
|
||||
_sp_in = make_lw_shared<streaming::stream_plan>(sprint("repair-in-id-%d-shard-%d-index-%d", id, shard, sp_index));
|
||||
_sp_out = make_lw_shared<streaming::stream_plan>(sprint("repair-out-id-%d-shard-%d-index-%d", id, shard, sp_index));
|
||||
_sp_in = make_lw_shared<streaming::stream_plan>(sprint("repair-in-id-%d-shard-%d-index-%d", id, shard, sp_index), streaming::stream_reason::repair);
|
||||
_sp_out = make_lw_shared<streaming::stream_plan>(sprint("repair-out-id-%d-shard-%d-index-%d", id, shard, sp_index), streaming::stream_reason::repair);
|
||||
|
||||
for (auto& x : ranges_need_repair_in) {
|
||||
auto& peer = x.first;
|
||||
|
||||
34
schema.cc
34
schema.cc
@@ -193,7 +193,9 @@ const std::vector<column_definition>& v3_columns::all_columns() const {
|
||||
void schema::rebuild() {
|
||||
_partition_key_type = make_lw_shared<compound_type<>>(get_column_types(partition_key_columns()));
|
||||
_clustering_key_type = make_lw_shared<compound_prefix>(get_column_types(clustering_key_columns()));
|
||||
|
||||
_clustering_key_size = column_offset(column_kind::static_column) - column_offset(column_kind::clustering_key);
|
||||
_regular_column_count = _raw._columns.size() - column_offset(column_kind::regular_column);
|
||||
_static_column_count = column_offset(column_kind::regular_column) - column_offset(column_kind::static_column);
|
||||
_columns_by_name.clear();
|
||||
|
||||
for (const column_definition& def : all_columns()) {
|
||||
@@ -1121,26 +1123,26 @@ schema::has_static_columns() const {
|
||||
return !static_columns().empty();
|
||||
}
|
||||
|
||||
column_count_type
|
||||
schema::columns_count(column_kind kind) const {
|
||||
switch (kind) {
|
||||
case column_kind::partition_key:
|
||||
return partition_key_size();
|
||||
case column_kind::clustering_key:
|
||||
return clustering_key_size();
|
||||
case column_kind::static_column:
|
||||
return static_columns_count();
|
||||
case column_kind::regular_column:
|
||||
return regular_columns_count();
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
column_count_type
|
||||
schema::partition_key_size() const {
|
||||
return column_offset(column_kind::clustering_key);
|
||||
}
|
||||
|
||||
column_count_type
|
||||
schema::clustering_key_size() const {
|
||||
return column_offset(column_kind::static_column) - column_offset(column_kind::clustering_key);
|
||||
}
|
||||
|
||||
column_count_type
|
||||
schema::static_columns_count() const {
|
||||
return column_offset(column_kind::regular_column) - column_offset(column_kind::static_column);
|
||||
}
|
||||
|
||||
column_count_type
|
||||
schema::regular_columns_count() const {
|
||||
return _raw._columns.size() - column_offset(column_kind::regular_column);
|
||||
}
|
||||
|
||||
schema::const_iterator_range_type
|
||||
schema::partition_key_columns() const {
|
||||
return boost::make_iterator_range(_raw._columns.begin() + column_offset(column_kind::partition_key)
|
||||
|
||||
10
schema.hh
10
schema.hh
@@ -529,6 +529,9 @@ private:
|
||||
lw_shared_ptr<compound_type<allow_prefixes::yes>> _clustering_key_type;
|
||||
column_mapping _column_mapping;
|
||||
shared_ptr<query::partition_slice> _full_slice;
|
||||
column_count_type _clustering_key_size;
|
||||
column_count_type _regular_column_count;
|
||||
column_count_type _static_column_count;
|
||||
|
||||
extensions_map& extensions() {
|
||||
return _raw._extensions;
|
||||
@@ -701,10 +704,11 @@ public:
|
||||
bool is_last_partition_key(const column_definition& def) const;
|
||||
bool has_multi_cell_collections() const;
|
||||
bool has_static_columns() const;
|
||||
column_count_type columns_count(column_kind kind) const;
|
||||
column_count_type partition_key_size() const;
|
||||
column_count_type clustering_key_size() const;
|
||||
column_count_type static_columns_count() const;
|
||||
column_count_type regular_columns_count() const;
|
||||
column_count_type clustering_key_size() const { return _clustering_key_size; }
|
||||
column_count_type static_columns_count() const { return _static_column_count; }
|
||||
column_count_type regular_columns_count() const { return _regular_column_count; }
|
||||
// Returns a range of column definitions
|
||||
const_iterator_range_type partition_key_columns() const;
|
||||
// Returns a range of column definitions
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 57128167aa...1651a2ac89
@@ -514,7 +514,12 @@ future<> migration_manager::announce_new_keyspace(lw_shared_ptr<keyspace_metadat
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
}
|
||||
|
||||
future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool announce_locally) {
|
||||
future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool announce_locally)
|
||||
{
|
||||
return announce_new_column_family(std::move(cfm), api::new_timestamp(), announce_locally);
|
||||
}
|
||||
|
||||
future<> migration_manager::announce_new_column_family(schema_ptr cfm, api::timestamp_type timestamp, bool announce_locally) {
|
||||
#if 0
|
||||
cfm.validate();
|
||||
#endif
|
||||
@@ -525,7 +530,7 @@ future<> migration_manager::announce_new_column_family(schema_ptr cfm, bool anno
|
||||
throw exceptions::already_exists_exception(cfm->ks_name(), cfm->cf_name());
|
||||
}
|
||||
mlogger.info("Create new ColumnFamily: {}", cfm);
|
||||
return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, api::new_timestamp())
|
||||
return db::schema_tables::make_create_table_mutations(keyspace.metadata(), cfm, timestamp)
|
||||
.then([announce_locally, this] (auto&& mutations) {
|
||||
return announce(std::move(mutations), announce_locally);
|
||||
});
|
||||
|
||||
@@ -111,6 +111,8 @@ public:
|
||||
|
||||
future<> announce_new_column_family(schema_ptr cfm, bool announce_locally = false);
|
||||
|
||||
future<> announce_new_column_family(schema_ptr cfm, api::timestamp_type timestamp, bool announce_locally = false);
|
||||
|
||||
future<> announce_new_type(user_type new_type, bool announce_locally = false);
|
||||
|
||||
future<> announce_type_update(user_type updated_type, bool announce_locally = false);
|
||||
|
||||
@@ -151,6 +151,10 @@ protected:
|
||||
void handle_result(Visitor&& visitor,
|
||||
const foreign_ptr<lw_shared_ptr<query::result>>& results,
|
||||
uint32_t page_size, gc_clock::time_point now);
|
||||
|
||||
virtual uint32_t max_rows_to_fetch(uint32_t page_size) {
|
||||
return std::min(_max, page_size);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -55,7 +55,7 @@ struct noop_visitor {
|
||||
void accept_new_partition(const partition_key& key, uint32_t row_count) { }
|
||||
void accept_new_row(const clustering_key& key, const query::result_row_view& static_row, const query::result_row_view& row) { }
|
||||
void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) { }
|
||||
void accept_partition_end(const query::result_row_view& static_row) { }
|
||||
uint32_t accept_partition_end(const query::result_row_view& static_row) { return 0; }
|
||||
};
|
||||
|
||||
static bool has_clustering_keys(const schema& s, const query::read_command& cmd) {
|
||||
@@ -202,7 +202,7 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
|
||||
}
|
||||
}
|
||||
|
||||
auto max_rows = std::min(_max, page_size);
|
||||
auto max_rows = max_rows_to_fetch(page_size);
|
||||
|
||||
// We always need PK so we can determine where to start next.
|
||||
_cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
|
||||
@@ -284,6 +284,10 @@ public:
|
||||
std::move(qr.query_result), page_size, now);
|
||||
});
|
||||
}
|
||||
protected:
|
||||
virtual uint32_t max_rows_to_fetch(uint32_t page_size) override {
|
||||
return page_size;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Base>
|
||||
@@ -291,6 +295,7 @@ class query_pager::query_result_visitor : public Base {
|
||||
using visitor = Base;
|
||||
public:
|
||||
uint32_t total_rows = 0;
|
||||
uint32_t dropped_rows = 0;
|
||||
std::experimental::optional<partition_key> last_pkey;
|
||||
std::experimental::optional<clustering_key> last_ckey;
|
||||
|
||||
@@ -317,7 +322,7 @@ public:
|
||||
visitor::accept_new_row(static_row, row);
|
||||
}
|
||||
void accept_partition_end(const query::result_row_view& static_row) {
|
||||
visitor::accept_partition_end(static_row);
|
||||
dropped_rows += visitor::accept_partition_end(static_row);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -348,9 +353,9 @@ public:
|
||||
update_slice(*_last_pkey);
|
||||
}
|
||||
|
||||
row_count = v.total_rows;
|
||||
row_count = v.total_rows - v.dropped_rows;
|
||||
_max = _max - row_count;
|
||||
_exhausted = (v.total_rows < page_size && !results->is_short_read()) || _max == 0;
|
||||
_exhausted = (v.total_rows < page_size && !results->is_short_read() && v.dropped_rows == 0) || _max == 0;
|
||||
_last_pkey = v.last_pkey;
|
||||
_last_ckey = v.last_ckey;
|
||||
} else {
|
||||
@@ -379,7 +384,7 @@ public:
|
||||
}
|
||||
|
||||
::shared_ptr<const paging_state> query_pager::state() const {
|
||||
return ::make_shared<paging_state>(*_last_pkey, _last_ckey, _exhausted ? 0 : _max, _cmd->query_uuid, _last_replicas, _query_read_repair_decision);
|
||||
return ::make_shared<paging_state>(_last_pkey.value_or(partition_key::make_empty()), _last_ckey, _exhausted ? 0 : _max, _cmd->query_uuid, _last_replicas, _query_read_repair_decision);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -64,6 +64,7 @@
|
||||
#include <boost/range/adaptors.hpp>
|
||||
#include <boost/algorithm/cxx11/any_of.hpp>
|
||||
#include <boost/algorithm/cxx11/none_of.hpp>
|
||||
#include <boost/algorithm/cxx11/partition_copy.hpp>
|
||||
#include <boost/range/algorithm/count_if.hpp>
|
||||
#include <boost/range/algorithm/find.hpp>
|
||||
#include <boost/range/algorithm/find_if.hpp>
|
||||
@@ -170,10 +171,13 @@ public:
|
||||
class shared_mutation : public mutation_holder {
|
||||
lw_shared_ptr<const frozen_mutation> _mutation;
|
||||
public:
|
||||
shared_mutation(const mutation& m) : _mutation(make_lw_shared<const frozen_mutation>(freeze(m))) {
|
||||
explicit shared_mutation(frozen_mutation_and_schema&& fm_a_s)
|
||||
: _mutation(make_lw_shared<const frozen_mutation>(std::move(fm_a_s.fm))) {
|
||||
_size = _mutation->representation().size();
|
||||
_schema = m.schema();
|
||||
};
|
||||
_schema = std::move(fm_a_s.s);
|
||||
}
|
||||
explicit shared_mutation(const mutation& m) : shared_mutation(frozen_mutation_and_schema{freeze(m), m.schema()}) {
|
||||
}
|
||||
lw_shared_ptr<const frozen_mutation> get_mutation_for(gms::inet_address ep) override {
|
||||
return _mutation;
|
||||
}
|
||||
@@ -206,7 +210,8 @@ protected:
|
||||
FAILURE,
|
||||
};
|
||||
error _error = error::NONE;
|
||||
size_t _failed = 0;
|
||||
size_t _failed = 0; // only failures that may impact consistency
|
||||
size_t _all_failures = 0; // total amount of failures
|
||||
size_t _total_endpoints = 0;
|
||||
storage_proxy::write_stats& _stats;
|
||||
|
||||
@@ -295,6 +300,50 @@ public:
|
||||
}
|
||||
return _targets.size() == 0;
|
||||
}
|
||||
// return true if handler is no longer needed because
|
||||
// CL cannot be reached
|
||||
bool failure_response(gms::inet_address from, size_t count) {
|
||||
auto it = _targets.find(from);
|
||||
if (it == _targets.end()) {
|
||||
// There is a little change we can get outdated reply
|
||||
// if the coordinator was restarted after sending a request and
|
||||
// getting reply back. The chance is low though since initial
|
||||
// request id is initialized to server starting time
|
||||
slogger.warn("Receive outdated write failure from {}", from);
|
||||
return false;
|
||||
}
|
||||
_all_failures += count;
|
||||
// we should not fail CL=ANY requests since they may succeed after
|
||||
// writing hints
|
||||
return _cl != db::consistency_level::ANY && failure(from, count);
|
||||
}
|
||||
void check_for_early_completion() {
|
||||
if (_all_failures == _targets.size()) {
|
||||
// leftover targets are all reported error, so nothing to wait for any longer
|
||||
timeout_cb();
|
||||
}
|
||||
}
|
||||
void timeout_cb() {
|
||||
if (_cl_achieved || _cl == db::consistency_level::ANY) {
|
||||
// we are here because either cl was achieved, but targets left in the handler are not
|
||||
// responding, so a hint should be written for them, or cl == any in which case
|
||||
// hints are counted towards consistency, so we need to write hints and count how much was written
|
||||
auto hints = _proxy->hint_to_dead_endpoints(_mutation_holder, get_targets(), _type, get_trace_state());
|
||||
signal(hints);
|
||||
if (_cl == db::consistency_level::ANY && hints) {
|
||||
slogger.trace("Wrote hint to satisfy CL.ANY after no replicas acknowledged the write");
|
||||
}
|
||||
if (_cl_achieved) { // For CL=ANY this can still be false
|
||||
for (auto&& ep : get_targets()) {
|
||||
++stats().background_replica_writes_failed.get_ep_stat(ep);
|
||||
}
|
||||
stats().background_writes_failed += int(!_targets.empty());
|
||||
}
|
||||
}
|
||||
|
||||
on_timeout();
|
||||
_proxy->remove_response_handler(_id);
|
||||
}
|
||||
future<> wait() {
|
||||
return _ready.get_future();
|
||||
}
|
||||
@@ -319,6 +368,9 @@ public:
|
||||
const tracing::trace_state_ptr& get_trace_state() const {
|
||||
return _trace_state;
|
||||
}
|
||||
storage_proxy::write_stats& stats() {
|
||||
return _stats;
|
||||
}
|
||||
friend storage_proxy;
|
||||
};
|
||||
|
||||
@@ -458,22 +510,7 @@ void storage_proxy::unthrottle() {
|
||||
|
||||
storage_proxy::response_id_type storage_proxy::register_response_handler(shared_ptr<abstract_write_response_handler>&& h) {
|
||||
auto id = h->id();
|
||||
auto e = _response_handlers.emplace(id, rh_entry(std::move(h), [this, id] {
|
||||
auto& e = _response_handlers.find(id)->second;
|
||||
if (e.handler->_cl_achieved || e.handler->_cl == db::consistency_level::ANY) {
|
||||
// we are here because either cl was achieved, but targets left in the handler are not
|
||||
// responding, so a hint should be written for them, or cl == any in which case
|
||||
// hints are counted towards consistency, so we need to write hints and count how much was written
|
||||
auto hints = hint_to_dead_endpoints(e.handler->_mutation_holder, e.handler->get_targets(), e.handler->_type, e.handler->get_trace_state());
|
||||
e.handler->signal(hints);
|
||||
if (e.handler->_cl == db::consistency_level::ANY && hints) {
|
||||
slogger.trace("Wrote hint to satisfy CL.ANY after no replicas acknowledged the write");
|
||||
}
|
||||
}
|
||||
|
||||
e.handler->on_timeout();
|
||||
remove_response_handler(id);
|
||||
}));
|
||||
auto e = _response_handlers.emplace(id, std::move(h));
|
||||
assert(e.second);
|
||||
return id;
|
||||
}
|
||||
@@ -488,6 +525,8 @@ void storage_proxy::got_response(storage_proxy::response_id_type id, gms::inet_a
|
||||
tracing::trace(it->second.handler->get_trace_state(), "Got a response from /{}", from);
|
||||
if (it->second.handler->response(from)) {
|
||||
remove_response_handler(id); // last one, remove entry. Will cancel expiration timer too.
|
||||
} else {
|
||||
it->second.handler->check_for_early_completion();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -496,8 +535,10 @@ void storage_proxy::got_failure_response(storage_proxy::response_id_type id, gms
|
||||
auto it = _response_handlers.find(id);
|
||||
if (it != _response_handlers.end()) {
|
||||
tracing::trace(it->second.handler->get_trace_state(), "Got {} failures from /{}", count, from);
|
||||
if (it->second.handler->failure(from, count)) {
|
||||
remove_response_handler(id); // last one, remove entry. Will cancel expiration timer too.
|
||||
if (it->second.handler->failure_response(from, count)) {
|
||||
remove_response_handler(id);
|
||||
} else {
|
||||
it->second.handler->check_for_early_completion();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -544,22 +585,26 @@ storage_proxy_stats::split_stats::split_stats(const sstring& category, const sst
|
||||
storage_proxy_stats::write_stats::write_stats()
|
||||
: writes_attempts(storage_proxy::COORDINATOR_STATS_CATEGORY, "total_write_attempts", "total number of write requests", "mutation_data")
|
||||
, writes_errors(storage_proxy::COORDINATOR_STATS_CATEGORY, "write_errors", "number of write requests that failed", "mutation_data")
|
||||
, background_replica_writes_failed(storage_proxy::COORDINATOR_STATS_CATEGORY, "background_replica_writes_failed", "number of replica writes that timed out or failed after CL was reached", "mutation_data")
|
||||
, read_repair_write_attempts(storage_proxy::COORDINATOR_STATS_CATEGORY, "read_repair_write_attempts", "number of write operations in a read repair context", "mutation_data") { }
|
||||
|
||||
storage_proxy_stats::write_stats::write_stats(const sstring& category, bool auto_register_stats)
|
||||
: writes_attempts(category, "total_write_attempts", "total number of write requests", "mutation_data", auto_register_stats)
|
||||
, writes_errors(category, "write_errors", "number of write requests that failed", "mutation_data", auto_register_stats)
|
||||
, background_replica_writes_failed(category, "background_replica_writes_failed", "number of replica writes that timed out or failed after CL was reached", "mutation_data", auto_register_stats)
|
||||
, read_repair_write_attempts(category, "read_repair_write_attempts", "number of write operations in a read repair context", "mutation_data", auto_register_stats) { }
|
||||
|
||||
void storage_proxy_stats::write_stats::register_metrics_local() {
|
||||
writes_attempts.register_metrics_local();
|
||||
writes_errors.register_metrics_local();
|
||||
background_replica_writes_failed.register_metrics_local();
|
||||
read_repair_write_attempts.register_metrics_local();
|
||||
}
|
||||
|
||||
void storage_proxy_stats::write_stats::register_metrics_for(gms::inet_address ep) {
|
||||
writes_attempts.register_metrics_for(ep);
|
||||
writes_errors.register_metrics_for(ep);
|
||||
background_replica_writes_failed.register_metrics_for(ep);
|
||||
read_repair_write_attempts.register_metrics_for(ep);
|
||||
}
|
||||
|
||||
@@ -709,6 +754,9 @@ storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cf
|
||||
|
||||
sm::make_total_operations("speculative_data_reads", [this] { return _stats.speculative_data_reads; },
|
||||
sm::description("number of speculative data read requests that were sent")),
|
||||
|
||||
sm::make_total_operations("background_writes_failed", [this] { return _stats.background_writes_failed; },
|
||||
sm::description("number of write requests that failed after CL was reached")),
|
||||
});
|
||||
|
||||
_metrics.add_group(REPLICA_STATS_CATEGORY, {
|
||||
@@ -753,7 +801,7 @@ storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cf
|
||||
_hints_resource_manager.register_manager(_hints_for_views_manager);
|
||||
}
|
||||
|
||||
storage_proxy::rh_entry::rh_entry(shared_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb) : handler(std::move(h)), expire_timer(std::move(cb)) {}
|
||||
storage_proxy::rh_entry::rh_entry(shared_ptr<abstract_write_response_handler>&& h) : handler(std::move(h)), expire_timer([this] { handler->timeout_cb(); }) {}
|
||||
|
||||
storage_proxy::unique_response_handler::unique_response_handler(storage_proxy& p_, response_id_type id_) : id(id_), p(p_) {}
|
||||
storage_proxy::unique_response_handler::unique_response_handler(unique_response_handler&& x) : id(x.id), p(x.p) { x.id = 0; };
|
||||
@@ -1292,28 +1340,28 @@ storage_proxy::hint_to_dead_endpoints(response_id_type id, db::consistency_level
|
||||
}
|
||||
|
||||
template<typename Range, typename CreateWriteHandler>
|
||||
future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
|
||||
future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
|
||||
// apply is used to convert exceptions to exceptional future
|
||||
return futurize<std::vector<storage_proxy::unique_response_handler>>::apply([this] (const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
|
||||
return futurize<std::vector<storage_proxy::unique_response_handler>>::apply([this] (Range&& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler create_handler) {
|
||||
std::vector<unique_response_handler> ids;
|
||||
ids.reserve(std::distance(std::begin(mutations), std::end(mutations)));
|
||||
for (auto& m : mutations) {
|
||||
ids.emplace_back(*this, create_handler(m, cl, type));
|
||||
}
|
||||
return make_ready_future<std::vector<unique_response_handler>>(std::move(ids));
|
||||
}, mutations, cl, type, std::move(create_handler));
|
||||
}, std::forward<Range>(mutations), cl, type, std::move(create_handler));
|
||||
}
|
||||
|
||||
template<typename Range>
|
||||
future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state) {
|
||||
return mutate_prepare<>(mutations, cl, type, [this, tr_state = std::move(tr_state)] (const typename Range::value_type& m, db::consistency_level cl, db::write_type type) mutable {
|
||||
future<std::vector<storage_proxy::unique_response_handler>> storage_proxy::mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state) {
|
||||
return mutate_prepare<>(std::forward<Range>(mutations), cl, type, [this, tr_state = std::move(tr_state)] (const typename std::decay_t<Range>::value_type& m, db::consistency_level cl, db::write_type type) mutable {
|
||||
return create_write_response_handler(m, cl, type, tr_state);
|
||||
});
|
||||
}
|
||||
|
||||
future<> storage_proxy::mutate_begin(std::vector<unique_response_handler> ids, db::consistency_level cl,
|
||||
write_stats& stats, stdx::optional<clock_type::time_point> timeout_opt) {
|
||||
return parallel_for_each(ids, [this, cl, timeout_opt, &stats] (unique_response_handler& protected_response) {
|
||||
stdx::optional<clock_type::time_point> timeout_opt) {
|
||||
return parallel_for_each(ids, [this, cl, timeout_opt] (unique_response_handler& protected_response) {
|
||||
auto response_id = protected_response.id;
|
||||
// it is better to send first and hint afterwards to reduce latency
|
||||
// but request may complete before hint_to_dead_endpoints() is called and
|
||||
@@ -1324,7 +1372,7 @@ future<> storage_proxy::mutate_begin(std::vector<unique_response_handler> ids, d
|
||||
auto timeout = timeout_opt.value_or(clock_type::now() + std::chrono::milliseconds(_db.local().get_config().write_request_timeout_in_ms()));
|
||||
// call before send_to_live_endpoints() for the same reason as above
|
||||
auto f = response_wait(response_id, timeout);
|
||||
send_to_live_endpoints(protected_response.release(), timeout, stats); // response is now running and it will either complete or timeout
|
||||
send_to_live_endpoints(protected_response.release(), timeout); // response is now running and it will either complete or timeout
|
||||
return std::move(f);
|
||||
});
|
||||
}
|
||||
@@ -1503,7 +1551,7 @@ storage_proxy::mutate_internal(Range mutations, db::consistency_level cl, bool c
|
||||
lc.start();
|
||||
|
||||
return mutate_prepare(mutations, cl, type, tr_state).then([this, cl, timeout_opt] (std::vector<storage_proxy::unique_response_handler> ids) {
|
||||
return mutate_begin(std::move(ids), cl, _stats, timeout_opt);
|
||||
return mutate_begin(std::move(ids), cl, timeout_opt);
|
||||
}).then_wrapped([this, p = shared_from_this(), lc, tr_state] (future<> f) mutable {
|
||||
return p->mutate_end(std::move(f), lc, _stats, std::move(tr_state));
|
||||
});
|
||||
@@ -1590,7 +1638,7 @@ storage_proxy::mutate_atomically(std::vector<mutation> mutations, db::consistenc
|
||||
auto& ks = _p._db.local().find_keyspace(m.schema()->ks_name());
|
||||
return _p.create_write_response_handler(ks, cl, type, std::make_unique<shared_mutation>(m), _batchlog_endpoints, {}, {}, _trace_state, _stats);
|
||||
}).then([this, cl] (std::vector<unique_response_handler> ids) {
|
||||
return _p.mutate_begin(std::move(ids), cl, _stats, _timeout);
|
||||
return _p.mutate_begin(std::move(ids), cl, _timeout);
|
||||
});
|
||||
}
|
||||
future<> sync_write_to_batchlog() {
|
||||
@@ -1616,7 +1664,7 @@ storage_proxy::mutate_atomically(std::vector<mutation> mutations, db::consistenc
|
||||
return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state).then([this] (std::vector<unique_response_handler> ids) {
|
||||
return sync_write_to_batchlog().then([this, ids = std::move(ids)] () mutable {
|
||||
tracing::trace(_trace_state, "Sending batch mutations");
|
||||
return _p.mutate_begin(std::move(ids), _cl, _stats, _timeout);
|
||||
return _p.mutate_begin(std::move(ids), _cl, _timeout);
|
||||
}).then(std::bind(&context::async_remove_from_batchlog, this));
|
||||
});
|
||||
}
|
||||
@@ -1644,7 +1692,7 @@ bool storage_proxy::cannot_hint(const Range& targets, db::write_type type) {
|
||||
}
|
||||
|
||||
future<> storage_proxy::send_to_endpoint(
|
||||
mutation m,
|
||||
std::unique_ptr<mutation_holder> m,
|
||||
gms::inet_address target,
|
||||
std::vector<gms::inet_address> pending_endpoints,
|
||||
db::write_type type,
|
||||
@@ -1654,29 +1702,78 @@ future<> storage_proxy::send_to_endpoint(
|
||||
|
||||
// View updates use consistency level ANY in order to fall back to hinted handoff in case of a failed update
|
||||
db::consistency_level cl = (type == db::write_type::VIEW) ? db::consistency_level::ANY : db::consistency_level::ONE;
|
||||
std::unordered_set<gms::inet_address> targets(pending_endpoints.begin(), pending_endpoints.end());
|
||||
targets.insert(std::move(target));
|
||||
return mutate_prepare(std::array<mutation, 1>{std::move(m)}, cl, type,
|
||||
[this, targets = std::move(targets), pending_endpoints = std::move(pending_endpoints), &stats] (
|
||||
const mutation& m,
|
||||
return mutate_prepare(std::array{std::move(m)}, cl, type,
|
||||
[this, target = std::array{target}, pending_endpoints = std::move(pending_endpoints), &stats] (
|
||||
std::unique_ptr<mutation_holder>& m,
|
||||
db::consistency_level cl,
|
||||
db::write_type type) mutable {
|
||||
auto& ks = _db.local().find_keyspace(m.schema()->ks_name());
|
||||
return create_write_response_handler(
|
||||
ks,
|
||||
cl,
|
||||
type,
|
||||
std::make_unique<shared_mutation>(m),
|
||||
std::move(targets),
|
||||
pending_endpoints,
|
||||
{ },
|
||||
nullptr,
|
||||
stats);
|
||||
}).then([this, &stats, cl] (std::vector<unique_response_handler> ids) {
|
||||
return mutate_begin(std::move(ids), cl, stats);
|
||||
std::unordered_set<gms::inet_address> targets;
|
||||
targets.reserve(pending_endpoints.size() + 1);
|
||||
std::vector<gms::inet_address> dead_endpoints;
|
||||
boost::algorithm::partition_copy(
|
||||
boost::range::join(pending_endpoints, target),
|
||||
std::inserter(targets, targets.begin()),
|
||||
std::back_inserter(dead_endpoints),
|
||||
[] (gms::inet_address ep) { return gms::get_local_failure_detector().is_alive(ep); });
|
||||
auto& ks = _db.local().find_keyspace(m->schema()->ks_name());
|
||||
slogger.trace("Creating write handler with live: {}; dead: {}", targets, dead_endpoints);
|
||||
db::assure_sufficient_live_nodes(cl, ks, targets, pending_endpoints);
|
||||
return create_write_response_handler(
|
||||
ks,
|
||||
cl,
|
||||
type,
|
||||
std::move(m),
|
||||
std::move(targets),
|
||||
pending_endpoints,
|
||||
std::move(dead_endpoints),
|
||||
nullptr,
|
||||
stats);
|
||||
}).then([this, cl] (std::vector<unique_response_handler> ids) {
|
||||
return mutate_begin(std::move(ids), cl);
|
||||
}).then_wrapped([p = shared_from_this(), lc, &stats] (future<>&& f) {
|
||||
return p->mutate_end(std::move(f), lc, stats, nullptr);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> storage_proxy::send_to_endpoint(
|
||||
frozen_mutation_and_schema fm_a_s,
|
||||
gms::inet_address target,
|
||||
std::vector<gms::inet_address> pending_endpoints,
|
||||
db::write_type type) {
|
||||
return send_to_endpoint(
|
||||
std::make_unique<shared_mutation>(std::move(fm_a_s)),
|
||||
std::move(target),
|
||||
std::move(pending_endpoints),
|
||||
type,
|
||||
_stats);
|
||||
}
|
||||
|
||||
future<> storage_proxy::send_to_endpoint(
|
||||
frozen_mutation_and_schema fm_a_s,
|
||||
gms::inet_address target,
|
||||
std::vector<gms::inet_address> pending_endpoints,
|
||||
db::write_type type,
|
||||
write_stats& stats) {
|
||||
return send_to_endpoint(
|
||||
std::make_unique<shared_mutation>(std::move(fm_a_s)),
|
||||
std::move(target),
|
||||
std::move(pending_endpoints),
|
||||
type,
|
||||
stats);
|
||||
}
|
||||
|
||||
future<> storage_proxy::send_to_endpoint(
|
||||
mutation m,
|
||||
gms::inet_address target,
|
||||
std::vector<gms::inet_address> pending_endpoints,
|
||||
db::write_type type,
|
||||
write_stats& stats) {
|
||||
return send_to_endpoint(
|
||||
std::make_unique<shared_mutation>(m),
|
||||
std::move(target),
|
||||
std::move(pending_endpoints),
|
||||
type,
|
||||
stats);
|
||||
}
|
||||
|
||||
future<> storage_proxy::send_to_endpoint(
|
||||
@@ -1684,7 +1781,12 @@ future<> storage_proxy::send_to_endpoint(
|
||||
gms::inet_address target,
|
||||
std::vector<gms::inet_address> pending_endpoints,
|
||||
db::write_type type) {
|
||||
return send_to_endpoint(std::move(m), std::move(target), std::move(pending_endpoints), type, _stats);
|
||||
return send_to_endpoint(
|
||||
std::make_unique<shared_mutation>(m),
|
||||
std::move(target),
|
||||
std::move(pending_endpoints),
|
||||
type,
|
||||
_stats);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1702,7 +1804,7 @@ future<> storage_proxy::send_to_endpoint(
|
||||
* @throws OverloadedException if the hints cannot be written/enqueued
|
||||
*/
|
||||
// returned future is ready when sent is complete, not when mutation is executed on all (or any) targets!
|
||||
void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type response_id, clock_type::time_point timeout, write_stats& stats)
|
||||
void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type response_id, clock_type::time_point timeout)
|
||||
{
|
||||
// extra-datacenter replicas, grouped by dc
|
||||
std::unordered_map<sstring, std::vector<gms::inet_address>> dc_groups;
|
||||
@@ -1710,6 +1812,7 @@ void storage_proxy::send_to_live_endpoints(storage_proxy::response_id_type respo
|
||||
local.reserve(3);
|
||||
|
||||
auto handler_ptr = get_write_response_handler(response_id);
|
||||
auto& stats = handler_ptr->stats();
|
||||
auto& handler = *handler_ptr;
|
||||
|
||||
for(auto dest: handler.get_targets()) {
|
||||
@@ -3074,8 +3177,9 @@ storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr<query::read_comman
|
||||
unsigned shard = _db.local().shard_of(pr.start()->value().token());
|
||||
_stats.replica_cross_shard_ops += shard != engine().cpu_id();
|
||||
return _db.invoke_on(shard, [max_size, gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
|
||||
tracing::trace(gt, "Start querying the token range that starts with {}", seastar::value_of([&prv] { return prv.begin()->start()->value().token(); }));
|
||||
return db.query(gs, *cmd, opts, prv, gt, max_size, timeout).then([trace_state = gt.get()](auto&& f, cache_temperature ht) {
|
||||
auto trace_state = gt.get();
|
||||
tracing::trace(trace_state, "Start querying the token range that starts with {}", seastar::value_of([&prv] { return prv.begin()->start()->value().token(); }));
|
||||
return db.query(gs, *cmd, opts, prv, trace_state, max_size, timeout).then([trace_state](auto&& f, cache_temperature ht) {
|
||||
tracing::trace(trace_state, "Querying is done");
|
||||
return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>(make_foreign(std::move(f)), ht);
|
||||
});
|
||||
@@ -4218,6 +4322,10 @@ future<> storage_proxy::start_hints_manager(shared_ptr<gms::gossiper> gossiper_p
|
||||
return _hints_resource_manager.start(shared_from_this(), gossiper_ptr, ss_ptr);
|
||||
}
|
||||
|
||||
void storage_proxy::allow_replaying_hints() noexcept {
|
||||
return _hints_resource_manager.allow_replaying();
|
||||
}
|
||||
|
||||
future<> storage_proxy::stop_hints_manager() {
|
||||
return _hints_resource_manager.stop();
|
||||
}
|
||||
|
||||
@@ -83,7 +83,8 @@ private:
|
||||
struct rh_entry {
|
||||
::shared_ptr<abstract_write_response_handler> handler;
|
||||
timer<clock_type> expire_timer;
|
||||
rh_entry(::shared_ptr<abstract_write_response_handler>&& h, std::function<void()>&& cb);
|
||||
rh_entry(::shared_ptr<abstract_write_response_handler>&& h);
|
||||
rh_entry(rh_entry&&) = delete;
|
||||
};
|
||||
|
||||
using response_id_type = uint64_t;
|
||||
@@ -187,7 +188,7 @@ private:
|
||||
const std::vector<gms::inet_address>& pending_endpoints, std::vector<gms::inet_address>, tracing::trace_state_ptr tr_state, storage_proxy::write_stats& stats);
|
||||
response_id_type create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state);
|
||||
response_id_type create_write_response_handler(const std::unordered_map<gms::inet_address, std::experimental::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state);
|
||||
void send_to_live_endpoints(response_id_type response_id, clock_type::time_point timeout, write_stats& stats);
|
||||
void send_to_live_endpoints(response_id_type response_id, clock_type::time_point timeout);
|
||||
template<typename Range>
|
||||
size_t hint_to_dead_endpoints(std::unique_ptr<mutation_holder>& mh, const Range& targets, db::write_type type, tracing::trace_state_ptr tr_state) noexcept;
|
||||
void hint_to_dead_endpoints(response_id_type, db::consistency_level);
|
||||
@@ -239,10 +240,10 @@ private:
|
||||
db::consistency_level cl,
|
||||
coordinator_query_options optional_params);
|
||||
template<typename Range, typename CreateWriteHandler>
|
||||
future<std::vector<unique_response_handler>> mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler handler);
|
||||
future<std::vector<unique_response_handler>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, CreateWriteHandler handler);
|
||||
template<typename Range>
|
||||
future<std::vector<unique_response_handler>> mutate_prepare(const Range& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state);
|
||||
future<> mutate_begin(std::vector<unique_response_handler> ids, db::consistency_level cl, write_stats& stats, stdx::optional<clock_type::time_point> timeout_opt = { });
|
||||
future<std::vector<unique_response_handler>> mutate_prepare(Range&& mutations, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state);
|
||||
future<> mutate_begin(std::vector<unique_response_handler> ids, db::consistency_level cl, stdx::optional<clock_type::time_point> timeout_opt = { });
|
||||
future<> mutate_end(future<> mutate_result, utils::latency_counter, write_stats& stats, tracing::trace_state_ptr trace_state);
|
||||
future<> schedule_repair(std::unordered_map<dht::token, std::unordered_map<gms::inet_address, std::experimental::optional<mutation>>> diffs, db::consistency_level cl, tracing::trace_state_ptr trace_state);
|
||||
bool need_throttle_writes() const;
|
||||
@@ -254,10 +255,6 @@ private:
|
||||
schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state,
|
||||
uint64_t max_size, clock_type::time_point timeout);
|
||||
|
||||
struct frozen_mutation_and_schema {
|
||||
frozen_mutation fm;
|
||||
schema_ptr s;
|
||||
};
|
||||
future<> mutate_counters_on_leader(std::vector<frozen_mutation_and_schema> mutations, db::consistency_level cl, clock_type::time_point timeout,
|
||||
tracing::trace_state_ptr trace_state);
|
||||
future<> mutate_counter_on_leader_and_replicate(const schema_ptr& s, frozen_mutation m, db::consistency_level cl, clock_type::time_point timeout,
|
||||
@@ -266,6 +263,13 @@ private:
|
||||
gms::inet_address find_leader_for_counter_update(const mutation& m, db::consistency_level cl);
|
||||
|
||||
future<> do_mutate(std::vector<mutation> mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, bool);
|
||||
|
||||
future<> send_to_endpoint(
|
||||
std::unique_ptr<mutation_holder> m,
|
||||
gms::inet_address target,
|
||||
std::vector<gms::inet_address> pending_endpoints,
|
||||
db::write_type type,
|
||||
write_stats& stats);
|
||||
public:
|
||||
storage_proxy(distributed<database>& db, config cfg);
|
||||
~storage_proxy();
|
||||
@@ -338,6 +342,8 @@ public:
|
||||
// send_to_live_endpoints() - another take on the same original function.
|
||||
future<> send_to_endpoint(mutation m, gms::inet_address target, std::vector<gms::inet_address> pending_endpoints, db::write_type type, write_stats& stats);
|
||||
future<> send_to_endpoint(mutation m, gms::inet_address target, std::vector<gms::inet_address> pending_endpoints, db::write_type type);
|
||||
future<> send_to_endpoint(frozen_mutation_and_schema fm_a_s, gms::inet_address target, std::vector<gms::inet_address> pending_endpoints, db::write_type type, write_stats& stats);
|
||||
future<> send_to_endpoint(frozen_mutation_and_schema fm_a_s, gms::inet_address target, std::vector<gms::inet_address> pending_endpoints, db::write_type type);
|
||||
|
||||
/**
|
||||
* Performs the truncate operatoin, which effectively deletes all data from
|
||||
@@ -390,6 +396,7 @@ public:
|
||||
future<> stop();
|
||||
future<> stop_hints_manager();
|
||||
future<> start_hints_manager(shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
||||
void allow_replaying_hints() noexcept;
|
||||
|
||||
const stats& get_stats() const {
|
||||
return _stats;
|
||||
|
||||
@@ -81,6 +81,7 @@ struct write_stats {
|
||||
// total write attempts
|
||||
split_stats writes_attempts;
|
||||
split_stats writes_errors;
|
||||
split_stats background_replica_writes_failed;
|
||||
|
||||
// write attempts due to Read Repair logic
|
||||
split_stats read_repair_write_attempts;
|
||||
@@ -96,6 +97,7 @@ struct write_stats {
|
||||
uint64_t background_write_bytes = 0;
|
||||
uint64_t queued_write_bytes = 0;
|
||||
uint64_t throttled_writes = 0; // total number of writes ever delayed due to throttling
|
||||
uint64_t background_writes_failed = 0;
|
||||
public:
|
||||
write_stats();
|
||||
write_stats(const sstring& category, bool auto_register_stats);
|
||||
|
||||
@@ -209,14 +209,15 @@ sstring storage_service::get_config_supported_features() {
|
||||
ROLES_FEATURE,
|
||||
LA_SSTABLE_FEATURE,
|
||||
STREAM_WITH_RPC_STREAM,
|
||||
MATERIALIZED_VIEWS_FEATURE,
|
||||
INDEXES_FEATURE
|
||||
};
|
||||
auto& config = service::get_local_storage_service()._db.local().get_config();
|
||||
if (config.enable_sstables_mc_format()) {
|
||||
features.push_back(MC_SSTABLE_FEATURE);
|
||||
}
|
||||
if (config.experimental()) {
|
||||
features.push_back(MATERIALIZED_VIEWS_FEATURE);
|
||||
features.push_back(INDEXES_FEATURE);
|
||||
// push additional experimental features
|
||||
}
|
||||
return join(",", features);
|
||||
}
|
||||
@@ -353,7 +354,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
gossiper.check_knows_remote_features(local_features, peer_features);
|
||||
}
|
||||
|
||||
gossiper.reset_endpoint_state_map();
|
||||
gossiper.reset_endpoint_state_map().get();
|
||||
for (auto ep : loaded_endpoints) {
|
||||
gossiper.add_saved_endpoint(ep);
|
||||
}
|
||||
@@ -367,7 +368,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
slogger.info("Checking remote features with gossip");
|
||||
gossiper.do_shadow_round().get();
|
||||
gossiper.check_knows_remote_features(local_features);
|
||||
gossiper.reset_endpoint_state_map();
|
||||
gossiper.reset_endpoint_state_map().get();
|
||||
for (auto ep : loaded_endpoints) {
|
||||
gossiper.add_saved_endpoint(ep);
|
||||
}
|
||||
@@ -432,11 +433,8 @@ void storage_service::register_features() {
|
||||
_la_sstable_feature = gms::feature(LA_SSTABLE_FEATURE);
|
||||
_stream_with_rpc_stream_feature = gms::feature(STREAM_WITH_RPC_STREAM);
|
||||
_mc_sstable_feature = gms::feature(MC_SSTABLE_FEATURE);
|
||||
|
||||
if (_db.local().get_config().experimental()) {
|
||||
_materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
|
||||
_indexes_feature = gms::feature(INDEXES_FEATURE);
|
||||
}
|
||||
_materialized_views_feature = gms::feature(MATERIALIZED_VIEWS_FEATURE);
|
||||
_indexes_feature = gms::feature(INDEXES_FEATURE);
|
||||
}
|
||||
|
||||
// Runs inside seastar::async context
|
||||
@@ -446,6 +444,13 @@ void storage_service::join_token_ring(int delay) {
|
||||
get_storage_service().invoke_on_all([] (auto&& ss) {
|
||||
ss._joined = true;
|
||||
}).get();
|
||||
if (!_is_survey_mode) {
|
||||
supervisor::notify("starting system distributed keyspace");
|
||||
_sys_dist_ks.start(
|
||||
std::ref(cql3::get_query_processor()),
|
||||
std::ref(service::get_migration_manager())).get();
|
||||
_sys_dist_ks.invoke_on_all(&db::system_distributed_keyspace::start).get();
|
||||
}
|
||||
// We bootstrap if we haven't successfully bootstrapped before, as long as we are not a seed.
|
||||
// If we are a seed, or if the user manually sets auto_bootstrap to false,
|
||||
// we'll skip streaming data from other nodes and jump directly into the ring.
|
||||
@@ -618,12 +623,6 @@ void storage_service::join_token_ring(int delay) {
|
||||
|
||||
supervisor::notify("starting tracing");
|
||||
tracing::tracing::start_tracing().get();
|
||||
|
||||
supervisor::notify("starting system distributed keyspace");
|
||||
_sys_dist_ks.start(
|
||||
std::ref(cql3::get_query_processor()),
|
||||
std::ref(service::get_migration_manager())).get();
|
||||
_sys_dist_ks.invoke_on_all(&db::system_distributed_keyspace::start).get();
|
||||
} else {
|
||||
slogger.info("Startup complete, but write survey mode is active, not becoming an active ring member. Use JMX (StorageService->joinRing()) to finalize ring joining.");
|
||||
}
|
||||
@@ -1570,7 +1569,7 @@ future<> storage_service::check_for_endpoint_collision() {
|
||||
throw std::runtime_error("Other bootstrapping/leaving/moving nodes detected, cannot bootstrap while consistent_rangemovement is true (check_for_endpoint_collision)");
|
||||
} else {
|
||||
gossiper.goto_shadow_round();
|
||||
gossiper.reset_endpoint_state_map();
|
||||
gossiper.reset_endpoint_state_map().get();
|
||||
found_bootstrapping_node = true;
|
||||
auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(gms::gossiper::clk::now() - t).count();
|
||||
slogger.info("Checking bootstrapping/leaving/moving nodes: node={}, status={}, sleep 1 second and check again ({} seconds elapsed) (check_for_endpoint_collision)", addr, state, elapsed);
|
||||
@@ -1582,7 +1581,7 @@ future<> storage_service::check_for_endpoint_collision() {
|
||||
}
|
||||
} while (found_bootstrapping_node);
|
||||
slogger.info("Checking bootstrapping/leaving/moving nodes: ok (check_for_endpoint_collision)");
|
||||
gossiper.reset_endpoint_state_map();
|
||||
gossiper.reset_endpoint_state_map().get();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1632,8 +1631,9 @@ future<std::unordered_set<token>> storage_service::prepare_replacement_info() {
|
||||
auto tokens = get_tokens_for(replace_address);
|
||||
// use the replacee's host Id as our own so we receive hints, etc
|
||||
return db::system_keyspace::set_local_host_id(host_id).discard_result().then([replace_address, tokens = std::move(tokens)] {
|
||||
gms::get_local_gossiper().reset_endpoint_state_map(); // clean up since we have what we need
|
||||
return make_ready_future<std::unordered_set<token>>(std::move(tokens));
|
||||
return gms::get_local_gossiper().reset_endpoint_state_map().then([tokens = std::move(tokens)] { // clean up since we have what we need
|
||||
return make_ready_future<std::unordered_set<token>>(std::move(tokens));
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -2046,6 +2046,7 @@ future<> storage_service::start_rpc_server() {
|
||||
auto keepalive = cfg.rpc_keepalive();
|
||||
thrift_server_config tsc;
|
||||
tsc.timeout_config = make_timeout_config(cfg);
|
||||
tsc.max_request_size = cfg.thrift_max_message_length_in_mb() * (uint64_t(1) << 20);
|
||||
return seastar::net::dns::resolve_name(addr).then([&ss, tserver, addr, port, keepalive, tsc] (seastar::net::inet_address ip) {
|
||||
return tserver->start(std::ref(ss._db), std::ref(cql3::get_query_processor()), std::ref(ss._auth_service), tsc).then([tserver, port, addr, ip, keepalive] {
|
||||
// #293 - do not stop anything
|
||||
@@ -2473,7 +2474,7 @@ future<std::map<sstring, double>> storage_service::get_load_map() {
|
||||
future<> storage_service::rebuild(sstring source_dc) {
|
||||
return run_with_api_lock(sstring("rebuild"), [source_dc] (storage_service& ss) {
|
||||
slogger.info("rebuild from dc: {}", source_dc == "" ? "(any dc)" : source_dc);
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(ss._db, ss._token_metadata, ss.get_broadcast_address(), "Rebuild");
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(ss._db, ss._token_metadata, ss.get_broadcast_address(), "Rebuild", streaming::stream_reason::rebuild);
|
||||
streamer->add_source_filter(std::make_unique<dht::range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
|
||||
if (source_dc != "") {
|
||||
streamer->add_source_filter(std::make_unique<dht::range_streamer::single_datacenter_filter>(source_dc));
|
||||
@@ -2610,7 +2611,7 @@ void storage_service::unbootstrap() {
|
||||
}
|
||||
|
||||
future<> storage_service::restore_replica_count(inet_address endpoint, inet_address notify_endpoint) {
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Restore_replica_count");
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
|
||||
auto my_address = get_broadcast_address();
|
||||
auto non_system_keyspaces = _db.local().get_non_system_keyspaces();
|
||||
for (const auto& keyspace_name : non_system_keyspaces) {
|
||||
@@ -2729,7 +2730,7 @@ void storage_service::leave_ring() {
|
||||
|
||||
future<>
|
||||
storage_service::stream_ranges(std::unordered_map<sstring, std::unordered_multimap<dht::token_range, inet_address>> ranges_to_stream_by_keyspace) {
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Unbootstrap");
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(_db, get_token_metadata(), get_broadcast_address(), "Unbootstrap", streaming::stream_reason::decommission);
|
||||
for (auto& entry : ranges_to_stream_by_keyspace) {
|
||||
const auto& keyspace = entry.first;
|
||||
auto& ranges_with_endpoints = entry.second;
|
||||
|
||||
@@ -22,6 +22,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <zlib.h>
|
||||
#include <seastar/util/gcc6-concepts.hh>
|
||||
#include "libdeflate/libdeflate.h"
|
||||
#include "utils/gz/crc_combine.hh"
|
||||
|
||||
GCC6_CONCEPT(
|
||||
template<typename Checksum>
|
||||
@@ -30,6 +33,11 @@ concept bool ChecksumUtils = requires(const char* input, size_t size, uint32_t c
|
||||
{ Checksum::checksum(input, size) } -> uint32_t;
|
||||
{ Checksum::checksum(checksum, input, size) } -> uint32_t;
|
||||
{ Checksum::checksum_combine(checksum, checksum, size) } -> uint32_t;
|
||||
|
||||
// Tells whether checksum_combine() should be preferred over checksum().
|
||||
// For same checksummers it's faster to re-feed the buffer to checksum() than to
|
||||
// combine the checksum of the buffer.
|
||||
{ Checksum::prefer_combine() } -> bool;
|
||||
};
|
||||
)
|
||||
|
||||
@@ -52,9 +60,11 @@ struct adler32_utils {
|
||||
inline static uint32_t checksum_combine(uint32_t first, uint32_t second, size_t input_len2) {
|
||||
return adler32_combine(first, second, input_len2);
|
||||
}
|
||||
|
||||
static constexpr bool prefer_combine() { return true; }
|
||||
};
|
||||
|
||||
struct crc32_utils {
|
||||
struct zlib_crc32_checksummer {
|
||||
inline static uint32_t init_checksum() {
|
||||
return crc32(0, Z_NULL, 0);
|
||||
}
|
||||
@@ -73,5 +83,55 @@ struct crc32_utils {
|
||||
inline static uint32_t checksum_combine(uint32_t first, uint32_t second, size_t input_len2) {
|
||||
return crc32_combine(first, second, input_len2);
|
||||
}
|
||||
|
||||
static constexpr bool prefer_combine() { return false; } // crc32_combine() is very slow
|
||||
};
|
||||
|
||||
struct libdeflate_crc32_checksummer {
|
||||
static uint32_t init_checksum() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static uint32_t checksum(const char* input, size_t input_len) {
|
||||
return checksum(init_checksum(), input, input_len);
|
||||
}
|
||||
|
||||
static uint32_t checksum(uint32_t prev, const char* input, size_t input_len) {
|
||||
return libdeflate_crc32(prev, input, input_len);
|
||||
}
|
||||
|
||||
static uint32_t checksum_combine(uint32_t first, uint32_t second, size_t input_len2) {
|
||||
return zlib_crc32_checksummer::checksum_combine(first, second, input_len2);
|
||||
}
|
||||
|
||||
static constexpr bool prefer_combine() { return false; }
|
||||
};
|
||||
|
||||
template<typename Checksum>
|
||||
inline uint32_t checksum_combine_or_feed(uint32_t first, uint32_t second, const char* input, size_t input_len) {
|
||||
if constexpr (Checksum::prefer_combine()) {
|
||||
return Checksum::checksum_combine(first, second, input_len);
|
||||
} else {
|
||||
return Checksum::checksum(first, input, input_len);
|
||||
}
|
||||
}
|
||||
|
||||
struct crc32_utils {
|
||||
static uint32_t init_checksum() { return libdeflate_crc32_checksummer::init_checksum(); }
|
||||
|
||||
static uint32_t checksum(const char* input, size_t input_len) {
|
||||
return libdeflate_crc32_checksummer::checksum(input, input_len);
|
||||
}
|
||||
|
||||
static uint32_t checksum(uint32_t prev, const char* input, size_t input_len) {
|
||||
return libdeflate_crc32_checksummer::checksum(prev, input, input_len);
|
||||
}
|
||||
|
||||
static uint32_t checksum_combine(uint32_t first, uint32_t second, size_t input_len2) {
|
||||
return fast_crc32_combine(first, second, input_len2);
|
||||
}
|
||||
|
||||
static constexpr bool prefer_combine() {
|
||||
return fast_crc32_combine_optimized();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -53,63 +53,76 @@ inline column_values_fixed_lengths get_clustering_values_fixed_lengths(const ser
|
||||
* This way we don't need to looku them up by column name every time.
|
||||
*/
|
||||
class column_translation {
|
||||
public:
|
||||
struct column_info {
|
||||
// Disengaged 'id' means the column is missing from the current schema
|
||||
std::optional<column_id> id;
|
||||
std::optional<uint32_t> value_length;
|
||||
bool is_collection;
|
||||
bool is_counter;
|
||||
};
|
||||
|
||||
private:
|
||||
|
||||
struct state {
|
||||
|
||||
static std::tuple<std::vector<std::optional<column_id>>,
|
||||
std::vector<std::optional<uint32_t>>,
|
||||
std::vector<bool>,
|
||||
std::vector<bool>> build(
|
||||
static std::vector<column_info> build(
|
||||
const schema& s,
|
||||
const utils::chunked_vector<serialization_header::column_desc>& src,
|
||||
bool is_static) {
|
||||
std::vector<std::optional<column_id>> ids;
|
||||
std::vector<std::optional<column_id>> lens;
|
||||
std::vector<bool> is_collection;
|
||||
std::vector<bool> is_counter;
|
||||
std::vector<column_info> cols;
|
||||
if (s.is_dense()) {
|
||||
if (is_static) {
|
||||
ids.push_back(s.static_begin()->id);
|
||||
lens.push_back(s.static_begin()->type->value_length_if_fixed());
|
||||
is_collection.push_back(s.static_begin()->is_multi_cell());
|
||||
is_counter.push_back(s.static_begin()->is_counter());
|
||||
cols.push_back(column_info{
|
||||
s.static_begin()->id,
|
||||
s.static_begin()->type->value_length_if_fixed(),
|
||||
s.static_begin()->is_multi_cell(),
|
||||
s.static_begin()->is_counter()
|
||||
});
|
||||
} else {
|
||||
ids.push_back(s.regular_begin()->id);
|
||||
lens.push_back(s.regular_begin()->type->value_length_if_fixed());
|
||||
is_collection.push_back(s.regular_begin()->is_multi_cell());
|
||||
is_counter.push_back(s.regular_begin()->is_counter());
|
||||
cols.push_back(column_info{
|
||||
s.regular_begin()->id,
|
||||
s.regular_begin()->type->value_length_if_fixed(),
|
||||
s.regular_begin()->is_multi_cell(),
|
||||
s.regular_begin()->is_counter()
|
||||
});
|
||||
}
|
||||
} else {
|
||||
ids.reserve(src.size());
|
||||
lens.reserve(src.size());
|
||||
cols.reserve(src.size());
|
||||
for (auto&& desc : src) {
|
||||
const bytes& type_name = desc.type_name.value;
|
||||
data_type type = db::marshal::type_parser::parse(to_sstring_view(type_name));
|
||||
const column_definition* def = s.get_column_definition(desc.name.value);
|
||||
std::optional<column_id> id;
|
||||
if (def) {
|
||||
ids.push_back(def->id);
|
||||
lens.push_back(def->type->value_length_if_fixed());
|
||||
is_collection.push_back(def->is_multi_cell());
|
||||
is_counter.push_back(def->is_counter());
|
||||
} else {
|
||||
ids.push_back(std::nullopt);
|
||||
lens.push_back(std::nullopt);
|
||||
is_collection.push_back(false);
|
||||
is_counter.push_back(false);
|
||||
if (def->is_multi_cell() != type->is_multi_cell() || def->is_counter() != type->is_counter()) {
|
||||
throw malformed_sstable_exception(sprint(
|
||||
"{} definition in serialization header does not match schema. "
|
||||
"Schema collection = {}, counter = {}. Header collection = {}, counter = {}",
|
||||
def->name(),
|
||||
def->is_multi_cell(),
|
||||
def->is_counter(),
|
||||
type->is_multi_cell(),
|
||||
type->is_counter()));
|
||||
}
|
||||
id = def->id;
|
||||
}
|
||||
cols.push_back(column_info{
|
||||
id,
|
||||
type->value_length_if_fixed(),
|
||||
type->is_multi_cell(),
|
||||
type->is_counter()
|
||||
});
|
||||
}
|
||||
boost::range::stable_partition(cols, [](const column_info& column) { return !column.is_collection; });
|
||||
}
|
||||
return std::make_tuple(std::move(ids), std::move(lens), std::move(is_collection), std::move(is_counter));
|
||||
return cols;
|
||||
}
|
||||
|
||||
utils::UUID schema_uuid;
|
||||
std::vector<std::optional<column_id>> regular_schema_column_id_from_sstable;
|
||||
std::vector<std::optional<column_id>> static_schema_column_id_from_sstable;
|
||||
column_values_fixed_lengths regular_column_value_fix_lengths;
|
||||
column_values_fixed_lengths static_column_value_fix_lengths;
|
||||
std::vector<column_info> regular_schema_columns_from_sstable;
|
||||
std::vector<column_info> static_schema_columns_from_sstable;
|
||||
column_values_fixed_lengths clustering_column_value_fix_lengths;
|
||||
std::vector<bool> static_column_is_collection;
|
||||
std::vector<bool> regular_column_is_collection;
|
||||
std::vector<bool> static_column_is_counter;
|
||||
std::vector<bool> regular_column_is_counter;
|
||||
|
||||
state() = default;
|
||||
state(const state&) = delete;
|
||||
@@ -118,19 +131,11 @@ class column_translation {
|
||||
state& operator=(state&&) = default;
|
||||
|
||||
state(const schema& s, const serialization_header& header)
|
||||
: schema_uuid(s.version()) {
|
||||
std::tie(regular_schema_column_id_from_sstable,
|
||||
regular_column_value_fix_lengths,
|
||||
regular_column_is_collection,
|
||||
regular_column_is_counter) =
|
||||
build(s, header.regular_columns.elements, false);
|
||||
std::tie(static_schema_column_id_from_sstable,
|
||||
static_column_value_fix_lengths,
|
||||
static_column_is_collection,
|
||||
static_column_is_counter) =
|
||||
build(s, header.static_columns.elements, true);
|
||||
clustering_column_value_fix_lengths = get_clustering_values_fixed_lengths(header);
|
||||
}
|
||||
: schema_uuid(s.version())
|
||||
, regular_schema_columns_from_sstable(build(s, header.regular_columns.elements, false))
|
||||
, static_schema_columns_from_sstable(build(s, header.static_columns.elements, true))
|
||||
, clustering_column_value_fix_lengths (get_clustering_values_fixed_lengths(header))
|
||||
{}
|
||||
};
|
||||
|
||||
lw_shared_ptr<const state> _state = make_lw_shared<const state>();
|
||||
@@ -143,33 +148,15 @@ public:
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::vector<std::optional<column_id>>& regular_columns() const {
|
||||
return _state->regular_schema_column_id_from_sstable;
|
||||
const std::vector<column_info>& regular_columns() const {
|
||||
return _state->regular_schema_columns_from_sstable;
|
||||
}
|
||||
const std::vector<std::optional<column_id>>& static_columns() const {
|
||||
return _state->static_schema_column_id_from_sstable;
|
||||
}
|
||||
const std::vector<std::optional<uint32_t>>& regular_column_value_fix_legths() const {
|
||||
return _state->regular_column_value_fix_lengths;
|
||||
}
|
||||
const std::vector<std::optional<uint32_t>>& static_column_value_fix_legths() const {
|
||||
return _state->static_column_value_fix_lengths;
|
||||
const std::vector<column_info>& static_columns() const {
|
||||
return _state->static_schema_columns_from_sstable;
|
||||
}
|
||||
const std::vector<std::optional<uint32_t>>& clustering_column_value_fix_legths() const {
|
||||
return _state->clustering_column_value_fix_lengths;
|
||||
}
|
||||
const std::vector<bool>& static_column_is_collection() const {
|
||||
return _state->static_column_is_collection;
|
||||
}
|
||||
const std::vector<bool>& regular_column_is_collection() const {
|
||||
return _state->regular_column_is_collection;
|
||||
}
|
||||
const std::vector<bool>& static_column_is_counter() const {
|
||||
return _state->static_column_is_counter;
|
||||
}
|
||||
const std::vector<bool>& regular_column_is_counter() const {
|
||||
return _state->regular_column_is_counter;
|
||||
}
|
||||
};
|
||||
|
||||
}; // namespace sstables
|
||||
|
||||
@@ -531,11 +531,11 @@ public:
|
||||
}
|
||||
|
||||
void report_start(const sstring& formatted_msg) const override {
|
||||
clogger.debug("Compacting {}", formatted_msg);
|
||||
clogger.info("Compacting {}", formatted_msg);
|
||||
}
|
||||
|
||||
void report_finish(const sstring& formatted_msg, std::chrono::time_point<db_clock> ended_at) const override {
|
||||
clogger.debug("Compacted {}", formatted_msg);
|
||||
clogger.info("Compacted {}", formatted_msg);
|
||||
}
|
||||
|
||||
void backlog_tracker_adjust_charges() override {
|
||||
@@ -818,7 +818,10 @@ get_fully_expired_sstables(column_family& cf, const std::vector<sstables::shared
|
||||
auto compacted_undeleted_gens = boost::copy_range<std::unordered_set<int64_t>>(cf.compacted_undeleted_sstables()
|
||||
| boost::adaptors::transformed(std::mem_fn(&sstables::sstable::generation)));
|
||||
auto has_undeleted_ancestor = [&compacted_undeleted_gens] (auto& candidate) {
|
||||
return boost::algorithm::any_of(candidate->ancestors(), [&compacted_undeleted_gens] (auto gen) {
|
||||
// Get ancestors from metadata collector which is empty after restart. It works for this purpose because
|
||||
// we only need to check that a sstable compacted *in this instance* hasn't an ancestor undeleted.
|
||||
// Not getting it from sstable metadata because mc format hasn't it available.
|
||||
return boost::algorithm::any_of(candidate->get_metadata_collector().ancestors(), [&compacted_undeleted_gens] (auto gen) {
|
||||
return compacted_undeleted_gens.count(gen);
|
||||
});
|
||||
};
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user