Compare commits
189 Commits
next
...
scylla-3.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5174b1cd13 | ||
|
|
9ba608cae4 | ||
|
|
f7c5cbc645 | ||
|
|
cf4b4d4878 | ||
|
|
45bb1ba1b7 | ||
|
|
28294ed42e | ||
|
|
3c4f8cf6ed | ||
|
|
7b94264ae5 | ||
|
|
22a085fbd3 | ||
|
|
2d181da656 | ||
|
|
d427a23d42 | ||
|
|
37ab553f02 | ||
|
|
6a3f4fb3f9 | ||
|
|
8168d13887 | ||
|
|
13bdec6eb4 | ||
|
|
57e7081d86 | ||
|
|
2fcae36d96 | ||
|
|
ba62dcd5c7 | ||
|
|
515399ce17 | ||
|
|
772c4b5fdc | ||
|
|
874d88c98d | ||
|
|
5a178ff635 | ||
|
|
d67439b910 | ||
|
|
21a5a4c76a | ||
|
|
f818d6ee3f | ||
|
|
20c2745592 | ||
|
|
cf5c72561c | ||
|
|
53b85e5d32 | ||
|
|
2456cf63f2 | ||
|
|
c1f6ce4251 | ||
|
|
fc82eb5586 | ||
|
|
f58e592345 | ||
|
|
6375b1e5b7 | ||
|
|
7ca24efb39 | ||
|
|
32ebaaa585 | ||
|
|
a88c722a4c | ||
|
|
07582d6c10 | ||
|
|
18c89edbf7 | ||
|
|
5558fa8c44 | ||
|
|
f678eb52cd | ||
|
|
dfb23f4b38 | ||
|
|
502ddf158a | ||
|
|
0ccb0a127a | ||
|
|
b94997be0d | ||
|
|
d3a5b10cb8 | ||
|
|
48f3f899ac | ||
|
|
c4f745276c | ||
|
|
392c7dee3c | ||
|
|
04e982f909 | ||
|
|
97a8cc149e | ||
|
|
dbe347811c | ||
|
|
8f2d24bb8f | ||
|
|
689e11c892 | ||
|
|
1766c793a8 | ||
|
|
0b09008cde | ||
|
|
713e60f690 | ||
|
|
7b6841f947 | ||
|
|
f124b7026f | ||
|
|
28cca751d1 | ||
|
|
21d08aa41e | ||
|
|
f0b5170fa6 | ||
|
|
3b617e873c | ||
|
|
4eb9836e64 | ||
|
|
46af353209 | ||
|
|
76f70c676e | ||
|
|
afc9f0e177 | ||
|
|
c899191ad5 | ||
|
|
a3563e5f7d | ||
|
|
78c5b09694 | ||
|
|
a51878205a | ||
|
|
46efc08882 | ||
|
|
c95433c967 | ||
|
|
df3b6fb4a8 | ||
|
|
44ee43bb17 | ||
|
|
aac363ca86 | ||
|
|
9e6cc5b024 | ||
|
|
13b72c7b92 | ||
|
|
6b011fbe0a | ||
|
|
9dd4e1b01f | ||
|
|
e91c741ef5 | ||
|
|
b18e9e115d | ||
|
|
0b86ab0d2a | ||
|
|
97cd9108d6 | ||
|
|
f81fe96b0b | ||
|
|
91ce3a7957 | ||
|
|
af7e58f4c5 | ||
|
|
bd3373b511 | ||
|
|
4820130abe | ||
|
|
9b299241e5 | ||
|
|
745a98e151 | ||
|
|
b9c99af18b | ||
|
|
cded9c7ac7 | ||
|
|
4acfc5ed8f | ||
|
|
cb9199bc7f | ||
|
|
695ff5383f | ||
|
|
730e48bf60 | ||
|
|
af6d4f40e1 | ||
|
|
9d8507de09 | ||
|
|
07c980845d | ||
|
|
c52b8239d0 | ||
|
|
5a07a4fac8 | ||
|
|
b9c046b17b | ||
|
|
979cb636b8 | ||
|
|
59cf9d9070 | ||
|
|
c9ec9d4087 | ||
|
|
2e8fefbc5a | ||
|
|
6be0635029 | ||
|
|
04a544c0a2 | ||
|
|
028f9b95d1 | ||
|
|
54258ca8eb | ||
|
|
c9a030f1f0 | ||
|
|
1c7daef554 | ||
|
|
f8195a77b0 | ||
|
|
5b724c80ab | ||
|
|
4a7ae81b3f | ||
|
|
3cf26a60a2 | ||
|
|
2103d0d52b | ||
|
|
16ee3b3ebe | ||
|
|
b0a9c40ab1 | ||
|
|
53924e5c7f | ||
|
|
befe0012f5 | ||
|
|
1953c5fa61 | ||
|
|
b72a94b53e | ||
|
|
3f82b697f2 | ||
|
|
ee1ef853e5 | ||
|
|
6e7e7f3822 | ||
|
|
82a36edc9d | ||
|
|
d4efa3c9b2 | ||
|
|
324dae3e12 | ||
|
|
c0ffc9a2b7 | ||
|
|
f81fa5f75c | ||
|
|
6fd1cfcfce | ||
|
|
9d458ffea9 | ||
|
|
9776a048e7 | ||
|
|
10cf97375e | ||
|
|
e6355a9a01 | ||
|
|
e57907a1d5 | ||
|
|
f94b46e7e0 | ||
|
|
6847c12668 | ||
|
|
80b86def1f | ||
|
|
c6de9ea39b | ||
|
|
94bed81c1d | ||
|
|
0f3a21f0bb | ||
|
|
976db7e9e0 | ||
|
|
996b86b804 | ||
|
|
b7b217cc43 | ||
|
|
c274430933 | ||
|
|
893a18a7c4 | ||
|
|
39b39058fc | ||
|
|
6bf4a73d88 | ||
|
|
ca4846dd63 | ||
|
|
2663ff7bc1 | ||
|
|
043a575fcd | ||
|
|
00dc400993 | ||
|
|
522a48a244 | ||
|
|
5faa28ce45 | ||
|
|
52be02558e | ||
|
|
a7cbfbe63f | ||
|
|
28fd2044d2 | ||
|
|
76ff2e5c3d | ||
|
|
7b34d54a96 | ||
|
|
26c31f6798 | ||
|
|
28fa66591a | ||
|
|
0fee1d9e43 | ||
|
|
76e72e28f4 | ||
|
|
f969e80965 | ||
|
|
2029134063 | ||
|
|
f30fe7bd17 | ||
|
|
aeb418af9e | ||
|
|
714e6d741f | ||
|
|
95c5872450 | ||
|
|
87f8968553 | ||
|
|
2895428d44 | ||
|
|
e18f182cfc | ||
|
|
cf8cdbf87d | ||
|
|
eb2814067d | ||
|
|
0c722d4547 | ||
|
|
54cf463430 | ||
|
|
d2a0622edd | ||
|
|
60edaec757 | ||
|
|
5802532cb3 | ||
|
|
83ea91055e | ||
|
|
e7863d3d54 | ||
|
|
57f124b905 | ||
|
|
40d8de5784 | ||
|
|
1468ec62de | ||
|
|
c6ef56ae1e | ||
|
|
ad62313b86 | ||
|
|
de87f798e1 |
5
.gitmodules
vendored
5
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
@@ -9,3 +9,6 @@
|
||||
[submodule "xxHash"]
|
||||
path = xxHash
|
||||
url = ../xxHash
|
||||
[submodule "libdeflate"]
|
||||
path = libdeflate
|
||||
url = ../libdeflate
|
||||
|
||||
@@ -138,4 +138,5 @@ target_include_directories(scylla PUBLIC
|
||||
${SEASTAR_INCLUDE_DIRS}
|
||||
${Boost_INCLUDE_DIRS}
|
||||
xxhash
|
||||
libdeflate
|
||||
build/release/gen)
|
||||
|
||||
@@ -50,12 +50,12 @@ Then, to build an RPM, run:
|
||||
./dist/redhat/build_rpm.sh
|
||||
```
|
||||
|
||||
The built RPM is stored in ``/var/lib/mock/<configuration>/result`` directory.
|
||||
The built RPM is stored in the ``build/mock/<configuration>/result`` directory.
|
||||
For example, on Fedora 21 mock reports the following:
|
||||
|
||||
```
|
||||
INFO: Done(scylla-server-0.00-1.fc21.src.rpm) Config(default) 20 minutes 7 seconds
|
||||
INFO: Results and/or logs in: /var/lib/mock/fedora-21-x86_64/result
|
||||
INFO: Results and/or logs in: build/mock/fedora-21-x86_64/result
|
||||
```
|
||||
|
||||
## Building Fedora-based Docker image
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
VERSION=666.development
|
||||
VERSION=3.0.2
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -2228,11 +2228,11 @@
|
||||
"description":"The column family"
|
||||
},
|
||||
"total":{
|
||||
"type":"int",
|
||||
"type":"long",
|
||||
"description":"The total snapshot size"
|
||||
},
|
||||
"live":{
|
||||
"type":"int",
|
||||
"type":"long",
|
||||
"description":"The live snapshot size"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,11 +87,17 @@ future<> create_metadata_table_if_missing(
|
||||
return mm.announce_new_column_family(b.build(), false);
|
||||
}
|
||||
|
||||
future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db) {
|
||||
future<> wait_for_schema_agreement(::service::migration_manager& mm, const database& db, seastar::abort_source& as) {
|
||||
static const auto pause = [] { return sleep(std::chrono::milliseconds(500)); };
|
||||
|
||||
return do_until([&db] { return db.get_version() != database::empty_version; }, pause).then([&mm] {
|
||||
return do_until([&mm] { return mm.have_schema_agreement(); }, pause);
|
||||
return do_until([&db, &as] {
|
||||
as.check();
|
||||
return db.get_version() != database::empty_version;
|
||||
}, pause).then([&mm, &as] {
|
||||
return do_until([&mm, &as] {
|
||||
as.check();
|
||||
return mm.have_schema_agreement();
|
||||
}, pause);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ future<> create_metadata_table_if_missing(
|
||||
stdx::string_view cql,
|
||||
::service::migration_manager&);
|
||||
|
||||
future<> wait_for_schema_agreement(::service::migration_manager&, const database&);
|
||||
future<> wait_for_schema_agreement(::service::migration_manager&, const database&, seastar::abort_source&);
|
||||
|
||||
///
|
||||
/// Time-outs for internal, non-local CQL queries.
|
||||
|
||||
@@ -160,7 +160,7 @@ future<> default_authorizer::start() {
|
||||
_migration_manager).then([this] {
|
||||
_finished = do_after_system_ready(_as, [this] {
|
||||
return async([this] {
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||
|
||||
if (legacy_metadata_exists()) {
|
||||
if (!any_granted().get0()) {
|
||||
@@ -178,7 +178,7 @@ future<> default_authorizer::start() {
|
||||
|
||||
future<> default_authorizer::stop() {
|
||||
_as.request_abort();
|
||||
return _finished.handle_exception_type([](const sleep_aborted&) {});
|
||||
return _finished.handle_exception_type([](const sleep_aborted&) {}).handle_exception_type([](const abort_requested_exception&) {});
|
||||
}
|
||||
|
||||
future<permission_set>
|
||||
|
||||
@@ -157,7 +157,7 @@ future<> password_authenticator::start() {
|
||||
|
||||
_stopped = do_after_system_ready(_as, [this] {
|
||||
return async([this] {
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||
|
||||
if (any_nondefault_role_row_satisfies(_qp, &has_salted_hash).get0()) {
|
||||
if (legacy_metadata_exists()) {
|
||||
@@ -182,7 +182,7 @@ future<> password_authenticator::start() {
|
||||
|
||||
future<> password_authenticator::stop() {
|
||||
_as.request_abort();
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { });
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});
|
||||
}
|
||||
|
||||
db::consistency_level password_authenticator::consistency_for_user(stdx::string_view role_name) {
|
||||
|
||||
@@ -196,6 +196,10 @@ future<> service::start() {
|
||||
}
|
||||
|
||||
future<> service::stop() {
|
||||
// Only one of the shards has the listener registered, but let's try to
|
||||
// unregister on each one just to make sure.
|
||||
_migration_manager.unregister_listener(_migration_listener.get());
|
||||
|
||||
return _permissions_cache->stop().then([this] {
|
||||
return when_all_succeed(_role_manager->stop(), _authorizer->stop(), _authenticator->stop());
|
||||
});
|
||||
|
||||
@@ -227,7 +227,7 @@ future<> standard_role_manager::start() {
|
||||
return this->create_metadata_tables_if_missing().then([this] {
|
||||
_stopped = auth::do_after_system_ready(_as, [this] {
|
||||
return seastar::async([this] {
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local()).get0();
|
||||
wait_for_schema_agreement(_migration_manager, _qp.db().local(), _as).get0();
|
||||
|
||||
if (any_nondefault_role_row_satisfies(_qp, &has_can_login).get0()) {
|
||||
if (this->legacy_metadata_exists()) {
|
||||
@@ -251,7 +251,7 @@ future<> standard_role_manager::start() {
|
||||
|
||||
future<> standard_role_manager::stop() {
|
||||
_as.request_abort();
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { });
|
||||
return _stopped.handle_exception_type([] (const sleep_aborted&) { }).handle_exception_type([](const abort_requested_exception&) {});;
|
||||
}
|
||||
|
||||
future<> standard_role_manager::create_or_replace(stdx::string_view role_name, const role_config& c) const {
|
||||
|
||||
@@ -77,7 +77,7 @@ protected:
|
||||
, _io_priority(iop)
|
||||
, _interval(interval)
|
||||
, _update_timer([this] { adjust(); })
|
||||
, _control_points({{0,0}})
|
||||
, _control_points()
|
||||
, _current_backlog(std::move(backlog))
|
||||
, _inflight_update(make_ready_future<>())
|
||||
{
|
||||
@@ -125,7 +125,7 @@ public:
|
||||
flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
|
||||
flush_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, float soft_limit, std::function<float()> current_dirty)
|
||||
: backlog_controller(sg, iop, std::move(interval),
|
||||
std::vector<backlog_controller::control_point>({{soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
|
||||
std::vector<backlog_controller::control_point>({{0.0, 0.0}, {soft_limit, 10}, {soft_limit + (hard_dirty_limit - soft_limit) / 2, 200} , {hard_dirty_limit, 1000}}),
|
||||
std::move(current_dirty)
|
||||
)
|
||||
{}
|
||||
@@ -139,7 +139,7 @@ public:
|
||||
compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, float static_shares) : backlog_controller(sg, iop, static_shares) {}
|
||||
compaction_controller(seastar::scheduling_group sg, const ::io_priority_class& iop, std::chrono::milliseconds interval, std::function<float()> current_backlog)
|
||||
: backlog_controller(sg, iop, std::move(interval),
|
||||
std::vector<backlog_controller::control_point>({{0.5, 10}, {1.5, 100} , {normalization_factor, 1000}}),
|
||||
std::vector<backlog_controller::control_point>({{0.0, 50}, {1.5, 100} , {normalization_factor, 1000}}),
|
||||
std::move(current_backlog)
|
||||
)
|
||||
{}
|
||||
|
||||
@@ -57,12 +57,12 @@ private:
|
||||
value_type data[0];
|
||||
void operator delete(void* ptr) { free(ptr); }
|
||||
};
|
||||
// FIXME: consider increasing chunk size as the buffer grows
|
||||
static constexpr size_type chunk_size{512};
|
||||
static constexpr size_type default_chunk_size{512};
|
||||
private:
|
||||
std::unique_ptr<chunk> _begin;
|
||||
chunk* _current;
|
||||
size_type _size;
|
||||
size_type _initial_chunk_size = default_chunk_size;
|
||||
public:
|
||||
class fragment_iterator : public std::iterator<std::input_iterator_tag, bytes_view> {
|
||||
chunk* _current = nullptr;
|
||||
@@ -102,13 +102,13 @@ private:
|
||||
}
|
||||
// Figure out next chunk size.
|
||||
// - must be enough for data_size
|
||||
// - must be at least chunk_size
|
||||
// - must be at least _initial_chunk_size
|
||||
// - try to double each time to prevent too many allocations
|
||||
// - do not exceed max_chunk_size
|
||||
size_type next_alloc_size(size_t data_size) const {
|
||||
auto next_size = _current
|
||||
? _current->size * 2
|
||||
: chunk_size;
|
||||
: _initial_chunk_size;
|
||||
next_size = std::min(next_size, max_chunk_size());
|
||||
// FIXME: check for overflow?
|
||||
return std::max<size_type>(next_size, data_size + sizeof(chunk));
|
||||
@@ -116,13 +116,19 @@ private:
|
||||
// Makes room for a contiguous region of given size.
|
||||
// The region is accounted for as already written.
|
||||
// size must not be zero.
|
||||
[[gnu::always_inline]]
|
||||
value_type* alloc(size_type size) {
|
||||
if (size <= current_space_left()) {
|
||||
if (__builtin_expect(size <= current_space_left(), true)) {
|
||||
auto ret = _current->data + _current->offset;
|
||||
_current->offset += size;
|
||||
_size += size;
|
||||
return ret;
|
||||
} else {
|
||||
return alloc_new(size);
|
||||
}
|
||||
}
|
||||
[[gnu::noinline]]
|
||||
value_type* alloc_new(size_type size) {
|
||||
auto alloc_size = next_alloc_size(size);
|
||||
auto space = malloc(alloc_size);
|
||||
if (!space) {
|
||||
@@ -140,19 +146,22 @@ private:
|
||||
}
|
||||
_size += size;
|
||||
return _current->data;
|
||||
};
|
||||
}
|
||||
public:
|
||||
bytes_ostream() noexcept
|
||||
explicit bytes_ostream(size_t initial_chunk_size) noexcept
|
||||
: _begin()
|
||||
, _current(nullptr)
|
||||
, _size(0)
|
||||
, _initial_chunk_size(initial_chunk_size)
|
||||
{ }
|
||||
|
||||
bytes_ostream() noexcept : bytes_ostream(default_chunk_size) {}
|
||||
|
||||
bytes_ostream(bytes_ostream&& o) noexcept
|
||||
: _begin(std::move(o._begin))
|
||||
, _current(o._current)
|
||||
, _size(o._size)
|
||||
, _initial_chunk_size(o._initial_chunk_size)
|
||||
{
|
||||
o._current = nullptr;
|
||||
o._size = 0;
|
||||
@@ -162,6 +171,7 @@ public:
|
||||
: _begin()
|
||||
, _current(nullptr)
|
||||
, _size(0)
|
||||
, _initial_chunk_size(o._initial_chunk_size)
|
||||
{
|
||||
append(o);
|
||||
}
|
||||
@@ -199,18 +209,20 @@ public:
|
||||
return place_holder<T>{alloc(sizeof(T))};
|
||||
}
|
||||
|
||||
[[gnu::always_inline]]
|
||||
value_type* write_place_holder(size_type size) {
|
||||
return alloc(size);
|
||||
}
|
||||
|
||||
// Writes given sequence of bytes
|
||||
[[gnu::always_inline]]
|
||||
inline void write(bytes_view v) {
|
||||
if (v.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto this_size = std::min(v.size(), size_t(current_space_left()));
|
||||
if (this_size) {
|
||||
if (__builtin_expect(this_size, true)) {
|
||||
memcpy(_current->data + _current->offset, v.begin(), this_size);
|
||||
_current->offset += this_size;
|
||||
_size += this_size;
|
||||
@@ -219,11 +231,12 @@ public:
|
||||
|
||||
while (!v.empty()) {
|
||||
auto this_size = std::min(v.size(), size_t(max_chunk_size()));
|
||||
std::copy_n(v.begin(), this_size, alloc(this_size));
|
||||
std::copy_n(v.begin(), this_size, alloc_new(this_size));
|
||||
v.remove_prefix(this_size);
|
||||
}
|
||||
}
|
||||
|
||||
[[gnu::always_inline]]
|
||||
void write(const char* ptr, size_t size) {
|
||||
write(bytes_view(reinterpret_cast<const signed char*>(ptr), size));
|
||||
}
|
||||
@@ -393,6 +406,21 @@ public:
|
||||
bool operator!=(const bytes_ostream& other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
// Makes this instance empty.
|
||||
//
|
||||
// The first buffer is not deallocated, so callers may rely on the
|
||||
// fact that if they write less than the initial chunk size between
|
||||
// the clear() calls then writes will not involve any memory allocations,
|
||||
// except for the first write made on this instance.
|
||||
void clear() {
|
||||
if (_begin) {
|
||||
_begin->offset = 0;
|
||||
_size = 0;
|
||||
_current = _begin.get();
|
||||
_begin->next.reset();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
|
||||
@@ -200,8 +200,9 @@ public:
|
||||
return _current_start;
|
||||
}
|
||||
|
||||
position_in_partition_view upper_bound() const {
|
||||
return _current_end;
|
||||
// Returns the upper bound of the last range in provided ranges set
|
||||
position_in_partition_view uppermost_bound() const {
|
||||
return position_in_partition_view::for_range_end(_ranges.back());
|
||||
}
|
||||
|
||||
// When lower_bound() changes, this also does
|
||||
|
||||
@@ -112,7 +112,7 @@ const sstring compression_parameters::CHUNK_LENGTH_KB = "chunk_length_kb";
|
||||
const sstring compression_parameters::CRC_CHECK_CHANCE = "crc_check_chance";
|
||||
|
||||
compression_parameters::compression_parameters()
|
||||
: compression_parameters(nullptr)
|
||||
: compression_parameters(compressor::lz4)
|
||||
{}
|
||||
|
||||
compression_parameters::~compression_parameters()
|
||||
|
||||
@@ -118,6 +118,10 @@ public:
|
||||
std::map<sstring, sstring> get_options() const;
|
||||
bool operator==(const compression_parameters& other) const;
|
||||
bool operator!=(const compression_parameters& other) const;
|
||||
|
||||
static compression_parameters no_compression() {
|
||||
return compression_parameters(nullptr);
|
||||
}
|
||||
private:
|
||||
void validate_options(const std::map<sstring, sstring>&);
|
||||
};
|
||||
|
||||
@@ -242,6 +242,9 @@ batch_size_fail_threshold_in_kb: 50
|
||||
|
||||
# The directory where hints files are stored if hinted handoff is enabled.
|
||||
# hints_directory: /var/lib/scylla/hints
|
||||
|
||||
# The directory where hints files are stored for materialized-view updates
|
||||
# view_hints_directory: /var/lib/scylla/view_hints
|
||||
|
||||
# See http://wiki.apache.org/cassandra/HintedHandoff
|
||||
# May either be "true" or "false" to enable globally, or contain a list
|
||||
|
||||
34
configure.py
34
configure.py
@@ -197,7 +197,9 @@ class Thrift(object):
|
||||
|
||||
def default_target_arch():
|
||||
if platform.machine() in ['i386', 'i686', 'x86_64']:
|
||||
return 'nehalem'
|
||||
return 'westmere' # support PCLMUL
|
||||
elif platform.machine() == 'aarch64':
|
||||
return 'armv8-a+crc+crypto'
|
||||
else:
|
||||
return ''
|
||||
|
||||
@@ -271,6 +273,7 @@ scylla_tests = [
|
||||
'tests/perf/perf_sstable',
|
||||
'tests/cql_query_test',
|
||||
'tests/secondary_index_test',
|
||||
'tests/filtering_test',
|
||||
'tests/storage_proxy_test',
|
||||
'tests/schema_change_test',
|
||||
'tests/mutation_reader_test',
|
||||
@@ -306,6 +309,7 @@ scylla_tests = [
|
||||
'tests/log_heap_test',
|
||||
'tests/managed_vector_test',
|
||||
'tests/crc_test',
|
||||
'tests/checksum_utils_test',
|
||||
'tests/flush_queue_test',
|
||||
'tests/dynamic_bitset_test',
|
||||
'tests/auth_test',
|
||||
@@ -356,6 +360,7 @@ scylla_tests = [
|
||||
|
||||
perf_tests = [
|
||||
'tests/perf/perf_mutation_readers',
|
||||
'tests/perf/perf_checksum',
|
||||
'tests/perf/perf_mutation_fragment',
|
||||
'tests/perf/perf_idl',
|
||||
]
|
||||
@@ -431,6 +436,7 @@ extra_cxxflags = {}
|
||||
cassandra_interface = Thrift(source='interface/cassandra.thrift', service='Cassandra')
|
||||
|
||||
scylla_core = (['database.cc',
|
||||
'table.cc',
|
||||
'atomic_cell.cc',
|
||||
'schema.cc',
|
||||
'frozen_schema.cc',
|
||||
@@ -461,6 +467,7 @@ scylla_core = (['database.cc',
|
||||
'compress.cc',
|
||||
'sstables/mp_row_consumer.cc',
|
||||
'sstables/sstables.cc',
|
||||
'sstables/mc/writer.cc',
|
||||
'sstables/sstable_version.cc',
|
||||
'sstables/compress.cc',
|
||||
'sstables/row.cc',
|
||||
@@ -470,7 +477,6 @@ scylla_core = (['database.cc',
|
||||
'sstables/compaction_manager.cc',
|
||||
'sstables/integrity_checked_file_impl.cc',
|
||||
'sstables/prepended_input_stream.cc',
|
||||
'sstables/m_format_write_helpers.cc',
|
||||
'sstables/m_format_read_helpers.cc',
|
||||
'transport/event.cc',
|
||||
'transport/event_notifier.cc',
|
||||
@@ -579,6 +585,7 @@ scylla_core = (['database.cc',
|
||||
'db/marshal/type_parser.cc',
|
||||
'db/batchlog_manager.cc',
|
||||
'db/view/view.cc',
|
||||
'db/view/view_update_from_staging_generator.cc',
|
||||
'db/view/row_locking.cc',
|
||||
'index/secondary_index_manager.cc',
|
||||
'index/secondary_index.cc',
|
||||
@@ -592,6 +599,7 @@ scylla_core = (['database.cc',
|
||||
'utils/managed_bytes.cc',
|
||||
'utils/exceptions.cc',
|
||||
'utils/config_file.cc',
|
||||
'utils/gz/crc_combine.cc',
|
||||
'gms/version_generator.cc',
|
||||
'gms/versioned_value.cc',
|
||||
'gms/gossiper.cc',
|
||||
@@ -682,6 +690,7 @@ scylla_core = (['database.cc',
|
||||
'data/cell.cc',
|
||||
'multishard_writer.cc',
|
||||
'multishard_mutation_query.cc',
|
||||
'reader_concurrency_semaphore.cc',
|
||||
] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
|
||||
)
|
||||
|
||||
@@ -744,6 +753,7 @@ idls = ['idl/gossip_digest.idl.hh',
|
||||
'idl/tracing.idl.hh',
|
||||
'idl/consistency_level.idl.hh',
|
||||
'idl/cache_temperature.idl.hh',
|
||||
'idl/view.idl.hh',
|
||||
]
|
||||
|
||||
scylla_tests_dependencies = scylla_core + idls + [
|
||||
@@ -773,6 +783,7 @@ pure_boost_tests = set([
|
||||
'tests/test-serialization',
|
||||
'tests/range_test',
|
||||
'tests/crc_test',
|
||||
'tests/checksum_utils_test',
|
||||
'tests/managed_vector_test',
|
||||
'tests/dynamic_bitset_test',
|
||||
'tests/idl_test',
|
||||
@@ -1001,6 +1012,8 @@ seastar_ldflags = args.user_ldflags
|
||||
seastar_flags += ['--compiler', args.cxx, '--c-compiler', args.cc, '--cflags=%s' % (seastar_cflags), '--ldflags=%s' % (seastar_ldflags),
|
||||
'--c++-dialect=gnu++1z', '--optflags=%s' % (modes['release']['opt']), ]
|
||||
|
||||
libdeflate_cflags = seastar_cflags
|
||||
|
||||
status = subprocess.call([args.python, './configure.py'] + seastar_flags, cwd='seastar')
|
||||
|
||||
if status != 0:
|
||||
@@ -1100,6 +1113,9 @@ with open(buildfile, 'w') as f:
|
||||
command = {ninja} -C $subdir $target
|
||||
restat = 1
|
||||
description = NINJA $out
|
||||
rule run
|
||||
command = $in > $out
|
||||
description = GEN $out
|
||||
rule copy
|
||||
command = cp $in $out
|
||||
description = COPY $out
|
||||
@@ -1172,6 +1188,10 @@ with open(buildfile, 'w') as f:
|
||||
if binary.endswith('.a'):
|
||||
f.write('build $builddir/{}/{}: ar.{} {}\n'.format(mode, binary, mode, str.join(' ', objs)))
|
||||
else:
|
||||
objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
|
||||
'libdeflate/libdeflate.a'
|
||||
]])
|
||||
objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
|
||||
if binary.startswith('tests/'):
|
||||
local_libs = '$libs'
|
||||
if binary not in tests_not_using_seastar_test_framework or binary in pure_boost_tests:
|
||||
@@ -1213,6 +1233,12 @@ with open(buildfile, 'w') as f:
|
||||
antlr3_grammars.add(src)
|
||||
else:
|
||||
raise Exception('No rule for ' + src)
|
||||
compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
|
||||
compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
|
||||
f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
|
||||
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
|
||||
f.write('build {}: link.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
|
||||
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
|
||||
for obj in compiles:
|
||||
src = compiles[obj]
|
||||
gen_headers = list(ragels.keys())
|
||||
@@ -1262,6 +1288,10 @@ with open(buildfile, 'w') as f:
|
||||
''').format(**locals()))
|
||||
f.write('build build/$mode/scylla-package.tar: package build/{mode}/scylla build/{mode}/iotune\n'.format(**locals()))
|
||||
f.write(' mode = {mode}\n'.format(**locals()))
|
||||
f.write('rule libdeflate.{mode}\n'.format(**locals()))
|
||||
f.write(' command = make -C libdeflate BUILD_DIR=../build/{mode}/libdeflate/ CFLAGS="{libdeflate_cflags}" CC={args.cc} ../build/{mode}/libdeflate//libdeflate.a\n'.format(**locals()))
|
||||
f.write('build build/{mode}/libdeflate/libdeflate.a: libdeflate.{mode}\n'.format(**locals()))
|
||||
|
||||
f.write('build {}: phony\n'.format(seastar_deps))
|
||||
f.write(textwrap.dedent('''\
|
||||
rule configure
|
||||
|
||||
@@ -38,44 +38,44 @@ private:
|
||||
static bool is_compatible(const column_definition& new_def, const data_type& old_type, column_kind kind) {
|
||||
return ::is_compatible(new_def.kind, kind) && new_def.type->is_value_compatible_with(*old_type);
|
||||
}
|
||||
static atomic_cell upgrade_cell(const abstract_type& new_type, const abstract_type& old_type, atomic_cell_view cell,
|
||||
atomic_cell::collection_member cm = atomic_cell::collection_member::no) {
|
||||
if (cell.is_live() && !old_type.is_counter()) {
|
||||
if (cell.is_live_and_has_ttl()) {
|
||||
return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl(), cm);
|
||||
}
|
||||
return atomic_cell::make_live(new_type, cell.timestamp(), cell.value().linearize(), cm);
|
||||
} else {
|
||||
return atomic_cell(new_type, cell);
|
||||
}
|
||||
}
|
||||
static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, atomic_cell_view cell) {
|
||||
if (!is_compatible(new_def, old_type, kind) || cell.timestamp() <= new_def.dropped_at()) {
|
||||
return;
|
||||
}
|
||||
auto new_cell = [&] {
|
||||
if (cell.is_live() && !old_type->is_counter()) {
|
||||
if (cell.is_live_and_has_ttl()) {
|
||||
return atomic_cell_or_collection(
|
||||
atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize(), cell.expiry(), cell.ttl())
|
||||
);
|
||||
}
|
||||
return atomic_cell_or_collection(
|
||||
atomic_cell::make_live(*new_def.type, cell.timestamp(), cell.value().linearize())
|
||||
);
|
||||
} else {
|
||||
return atomic_cell_or_collection(*new_def.type, cell);
|
||||
}
|
||||
}();
|
||||
dst.apply(new_def, std::move(new_cell));
|
||||
dst.apply(new_def, upgrade_cell(*new_def.type, *old_type, cell));
|
||||
}
|
||||
static void accept_cell(row& dst, column_kind kind, const column_definition& new_def, const data_type& old_type, collection_mutation_view cell) {
|
||||
if (!is_compatible(new_def, old_type, kind)) {
|
||||
return;
|
||||
}
|
||||
cell.data.with_linearized([&] (bytes_view cell_bv) {
|
||||
auto&& ctype = static_pointer_cast<const collection_type_impl>(old_type);
|
||||
auto old_view = ctype->deserialize_mutation_form(cell_bv);
|
||||
auto new_ctype = static_pointer_cast<const collection_type_impl>(new_def.type);
|
||||
auto old_ctype = static_pointer_cast<const collection_type_impl>(old_type);
|
||||
auto old_view = old_ctype->deserialize_mutation_form(cell_bv);
|
||||
|
||||
collection_type_impl::mutation_view new_view;
|
||||
collection_type_impl::mutation new_view;
|
||||
if (old_view.tomb.timestamp > new_def.dropped_at()) {
|
||||
new_view.tomb = old_view.tomb;
|
||||
}
|
||||
for (auto& c : old_view.cells) {
|
||||
if (c.second.timestamp() > new_def.dropped_at()) {
|
||||
new_view.cells.emplace_back(std::move(c));
|
||||
new_view.cells.emplace_back(c.first, upgrade_cell(*new_ctype->value_comparator(), *old_ctype->value_comparator(), c.second, atomic_cell::collection_member::yes));
|
||||
}
|
||||
}
|
||||
dst.apply(new_def, ctype->serialize_mutation_form(std::move(new_view)));
|
||||
if (new_view.tomb || !new_view.cells.empty()) {
|
||||
dst.apply(new_def, new_ctype->serialize_mutation_form(std::move(new_view)));
|
||||
}
|
||||
});
|
||||
}
|
||||
public:
|
||||
|
||||
@@ -67,6 +67,12 @@ class error_collector : public error_listener<RecognizerType, ExceptionBaseType>
|
||||
*/
|
||||
const sstring_view _query;
|
||||
|
||||
/**
|
||||
* An empty bitset to be used as a workaround for AntLR null dereference
|
||||
* bug.
|
||||
*/
|
||||
static typename ExceptionBaseType::BitsetListType _empty_bit_list;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
@@ -144,6 +150,14 @@ private:
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// AntLR Exception class has a bug of dereferencing a null
|
||||
// pointer in the displayRecognitionError. The following
|
||||
// if statement makes sure it will not be null before the
|
||||
// call to that function (displayRecognitionError).
|
||||
// bug reference: https://github.com/antlr/antlr3/issues/191
|
||||
if (!ex->get_expectingSet()) {
|
||||
ex->set_expectingSet(&_empty_bit_list);
|
||||
}
|
||||
ex->displayRecognitionError(token_names, msg);
|
||||
}
|
||||
return msg.str();
|
||||
@@ -345,4 +359,8 @@ private:
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename RecognizerType, typename TokenType, typename ExceptionBaseType>
|
||||
typename ExceptionBaseType::BitsetListType
|
||||
error_collector<RecognizerType,TokenType,ExceptionBaseType>::_empty_bit_list = typename ExceptionBaseType::BitsetListType();
|
||||
|
||||
}
|
||||
|
||||
@@ -100,12 +100,28 @@ public:
|
||||
bool has_unrestricted_components(const schema& schema) const;
|
||||
|
||||
virtual bool needs_filtering(const schema& schema) const;
|
||||
|
||||
// How long a prefix of the restrictions could have resulted in
|
||||
// need_filtering() == false. These restrictions do not need to be
|
||||
// applied during filtering.
|
||||
// For example, if we have the filter "c1 < 3 and c2 > 3", c1 does
|
||||
// not need filtering (just a read stopping at c1=3) but c2 does,
|
||||
// so num_prefix_columns_that_need_not_be_filtered() will be 1.
|
||||
virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
virtual bool is_all_eq() const {
|
||||
return false;
|
||||
}
|
||||
virtual size_t prefix_size() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t prefix_size(const schema_ptr schema) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template<>
|
||||
@@ -129,5 +145,23 @@ inline bool primary_key_restrictions<clustering_key>::needs_filtering(const sche
|
||||
return false;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline size_t primary_key_restrictions<clustering_key>::prefix_size(const schema_ptr schema) const {
|
||||
size_t count = 0;
|
||||
if (schema->clustering_key_columns().empty()) {
|
||||
return count;
|
||||
}
|
||||
auto column_defs = get_column_defs();
|
||||
column_id expected_column_id = schema->clustering_key_columns().begin()->id;
|
||||
for (auto&& cdef : column_defs) {
|
||||
if (schema->position(*cdef) != expected_column_id) {
|
||||
return count;
|
||||
}
|
||||
expected_column_id++;
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,19 +166,7 @@ public:
|
||||
}
|
||||
|
||||
virtual size_t prefix_size() const override {
|
||||
size_t count = 0;
|
||||
if (_schema->clustering_key_columns().empty()) {
|
||||
return count;
|
||||
}
|
||||
column_id expected_column_id = _schema->clustering_key_columns().begin()->id;
|
||||
for (const auto& restriction_entry : _restrictions->restrictions()) {
|
||||
if (_schema->position(*restriction_entry.first) != expected_column_id) {
|
||||
return count;
|
||||
}
|
||||
expected_column_id++;
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
return primary_key_restrictions<ValueType>::prefix_size(_schema);
|
||||
}
|
||||
|
||||
::shared_ptr<single_column_primary_key_restrictions<clustering_key>> get_longest_prefix_restrictions() {
|
||||
@@ -419,6 +407,7 @@ public:
|
||||
}
|
||||
|
||||
virtual bool needs_filtering(const schema& schema) const override;
|
||||
virtual unsigned int num_prefix_columns_that_need_not_be_filtered() const override;
|
||||
};
|
||||
|
||||
template<>
|
||||
@@ -499,6 +488,39 @@ inline bool single_column_primary_key_restrictions<clustering_key>::needs_filter
|
||||
return false;
|
||||
}
|
||||
|
||||
// How many of the restrictions (in column order) do not need filtering
|
||||
// because they are implemented as a slice (potentially, a contiguous disk
|
||||
// read). For example, if we have the filter "c1 < 3 and c2 > 3", c1 does not
|
||||
// need filtering but c2 does so num_prefix_columns_that_need_not_be_filtered
|
||||
// will be 1.
|
||||
// The implementation of num_prefix_columns_that_need_not_be_filtered() is
|
||||
// closely tied to that of needs_filtering() above - basically, if only the
|
||||
// first num_prefix_columns_that_need_not_be_filtered() restrictions existed,
|
||||
// then needs_filtering() would have returned false.
|
||||
template<>
|
||||
inline unsigned single_column_primary_key_restrictions<clustering_key>::num_prefix_columns_that_need_not_be_filtered() const {
|
||||
column_id position = 0;
|
||||
unsigned int count = 0;
|
||||
for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
|
||||
if (restriction->is_contains() || position != restriction->get_column_def().id) {
|
||||
return count;
|
||||
}
|
||||
if (!restriction->is_slice()) {
|
||||
position = restriction->get_column_def().id + 1;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline unsigned single_column_primary_key_restrictions<partition_key>::num_prefix_columns_that_need_not_be_filtered() const {
|
||||
// skip_filtering() is currently called only for clustering key
|
||||
// restrictions, so it doesn't matter what we return here.
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -337,6 +337,52 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
|
||||
return _index_restrictions;
|
||||
}
|
||||
|
||||
std::optional<secondary_index::index> statement_restrictions::find_idx(secondary_index::secondary_index_manager& sim) const {
|
||||
for (::shared_ptr<cql3::restrictions::restrictions> restriction : index_restrictions()) {
|
||||
for (const auto& cdef : restriction->get_column_defs()) {
|
||||
for (auto index : sim.list_indexes()) {
|
||||
if (index.depends_on(*cdef)) {
|
||||
return std::make_optional<secondary_index::index>(std::move(index));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::vector<const column_definition*> statement_restrictions::get_column_defs_for_filtering(database& db) const {
|
||||
std::vector<const column_definition*> column_defs_for_filtering;
|
||||
if (need_filtering()) {
|
||||
auto& sim = db.find_column_family(_schema).get_index_manager();
|
||||
std::optional<secondary_index::index> opt_idx = find_idx(sim);
|
||||
auto column_uses_indexing = [&opt_idx] (const column_definition* cdef) {
|
||||
return opt_idx && opt_idx->depends_on(*cdef);
|
||||
};
|
||||
if (_partition_key_restrictions->needs_filtering(*_schema)) {
|
||||
for (auto&& cdef : _partition_key_restrictions->get_column_defs()) {
|
||||
if (!column_uses_indexing(cdef)) {
|
||||
column_defs_for_filtering.emplace_back(cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
column_id first_filtering_id = _schema->clustering_key_columns().begin()->id +
|
||||
_clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
|
||||
for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
|
||||
if (cdef->id >= first_filtering_id && !column_uses_indexing(cdef)) {
|
||||
column_defs_for_filtering.emplace_back(cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto&& cdef : _nonprimary_key_restrictions->get_column_defs()) {
|
||||
if (!column_uses_indexing(cdef)) {
|
||||
column_defs_for_filtering.emplace_back(cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
return column_defs_for_filtering;
|
||||
}
|
||||
|
||||
void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
|
||||
// If there is a queriable index, no special condition are required on the other restrictions.
|
||||
// But we still need to know 2 things:
|
||||
|
||||
@@ -163,6 +163,20 @@ public:
|
||||
return _clustering_columns_restrictions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a possibly empty collection of column definitions that will be used for filtering
|
||||
* @param db - the database context
|
||||
* @return A list with the column definitions needed for filtering.
|
||||
*/
|
||||
std::vector<const column_definition*> get_column_defs_for_filtering(database& db) const;
|
||||
|
||||
/**
|
||||
* Determines the index to be used with the restriction.
|
||||
* @param db - the database context (for extracting index manager)
|
||||
* @return If an index can be used, an optional containing this index, otherwise an empty optional.
|
||||
*/
|
||||
std::optional<secondary_index::index> find_idx(secondary_index::secondary_index_manager& sim) const;
|
||||
|
||||
/**
|
||||
* Checks if the partition key has some unrestricted components.
|
||||
* @return <code>true</code> if the partition key has some unrestricted components, <code>false</code> otherwise.
|
||||
|
||||
@@ -156,9 +156,9 @@ public:
|
||||
return _factories->uses_function(ks_name, function_name);
|
||||
}
|
||||
|
||||
virtual uint32_t add_column_for_ordering(const column_definition& c) override {
|
||||
uint32_t index = selection::add_column_for_ordering(c);
|
||||
_factories->add_selector_for_ordering(c, index);
|
||||
virtual uint32_t add_column_for_post_processing(const column_definition& c) override {
|
||||
uint32_t index = selection::add_column_for_post_processing(c);
|
||||
_factories->add_selector_for_post_processing(c, index);
|
||||
return index;
|
||||
}
|
||||
|
||||
@@ -227,7 +227,7 @@ protected:
|
||||
return simple_selection::make(schema, std::move(columns), false);
|
||||
}
|
||||
|
||||
uint32_t selection::add_column_for_ordering(const column_definition& c) {
|
||||
uint32_t selection::add_column_for_post_processing(const column_definition& c) {
|
||||
_columns.push_back(&c);
|
||||
_metadata->add_non_serialized_column(c.column_specification);
|
||||
return _columns.size() - 1;
|
||||
@@ -339,14 +339,14 @@ std::unique_ptr<result_set> result_set_builder::build() {
|
||||
return std::move(_result_set);
|
||||
}
|
||||
|
||||
bool result_set_builder::restrictions_filter::operator()(const selection& selection,
|
||||
bool result_set_builder::restrictions_filter::do_filter(const selection& selection,
|
||||
const std::vector<bytes>& partition_key,
|
||||
const std::vector<bytes>& clustering_key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) const {
|
||||
static logging::logger rlogger("restrictions_filter");
|
||||
|
||||
if (_current_partition_key_does_not_match || _current_static_row_does_not_match) {
|
||||
if (_current_partition_key_does_not_match || _current_static_row_does_not_match || _remaining == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -427,6 +427,20 @@ bool result_set_builder::restrictions_filter::operator()(const selection& select
|
||||
return true;
|
||||
}
|
||||
|
||||
bool result_set_builder::restrictions_filter::operator()(const selection& selection,
|
||||
const std::vector<bytes>& partition_key,
|
||||
const std::vector<bytes>& clustering_key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) const {
|
||||
const bool accepted = do_filter(selection, partition_key, clustering_key, static_row, row);
|
||||
if (!accepted) {
|
||||
++_rows_dropped;
|
||||
} else if (_remaining > 0) {
|
||||
--_remaining;
|
||||
}
|
||||
return accepted;
|
||||
}
|
||||
|
||||
api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
|
||||
return _timestamps[idx];
|
||||
}
|
||||
|
||||
@@ -176,7 +176,7 @@ public:
|
||||
static ::shared_ptr<selection> wildcard(schema_ptr schema);
|
||||
static ::shared_ptr<selection> for_columns(schema_ptr schema, std::vector<const column_definition*> columns);
|
||||
|
||||
virtual uint32_t add_column_for_ordering(const column_definition& c);
|
||||
virtual uint32_t add_column_for_post_processing(const column_definition& c);
|
||||
|
||||
virtual bool uses_function(const sstring &ks_name, const sstring& function_name) const {
|
||||
return false;
|
||||
@@ -259,20 +259,31 @@ public:
|
||||
}
|
||||
void reset() {
|
||||
}
|
||||
uint32_t get_rows_dropped() const {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
class restrictions_filter {
|
||||
::shared_ptr<restrictions::statement_restrictions> _restrictions;
|
||||
const query_options& _options;
|
||||
mutable bool _current_partition_key_does_not_match = false;
|
||||
mutable bool _current_static_row_does_not_match = false;
|
||||
mutable uint32_t _rows_dropped = 0;
|
||||
mutable uint32_t _remaining = 0;
|
||||
public:
|
||||
restrictions_filter() = default;
|
||||
explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options) : _restrictions(restrictions), _options(options) {}
|
||||
explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions, const query_options& options, uint32_t remaining) : _restrictions(restrictions), _options(options), _remaining(remaining) {}
|
||||
bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
|
||||
void reset() {
|
||||
_current_partition_key_does_not_match = false;
|
||||
_current_static_row_does_not_match = false;
|
||||
_rows_dropped = 0;
|
||||
}
|
||||
uint32_t get_rows_dropped() const {
|
||||
return _rows_dropped;
|
||||
}
|
||||
private:
|
||||
bool do_filter(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
|
||||
};
|
||||
|
||||
result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
|
||||
@@ -372,7 +383,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void accept_partition_end(const query::result_row_view& static_row) {
|
||||
uint32_t accept_partition_end(const query::result_row_view& static_row) {
|
||||
if (_row_count == 0) {
|
||||
_builder.new_row();
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
@@ -386,6 +397,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
return _filter.get_rows_dropped();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -53,6 +53,7 @@ selector_factories::selector_factories(std::vector<::shared_ptr<selectable>> sel
|
||||
: _contains_write_time_factory(false)
|
||||
, _contains_ttl_factory(false)
|
||||
, _number_of_aggregate_factories(0)
|
||||
, _number_of_factories_for_post_processing(0)
|
||||
{
|
||||
_factories.reserve(selectables.size());
|
||||
|
||||
@@ -76,8 +77,9 @@ bool selector_factories::uses_function(const sstring& ks_name, const sstring& fu
|
||||
return false;
|
||||
}
|
||||
|
||||
void selector_factories::add_selector_for_ordering(const column_definition& def, uint32_t index) {
|
||||
void selector_factories::add_selector_for_post_processing(const column_definition& def, uint32_t index) {
|
||||
_factories.emplace_back(simple_selector::new_factory(def.name_as_text(), index, def.type));
|
||||
++_number_of_factories_for_post_processing;
|
||||
}
|
||||
|
||||
std::vector<::shared_ptr<selector>> selector_factories::new_instances() const {
|
||||
|
||||
@@ -74,6 +74,11 @@ private:
|
||||
*/
|
||||
uint32_t _number_of_aggregate_factories;
|
||||
|
||||
/**
|
||||
* The number of factories that are only for post processing.
|
||||
*/
|
||||
uint32_t _number_of_factories_for_post_processing;
|
||||
|
||||
public:
|
||||
/**
|
||||
* Creates a new <code>SelectorFactories</code> instance and collect the column definitions.
|
||||
@@ -97,11 +102,12 @@ public:
|
||||
bool uses_function(const sstring& ks_name, const sstring& function_name) const;
|
||||
|
||||
/**
|
||||
* Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY purposes.
|
||||
* Adds a new <code>Selector.Factory</code> for a column that is needed only for ORDER BY or post
|
||||
* processing purposes.
|
||||
* @param def the column that is needed for ordering
|
||||
* @param index the index of the column definition in the Selection's list of columns
|
||||
*/
|
||||
void add_selector_for_ordering(const column_definition& def, uint32_t index);
|
||||
void add_selector_for_post_processing(const column_definition& def, uint32_t index);
|
||||
|
||||
/**
|
||||
* Checks if this <code>SelectorFactories</code> contains only factories for aggregates.
|
||||
@@ -111,7 +117,7 @@ public:
|
||||
*/
|
||||
bool contains_only_aggregate_functions() const {
|
||||
auto size = _factories.size();
|
||||
return size != 0 && _number_of_aggregate_factories == size;
|
||||
return size != 0 && _number_of_aggregate_factories == (size - _number_of_factories_for_post_processing);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -276,7 +276,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
|
||||
auto type = validate_alter(schema, *def, *validator);
|
||||
// In any case, we update the column definition
|
||||
cfm.with_altered_column_type(column_name->name(), type);
|
||||
cfm.alter_column_type(column_name->name(), type);
|
||||
|
||||
// We also have to validate the view types here. If we have a view which includes a column as part of
|
||||
// the clustering key, we need to make sure that it is indeed compatible.
|
||||
@@ -285,7 +285,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
if (view_def) {
|
||||
schema_builder builder(view);
|
||||
auto view_type = validate_alter(view, *view_def, *validator);
|
||||
builder.with_altered_column_type(column_name->name(), std::move(view_type));
|
||||
builder.alter_column_type(column_name->name(), std::move(view_type));
|
||||
view_updates.push_back(view_ptr(builder.build()));
|
||||
}
|
||||
}
|
||||
@@ -306,7 +306,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
} else {
|
||||
for (auto&& column_def : boost::range::join(schema->static_columns(), schema->regular_columns())) { // find
|
||||
if (column_def.name() == column_name->name()) {
|
||||
cfm.without_column(column_name->name());
|
||||
cfm.remove_column(column_name->name());
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -349,7 +349,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
auto to = entry.second->prepare_column_identifier(schema);
|
||||
|
||||
validate_column_rename(db, *schema, *from, *to);
|
||||
cfm.with_column_rename(from->name(), to->name());
|
||||
cfm.rename_column(from->name(), to->name());
|
||||
|
||||
// If the view includes a renamed column, it must be renamed in
|
||||
// the view table and the definition.
|
||||
@@ -360,7 +360,7 @@ future<shared_ptr<cql_transport::event::schema_change>> alter_table_statement::a
|
||||
auto view_from = entry.first->prepare_column_identifier(view);
|
||||
auto view_to = entry.second->prepare_column_identifier(view);
|
||||
validate_column_rename(db, *view, *view_from, *view_to);
|
||||
builder.with_column_rename(view_from->name(), view_to->name());
|
||||
builder.rename_column(view_from->name(), view_to->name());
|
||||
|
||||
auto new_where = util::rename_column_in_where_clause(
|
||||
view->view_info()->where_clause(),
|
||||
|
||||
@@ -110,7 +110,7 @@ void alter_type_statement::do_announce_migration(database& db, ::keyspace& ks, b
|
||||
if (t_opt) {
|
||||
modified = true;
|
||||
// We need to update this column
|
||||
cfm.with_altered_column_type(column.name(), *t_opt);
|
||||
cfm.alter_column_type(column.name(), *t_opt);
|
||||
}
|
||||
}
|
||||
if (modified) {
|
||||
|
||||
@@ -88,6 +88,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
||||
throw exceptions::invalid_request_exception("Secondary indexes are not supported on materialized views");
|
||||
}
|
||||
|
||||
if (schema->is_dense()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Secondary indexes are not supported on COMPACT STORAGE tables that have clustering columns");
|
||||
}
|
||||
|
||||
std::vector<::shared_ptr<index_target>> targets;
|
||||
for (auto& raw_target : _raw_targets) {
|
||||
targets.emplace_back(raw_target->prepare(schema));
|
||||
@@ -109,6 +114,11 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
||||
sprint("No column definition found for column %s", *target->column));
|
||||
}
|
||||
|
||||
//NOTICE(sarna): Should be lifted after resolving issue #2963
|
||||
if (cd->is_static()) {
|
||||
throw exceptions::invalid_request_exception("Indexing static columns is not implemented yet.");
|
||||
}
|
||||
|
||||
if (cd->type->references_duration()) {
|
||||
using request_validations::check_false;
|
||||
const auto& ty = *cd->type;
|
||||
@@ -122,8 +132,7 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
||||
}
|
||||
|
||||
// Origin TODO: we could lift that limitation
|
||||
if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) &&
|
||||
cd->kind != column_kind::regular_column) {
|
||||
if ((schema->is_dense() || !schema->thrift().has_compound_comparator()) && cd->is_primary_key()) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Secondary indexes are not supported on PRIMARY KEY columns in COMPACT STORAGE tables");
|
||||
}
|
||||
@@ -137,10 +146,15 @@ create_index_statement::validate(service::storage_proxy& proxy, const service::c
|
||||
|
||||
bool is_map = dynamic_cast<const collection_type_impl *>(cd->type.get()) != nullptr
|
||||
&& dynamic_cast<const collection_type_impl *>(cd->type.get())->is_map();
|
||||
bool is_frozen_collection = cd->type->is_collection() && !cd->type->is_multi_cell();
|
||||
bool is_collection = cd->type->is_collection();
|
||||
bool is_frozen_collection = is_collection && !cd->type->is_multi_cell();
|
||||
|
||||
if (is_frozen_collection) {
|
||||
validate_for_frozen_collection(target);
|
||||
} else if (is_collection) {
|
||||
// NOTICE(sarna): should be lifted after #2962 (indexes on non-frozen collections) is implemented
|
||||
throw exceptions::invalid_request_exception(
|
||||
sprint("Cannot create secondary index on non-frozen collection column %s", cd->name_as_text()));
|
||||
} else {
|
||||
validate_not_full_index(target);
|
||||
validate_is_values_index_if_target_column_not_collection(cd, target);
|
||||
|
||||
@@ -84,7 +84,6 @@ create_view_statement::create_view_statement(
|
||||
, _clustering_keys{clustering_keys}
|
||||
, _if_not_exists{if_not_exists}
|
||||
{
|
||||
service::get_local_storage_proxy().get_db().local().get_config().check_experimental("Creating materialized views");
|
||||
if (!service::get_local_storage_service().cluster_supports_materialized_views()) {
|
||||
throw exceptions::invalid_request_exception("Can't create materialized views until the whole cluster has been upgraded");
|
||||
}
|
||||
@@ -315,6 +314,27 @@ future<shared_ptr<cql_transport::event::schema_change>> create_view_statement::a
|
||||
throw exceptions::invalid_request_exception(sprint("No columns are defined for Materialized View other than primary key"));
|
||||
}
|
||||
|
||||
// The unique feature of a filter by a non-key column is that the
|
||||
// value of such column can be updated - and also be expired with TTL
|
||||
// and cause the view row to appear and disappear. We don't currently
|
||||
// support support this case - see issue #3430, and neither does
|
||||
// Cassandra - see see CASSANDRA-13798 and CASSANDRA-13832.
|
||||
// Actually, as CASSANDRA-13798 explains, the problem is "the liveness of
|
||||
// view row is now depending on multiple base columns (multiple filtered
|
||||
// non-pk base column + base column used in view pk)". When the filtered
|
||||
// column *is* the base column added to the view pk, we don't have this
|
||||
// problem. And this case actually works correctly.
|
||||
auto non_pk_restrictions = restrictions->get_non_pk_restriction();
|
||||
if (non_pk_restrictions.size() == 1 && has_non_pk_column &&
|
||||
std::find(target_primary_keys.begin(), target_primary_keys.end(), non_pk_restrictions.cbegin()->first) != target_primary_keys.end()) {
|
||||
// This case (filter by new PK column of the view) works, as explained above
|
||||
} else if (!non_pk_restrictions.empty()) {
|
||||
auto column_names = ::join(", ", non_pk_restrictions | boost::adaptors::map_keys | boost::adaptors::transformed(std::mem_fn(&column_definition::name_as_text)));
|
||||
throw exceptions::invalid_request_exception(sprint(
|
||||
"Non-primary key columns cannot be restricted in the SELECT statement used for materialized view %s creation (got restrictions on: %s)",
|
||||
column_family(), column_names));
|
||||
}
|
||||
|
||||
schema_builder builder{keyspace(), column_family()};
|
||||
auto add_columns = [this, &builder] (std::vector<const column_definition*>& defs, column_kind kind) mutable {
|
||||
for (auto* def : defs) {
|
||||
|
||||
@@ -49,7 +49,7 @@ void cql3::statements::index_prop_defs::validate() {
|
||||
property_definitions::validate(keywords);
|
||||
|
||||
if (is_custom && !custom_class) {
|
||||
throw exceptions::invalid_request_exception("CUSTOM index requires specifiying the index class");
|
||||
throw exceptions::invalid_request_exception("CUSTOM index requires specifying the index class");
|
||||
}
|
||||
|
||||
if (!is_custom && custom_class) {
|
||||
@@ -64,6 +64,16 @@ void cql3::statements::index_prop_defs::validate() {
|
||||
sprint("Cannot specify %s as a CUSTOM option",
|
||||
db::index::secondary_index::custom_index_option_name));
|
||||
}
|
||||
|
||||
// Currently, Scylla does not support *any* class of custom index
|
||||
// implementation. If in the future we do (e.g., SASI, or something
|
||||
// new), we'll need to check for valid values here.
|
||||
if (is_custom && custom_class) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Unsupported CUSTOM INDEX class {}. Note that currently, Scylla does not support SASI or any other CUSTOM INDEX class.",
|
||||
*custom_class));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
index_options_map
|
||||
|
||||
@@ -141,6 +141,10 @@ private:
|
||||
/** If ALLOW FILTERING was not specified, this verifies that it is not needed */
|
||||
void check_needs_filtering(::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||
|
||||
void ensure_filtering_columns_retrieval(database& db,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||
|
||||
bool contains_alias(::shared_ptr<column_identifier> name);
|
||||
|
||||
::shared_ptr<column_specification> limit_receiver();
|
||||
|
||||
@@ -383,8 +383,9 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
int32_t limit = get_limit(options);
|
||||
auto now = gc_clock::now();
|
||||
|
||||
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||
++_stats.reads;
|
||||
_stats.filtered_reads += _restrictions->need_filtering();
|
||||
_stats.filtered_reads += restrictions_need_filtering;
|
||||
|
||||
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
|
||||
make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
|
||||
@@ -396,37 +397,41 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
// An aggregation query will never be paged for the user, but we always page it internally to avoid OOM.
|
||||
// If we user provided a page_size we'll use that to page internally (because why not), otherwise we use our default
|
||||
// Note that if there are some nodes in the cluster with a version less than 2.0, we can't use paging (CASSANDRA-6707).
|
||||
auto aggregate = _selection->is_aggregate();
|
||||
if (aggregate && page_size <= 0) {
|
||||
const bool aggregate = _selection->is_aggregate();
|
||||
const bool nonpaged_filtering = restrictions_need_filtering && page_size <= 0;
|
||||
if (aggregate || nonpaged_filtering) {
|
||||
page_size = DEFAULT_COUNT_PAGE_SIZE;
|
||||
}
|
||||
|
||||
auto key_ranges = _restrictions->get_partition_key_ranges(options);
|
||||
|
||||
if (!aggregate && (page_size <= 0
|
||||
if (!aggregate && !restrictions_need_filtering && (page_size <= 0
|
||||
|| !service::pager::query_pagers::may_need_paging(*_schema, page_size,
|
||||
*command, key_ranges))) {
|
||||
return execute(proxy, command, std::move(key_ranges), state, options, now);
|
||||
}
|
||||
|
||||
command->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
auto timeout_duration = options.get_timeout_config().*get_timeout_config_selector();
|
||||
auto p = service::pager::query_pagers::pager(_schema, _selection,
|
||||
state, options, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);
|
||||
state, options, command, std::move(key_ranges), _stats, restrictions_need_filtering ? _restrictions : nullptr);
|
||||
|
||||
if (aggregate) {
|
||||
if (aggregate || nonpaged_filtering) {
|
||||
return do_with(
|
||||
cql3::selection::result_set_builder(*_selection, now,
|
||||
options.get_cql_serialization_format()),
|
||||
[this, p, page_size, now, timeout](auto& builder) {
|
||||
[this, p, page_size, now, timeout_duration, restrictions_need_filtering](auto& builder) {
|
||||
return do_until([p] {return p->is_exhausted();},
|
||||
[p, &builder, page_size, now, timeout] {
|
||||
[p, &builder, page_size, now, timeout_duration] {
|
||||
auto timeout = db::timeout_clock::now() + timeout_duration;
|
||||
return p->fetch_page(builder, page_size, now, timeout);
|
||||
}
|
||||
).then([this, &builder] {
|
||||
).then([this, &builder, restrictions_need_filtering] {
|
||||
auto rs = builder.build();
|
||||
if (restrictions_need_filtering) {
|
||||
_stats.filtered_rows_matched_total += rs->size();
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
});
|
||||
@@ -439,7 +444,8 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
" you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
|
||||
}
|
||||
|
||||
if (_selection->is_trivial() && !_restrictions->need_filtering()) {
|
||||
auto timeout = db::timeout_clock::now() + timeout_duration;
|
||||
if (_selection->is_trivial() && !restrictions_need_filtering) {
|
||||
return p->fetch_page_generator(page_size, now, timeout, _stats).then([this, p, limit] (result_generator generator) {
|
||||
auto meta = [&] () -> shared_ptr<const cql3::metadata> {
|
||||
if (!p->is_exhausted()) {
|
||||
@@ -458,14 +464,16 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
}
|
||||
|
||||
return p->fetch_page(page_size, now, timeout).then(
|
||||
[this, p, &options, limit, now](std::unique_ptr<cql3::result_set> rs) {
|
||||
[this, p, &options, now, restrictions_need_filtering](std::unique_ptr<cql3::result_set> rs) {
|
||||
|
||||
if (!p->is_exhausted()) {
|
||||
rs->get_metadata().set_paging_state(p->state());
|
||||
}
|
||||
|
||||
if (restrictions_need_filtering) {
|
||||
_stats.filtered_rows_matched_total += rs->size();
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
});
|
||||
@@ -492,15 +500,9 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const clustering_
|
||||
return KeyType::from_range(exploded_base_key);
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
auto cmd = ::make_lw_shared<query::read_command>(
|
||||
lw_shared_ptr<query::read_command>
|
||||
indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) {
|
||||
lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
|
||||
_schema->id(),
|
||||
_schema->version(),
|
||||
make_partition_slice(options),
|
||||
@@ -510,9 +512,25 @@ indexed_table_select_statement::execute_base_query(
|
||||
query::max_partitions,
|
||||
utils::UUID(),
|
||||
options.get_timestamp(state));
|
||||
if (options.get_page_size() > 0) {
|
||||
if (use_paging) {
|
||||
cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||
cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
|
||||
if (_schema->clustering_key_size() > 0) {
|
||||
cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
|
||||
}
|
||||
}
|
||||
return cmd;
|
||||
}
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
indexed_table_select_statement::execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
service::query_state& state,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
dht::partition_range_vector per_vnode_ranges;
|
||||
per_vnode_ranges.reserve(partition_ranges.size());
|
||||
@@ -586,19 +604,7 @@ indexed_table_select_statement::execute_base_query(
|
||||
const query_options& options,
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state) {
|
||||
auto cmd = make_lw_shared<query::read_command>(
|
||||
_schema->id(),
|
||||
_schema->version(),
|
||||
make_partition_slice(options),
|
||||
get_limit(options),
|
||||
now,
|
||||
tracing::make_trace_info(state.get_trace_state()),
|
||||
query::max_partitions,
|
||||
utils::UUID(),
|
||||
options.get_timestamp(state));
|
||||
if (options.get_page_size() > 0) {
|
||||
cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||
}
|
||||
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
|
||||
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
|
||||
|
||||
struct base_query_state {
|
||||
@@ -714,7 +720,8 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
const query_options& options,
|
||||
gc_clock::time_point now)
|
||||
{
|
||||
bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !_restrictions->need_filtering();
|
||||
const bool restrictions_need_filtering = _restrictions->need_filtering();
|
||||
const bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !restrictions_need_filtering;
|
||||
if (fast_path) {
|
||||
return make_shared<cql_transport::messages::result_message::rows>(result(
|
||||
result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
|
||||
@@ -724,12 +731,12 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
|
||||
cql3::selection::result_set_builder builder(*_selection, now,
|
||||
options.get_cql_serialization_format());
|
||||
if (_restrictions->need_filtering()) {
|
||||
if (restrictions_need_filtering) {
|
||||
results->ensure_counts();
|
||||
_stats.filtered_rows_read_total += *results->row_count();
|
||||
query::result_view::consume(*results, cmd->slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||
*_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options)));
|
||||
*_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions, options, cmd->row_limit)));
|
||||
} else {
|
||||
query::result_view::consume(*results, cmd->slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||
@@ -745,7 +752,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
rs->trim(cmd->row_limit);
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
||||
_stats.filtered_rows_matched_total += restrictions_need_filtering ? rs->size() : 0;
|
||||
return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
}
|
||||
|
||||
@@ -774,7 +781,8 @@ indexed_table_select_statement::prepare(database& db,
|
||||
ordering_comparator_type ordering_comparator,
|
||||
::shared_ptr<term> limit, cql_stats &stats)
|
||||
{
|
||||
auto index_opt = find_idx(db, schema, restrictions);
|
||||
auto& sim = db.find_column_family(schema).get_index_manager();
|
||||
auto index_opt = restrictions->find_idx(sim);
|
||||
if (!index_opt) {
|
||||
throw std::runtime_error("No index found.");
|
||||
}
|
||||
@@ -798,24 +806,6 @@ indexed_table_select_statement::prepare(database& db,
|
||||
|
||||
}
|
||||
|
||||
|
||||
stdx::optional<secondary_index::index> indexed_table_select_statement::find_idx(database& db,
|
||||
schema_ptr schema,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions)
|
||||
{
|
||||
auto& sim = db.find_column_family(schema).get_index_manager();
|
||||
for (::shared_ptr<cql3::restrictions::restrictions> restriction : restrictions->index_restrictions()) {
|
||||
for (const auto& cdef : restriction->get_column_defs()) {
|
||||
for (auto index : sim.list_indexes()) {
|
||||
if (index.depends_on(*cdef)) {
|
||||
return stdx::make_optional<secondary_index::index>(std::move(index));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return stdx::nullopt;
|
||||
}
|
||||
|
||||
indexed_table_select_statement::indexed_table_select_statement(schema_ptr schema, uint32_t bound_terms,
|
||||
::shared_ptr<parameters> parameters,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
@@ -882,7 +872,6 @@ static void append_base_key_to_index_ck(std::vector<bytes_view>& exploded_index_
|
||||
auto paging_state_copy = ::make_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
|
||||
paging_state_copy->set_partition_key(std::move(index_pk));
|
||||
paging_state_copy->set_clustering_key(std::move(index_ck));
|
||||
paging_state_copy->set_remaining(query::max_rows);
|
||||
return std::move(paging_state_copy);
|
||||
}
|
||||
|
||||
@@ -1219,6 +1208,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
|
||||
}
|
||||
|
||||
check_needs_filtering(restrictions);
|
||||
ensure_filtering_columns_retrieval(db, selection, restrictions);
|
||||
|
||||
::shared_ptr<cql3::statements::select_statement> stmt;
|
||||
if (restrictions->uses_secondary_indexing()) {
|
||||
@@ -1357,7 +1347,7 @@ select_statement::get_ordering_comparator(schema_ptr schema,
|
||||
}
|
||||
auto index = selection->index_of(*def);
|
||||
if (index < 0) {
|
||||
index = selection->add_column_for_ordering(*def);
|
||||
index = selection->add_column_for_post_processing(*def);
|
||||
}
|
||||
|
||||
sorters.emplace_back(index, def->type);
|
||||
@@ -1444,6 +1434,23 @@ void select_statement::check_needs_filtering(::shared_ptr<restrictions::statemen
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds columns that are needed for the purpose of filtering to the selection.
|
||||
* The columns that are added to the selection are columns that
|
||||
* are needed for filtering on the coordinator but are not part of the selection.
|
||||
* The columns are added with a meta-data indicating they are not to be returned
|
||||
* to the user.
|
||||
*/
|
||||
void select_statement::ensure_filtering_columns_retrieval(database& db,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions) {
|
||||
for (auto&& cdef : restrictions->get_column_defs_for_filtering(db)) {
|
||||
if (!selection->has_column(*cdef)) {
|
||||
selection->add_column_for_post_processing(*cdef);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool select_statement::contains_alias(::shared_ptr<column_identifier> name) {
|
||||
return std::any_of(_select_clause.begin(), _select_clause.end(), [name] (auto raw) {
|
||||
return raw->alias && *name == *raw->alias;
|
||||
|
||||
@@ -186,10 +186,6 @@ public:
|
||||
schema_ptr view_schema);
|
||||
|
||||
private:
|
||||
static stdx::optional<secondary_index::index> find_idx(database& db,
|
||||
schema_ptr schema,
|
||||
::shared_ptr<restrictions::statement_restrictions> restrictions);
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>> do_execute(service::storage_proxy& proxy,
|
||||
service::query_state& state, const query_options& options) override;
|
||||
|
||||
@@ -214,6 +210,9 @@ private:
|
||||
gc_clock::time_point now,
|
||||
::shared_ptr<const service::pager::paging_state> paging_state);
|
||||
|
||||
lw_shared_ptr<query::read_command>
|
||||
prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging);
|
||||
|
||||
future<shared_ptr<cql_transport::messages::result_message>>
|
||||
execute_base_query(
|
||||
service::storage_proxy& proxy,
|
||||
|
||||
178
database.cc
178
database.cc
@@ -76,6 +76,8 @@
|
||||
#include "sstables/compaction_manager.hh"
|
||||
#include "sstables/compaction_backlog_manager.hh"
|
||||
#include "sstables/progress_monitor.hh"
|
||||
#include "auth/common.hh"
|
||||
#include "tracing/trace_keyspace_helper.hh"
|
||||
|
||||
#include "checked-file-impl.hh"
|
||||
#include "disk-error-handler.hh"
|
||||
@@ -178,6 +180,18 @@ bool is_system_keyspace(const sstring& name) {
|
||||
return system_keyspaces.find(name) != system_keyspaces.end();
|
||||
}
|
||||
|
||||
static const std::unordered_set<sstring> internal_keyspaces = {
|
||||
db::system_distributed_keyspace::NAME,
|
||||
db::system_keyspace::NAME,
|
||||
db::schema_tables::NAME,
|
||||
auth::meta::AUTH_KS,
|
||||
tracing::trace_keyspace_helper::KEYSPACE_NAME
|
||||
};
|
||||
|
||||
bool is_internal_keyspace(const sstring& name) {
|
||||
return internal_keyspaces.find(name) != internal_keyspaces.end();
|
||||
}
|
||||
|
||||
// Used for tests where the CF exists without a database object. We need to pass a valid
|
||||
// dirty_memory manager in that case.
|
||||
thread_local dirty_memory_manager default_dirty_memory_manager;
|
||||
@@ -684,9 +698,11 @@ table::make_reader(schema_ptr s,
|
||||
return make_combined_reader(s, std::move(readers), fwd, fwd_mr);
|
||||
}
|
||||
|
||||
sstables::shared_sstable
|
||||
table::make_streaming_sstable_for_write() {
|
||||
sstables::shared_sstable table::make_streaming_sstable_for_write(std::optional<sstring> subdir) {
|
||||
sstring dir = _config.datadir;
|
||||
if (subdir) {
|
||||
dir += "/" + *subdir;
|
||||
}
|
||||
auto newtab = sstables::make_sstable(_schema,
|
||||
dir, calculate_generation_for_new_table(),
|
||||
get_highest_supported_format(),
|
||||
@@ -826,7 +842,11 @@ void table::add_sstable(sstables::shared_sstable sstable, const std::vector<unsi
|
||||
new_sstables->insert(sstable);
|
||||
_sstables = std::move(new_sstables);
|
||||
update_stats_for_new_sstable(sstable->bytes_on_disk(), shards_for_the_sstable);
|
||||
_compaction_strategy.get_backlog_tracker().add_sstable(sstable);
|
||||
if (sstable->is_staging()) {
|
||||
_sstables_staging.emplace(sstable->generation(), sstable);
|
||||
} else {
|
||||
_compaction_strategy.get_backlog_tracker().add_sstable(sstable);
|
||||
}
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -1082,12 +1102,14 @@ table::start() {
|
||||
future<>
|
||||
table::stop() {
|
||||
return _async_gate.close().then([this] {
|
||||
return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
|
||||
return _compaction_manager.remove(this).then([this] {
|
||||
// Nest, instead of using when_all, so we don't lose any exceptions.
|
||||
return _streaming_flush_gate.close();
|
||||
}).then([this] {
|
||||
return _sstable_deletion_gate.close();
|
||||
return when_all(await_pending_writes(), await_pending_reads()).discard_result().finally([this] {
|
||||
return when_all(_memtables->request_flush(), _streaming_memtables->request_flush()).discard_result().finally([this] {
|
||||
return _compaction_manager.remove(this).then([this] {
|
||||
// Nest, instead of using when_all, so we don't lose any exceptions.
|
||||
return _streaming_flush_gate.close();
|
||||
}).then([this] {
|
||||
return _sstable_deletion_gate.close();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1346,6 +1368,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new
|
||||
|
||||
// This is done in the background, so we can consider this compaction completed.
|
||||
seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
|
||||
return with_semaphore(_sstable_deletion_sem, 1, [this, sstables_to_remove = std::move(sstables_to_remove)] {
|
||||
return sstables::delete_atomically(sstables_to_remove, *get_large_partition_handler()).then_wrapped([this, sstables_to_remove] (future<> f) {
|
||||
std::exception_ptr eptr;
|
||||
try {
|
||||
@@ -1369,6 +1392,7 @@ table::on_compaction_completion(const std::vector<sstables::shared_sstable>& new
|
||||
return make_exception_future<>(eptr);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}).then([this] {
|
||||
// refresh underlying data source in row cache to prevent it from holding reference
|
||||
// to sstables files which were previously deleted.
|
||||
@@ -1613,7 +1637,9 @@ std::vector<sstables::shared_sstable> table::select_sstables(const dht::partitio
|
||||
|
||||
std::vector<sstables::shared_sstable> table::candidates_for_compaction() const {
|
||||
return boost::copy_range<std::vector<sstables::shared_sstable>>(*get_sstables()
|
||||
| boost::adaptors::filtered([this] (auto& sst) { return !_sstables_need_rewrite.count(sst->generation()); }));
|
||||
| boost::adaptors::filtered([this] (auto& sst) {
|
||||
return !_sstables_need_rewrite.count(sst->generation()) && !_sstables_staging.count(sst->generation());
|
||||
}));
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable> table::sstables_need_rewrite() const {
|
||||
@@ -1671,9 +1697,9 @@ future<> distributed_loader::open_sstable(distributed<database>& db, sstables::e
|
||||
// to distribute evenly the resource usage among all shards.
|
||||
|
||||
return db.invoke_on(column_family::calculate_shard_from_sstable_generation(comps.generation),
|
||||
[&db, comps = std::move(comps), func = std::move(func), pc] (database& local) {
|
||||
[&db, comps = std::move(comps), func = std::move(func), &pc] (database& local) {
|
||||
|
||||
return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), pc] {
|
||||
return with_semaphore(local.sstable_load_concurrency_sem(), 1, [&db, &local, comps = std::move(comps), func = std::move(func), &pc] {
|
||||
auto& cf = local.find_column_family(comps.ks, comps.cf);
|
||||
|
||||
auto f = sstables::sstable::load_shared_components(cf.schema(), comps.sstdir, comps.generation, comps.version, comps.format, pc);
|
||||
@@ -1969,6 +1995,12 @@ future<sstables::entry_descriptor> distributed_loader::probe_file(distributed<da
|
||||
}
|
||||
auto cf_sstable_open = [sstdir, comps, fname] (column_family& cf, sstables::foreign_sstable_open_info info) {
|
||||
cf.update_sstables_known_generation(comps.generation);
|
||||
if (shared_sstable sst = cf.get_staging_sstable(comps.generation)) {
|
||||
dblog.warn("SSTable {} is already present in staging/ directory. Moving from staging will be retried.", sst->get_filename());
|
||||
return seastar::async([sst = std::move(sst), comps = std::move(comps)] () {
|
||||
sst->move_to_new_dir_in_thread(comps.sstdir, comps.generation);
|
||||
});
|
||||
}
|
||||
{
|
||||
auto i = boost::range::find_if(*cf._sstables->all(), [gen = comps.generation] (sstables::shared_sstable sst) { return sst->generation() == gen; });
|
||||
if (i != cf._sstables->all()->end()) {
|
||||
@@ -2154,9 +2186,6 @@ database::database(const db::config& cfg, database_config dbcfg)
|
||||
[this] {
|
||||
++_stats->sstable_read_queue_overloaded;
|
||||
return std::make_exception_ptr(std::runtime_error("sstable inactive read queue overloaded"));
|
||||
},
|
||||
[this] {
|
||||
return _querier_cache.evict_one();
|
||||
})
|
||||
// No timeouts or queue length limits - a failure here can kill an entire repair.
|
||||
// Trust the caller to limit concurrency.
|
||||
@@ -2168,12 +2197,11 @@ database::database(const db::config& cfg, database_config dbcfg)
|
||||
, _version(empty_version)
|
||||
, _compaction_manager(make_compaction_manager(*_cfg, dbcfg))
|
||||
, _enable_incremental_backups(cfg.incremental_backups())
|
||||
, _querier_cache(dbcfg.available_memory * 0.04)
|
||||
, _querier_cache(_read_concurrency_sem, dbcfg.available_memory * 0.04)
|
||||
, _large_partition_handler(std::make_unique<db::cql_table_large_partition_handler>(_cfg->compaction_large_partition_warning_threshold_mb()*1024*1024))
|
||||
, _result_memory_limiter(dbcfg.available_memory / 10)
|
||||
{
|
||||
local_schema_registry().init(*this); // TODO: we're never unbound.
|
||||
_compaction_manager->start();
|
||||
setup_metrics();
|
||||
|
||||
_row_cache_tracker.set_compaction_scheduling_group(dbcfg.memory_compaction_scheduling_group);
|
||||
@@ -2299,6 +2327,9 @@ database::setup_metrics() {
|
||||
sm::description("Counts sstables that survived the clustering key filtering. "
|
||||
"High value indicates that bloom filter is not very efficient and still have to access a lot of sstables to get data.")),
|
||||
|
||||
sm::make_derive("dropped_view_updates", _cf_stats.dropped_view_updates,
|
||||
sm::description("Counts the number of view updates that have been dropped due to cluster overload. ")),
|
||||
|
||||
sm::make_derive("total_writes", _stats->total_writes,
|
||||
sm::description("Counts the total number of successful write operations performed by this shard.")),
|
||||
|
||||
@@ -2316,6 +2347,9 @@ database::setup_metrics() {
|
||||
sm::description("Counts the total number of failed read operations. "
|
||||
"Add the total_reads to this value to get the total amount of reads issued on this shard.")),
|
||||
|
||||
sm::make_current_bytes("view_update_backlog", [this] { return get_view_update_backlog().current; },
|
||||
sm::description("Holds the current size in bytes of the pending view updates for all tables")),
|
||||
|
||||
sm::make_derive("querier_cache_lookups", _querier_cache.get_stats().lookups,
|
||||
sm::description("Counts querier cache lookups (paging queries)")),
|
||||
|
||||
@@ -2420,6 +2454,9 @@ database::setup_metrics() {
|
||||
}
|
||||
|
||||
database::~database() {
|
||||
_read_concurrency_sem.clear_inactive_reads();
|
||||
_streaming_concurrency_sem.clear_inactive_reads();
|
||||
_system_read_concurrency_sem.clear_inactive_reads();
|
||||
}
|
||||
|
||||
void database::update_version(const utils::UUID& version) {
|
||||
@@ -2450,6 +2487,8 @@ future<> distributed_loader::populate_keyspace(distributed<database>& db, sstrin
|
||||
auto sstdir = ks.column_family_directory(ksdir, cfname, uuid);
|
||||
dblog.info("Keyspace {}: Reading CF {} id={} version={}", ks_name, cfname, uuid, s->version());
|
||||
return ks.make_directory_for_column_family(cfname, uuid).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir + "/staging", ks_name, cfname);
|
||||
}).then([&db, sstdir, uuid, ks_name, cfname] {
|
||||
return distributed_loader::populate_column_family(db, sstdir, ks_name, cfname);
|
||||
}).handle_exception([ks_name, cfname, sstdir](std::exception_ptr eptr) {
|
||||
std::string msg =
|
||||
@@ -2903,6 +2942,7 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
|
||||
cfg.enable_metrics_reporting = db_config.enable_keyspace_column_family_metrics();
|
||||
cfg.large_partition_handler = lp_handler;
|
||||
cfg.view_update_concurrency_semaphore = _config.view_update_concurrency_semaphore;
|
||||
cfg.view_update_concurrency_semaphore_limit = _config.view_update_concurrency_semaphore_limit;
|
||||
|
||||
return cfg;
|
||||
}
|
||||
@@ -2930,6 +2970,7 @@ keyspace::make_directory_for_column_family(const sstring& name, utils::UUID uuid
|
||||
io_check(recursive_touch_directory, cfdir).get();
|
||||
}
|
||||
io_check(touch_directory, cfdirs[0] + "/upload").get();
|
||||
io_check(touch_directory, cfdirs[0] + "/staging").get();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3699,6 +3740,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
|
||||
cfg.enable_metrics_reporting = _cfg->enable_keyspace_column_family_metrics();
|
||||
|
||||
cfg.view_update_concurrency_semaphore = &_view_update_concurrency_sem;
|
||||
cfg.view_update_concurrency_semaphore_limit = max_memory_pending_view_updates();
|
||||
return cfg;
|
||||
}
|
||||
|
||||
@@ -3796,6 +3838,8 @@ database::stop() {
|
||||
return parallel_for_each(_column_families, [this] (auto& val_pair) {
|
||||
return val_pair.second->stop();
|
||||
});
|
||||
}).then([this] {
|
||||
return _view_update_concurrency_sem.wait(max_memory_pending_view_updates());
|
||||
}).then([this] {
|
||||
if (_commitlog != nullptr) {
|
||||
return _commitlog->release();
|
||||
@@ -4051,6 +4095,7 @@ seal_snapshot(sstring jsondir) {
|
||||
|
||||
future<> table::snapshot(sstring name) {
|
||||
return flush().then([this, name = std::move(name)]() {
|
||||
return with_semaphore(_sstable_deletion_sem, 1, [this, name = std::move(name)]() {
|
||||
auto tables = boost::copy_range<std::vector<sstables::shared_sstable>>(*_sstables->all());
|
||||
return do_with(std::move(tables), [this, name](std::vector<sstables::shared_sstable> & tables) {
|
||||
auto jsondir = _config.datadir + "/snapshots/" + name;
|
||||
@@ -4110,6 +4155,7 @@ future<> table::snapshot(sstring name) {
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -4239,6 +4285,7 @@ future<> table::fail_streaming_mutations(utils::UUID plan_id) {
|
||||
_streaming_memtables_big.erase(it);
|
||||
return entry->flush_in_progress.close().then([this, entry] {
|
||||
for (auto&& sst : entry->sstables) {
|
||||
sst.monitor->write_failed();
|
||||
sst.sstable->mark_for_deletion();
|
||||
}
|
||||
});
|
||||
@@ -4417,6 +4464,14 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
|
||||
}));
|
||||
}
|
||||
|
||||
static size_t memory_usage_of(const std::vector<frozen_mutation_and_schema>& ms) {
|
||||
// Overhead of sending a view mutation, in terms of data structures used by the storage_proxy.
|
||||
constexpr size_t base_overhead_bytes = 256;
|
||||
return boost::accumulate(ms | boost::adaptors::transformed([] (const frozen_mutation_and_schema& m) {
|
||||
return m.fm.representation().size();
|
||||
}), size_t{base_overhead_bytes * ms.size()});
|
||||
}
|
||||
|
||||
/**
|
||||
* Given some updates on the base table and the existing values for the rows affected by that update, generates the
|
||||
* mutations to be applied to the base table's views, and sends them to the paired view replicas.
|
||||
@@ -4433,75 +4488,15 @@ std::vector<view_ptr> table::affected_views(const schema_ptr& base, const mutati
|
||||
future<> table::generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views,
|
||||
mutation&& m,
|
||||
flat_mutation_reader_opt existings,
|
||||
db::timeout_clock::time_point timeout) const {
|
||||
flat_mutation_reader_opt existings) const {
|
||||
auto base_token = m.token();
|
||||
return db::view::generate_view_updates(base,
|
||||
std::move(views),
|
||||
flat_mutation_reader_from_mutations({std::move(m)}),
|
||||
std::move(existings)).then([this, timeout, base_token = std::move(base_token)] (auto&& updates) mutable {
|
||||
return seastar::get_units(*_config.view_update_concurrency_semaphore, 1, timeout).then(
|
||||
[this, base_token = std::move(base_token), updates = std::move(updates)] (auto units) mutable {
|
||||
db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats).handle_exception([units = std::move(units)] (auto ignored) { });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an update for the base table, calculates the set of potentially affected views,
|
||||
* generates the relevant updates, and sends them to the paired view replicas.
|
||||
*/
|
||||
future<row_locker::lock_holder> table::push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const {
|
||||
//FIXME: Avoid unfreezing here.
|
||||
auto m = fm.unfreeze(s);
|
||||
auto& base = schema();
|
||||
m.upgrade(base);
|
||||
auto views = affected_views(base, m);
|
||||
if (views.empty()) {
|
||||
return make_ready_future<row_locker::lock_holder>();
|
||||
}
|
||||
auto cr_ranges = db::view::calculate_affected_clustering_ranges(*base, m.decorated_key(), m.partition(), views);
|
||||
if (cr_ranges.empty()) {
|
||||
return generate_and_propagate_view_updates(base, std::move(views), std::move(m), { }, timeout).then([] {
|
||||
// In this case we are not doing a read-before-write, just a
|
||||
// write, so no lock is needed.
|
||||
return make_ready_future<row_locker::lock_holder>();
|
||||
});
|
||||
}
|
||||
// We read the whole set of regular columns in case the update now causes a base row to pass
|
||||
// a view's filters, and a view happens to include columns that have no value in this update.
|
||||
// Also, one of those columns can determine the lifetime of the base row, if it has a TTL.
|
||||
auto columns = boost::copy_range<std::vector<column_id>>(
|
||||
base->regular_columns() | boost::adaptors::transformed(std::mem_fn(&column_definition::id)));
|
||||
query::partition_slice::option_set opts;
|
||||
opts.set(query::partition_slice::option::send_partition_key);
|
||||
opts.set(query::partition_slice::option::send_clustering_key);
|
||||
opts.set(query::partition_slice::option::send_timestamp);
|
||||
opts.set(query::partition_slice::option::send_ttl);
|
||||
auto slice = query::partition_slice(
|
||||
std::move(cr_ranges), { }, std::move(columns), std::move(opts), { }, cql_serialization_format::internal(), query::max_rows);
|
||||
// Take the shard-local lock on the base-table row or partition as needed.
|
||||
// We'll return this lock to the caller, which will release it after
|
||||
// writing the base-table update.
|
||||
future<row_locker::lock_holder> lockf = local_base_lock(base, m.decorated_key(), slice.default_row_ranges(), timeout);
|
||||
return lockf.then([m = std::move(m), slice = std::move(slice), views = std::move(views), base, this, timeout] (row_locker::lock_holder lock) {
|
||||
return do_with(
|
||||
dht::partition_range::make_singular(m.decorated_key()),
|
||||
std::move(slice),
|
||||
std::move(m),
|
||||
[base, views = std::move(views), lock = std::move(lock), this, timeout] (auto& pk, auto& slice, auto& m) mutable {
|
||||
auto reader = this->make_reader(
|
||||
base,
|
||||
pk,
|
||||
slice,
|
||||
service::get_local_sstable_query_read_priority());
|
||||
return this->generate_and_propagate_view_updates(base, std::move(views), std::move(m), std::move(reader), timeout).then([lock = std::move(lock)] () mutable {
|
||||
// return the local partition/row lock we have taken so it
|
||||
// remains locked until the caller is done modifying this
|
||||
// partition/row and destroys the lock object.
|
||||
return std::move(lock);
|
||||
});
|
||||
});
|
||||
return db::view::generate_view_updates(
|
||||
base,
|
||||
std::move(views),
|
||||
flat_mutation_reader_from_mutations({std::move(m)}),
|
||||
std::move(existings)).then([this, base_token = std::move(base_token)] (std::vector<frozen_mutation_and_schema>&& updates) mutable {
|
||||
auto units = seastar::consume_units(*_config.view_update_concurrency_semaphore, memory_usage_of(updates));
|
||||
db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units)).handle_exception([] (auto ignored) { });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -4606,8 +4601,17 @@ future<> table::populate_views(
|
||||
schema,
|
||||
std::move(views),
|
||||
std::move(reader),
|
||||
{ }).then([base_token = std::move(base_token), this] (auto&& updates) {
|
||||
return db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats);
|
||||
{ }).then([base_token = std::move(base_token), this] (std::vector<frozen_mutation_and_schema>&& updates) mutable {
|
||||
size_t update_size = memory_usage_of(updates);
|
||||
size_t units_to_wait_for = std::min(_config.view_update_concurrency_semaphore_limit, update_size);
|
||||
return seastar::get_units(*_config.view_update_concurrency_semaphore, units_to_wait_for).then(
|
||||
[base_token = std::move(base_token),
|
||||
updates = std::move(updates),
|
||||
units_to_consume = update_size - units_to_wait_for,
|
||||
this] (db::timeout_semaphore_units&& units) mutable {
|
||||
units.adopt(seastar::consume_units(*_config.view_update_concurrency_semaphore, units_to_consume));
|
||||
return db::view::mutate_MV(std::move(base_token), std::move(updates), _view_stats, std::move(units));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
61
database.hh
61
database.hh
@@ -77,6 +77,7 @@
|
||||
#include <seastar/core/metrics_registration.hh>
|
||||
#include "tracing/trace_state.hh"
|
||||
#include "db/view/view.hh"
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
#include "db/view/row_locking.hh"
|
||||
#include "lister.hh"
|
||||
#include "utils/phased_barrier.hh"
|
||||
@@ -279,6 +280,9 @@ struct cf_stats {
|
||||
int64_t clustering_filter_fast_path_count = 0;
|
||||
// how many sstables survived the clustering key checks
|
||||
int64_t surviving_sstables_after_clustering_filter = 0;
|
||||
|
||||
// How many view updates were dropped due to overload.
|
||||
int64_t dropped_view_updates = 0;
|
||||
};
|
||||
|
||||
class cache_temperature {
|
||||
@@ -298,6 +302,8 @@ public:
|
||||
class table;
|
||||
using column_family = table;
|
||||
|
||||
class database_sstable_write_monitor;
|
||||
|
||||
class table : public enable_lw_shared_from_this<table> {
|
||||
public:
|
||||
struct config {
|
||||
@@ -323,6 +329,7 @@ public:
|
||||
bool enable_metrics_reporting = false;
|
||||
db::large_partition_handler* large_partition_handler;
|
||||
db::timeout_semaphore* view_update_concurrency_semaphore;
|
||||
size_t view_update_concurrency_semaphore_limit;
|
||||
};
|
||||
struct no_commitlog {};
|
||||
struct stats {
|
||||
@@ -395,7 +402,7 @@ private:
|
||||
// plan memtables and the resulting sstables are not made visible until
|
||||
// the streaming is complete.
|
||||
struct monitored_sstable {
|
||||
std::unique_ptr<sstables::write_monitor> monitor;
|
||||
std::unique_ptr<database_sstable_write_monitor> monitor;
|
||||
sstables::shared_sstable sstable;
|
||||
};
|
||||
|
||||
@@ -432,8 +439,15 @@ private:
|
||||
// but for correct compaction we need to start the compaction only after
|
||||
// reading all sstables.
|
||||
std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_need_rewrite;
|
||||
// sstables that should not be compacted (e.g. because they need to be used
|
||||
// to generate view updates later)
|
||||
std::unordered_map<uint64_t, sstables::shared_sstable> _sstables_staging;
|
||||
// Control background fibers waiting for sstables to be deleted
|
||||
seastar::gate _sstable_deletion_gate;
|
||||
// This semaphore ensures that an operation like snapshot won't have its selected
|
||||
// sstables deleted by compaction in parallel, a race condition which could
|
||||
// easily result in failure.
|
||||
seastar::semaphore _sstable_deletion_sem = {1};
|
||||
// There are situations in which we need to stop writing sstables. Flushers will take
|
||||
// the read lock, and the ones that wish to stop that process will take the write lock.
|
||||
rwlock _sstables_lock;
|
||||
@@ -485,6 +499,11 @@ private:
|
||||
utils::phased_barrier _pending_reads_phaser;
|
||||
public:
|
||||
future<> add_sstable_and_update_cache(sstables::shared_sstable sst);
|
||||
void move_sstable_from_staging_in_thread(sstables::shared_sstable sst);
|
||||
sstables::shared_sstable get_staging_sstable(uint64_t generation) {
|
||||
auto it = _sstables_staging.find(generation);
|
||||
return it != _sstables_staging.end() ? it->second : nullptr;
|
||||
}
|
||||
private:
|
||||
void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
|
||||
// Adds new sstable to the set of sstables
|
||||
@@ -618,6 +637,14 @@ public:
|
||||
tracing::trace_state_ptr trace_state = nullptr,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
|
||||
flat_mutation_reader make_reader_excluding_sstable(schema_ptr schema,
|
||||
sstables::shared_sstable sst,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc = default_priority_class(),
|
||||
tracing::trace_state_ptr trace_state = nullptr,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes) const;
|
||||
|
||||
flat_mutation_reader make_reader(schema_ptr schema, const dht::partition_range& range = query::full_partition_range) const {
|
||||
auto& full_slice = schema->full_slice();
|
||||
@@ -632,9 +659,13 @@ public:
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema,
|
||||
const dht::partition_range_vector& ranges) const;
|
||||
|
||||
sstables::shared_sstable make_streaming_sstable_for_write();
|
||||
sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
|
||||
sstables::shared_sstable make_streaming_staging_sstable() {
|
||||
return make_streaming_sstable_for_write("staging");
|
||||
}
|
||||
|
||||
mutation_source as_mutation_source() const;
|
||||
mutation_source as_mutation_source_excluding(sstables::shared_sstable sst) const;
|
||||
|
||||
void set_virtual_reader(mutation_source virtual_reader) {
|
||||
_virtual_reader = std::move(virtual_reader);
|
||||
@@ -842,6 +873,8 @@ public:
|
||||
void clear_views();
|
||||
const std::vector<view_ptr>& views() const;
|
||||
future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, const frozen_mutation& fm, db::timeout_clock::time_point timeout) const;
|
||||
future<row_locker::lock_holder> push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout) const;
|
||||
future<row_locker::lock_holder> stream_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, sstables::shared_sstable excluded_sstable) const;
|
||||
void add_coordinator_read_latency(utils::estimated_histogram::duration latency);
|
||||
std::chrono::milliseconds get_coordinator_read_latency_percentile(double percentile);
|
||||
|
||||
@@ -859,13 +892,17 @@ public:
|
||||
dht::token base_token,
|
||||
flat_mutation_reader&&);
|
||||
|
||||
reader_concurrency_semaphore& read_concurrency_semaphore() {
|
||||
return *_config.read_concurrency_semaphore;
|
||||
}
|
||||
|
||||
private:
|
||||
future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source) const;
|
||||
std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
|
||||
future<> generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views,
|
||||
mutation&& m,
|
||||
flat_mutation_reader_opt existings,
|
||||
db::timeout_clock::time_point timeout) const;
|
||||
flat_mutation_reader_opt existings) const;
|
||||
|
||||
mutable row_locker _row_locker;
|
||||
future<row_locker::lock_holder> local_base_lock(
|
||||
@@ -1055,6 +1092,7 @@ public:
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
bool enable_metrics_reporting = false;
|
||||
db::timeout_semaphore* view_update_concurrency_semaphore = nullptr;
|
||||
size_t view_update_concurrency_semaphore_limit;
|
||||
};
|
||||
private:
|
||||
std::unique_ptr<locator::abstract_replication_strategy> _replication_strategy;
|
||||
@@ -1156,6 +1194,7 @@ private:
|
||||
static const size_t max_count_system_concurrent_reads{10};
|
||||
size_t max_memory_system_concurrent_reads() { return _dbcfg.available_memory * 0.02; };
|
||||
static constexpr size_t max_concurrent_sstable_loads() { return 3; }
|
||||
size_t max_memory_pending_view_updates() const { return _dbcfg.available_memory * 0.1; }
|
||||
|
||||
struct db_stats {
|
||||
uint64_t total_writes = 0;
|
||||
@@ -1192,7 +1231,7 @@ private:
|
||||
|
||||
semaphore _sstable_load_concurrency_sem{max_concurrent_sstable_loads()};
|
||||
|
||||
db::timeout_semaphore _view_update_concurrency_sem{100}; // Stand-in hack for #2538
|
||||
db::timeout_semaphore _view_update_concurrency_sem{max_memory_pending_view_updates()};
|
||||
|
||||
cache_tracker _row_cache_tracker;
|
||||
|
||||
@@ -1399,6 +1438,12 @@ public:
|
||||
std::unordered_set<sstring> get_initial_tokens();
|
||||
std::experimental::optional<gms::inet_address> get_replace_address();
|
||||
bool is_replacing();
|
||||
reader_concurrency_semaphore& user_read_concurrency_sem() {
|
||||
return _read_concurrency_sem;
|
||||
}
|
||||
reader_concurrency_semaphore& streaming_read_concurrency_sem() {
|
||||
return _streaming_concurrency_sem;
|
||||
}
|
||||
reader_concurrency_semaphore& system_keyspace_read_concurrency_sem() {
|
||||
return _system_read_concurrency_sem;
|
||||
}
|
||||
@@ -1423,11 +1468,17 @@ public:
|
||||
return _querier_cache;
|
||||
}
|
||||
|
||||
db::view::update_backlog get_view_update_backlog() const {
|
||||
return {max_memory_pending_view_updates() - _view_update_concurrency_sem.current(), max_memory_pending_view_updates()};
|
||||
}
|
||||
|
||||
friend class distributed_loader;
|
||||
};
|
||||
|
||||
future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy);
|
||||
|
||||
bool is_internal_keyspace(const sstring& name);
|
||||
|
||||
class distributed_loader {
|
||||
public:
|
||||
static void reshard(distributed<database>& db, sstring ks_name, sstring cf_name);
|
||||
|
||||
@@ -1673,14 +1673,14 @@ const db::commitlog::config& db::commitlog::active_config() const {
|
||||
// No commit_io_check needed in the log reader since the database will fail
|
||||
// on error at startup if required
|
||||
future<std::unique_ptr<subscription<temporary_buffer<char>, db::replay_position>>>
|
||||
db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func next, position_type off, const db::extensions* exts) {
|
||||
db::commitlog::read_log_file(const sstring& filename, seastar::io_priority_class read_io_prio_class, commit_load_reader_func next, position_type off, const db::extensions* exts) {
|
||||
struct work {
|
||||
private:
|
||||
file_input_stream_options make_file_input_stream_options() {
|
||||
file_input_stream_options make_file_input_stream_options(seastar::io_priority_class read_io_prio_class) {
|
||||
file_input_stream_options fo;
|
||||
fo.buffer_size = db::commitlog::segment::default_size;
|
||||
fo.read_ahead = 10;
|
||||
fo.io_priority_class = service::get_local_commitlog_priority();
|
||||
fo.io_priority_class = read_io_prio_class;
|
||||
return fo;
|
||||
}
|
||||
public:
|
||||
@@ -1699,8 +1699,8 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
|
||||
bool header = true;
|
||||
bool failed = false;
|
||||
|
||||
work(file f, position_type o = 0)
|
||||
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options())), start_off(o) {
|
||||
work(file f, seastar::io_priority_class read_io_prio_class, position_type o = 0)
|
||||
: f(f), fin(make_file_input_stream(f, 0, make_file_input_stream_options(read_io_prio_class))), start_off(o) {
|
||||
}
|
||||
work(work&&) = default;
|
||||
|
||||
@@ -1918,9 +1918,9 @@ db::commitlog::read_log_file(const sstring& filename, commit_load_reader_func ne
|
||||
return fut;
|
||||
});
|
||||
|
||||
return fut.then([off, next](file f) {
|
||||
return fut.then([off, next, read_io_prio_class] (file f) {
|
||||
f = make_checked_file(commit_error_handler, std::move(f));
|
||||
auto w = make_lw_shared<work>(std::move(f), off);
|
||||
auto w = make_lw_shared<work>(std::move(f), read_io_prio_class, off);
|
||||
auto ret = w->s.listen(next);
|
||||
|
||||
w->s.started().then(std::bind(&work::read_file, w.get())).then([w] {
|
||||
|
||||
@@ -355,7 +355,7 @@ public:
|
||||
};
|
||||
|
||||
static future<std::unique_ptr<subscription<temporary_buffer<char>, replay_position>>> read_log_file(
|
||||
const sstring&, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
|
||||
const sstring&, seastar::io_priority_class read_io_prio_class, commit_load_reader_func, position_type = 0, const db::extensions* = nullptr);
|
||||
private:
|
||||
commitlog(config);
|
||||
|
||||
|
||||
@@ -34,7 +34,8 @@ public:
|
||||
commitlog_entry(stdx::optional<column_mapping> mapping, frozen_mutation&& mutation)
|
||||
: _mapping(std::move(mapping)), _mutation(std::move(mutation)) { }
|
||||
const stdx::optional<column_mapping>& mapping() const { return _mapping; }
|
||||
const frozen_mutation& mutation() const { return _mutation; }
|
||||
const frozen_mutation& mutation() const & { return _mutation; }
|
||||
frozen_mutation&& mutation() && { return std::move(_mutation); }
|
||||
};
|
||||
|
||||
class commitlog_entry_writer {
|
||||
@@ -80,5 +81,6 @@ public:
|
||||
commitlog_entry_reader(const temporary_buffer<char>& buffer);
|
||||
|
||||
const stdx::optional<column_mapping>& get_column_mapping() const { return _ce.mapping(); }
|
||||
const frozen_mutation& mutation() const { return _ce.mutation(); }
|
||||
const frozen_mutation& mutation() const & { return _ce.mutation(); }
|
||||
frozen_mutation&& mutation() && { return std::move(_ce).mutation(); }
|
||||
};
|
||||
|
||||
@@ -58,6 +58,7 @@
|
||||
#include "converting_mutation_partition_applier.hh"
|
||||
#include "schema_registry.hh"
|
||||
#include "commitlog_entry.hh"
|
||||
#include "service/priority_manager.hh"
|
||||
|
||||
static logging::logger rlogger("commitlog_replayer");
|
||||
|
||||
@@ -223,7 +224,7 @@ db::commitlog_replayer::impl::recover(sstring file, const sstring& fname_prefix)
|
||||
auto s = make_lw_shared<stats>();
|
||||
auto& exts = _qp.local().db().local().get_config().extensions();
|
||||
|
||||
return db::commitlog::read_log_file(file,
|
||||
return db::commitlog::read_log_file(file, service::get_local_commitlog_priority(),
|
||||
std::bind(&impl::process, this, s.get(), std::placeholders::_1,
|
||||
std::placeholders::_2), p, &exts).then([](auto s) {
|
||||
auto f = s->done();
|
||||
|
||||
13
db/config.hh
13
db/config.hh
@@ -155,6 +155,9 @@ public:
|
||||
val(hints_directory, sstring, "/var/lib/scylla/hints", Used, \
|
||||
"The directory where hints files are stored if hinted handoff is enabled." \
|
||||
) \
|
||||
val(view_hints_directory, sstring, "/var/lib/scylla/view_hints", Used, \
|
||||
"The directory where materialized-view updates are stored while a view replica is unreachable." \
|
||||
) \
|
||||
val(saved_caches_directory, sstring, "/var/lib/scylla/saved_caches", Unused, \
|
||||
"The directory location where table key and row caches are stored." \
|
||||
) \
|
||||
@@ -453,7 +456,7 @@ public:
|
||||
"The maximum number of tombstones a query can scan before aborting." \
|
||||
) \
|
||||
/* Network timeout settings */ \
|
||||
val(range_request_timeout_in_ms, uint32_t, 10000, Unused, \
|
||||
val(range_request_timeout_in_ms, uint32_t, 10000, Used, \
|
||||
"The time in milliseconds that the coordinator waits for sequential or index scans to complete." \
|
||||
) \
|
||||
val(read_request_timeout_in_ms, uint32_t, 5000, Used, \
|
||||
@@ -472,7 +475,7 @@ public:
|
||||
"The time in milliseconds that the coordinator waits for write operations to complete.\n" \
|
||||
"Related information: About hinted handoff writes" \
|
||||
) \
|
||||
val(request_timeout_in_ms, uint32_t, 10000, Unused, \
|
||||
val(request_timeout_in_ms, uint32_t, 10000, Used, \
|
||||
"The default timeout for other, miscellaneous operations.\n" \
|
||||
"Related information: About hinted handoff writes" \
|
||||
) \
|
||||
@@ -578,7 +581,7 @@ public:
|
||||
val(dynamic_snitch_update_interval_in_ms, uint32_t, 100, Unused, \
|
||||
"The time interval for how often the snitch calculates node scores. Because score calculation is CPU intensive, be careful when reducing this interval." \
|
||||
) \
|
||||
val(hinted_handoff_enabled, sstring, "false", Used, \
|
||||
val(hinted_handoff_enabled, sstring, "true", Used, \
|
||||
"Enable or disable hinted handoff. To enable per data center, add data center list. For example: hinted_handoff_enabled: DC1,DC2. A hint indicates that the write needs to be replayed to an unavailable node. " \
|
||||
"Related information: About hinted handoff writes" \
|
||||
) \
|
||||
@@ -621,7 +624,7 @@ public:
|
||||
val(thrift_framed_transport_size_in_mb, uint32_t, 15, Unused, \
|
||||
"Frame size (maximum field length) for Thrift. The frame is the row or part of the row the application is inserting." \
|
||||
) \
|
||||
val(thrift_max_message_length_in_mb, uint32_t, 16, Unused, \
|
||||
val(thrift_max_message_length_in_mb, uint32_t, 16, Used, \
|
||||
"The maximum length of a Thrift message in megabytes, including all fields and internal Thrift overhead (1 byte of overhead for each frame). Message length is usually used in conjunction with batches. A frame length greater than or equal to 24 accommodates a batch with four inserts, each of which is 24 bytes. The required message length is greater than or equal to 24+24+24+24+4 (number of frames)." \
|
||||
) \
|
||||
/* Security properties */ \
|
||||
@@ -739,7 +742,7 @@ public:
|
||||
" Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.") \
|
||||
val(cpu_scheduler, bool, true, Used, "Enable cpu scheduling") \
|
||||
val(view_building, bool, true, Used, "Enable view building; should only be set to false when the node is experience issues due to view building") \
|
||||
val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format; FOR TESTING PURPOSES ONLY - TO BE REMOVED BEFORE RELEASE") \
|
||||
val(enable_sstables_mc_format, bool, false, Used, "Enable SSTables 'mc' format to be used as the default file format") \
|
||||
/* done! */
|
||||
|
||||
#define _make_value_member(name, type, deflt, status, desc, ...) \
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "disk-error-handler.hh"
|
||||
#include "lister.hh"
|
||||
#include "db/timeout_clock.hh"
|
||||
#include "service/priority_manager.hh"
|
||||
|
||||
using namespace std::literals::chrono_literals;
|
||||
|
||||
@@ -78,6 +79,9 @@ void manager::register_metrics(const sstring& group_name) {
|
||||
|
||||
sm::make_derive("sent", _stats.sent,
|
||||
sm::description("Number of sent hints.")),
|
||||
|
||||
sm::make_derive("discarded", _stats.discarded,
|
||||
sm::description("Number of hints that were discarded during sending (too old, schema changed, etc.).")),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -95,6 +99,7 @@ future<> manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr
|
||||
return compute_hints_dir_device_id();
|
||||
}).then([this] {
|
||||
_strorage_service_anchor->register_subscriber(this);
|
||||
set_started();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -105,7 +110,7 @@ future<> manager::stop() {
|
||||
_strorage_service_anchor->unregister_subscriber(this);
|
||||
}
|
||||
|
||||
_stopping = true;
|
||||
set_stopping();
|
||||
|
||||
return _draining_eps_gate.close().finally([this] {
|
||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||
@@ -277,7 +282,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
|
||||
}
|
||||
|
||||
bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
|
||||
if (_stopping || !can_hint_for(ep)) {
|
||||
if (stopping() || !started() || !can_hint_for(ep)) {
|
||||
manager_logger.trace("Can't store a hint to {}", ep);
|
||||
++_stats.dropped;
|
||||
return false;
|
||||
@@ -380,7 +385,7 @@ future<timespec> manager::end_point_hints_manager::sender::get_last_file_modific
|
||||
});
|
||||
}
|
||||
|
||||
future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
|
||||
future<> manager::end_point_hints_manager::sender::do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept {
|
||||
return futurize_apply([this, m = std::move(m), &natural_endpoints] () mutable -> future<> {
|
||||
// The fact that we send with CL::ALL in both cases below ensures that new hints are not going
|
||||
// to be generated as a result of hints sending.
|
||||
@@ -392,7 +397,8 @@ future<> manager::end_point_hints_manager::sender::do_send_one_mutation(mutation
|
||||
// FIXME: using 1h as infinite timeout. If a node is down, we should get an
|
||||
// unavailable exception.
|
||||
auto timeout = db::timeout_clock::now() + 1h;
|
||||
return _proxy.mutate({std::move(m)}, consistency_level::ALL, timeout, nullptr);
|
||||
//FIXME: Add required frozen_mutation overloads
|
||||
return _proxy.mutate({m.fm.unfreeze(m.s)}, consistency_level::ALL, timeout, nullptr);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -418,21 +424,19 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
|
||||
}
|
||||
}
|
||||
|
||||
mutation manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
|
||||
frozen_mutation_and_schema manager::end_point_hints_manager::sender::get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf) {
|
||||
hint_entry_reader hr(buf);
|
||||
auto& fm = hr.mutation();
|
||||
auto& cm = get_column_mapping(std::move(ctx_ptr), fm, hr);
|
||||
auto& cf = _db.find_column_family(fm.column_family_id());
|
||||
auto schema = _db.find_schema(fm.column_family_id());
|
||||
|
||||
if (cf.schema()->version() != fm.schema_version()) {
|
||||
mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
|
||||
converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
|
||||
if (schema->version() != fm.schema_version()) {
|
||||
mutation m(schema, fm.decorated_key(*schema));
|
||||
converting_mutation_partition_applier v(cm, *schema, m.partition());
|
||||
fm.partition().accept(cm, v);
|
||||
|
||||
return std::move(m);
|
||||
} else {
|
||||
return fm.unfreeze(cf.schema());
|
||||
return {freeze(m), std::move(schema)};
|
||||
}
|
||||
return {std::move(hr).mutation(), std::move(schema)};
|
||||
}
|
||||
|
||||
const column_mapping& manager::end_point_hints_manager::sender::get_column_mapping(lw_shared_ptr<send_one_file_ctx> ctx_ptr, const frozen_mutation& fm, const hint_entry_reader& hr) {
|
||||
@@ -502,7 +506,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
|
||||
}
|
||||
|
||||
void manager::drain_for(gms::inet_address endpoint) {
|
||||
if (_stopping) {
|
||||
if (stopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -543,6 +547,7 @@ manager::end_point_hints_manager::sender::sender(end_point_hints_manager& parent
|
||||
, _resource_manager(_shard_manager._resource_manager)
|
||||
, _proxy(local_storage_proxy)
|
||||
, _db(local_db)
|
||||
, _hints_cpu_sched_group(_db.get_streaming_scheduling_group())
|
||||
, _gossiper(local_gossiper)
|
||||
, _file_update_mutex(_ep_manager.file_update_mutex())
|
||||
{}
|
||||
@@ -555,6 +560,7 @@ manager::end_point_hints_manager::sender::sender(const sender& other, end_point_
|
||||
, _resource_manager(_shard_manager._resource_manager)
|
||||
, _proxy(other._proxy)
|
||||
, _db(other._db)
|
||||
, _hints_cpu_sched_group(other._hints_cpu_sched_group)
|
||||
, _gossiper(other._gossiper)
|
||||
, _file_update_mutex(_ep_manager.file_update_mutex())
|
||||
{}
|
||||
@@ -610,7 +616,10 @@ manager::end_point_hints_manager::sender::clock::duration manager::end_point_hin
|
||||
}
|
||||
|
||||
void manager::end_point_hints_manager::sender::start() {
|
||||
_stopped = seastar::async([this] {
|
||||
seastar::thread_attributes attr;
|
||||
|
||||
attr.sched_group = _hints_cpu_sched_group;
|
||||
_stopped = seastar::async(std::move(attr), [this] {
|
||||
manager_logger.trace("ep_manager({})::sender: started", end_point_key());
|
||||
while (!stopping()) {
|
||||
try {
|
||||
@@ -630,10 +639,11 @@ void manager::end_point_hints_manager::sender::start() {
|
||||
});
|
||||
}
|
||||
|
||||
future<> manager::end_point_hints_manager::sender::send_one_mutation(mutation m) {
|
||||
keyspace& ks = _db.find_keyspace(m.schema()->ks_name());
|
||||
future<> manager::end_point_hints_manager::sender::send_one_mutation(frozen_mutation_and_schema m) {
|
||||
keyspace& ks = _db.find_keyspace(m.s->ks_name());
|
||||
auto& rs = ks.get_replication_strategy();
|
||||
std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(m.token());
|
||||
auto token = dht::global_partitioner().get_token(*m.s, m.fm.key(*m.s));
|
||||
std::vector<gms::inet_address> natural_endpoints = rs.get_natural_endpoints(std::move(token));
|
||||
|
||||
return do_send_one_mutation(std::move(m), natural_endpoints);
|
||||
}
|
||||
@@ -651,8 +661,8 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
mutation m = this->get_mutation(ctx_ptr, buf);
|
||||
gc_clock::duration gc_grace_sec = m.schema()->gc_grace_seconds();
|
||||
auto m = this->get_mutation(ctx_ptr, buf);
|
||||
gc_clock::duration gc_grace_sec = m.s->gc_grace_seconds();
|
||||
|
||||
// The hint is too old - drop it.
|
||||
//
|
||||
@@ -673,10 +683,13 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
||||
// ignore these errors and move on - probably this hint is too old and the KS/CF has been deleted...
|
||||
} catch (no_such_column_family& e) {
|
||||
manager_logger.debug("send_hints(): no_such_column_family: {}", e.what());
|
||||
++this->shard_stats().discarded;
|
||||
} catch (no_such_keyspace& e) {
|
||||
manager_logger.debug("send_hints(): no_such_keyspace: {}", e.what());
|
||||
++this->shard_stats().discarded;
|
||||
} catch (no_column_mapping& e) {
|
||||
manager_logger.debug("send_hints(): {}: {}", fname, e.what());
|
||||
manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
|
||||
++this->shard_stats().discarded;
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).finally([units = std::move(units), ctx_ptr] {});
|
||||
@@ -690,10 +703,10 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
||||
bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fname) {
|
||||
timespec last_mod = get_last_file_modification(fname).get0();
|
||||
gc_clock::duration secs_since_file_mod = std::chrono::seconds(last_mod.tv_sec);
|
||||
lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>();
|
||||
lw_shared_ptr<send_one_file_ctx> ctx_ptr = make_lw_shared<send_one_file_ctx>(_last_schema_ver_to_column_mapping);
|
||||
|
||||
try {
|
||||
auto s = commitlog::read_log_file(fname, [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
|
||||
auto s = commitlog::read_log_file(fname, service::get_local_streaming_read_priority(), [this, secs_since_file_mod, &fname, ctx_ptr] (temporary_buffer<char> buf, db::replay_position rp) mutable {
|
||||
// Check that we can still send the next hint. Don't try to send it if the destination host
|
||||
// is DOWN or if we have already failed to send some of the previous hints.
|
||||
if (!draining() && ctx_ptr->state.contains(send_state::segment_replay_failed)) {
|
||||
@@ -747,6 +760,7 @@ bool manager::end_point_hints_manager::sender::send_one_file(const sstring& fnam
|
||||
|
||||
// clear the replay position - we are going to send the next segment...
|
||||
_last_not_complete_rp = replay_position();
|
||||
_last_schema_ver_to_column_mapping.clear();
|
||||
manager_logger.trace("send_one_file(): segment {} was sent in full and deleted", fname);
|
||||
return true;
|
||||
}
|
||||
@@ -759,7 +773,7 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
|
||||
int replayed_segments_count = 0;
|
||||
|
||||
try {
|
||||
while (have_segments()) {
|
||||
while (replay_allowed() && have_segments()) {
|
||||
if (!send_one_file(*_segments_to_replay.begin())) {
|
||||
break;
|
||||
}
|
||||
@@ -784,14 +798,24 @@ void manager::end_point_hints_manager::sender::send_hints_maybe() noexcept {
|
||||
manager_logger.trace("send_hints(): we handled {} segments", replayed_segments_count);
|
||||
}
|
||||
|
||||
template<typename Func>
|
||||
static future<> scan_for_hints_dirs(const sstring& hints_directory, Func&& f) {
|
||||
return lister::scan_dir(hints_directory, { directory_entry_type::directory }, [f = std::forward<Func>(f)] (lister::path dir, directory_entry de) {
|
||||
try {
|
||||
return f(std::move(dir), std::move(de), std::stoi(de.name.c_str()));
|
||||
} catch (std::invalid_argument& ex) {
|
||||
manager_logger.debug("Ignore invalid directory {}", de.name);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// runs in seastar::async context
|
||||
manager::hints_segments_map manager::get_current_hints_segments(const sstring& hints_directory) {
|
||||
hints_segments_map current_hints_segments;
|
||||
|
||||
// shards level
|
||||
lister::scan_dir(hints_directory, { directory_entry_type::directory }, [¤t_hints_segments] (lister::path dir, directory_entry de) {
|
||||
unsigned shard_id = std::stoi(de.name.c_str());
|
||||
|
||||
scan_for_hints_dirs(hints_directory, [¤t_hints_segments] (lister::path dir, directory_entry de, unsigned shard_id) {
|
||||
manager_logger.trace("shard_id = {}", shard_id);
|
||||
// IPs level
|
||||
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory }, [¤t_hints_segments, shard_id] (lister::path dir, directory_entry de) {
|
||||
@@ -908,9 +932,7 @@ void manager::rebalance_segments_for(
|
||||
// runs in seastar::async context
|
||||
void manager::remove_irrelevant_shards_directories(const sstring& hints_directory) {
|
||||
// shards level
|
||||
lister::scan_dir(hints_directory, { directory_entry_type::directory }, [] (lister::path dir, directory_entry de) {
|
||||
unsigned shard_id = std::stoi(de.name.c_str());
|
||||
|
||||
scan_for_hints_dirs(hints_directory, [] (lister::path dir, directory_entry de, unsigned shard_id) {
|
||||
if (shard_id >= smp::count) {
|
||||
// IPs level
|
||||
return lister::scan_dir(dir / de.name.c_str(), { directory_entry_type::directory, directory_entry_type::regular }, lister::show_hidden::yes, [] (lister::path dir, directory_entry de) {
|
||||
@@ -936,5 +958,15 @@ future<> manager::rebalance(sstring hints_directory) {
|
||||
});
|
||||
}
|
||||
|
||||
void manager::update_backlog(size_t backlog, size_t max_backlog) {
|
||||
_backlog_size = backlog;
|
||||
_max_backlog_size = max_backlog;
|
||||
if (backlog < max_backlog) {
|
||||
allow_hints();
|
||||
} else {
|
||||
forbid_hints_for_eps_with_pending_hints();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,6 +59,7 @@ private:
|
||||
uint64_t errors = 0;
|
||||
uint64_t dropped = 0;
|
||||
uint64_t sent = 0;
|
||||
uint64_t discarded = 0;
|
||||
};
|
||||
|
||||
// map: shard -> segments
|
||||
@@ -69,6 +70,8 @@ private:
|
||||
class drain_tag {};
|
||||
using drain = seastar::bool_class<drain_tag>;
|
||||
|
||||
friend class space_watchdog;
|
||||
|
||||
public:
|
||||
class end_point_hints_manager {
|
||||
public:
|
||||
@@ -100,7 +103,10 @@ public:
|
||||
send_state::restart_segment>>;
|
||||
|
||||
struct send_one_file_ctx {
|
||||
std::unordered_map<table_schema_version, column_mapping> schema_ver_to_column_mapping;
|
||||
send_one_file_ctx(std::unordered_map<table_schema_version, column_mapping>& last_schema_ver_to_column_mapping)
|
||||
: schema_ver_to_column_mapping(last_schema_ver_to_column_mapping)
|
||||
{}
|
||||
std::unordered_map<table_schema_version, column_mapping>& schema_ver_to_column_mapping;
|
||||
seastar::gate file_send_gate;
|
||||
std::unordered_set<db::replay_position> rps_set; // number of elements in this set is never going to be greater than the maximum send queue length
|
||||
send_state_set state;
|
||||
@@ -109,6 +115,7 @@ public:
|
||||
private:
|
||||
std::list<sstring> _segments_to_replay;
|
||||
replay_position _last_not_complete_rp;
|
||||
std::unordered_map<table_schema_version, column_mapping> _last_schema_ver_to_column_mapping;
|
||||
state_set _state;
|
||||
future<> _stopped;
|
||||
clock::time_point _next_flush_tp;
|
||||
@@ -119,6 +126,7 @@ public:
|
||||
resource_manager& _resource_manager;
|
||||
service::storage_proxy& _proxy;
|
||||
database& _db;
|
||||
seastar::scheduling_group _hints_cpu_sched_group;
|
||||
gms::gossiper& _gossiper;
|
||||
seastar::shared_mutex& _file_update_mutex;
|
||||
|
||||
@@ -179,6 +187,10 @@ public:
|
||||
return _state.contains(state::stopping);
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _ep_manager.replay_allowed();
|
||||
}
|
||||
|
||||
/// \brief Try to send one hint read from the file.
|
||||
/// - Limit the maximum memory size of hints "in the air" and the maximum total number of hints "in the air".
|
||||
/// - Discard the hints that are older than the grace seconds value of the corresponding table.
|
||||
@@ -210,7 +222,7 @@ public:
|
||||
/// \param ctx_ptr pointer to the send context
|
||||
/// \param buf hints file entry
|
||||
/// \return The mutation object representing the original mutation stored in the hints file.
|
||||
mutation get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
|
||||
frozen_mutation_and_schema get_mutation(lw_shared_ptr<send_one_file_ctx> ctx_ptr, temporary_buffer<char>& buf);
|
||||
|
||||
/// \brief Get a reference to the column_mapping object for a given frozen mutation.
|
||||
/// \param ctx_ptr pointer to the send context
|
||||
@@ -227,13 +239,13 @@ public:
|
||||
/// \param m mutation to send
|
||||
/// \param natural_endpoints current replicas for the given mutation
|
||||
/// \return future that resolves when the operation is complete
|
||||
future<> do_send_one_mutation(mutation m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
|
||||
future<> do_send_one_mutation(frozen_mutation_and_schema m, const std::vector<gms::inet_address>& natural_endpoints) noexcept;
|
||||
|
||||
/// \brief Send one mutation out.
|
||||
///
|
||||
/// \param m mutation to send
|
||||
/// \return future that resolves when the mutation sending processing is complete.
|
||||
future<> send_one_mutation(mutation m);
|
||||
future<> send_one_mutation(frozen_mutation_and_schema m);
|
||||
|
||||
/// \brief Get the last modification time stamp for a given file.
|
||||
/// \param fname File name
|
||||
@@ -328,6 +340,10 @@ public:
|
||||
return _hints_in_progress;
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _shard_manager.replay_allowed();
|
||||
}
|
||||
|
||||
bool can_hint() const noexcept {
|
||||
return _state.contains(state::can_hint);
|
||||
}
|
||||
@@ -393,6 +409,17 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
enum class state {
|
||||
started, // hinting is currently allowed (start() call is complete)
|
||||
replay_allowed, // replaying (hints sending) is allowed
|
||||
stopping // hinting is not allowed - stopping is in progress (stop() method has been called)
|
||||
};
|
||||
|
||||
using state_set = enum_set<super_enum<state,
|
||||
state::started,
|
||||
state::replay_allowed,
|
||||
state::stopping>>;
|
||||
|
||||
private:
|
||||
using ep_key_type = typename end_point_hints_manager::key_type;
|
||||
using ep_managers_map_type = std::unordered_map<ep_key_type, end_point_hints_manager>;
|
||||
@@ -403,6 +430,7 @@ public:
|
||||
static const std::chrono::seconds hint_file_write_timeout;
|
||||
|
||||
private:
|
||||
state_set _state;
|
||||
const boost::filesystem::path _hints_dir;
|
||||
dev_t _hints_dir_device_id = 0;
|
||||
|
||||
@@ -414,7 +442,7 @@ private:
|
||||
locator::snitch_ptr& _local_snitch_ptr;
|
||||
int64_t _max_hint_window_us = 0;
|
||||
database& _local_db;
|
||||
bool _stopping = false;
|
||||
|
||||
seastar::gate _draining_eps_gate; // gate used to control the progress of ep_managers stopping not in the context of manager::stop() call
|
||||
|
||||
resource_manager& _resource_manager;
|
||||
@@ -424,9 +452,14 @@ private:
|
||||
seastar::metrics::metric_groups _metrics;
|
||||
std::unordered_set<ep_key_type> _eps_with_pending_hints;
|
||||
|
||||
size_t _max_backlog_size = 1;
|
||||
size_t _backlog_size = 0;
|
||||
|
||||
public:
|
||||
manager(sstring hints_directory, std::vector<sstring> hinted_dcs, int64_t max_hint_window_ms, resource_manager&res_manager, distributed<database>& db);
|
||||
virtual ~manager();
|
||||
manager(manager&&) = delete;
|
||||
manager& operator=(manager&&) = delete;
|
||||
void register_metrics(const sstring& group_name);
|
||||
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
||||
future<> stop();
|
||||
@@ -503,6 +536,18 @@ public:
|
||||
void forbid_hints();
|
||||
void forbid_hints_for_eps_with_pending_hints();
|
||||
|
||||
size_t max_backlog_size() const {
|
||||
return _max_backlog_size;
|
||||
}
|
||||
|
||||
size_t backlog_size() const {
|
||||
return _backlog_size;
|
||||
}
|
||||
|
||||
void allow_replaying() noexcept {
|
||||
_state.set(state::replay_allowed);
|
||||
}
|
||||
|
||||
/// \brief Rebalance hints segments among all present shards.
|
||||
///
|
||||
/// The difference between the number of segments on every two shard will be not greater than 1 after the
|
||||
@@ -616,6 +661,28 @@ private:
|
||||
/// \param endpoint node that left the cluster
|
||||
void drain_for(gms::inet_address endpoint);
|
||||
|
||||
void update_backlog(size_t backlog, size_t max_backlog);
|
||||
|
||||
bool stopping() const noexcept {
|
||||
return _state.contains(state::stopping);
|
||||
}
|
||||
|
||||
void set_stopping() noexcept {
|
||||
_state.set(state::stopping);
|
||||
}
|
||||
|
||||
bool started() const noexcept {
|
||||
return _state.contains(state::started);
|
||||
}
|
||||
|
||||
void set_started() noexcept {
|
||||
_state.set(state::started);
|
||||
}
|
||||
|
||||
bool replay_allowed() const noexcept {
|
||||
return _state.contains(state::replay_allowed);
|
||||
}
|
||||
|
||||
public:
|
||||
ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
|
||||
return _ep_managers.find(ep_key);
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "lister.hh"
|
||||
#include "disk-error-handler.hh"
|
||||
#include "seastarx.hh"
|
||||
#include <seastar/core/sleep.hh>
|
||||
|
||||
namespace db {
|
||||
namespace hints {
|
||||
@@ -65,19 +66,28 @@ const std::chrono::seconds space_watchdog::_watchdog_period = std::chrono::secon
|
||||
space_watchdog::space_watchdog(shard_managers_set& managers, per_device_limits_map& per_device_limits_map)
|
||||
: _shard_managers(managers)
|
||||
, _per_device_limits_map(per_device_limits_map)
|
||||
, _timer([this] { on_timer(); })
|
||||
{}
|
||||
|
||||
void space_watchdog::start() {
|
||||
_timer.arm(timer_clock_type::now());
|
||||
_started = seastar::async([this] {
|
||||
while (!_as.abort_requested()) {
|
||||
try {
|
||||
on_timer();
|
||||
} catch (...) {
|
||||
resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
|
||||
// Stop all hint generators if space_watchdog callback failed
|
||||
for (manager& shard_manager : _shard_managers) {
|
||||
shard_manager.forbid_hints();
|
||||
}
|
||||
}
|
||||
seastar::sleep_abortable(_watchdog_period, _as).get();
|
||||
}
|
||||
}).handle_exception_type([] (const seastar::sleep_aborted& ignored) { });
|
||||
}
|
||||
|
||||
future<> space_watchdog::stop() noexcept {
|
||||
try {
|
||||
return _gate.close().finally([this] { _timer.cancel(); });
|
||||
} catch (...) {
|
||||
return make_exception_future<>(std::current_exception());
|
||||
}
|
||||
_as.request_abort();
|
||||
return std::move(_started);
|
||||
}
|
||||
|
||||
future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager& shard_manager, ep_key_type ep_key) {
|
||||
@@ -94,83 +104,62 @@ future<> space_watchdog::scan_one_ep_dir(boost::filesystem::path path, manager&
|
||||
});
|
||||
}
|
||||
|
||||
// Called from the context of a seastar::thread.
|
||||
void space_watchdog::on_timer() {
|
||||
with_gate(_gate, [this] {
|
||||
return futurize_apply([this] {
|
||||
_total_size = 0;
|
||||
// The hints directories are organized as follows:
|
||||
// <hints root>
|
||||
// |- <shard1 ID>
|
||||
// | |- <EP1 address>
|
||||
// | |- <hints file1>
|
||||
// | |- <hints file2>
|
||||
// | |- ...
|
||||
// | |- <EP2 address>
|
||||
// | |- ...
|
||||
// | |-...
|
||||
// |- <shard2 ID>
|
||||
// | |- ...
|
||||
// ...
|
||||
// |- <shardN ID>
|
||||
// | |- ...
|
||||
//
|
||||
|
||||
return do_for_each(_shard_managers, [this] (manager& shard_manager) {
|
||||
shard_manager.clear_eps_with_pending_hints();
|
||||
|
||||
// The hints directories are organized as follows:
|
||||
// <hints root>
|
||||
// |- <shard1 ID>
|
||||
// | |- <EP1 address>
|
||||
// | |- <hints file1>
|
||||
// | |- <hints file2>
|
||||
// | |- ...
|
||||
// | |- <EP2 address>
|
||||
// | |- ...
|
||||
// | |-...
|
||||
// |- <shard2 ID>
|
||||
// | |- ...
|
||||
// ...
|
||||
// |- <shardN ID>
|
||||
// | |- ...
|
||||
for (auto& per_device_limits : _per_device_limits_map | boost::adaptors::map_values) {
|
||||
_total_size = 0;
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.clear_eps_with_pending_hints();
|
||||
lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
_files_count = 0;
|
||||
// Let's scan per-end-point directories and enumerate hints files...
|
||||
//
|
||||
return lister::scan_dir(shard_manager.hints_dir(), {directory_entry_type::directory}, [this, &shard_manager] (lister::path dir, directory_entry de) {
|
||||
_files_count = 0;
|
||||
// Let's scan per-end-point directories and enumerate hints files...
|
||||
//
|
||||
// Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
|
||||
// not hintable).
|
||||
// If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
|
||||
// continue to enumeration - there is no one to change them.
|
||||
auto it = shard_manager.find_ep_manager(de.name);
|
||||
if (it != shard_manager.ep_managers_end()) {
|
||||
return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
|
||||
return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
|
||||
});
|
||||
} else {
|
||||
return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
|
||||
}
|
||||
});
|
||||
}).then([this] {
|
||||
return do_for_each(_per_device_limits_map, [this](per_device_limits_map::value_type& per_device_limits_entry) {
|
||||
space_watchdog::per_device_limits& per_device_limits = per_device_limits_entry.second;
|
||||
|
||||
size_t adjusted_quota = 0;
|
||||
size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
|
||||
return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
|
||||
// Let's check if there is a corresponding end point manager (may not exist if the corresponding DC is
|
||||
// not hintable).
|
||||
// If exists - let's take a file update lock so that files are not changed under our feet. Otherwise, simply
|
||||
// continue to enumeration - there is no one to change them.
|
||||
auto it = shard_manager.find_ep_manager(de.name);
|
||||
if (it != shard_manager.ep_managers_end()) {
|
||||
return with_lock(it->second.file_update_mutex(), [this, &shard_manager, dir = std::move(dir), ep_name = std::move(de.name)]() mutable {
|
||||
return scan_one_ep_dir(dir / ep_name.c_str(), shard_manager, ep_key_type(ep_name));
|
||||
});
|
||||
if (per_device_limits.max_shard_disk_space_size > delta) {
|
||||
adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
|
||||
}
|
||||
} else {
|
||||
return scan_one_ep_dir(dir / de.name.c_str(), shard_manager, ep_key_type(de.name));
|
||||
}
|
||||
}).get();
|
||||
}
|
||||
|
||||
bool can_hint = _total_size < adjusted_quota;
|
||||
resource_manager_logger.trace("space_watchdog: total_size ({}) {} max_shard_disk_space_size ({})", _total_size, can_hint ? "<" : ">=", adjusted_quota);
|
||||
|
||||
if (!can_hint) {
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.forbid_hints_for_eps_with_pending_hints();
|
||||
}
|
||||
} else {
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.allow_hints();
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}).handle_exception([this] (auto eptr) {
|
||||
resource_manager_logger.trace("space_watchdog: unexpected exception - stop all hints generators");
|
||||
// Stop all hint generators if space_watchdog callback failed
|
||||
for (manager& shard_manager : _shard_managers) {
|
||||
shard_manager.forbid_hints();
|
||||
}
|
||||
}).finally([this] {
|
||||
_timer.arm(_watchdog_period);
|
||||
// Adjust the quota to take into account the space we guarantee to every end point manager
|
||||
size_t adjusted_quota = 0;
|
||||
size_t delta = boost::accumulate(per_device_limits.managers, 0, [] (size_t sum, manager& shard_manager) {
|
||||
return sum + shard_manager.ep_managers_size() * resource_manager::hint_segment_size_in_mb * 1024 * 1024;
|
||||
});
|
||||
});
|
||||
if (per_device_limits.max_shard_disk_space_size > delta) {
|
||||
adjusted_quota = per_device_limits.max_shard_disk_space_size - delta;
|
||||
}
|
||||
|
||||
resource_manager_logger.trace("space_watchdog: consuming {}/{} bytes", _total_size, adjusted_quota);
|
||||
for (manager& shard_manager : per_device_limits.managers) {
|
||||
shard_manager.update_backlog(_total_size, adjusted_quota);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr) {
|
||||
@@ -183,6 +172,10 @@ future<> resource_manager::start(shared_ptr<service::storage_proxy> proxy_ptr, s
|
||||
});
|
||||
}
|
||||
|
||||
void resource_manager::allow_replaying() noexcept {
|
||||
boost::for_each(_shard_managers, [] (manager& m) { m.allow_replaying(); });
|
||||
}
|
||||
|
||||
future<> resource_manager::stop() noexcept {
|
||||
return parallel_for_each(_shard_managers, [](manager& m) {
|
||||
return m.stop();
|
||||
@@ -201,14 +194,18 @@ future<> resource_manager::prepare_per_device_limits() {
|
||||
auto it = _per_device_limits_map.find(device_id);
|
||||
if (it == _per_device_limits_map.end()) {
|
||||
return is_mountpoint(shard_manager.hints_dir().parent_path()).then([this, device_id, &shard_manager](bool is_mountpoint) {
|
||||
// By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
|
||||
size_t max_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
|
||||
// If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
|
||||
// Then, reserve 90% of all space instead of 10% above.
|
||||
if (is_mountpoint) {
|
||||
max_size *= 9;
|
||||
auto [it, inserted] = _per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{});
|
||||
// Since we possibly deferred, we need to recheck the _per_device_limits_map.
|
||||
if (inserted) {
|
||||
// By default, give each group of managers 10% of the available disk space. Give each shard an equal share of the available space.
|
||||
it->second.max_shard_disk_space_size = boost::filesystem::space(shard_manager.hints_dir().c_str()).capacity / (10 * smp::count);
|
||||
// If hints directory is a mountpoint, we assume it's on dedicated (i.e. not shared with data/commitlog/etc) storage.
|
||||
// Then, reserve 90% of all space instead of 10% above.
|
||||
if (is_mountpoint) {
|
||||
it->second.max_shard_disk_space_size *= 9;
|
||||
}
|
||||
}
|
||||
_per_device_limits_map.emplace(device_id, space_watchdog::per_device_limits{{std::ref(shard_manager)}, max_size});
|
||||
it->second.managers.emplace_back(std::ref(shard_manager));
|
||||
});
|
||||
} else {
|
||||
it->second.managers.emplace_back(std::ref(shard_manager));
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/memory.hh>
|
||||
@@ -78,8 +79,8 @@ private:
|
||||
shard_managers_set& _shard_managers;
|
||||
per_device_limits_map& _per_device_limits_map;
|
||||
|
||||
seastar::gate _gate;
|
||||
seastar::timer<timer_clock_type> _timer;
|
||||
future<> _started = make_ready_future<>();
|
||||
seastar::abort_source _as;
|
||||
int _files_count = 0;
|
||||
|
||||
public:
|
||||
@@ -137,6 +138,9 @@ public:
|
||||
, _space_watchdog(_shard_managers, _per_device_limits_map)
|
||||
{}
|
||||
|
||||
resource_manager(resource_manager&&) = delete;
|
||||
resource_manager& operator=(resource_manager&&) = delete;
|
||||
|
||||
future<semaphore_units<semaphore_default_exception_factory>> get_send_units_for(size_t buf_size);
|
||||
|
||||
bool too_many_hints_in_progress() const {
|
||||
@@ -156,6 +160,7 @@ public:
|
||||
}
|
||||
|
||||
future<> start(shared_ptr<service::storage_proxy> proxy_ptr, shared_ptr<gms::gossiper> gossiper_ptr, shared_ptr<service::storage_service> ss_ptr);
|
||||
void allow_replaying() noexcept;
|
||||
future<> stop() noexcept;
|
||||
void register_manager(manager& m);
|
||||
future<> prepare_per_device_limits();
|
||||
|
||||
@@ -87,7 +87,7 @@ future<> system_distributed_keyspace::start() {
|
||||
return do_with(all_tables(), [this] (std::vector<schema_ptr>& tables) {
|
||||
return do_for_each(tables, [this] (schema_ptr table) {
|
||||
return ignore_existing([this, table = std::move(table)] {
|
||||
return _mm.announce_new_column_family(std::move(table), false);
|
||||
return _mm.announce_new_column_family(std::move(table), api::min_timestamp, false);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -28,5 +28,6 @@
|
||||
namespace db {
|
||||
using timeout_clock = seastar::lowres_clock;
|
||||
using timeout_semaphore = seastar::basic_semaphore<seastar::default_timeout_exception_factory, timeout_clock>;
|
||||
using timeout_semaphore_units = seastar::semaphore_units<seastar::default_timeout_exception_factory, timeout_clock>;
|
||||
static constexpr timeout_clock::time_point no_timeout = timeout_clock::time_point::max();
|
||||
}
|
||||
|
||||
70
db/view/node_view_update_backlog.hh
Normal file
70
db/view/node_view_update_backlog.hh
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
|
||||
#include <seastar/core/cacheline.hh>
|
||||
#include <seastar/core/lowres_clock.hh>
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <new>
|
||||
|
||||
namespace db::view {
|
||||
|
||||
/**
|
||||
* An atomic view update backlog representation, safe to update from multiple shards.
|
||||
* It is legal for a stale current max value to be returned.
|
||||
*/
|
||||
class node_update_backlog {
|
||||
using clock = seastar::lowres_clock;
|
||||
struct per_shard_backlog {
|
||||
// Multiply by 2 to defeat the prefetcher
|
||||
alignas(seastar::cache_line_size * 2) std::atomic<update_backlog> backlog = update_backlog::no_backlog();
|
||||
|
||||
update_backlog load() const {
|
||||
return backlog.load(std::memory_order_relaxed);
|
||||
}
|
||||
};
|
||||
std::vector<per_shard_backlog> _backlogs;
|
||||
std::chrono::milliseconds _interval;
|
||||
std::atomic<clock::time_point> _last_update;
|
||||
std::atomic<update_backlog> _max;
|
||||
|
||||
public:
|
||||
explicit node_update_backlog(size_t shards, std::chrono::milliseconds interval)
|
||||
: _backlogs(shards)
|
||||
, _interval(interval)
|
||||
, _last_update(clock::now() - _interval)
|
||||
, _max(update_backlog::no_backlog()) {
|
||||
}
|
||||
|
||||
update_backlog add_fetch(unsigned shard, update_backlog backlog);
|
||||
|
||||
// Exposed for testing only.
|
||||
update_backlog load() const {
|
||||
return _max.load(std::memory_order_relaxed);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
124
db/view/view.cc
124
db/view/view.cc
@@ -58,6 +58,7 @@
|
||||
#include "cql3/util.hh"
|
||||
#include "db/view/view.hh"
|
||||
#include "db/view/view_builder.hh"
|
||||
#include "frozen_mutation.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "keys.hh"
|
||||
#include "locator/network_topology_strategy.hh"
|
||||
@@ -226,10 +227,11 @@ public:
|
||||
, _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
|
||||
}
|
||||
|
||||
void move_to(std::vector<mutation>& mutations) && {
|
||||
void move_to(std::vector<frozen_mutation_and_schema>& mutations) && {
|
||||
auto& partitioner = dht::global_partitioner();
|
||||
std::transform(_updates.begin(), _updates.end(), std::back_inserter(mutations), [&, this] (auto&& m) {
|
||||
return mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
|
||||
auto mut = mutation(_view, partitioner.decorate_key(*_view, std::move(m.first)), std::move(m.second));
|
||||
return frozen_mutation_and_schema{freeze(mut), std::move(_view)};
|
||||
});
|
||||
}
|
||||
|
||||
@@ -627,7 +629,7 @@ public:
|
||||
, _now(gc_clock::now()) {
|
||||
}
|
||||
|
||||
future<std::vector<mutation>> build();
|
||||
future<std::vector<frozen_mutation_and_schema>> build();
|
||||
|
||||
private:
|
||||
void generate_update(clustering_row&& update, stdx::optional<clustering_row>&& existing);
|
||||
@@ -664,7 +666,7 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
future<std::vector<mutation>> view_update_builder::build() {
|
||||
future<std::vector<frozen_mutation_and_schema>> view_update_builder::build() {
|
||||
return advance_all().then([this] (auto&& ignored) {
|
||||
assert(_update && _update->is_partition_start());
|
||||
_key = std::move(std::move(_update)->as_partition_start().key().key());
|
||||
@@ -679,7 +681,7 @@ future<std::vector<mutation>> view_update_builder::build() {
|
||||
});
|
||||
});
|
||||
}).then([this] {
|
||||
std::vector<mutation> mutations;
|
||||
std::vector<frozen_mutation_and_schema> mutations;
|
||||
for (auto&& update : _view_updates) {
|
||||
std::move(update).move_to(mutations);
|
||||
}
|
||||
@@ -787,7 +789,7 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
return stop();
|
||||
}
|
||||
|
||||
future<std::vector<mutation>> generate_view_updates(
|
||||
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||
const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views_to_update,
|
||||
flat_mutation_reader&& updates,
|
||||
@@ -924,16 +926,35 @@ get_view_natural_endpoint(const sstring& keyspace_name,
|
||||
// to a modification of a single base partition, and apply them to the
|
||||
// appropriate paired replicas. This is done asynchronously - we do not wait
|
||||
// for the writes to complete.
|
||||
// FIXME: I dropped a lot of parameters the Cassandra version had,
|
||||
// we may need them back: writeCommitLog, baseComplete, queryStartNanoTime.
|
||||
future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations, db::view::stats& stats)
|
||||
future<> mutate_MV(
|
||||
const dht::token& base_token,
|
||||
std::vector<frozen_mutation_and_schema> view_updates,
|
||||
db::view::stats& stats,
|
||||
db::timeout_semaphore_units pending_view_updates)
|
||||
{
|
||||
auto fs = std::make_unique<std::vector<future<>>>();
|
||||
for (auto& mut : mutations) {
|
||||
auto view_token = mut.token();
|
||||
auto keyspace_name = mut.schema()->ks_name();
|
||||
fs->reserve(view_updates.size());
|
||||
auto& partitioner = dht::global_partitioner();
|
||||
for (frozen_mutation_and_schema& mut : view_updates) {
|
||||
auto view_token = partitioner.get_token(*mut.s, mut.fm.key(*mut.s));
|
||||
auto& keyspace_name = mut.s->ks_name();
|
||||
auto paired_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
|
||||
auto pending_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
|
||||
auto maybe_account_failure = [&stats, units = pending_view_updates.split(mut.fm.representation().size())] (
|
||||
future<>&& f,
|
||||
gms::inet_address target,
|
||||
bool is_local,
|
||||
size_t remotes) {
|
||||
if (f.failed()) {
|
||||
stats.view_updates_failed_local += is_local;
|
||||
stats.view_updates_failed_remote += remotes;
|
||||
auto ep = f.get_exception();
|
||||
vlogger.error("Error applying view update to {}: {}", target, ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
};
|
||||
if (paired_endpoint) {
|
||||
// When paired endpoint is the local node, we can just apply
|
||||
// the mutation locally, unless there are pending endpoints, in
|
||||
@@ -951,10 +972,16 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
|
||||
// do not wait for it to complete.
|
||||
// Note also that mutate_locally(mut) copies mut (in
|
||||
// frozen form) so don't need to increase its lifetime.
|
||||
fs->push_back(service::get_local_storage_proxy().mutate_locally(mut).handle_exception([&stats] (auto ep) {
|
||||
vlogger.error("Error applying local view update: {}", ep);
|
||||
stats.view_updates_failed_local++;
|
||||
return make_exception_future<>(std::move(ep));
|
||||
// send_to_endpoint() below updates statistics on pending
|
||||
// writes but mutate_locally() doesn't, so we need to do that here.
|
||||
++stats.writes;
|
||||
auto mut_ptr = std::make_unique<frozen_mutation>(std::move(mut.fm));
|
||||
fs->push_back(service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr).then_wrapped(
|
||||
[&stats,
|
||||
maybe_account_failure = std::move(maybe_account_failure),
|
||||
mut_ptr = std::move(mut_ptr)] (future<>&& f) {
|
||||
--stats.writes;
|
||||
return maybe_account_failure(std::move(f), utils::fb_utilities::get_broadcast_address(), true, 0);
|
||||
}));
|
||||
} else {
|
||||
vlogger.debug("Sending view update to endpoint {}, with pending endpoints = {}", *paired_endpoint, pending_endpoints);
|
||||
@@ -965,14 +992,17 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
|
||||
// to send the update there. Currently, we do this from *each* of
|
||||
// the base replicas, but this is probably excessive - see
|
||||
// See https://issues.apache.org/jira/browse/CASSANDRA-14262/
|
||||
fs->push_back(service::get_local_storage_proxy().send_to_endpoint(std::move(mut), *paired_endpoint, std::move(pending_endpoints), db::write_type::VIEW, stats)
|
||||
.handle_exception([paired_endpoint, is_endpoint_local, updates_pushed_remote, &stats] (auto ep) {
|
||||
stats.view_updates_failed_local += is_endpoint_local;
|
||||
stats.view_updates_failed_remote += updates_pushed_remote;
|
||||
vlogger.error("Error applying view update to {}: {}", *paired_endpoint, ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
})
|
||||
);
|
||||
fs->push_back(service::get_local_storage_proxy().send_to_endpoint(
|
||||
std::move(mut),
|
||||
*paired_endpoint,
|
||||
std::move(pending_endpoints),
|
||||
db::write_type::VIEW, stats).then_wrapped(
|
||||
[paired_endpoint,
|
||||
is_endpoint_local,
|
||||
updates_pushed_remote,
|
||||
maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) mutable {
|
||||
return maybe_account_failure(std::move(f), std::move(*paired_endpoint), is_endpoint_local, updates_pushed_remote);
|
||||
}));
|
||||
}
|
||||
} else if (!pending_endpoints.empty()) {
|
||||
// If there is no paired endpoint, it means there's a range movement going on (decommission or move),
|
||||
@@ -992,10 +1022,11 @@ future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations
|
||||
std::move(mut),
|
||||
target,
|
||||
std::move(pending_endpoints),
|
||||
db::write_type::VIEW).handle_exception([target, updates_pushed_remote, &stats] (auto ep) {
|
||||
stats.view_updates_failed_remote += updates_pushed_remote;
|
||||
vlogger.error("Error applying view update to {}: {}", target, ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
db::write_type::VIEW).then_wrapped(
|
||||
[target,
|
||||
updates_pushed_remote,
|
||||
maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) {
|
||||
return maybe_account_failure(std::move(f), std::move(target), false, updates_pushed_remote);
|
||||
}));
|
||||
}
|
||||
}
|
||||
@@ -1226,6 +1257,20 @@ future<> view_builder::calculate_shard_build_step(
|
||||
}
|
||||
}
|
||||
|
||||
// All shards need to arrive at the same decisions on whether or not to
|
||||
// restart a view build at some common token (reshard), and which token
|
||||
// to restart at. So we need to wait until all shards have read the view
|
||||
// build statuses before they can all proceed to make the (same) decision.
|
||||
// If we don't synchronoize here, a fast shard may make a decision, start
|
||||
// building and finish a build step - before the slowest shard even read
|
||||
// the view build information.
|
||||
container().invoke_on(0, [] (view_builder& builder) {
|
||||
if (++builder._shards_finished_read == smp::count) {
|
||||
builder._shards_finished_read_promise.set_value();
|
||||
}
|
||||
return builder._shards_finished_read_promise.get_shared_future();
|
||||
}).get();
|
||||
|
||||
std::unordered_set<utils::UUID> loaded_views;
|
||||
if (view_build_status_per_shard.size() != smp::count) {
|
||||
reshard(std::move(view_build_status_per_shard), loaded_views);
|
||||
@@ -1591,12 +1636,29 @@ future<> view_builder::maybe_mark_view_as_built(view_ptr view, dht::token next_t
|
||||
});
|
||||
}
|
||||
|
||||
future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout) {
|
||||
return container().invoke_on(0, [ks_name, view_name, timeout] (view_builder& builder) {
|
||||
future<> view_builder::wait_until_built(const sstring& ks_name, const sstring& view_name) {
|
||||
return container().invoke_on(0, [ks_name, view_name] (view_builder& builder) {
|
||||
auto v = std::pair(std::move(ks_name), std::move(view_name));
|
||||
return builder._build_notifiers[std::move(v)].get_shared_future(timeout);
|
||||
return builder._build_notifiers[std::move(v)].get_shared_future();
|
||||
});
|
||||
}
|
||||
|
||||
update_backlog node_update_backlog::add_fetch(unsigned shard, update_backlog backlog) {
|
||||
_backlogs[shard].backlog.store(backlog, std::memory_order_relaxed);
|
||||
auto now = clock::now();
|
||||
if (now >= _last_update.load(std::memory_order_relaxed) + _interval) {
|
||||
_last_update.store(now, std::memory_order_relaxed);
|
||||
auto new_max = boost::accumulate(
|
||||
_backlogs,
|
||||
update_backlog::no_backlog(),
|
||||
[] (const update_backlog& lhs, const per_shard_backlog& rhs) {
|
||||
return std::max(lhs, rhs.load());
|
||||
});
|
||||
_max.store(new_max, std::memory_order_relaxed);
|
||||
return new_max;
|
||||
}
|
||||
return std::max(backlog, _max.load(std::memory_order_relaxed));
|
||||
}
|
||||
|
||||
} // namespace view
|
||||
} // namespace db
|
||||
|
||||
@@ -30,6 +30,10 @@
|
||||
#include "flat_mutation_reader.hh"
|
||||
#include "stdx.hh"
|
||||
|
||||
#include <seastar/core/semaphore.hh>
|
||||
|
||||
class frozen_mutation_and_schema;
|
||||
|
||||
namespace db {
|
||||
|
||||
namespace view {
|
||||
@@ -90,7 +94,7 @@ bool matches_view_filter(const schema& base, const view_info& view, const partit
|
||||
|
||||
bool clustering_prefix_matches(const schema& base, const partition_key& key, const clustering_key_prefix& ck);
|
||||
|
||||
future<std::vector<mutation>> generate_view_updates(
|
||||
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||
const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views_to_update,
|
||||
flat_mutation_reader&& updates,
|
||||
@@ -102,7 +106,11 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
|
||||
const mutation_partition& mp,
|
||||
const std::vector<view_ptr>& views);
|
||||
|
||||
future<> mutate_MV(const dht::token& base_token, std::vector<mutation> mutations, db::view::stats& stats);
|
||||
future<> mutate_MV(
|
||||
const dht::token& base_token,
|
||||
std::vector<frozen_mutation_and_schema> view_updates,
|
||||
db::view::stats& stats,
|
||||
db::timeout_semaphore_units pending_view_updates);
|
||||
|
||||
/**
|
||||
* create_virtual_column() adds a "virtual column" to a schema builder.
|
||||
|
||||
@@ -151,6 +151,10 @@ class view_builder final : public service::migration_listener::only_view_notific
|
||||
future<> _started = make_ready_future<>();
|
||||
// Used to coordinate between shards the conclusion of the build process for a particular view.
|
||||
std::unordered_set<utils::UUID> _built_views;
|
||||
// Counter and promise (both on shard 0 only!) allowing to wait for all
|
||||
// shards to have read the view build statuses
|
||||
unsigned _shards_finished_read = 0;
|
||||
seastar::shared_promise<> _shards_finished_read_promise;
|
||||
// Used for testing.
|
||||
std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;
|
||||
|
||||
@@ -178,7 +182,7 @@ public:
|
||||
virtual void on_drop_view(const sstring& ks_name, const sstring& view_name) override;
|
||||
|
||||
// For tests
|
||||
future<> wait_until_built(const sstring& ks_name, const sstring& view_name, lowres_clock::time_point timeout);
|
||||
future<> wait_until_built(const sstring& ks_name, const sstring& view_name);
|
||||
|
||||
private:
|
||||
build_step& get_or_create_build_step(utils::UUID);
|
||||
|
||||
73
db/view/view_update_backlog.hh
Normal file
73
db/view/view_update_backlog.hh
Normal file
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
|
||||
namespace db::view {
|
||||
|
||||
/**
|
||||
* The view update backlog represents the pending view data that a base replica
|
||||
* maintains. It is the maximum of the memory backlog - how much memory pending
|
||||
* view updates are consuming out of the their allocated quota - and the disk
|
||||
* backlog - how much view hints are consuming. The size of a backlog is relative
|
||||
* to its maximum size.
|
||||
*/
|
||||
struct update_backlog {
|
||||
size_t current;
|
||||
size_t max;
|
||||
|
||||
float relative_size() const {
|
||||
return float(current) / float(max);
|
||||
}
|
||||
|
||||
friend bool operator==(const update_backlog& lhs, const update_backlog& rhs) {
|
||||
return lhs.relative_size() == rhs.relative_size();
|
||||
}
|
||||
|
||||
friend bool operator<(const update_backlog& lhs, const update_backlog& rhs) {
|
||||
return lhs.relative_size() < rhs.relative_size();
|
||||
}
|
||||
|
||||
friend bool operator!=(const update_backlog& lhs, const update_backlog& rhs) {
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
friend bool operator<=(const update_backlog& lhs, const update_backlog& rhs) {
|
||||
return !(rhs < lhs);
|
||||
}
|
||||
|
||||
friend bool operator>(const update_backlog& lhs, const update_backlog& rhs) {
|
||||
return rhs < lhs;
|
||||
}
|
||||
|
||||
friend bool operator>=(const update_backlog& lhs, const update_backlog& rhs) {
|
||||
return !(lhs < rhs);
|
||||
}
|
||||
|
||||
static update_backlog no_backlog() {
|
||||
return update_backlog{0, std::numeric_limits<size_t>::max()};
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
66
db/view/view_update_from_staging_generator.cc
Normal file
66
db/view/view_update_from_staging_generator.cc
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "view_update_from_staging_generator.hh"
|
||||
|
||||
namespace db::view {
|
||||
|
||||
future<> view_update_from_staging_generator::start() {
|
||||
_started = seastar::async([this]() mutable {
|
||||
while (!_as.abort_requested()) {
|
||||
if (_sstables_with_tables.empty()) {
|
||||
_pending_sstables.wait().get();
|
||||
}
|
||||
while (!_sstables_with_tables.empty()) {
|
||||
auto& entry = _sstables_with_tables.front();
|
||||
schema_ptr s = entry.t->schema();
|
||||
flat_mutation_reader staging_sstable_reader = entry.sst->read_rows_flat(s);
|
||||
auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, _proxy, entry.sst, _as), db::no_timeout);
|
||||
if (result == stop_iteration::yes) {
|
||||
break;
|
||||
}
|
||||
entry.t->move_sstable_from_staging_in_thread(entry.sst);
|
||||
_registration_sem.signal();
|
||||
_sstables_with_tables.pop_front();
|
||||
}
|
||||
}
|
||||
});
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
future<> view_update_from_staging_generator::stop() {
|
||||
_as.request_abort();
|
||||
_pending_sstables.signal();
|
||||
return std::move(_started).then([this] {
|
||||
_registration_sem.broken();
|
||||
});
|
||||
}
|
||||
|
||||
future<> view_update_from_staging_generator::register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table) {
|
||||
if (_as.abort_requested()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
_sstables_with_tables.emplace_back(std::move(sst), std::move(table));
|
||||
_pending_sstables.signal();
|
||||
return _registration_sem.wait(1);
|
||||
}
|
||||
|
||||
}
|
||||
56
db/view/view_update_from_staging_generator.hh
Normal file
56
db/view/view_update_from_staging_generator.hh
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "database.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include "db/view/view_updating_consumer.hh"
|
||||
|
||||
#include <seastar/core/abort_source.hh>
|
||||
#include <seastar/core/condition-variable.hh>
|
||||
#include <seastar/core/semaphore.hh>
|
||||
|
||||
namespace db::view {
|
||||
|
||||
class view_update_from_staging_generator {
|
||||
static constexpr size_t registration_queue_size = 5;
|
||||
database& _db;
|
||||
service::storage_proxy& _proxy;
|
||||
seastar::abort_source _as;
|
||||
future<> _started = make_ready_future<>();
|
||||
seastar::condition_variable _pending_sstables;
|
||||
semaphore _registration_sem{registration_queue_size};
|
||||
struct sstable_with_table {
|
||||
sstables::shared_sstable sst;
|
||||
lw_shared_ptr<table> t;
|
||||
sstable_with_table(sstables::shared_sstable sst, lw_shared_ptr<table> t) : sst(sst), t(t) { }
|
||||
};
|
||||
std::deque<sstable_with_table> _sstables_with_tables;
|
||||
public:
|
||||
view_update_from_staging_generator(database& db, service::storage_proxy& proxy) : _db(db), _proxy(proxy) { }
|
||||
|
||||
future<> start();
|
||||
future<> stop();
|
||||
future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
|
||||
};
|
||||
|
||||
}
|
||||
92
db/view/view_updating_consumer.hh
Normal file
92
db/view/view_updating_consumer.hh
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "service/storage_proxy.hh"
|
||||
#include "dht/i_partitioner.hh"
|
||||
#include "schema.hh"
|
||||
#include "mutation_fragment.hh"
|
||||
#include "sstables/shared_sstable.hh"
|
||||
|
||||
namespace db::view {
|
||||
|
||||
/*
|
||||
* A consumer that pushes materialized view updates for each consumed mutation.
|
||||
* It is expected to be run in seastar::async threaded context through consume_in_thread()
|
||||
*/
|
||||
class view_updating_consumer {
|
||||
schema_ptr _schema;
|
||||
lw_shared_ptr<table> _table;
|
||||
sstables::shared_sstable _excluded_sstable;
|
||||
const seastar::abort_source& _as;
|
||||
std::optional<mutation> _m;
|
||||
public:
|
||||
view_updating_consumer(schema_ptr schema, service::storage_proxy& proxy, sstables::shared_sstable excluded_sstable, const seastar::abort_source& as)
|
||||
: _schema(std::move(schema))
|
||||
, _table(proxy.get_db().local().find_column_family(_schema->id()).shared_from_this())
|
||||
, _excluded_sstable(excluded_sstable)
|
||||
, _as(as)
|
||||
, _m()
|
||||
{ }
|
||||
|
||||
void consume_new_partition(const dht::decorated_key& dk) {
|
||||
_m = mutation(_schema, dk, mutation_partition(_schema));
|
||||
}
|
||||
|
||||
void consume(tombstone t) {
|
||||
_m->partition().apply(std::move(t));
|
||||
}
|
||||
|
||||
stop_iteration consume(static_row&& sr) {
|
||||
if (_as.abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_m->partition().apply(*_schema, std::move(sr));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
stop_iteration consume(clustering_row&& cr) {
|
||||
if (_as.abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_m->partition().apply(*_schema, std::move(cr));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
stop_iteration consume(range_tombstone&& rt) {
|
||||
if (_as.abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_m->partition().apply(*_schema, std::move(rt));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
// Expected to be run in seastar::async threaded context (consume_in_thread())
|
||||
stop_iteration consume_end_of_partition();
|
||||
|
||||
stop_iteration consume_end_of_stream() {
|
||||
return stop_iteration(_as.abort_requested());
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ namespace dht {
|
||||
future<> boot_strapper::bootstrap() {
|
||||
blogger.debug("Beginning bootstrap process: sorted_tokens={}", _token_metadata.sorted_tokens());
|
||||
|
||||
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap");
|
||||
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _tokens, _address, "Bootstrap", streaming::stream_reason::bootstrap);
|
||||
streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_failure_detector()));
|
||||
for (const auto& keyspace_name : _db.local().get_non_system_keyspaces()) {
|
||||
auto& ks = _db.local().find_keyspace(keyspace_name);
|
||||
|
||||
@@ -294,7 +294,7 @@ future<> range_streamer::do_stream_async() {
|
||||
size_t nr_ranges_per_stream_plan = nr_ranges_total / 10;
|
||||
dht::token_range_vector ranges_to_stream;
|
||||
auto do_streaming = [&] {
|
||||
auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++));
|
||||
auto sp = stream_plan(sprint("%s-%s-index-%d", description, keyspace, sp_index++), _reason);
|
||||
logger.info("{} with {} for keyspace={}, {} out of {} ranges: ranges = {}",
|
||||
description, source, keyspace, nr_ranges_streamed, nr_ranges_total, ranges_to_stream.size());
|
||||
if (_nr_rx_added) {
|
||||
|
||||
@@ -42,6 +42,7 @@
|
||||
#include "locator/snitch_base.hh"
|
||||
#include "streaming/stream_plan.hh"
|
||||
#include "streaming/stream_state.hh"
|
||||
#include "streaming/stream_reason.hh"
|
||||
#include "gms/inet_address.hh"
|
||||
#include "gms/i_failure_detector.hh"
|
||||
#include "range.hh"
|
||||
@@ -101,17 +102,18 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description)
|
||||
range_streamer(distributed<database>& db, token_metadata& tm, std::unordered_set<token> tokens, inet_address address, sstring description, streaming::stream_reason reason)
|
||||
: _db(db)
|
||||
, _metadata(tm)
|
||||
, _tokens(std::move(tokens))
|
||||
, _address(address)
|
||||
, _description(std::move(description))
|
||||
, _reason(reason)
|
||||
, _stream_plan(_description) {
|
||||
}
|
||||
|
||||
range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description)
|
||||
: range_streamer(db, tm, std::unordered_set<token>(), address, description) {
|
||||
range_streamer(distributed<database>& db, token_metadata& tm, inet_address address, sstring description, streaming::stream_reason reason)
|
||||
: range_streamer(db, tm, std::unordered_set<token>(), address, description, reason) {
|
||||
}
|
||||
|
||||
void add_source_filter(std::unique_ptr<i_source_filter> filter) {
|
||||
@@ -166,6 +168,7 @@ private:
|
||||
std::unordered_set<token> _tokens;
|
||||
inet_address _address;
|
||||
sstring _description;
|
||||
streaming::stream_reason _reason;
|
||||
std::unordered_multimap<sstring, std::unordered_map<inet_address, dht::token_range_vector>> _to_stream;
|
||||
std::unordered_set<std::unique_ptr<i_source_filter>> _source_filters;
|
||||
stream_plan _stream_plan;
|
||||
|
||||
2
dist/ami/build_ami.sh
vendored
2
dist/ami/build_ami.sh
vendored
@@ -78,7 +78,7 @@ if [ $LOCALRPM -eq 1 ]; then
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
|
||||
git clone -b branch-3.0 --depth 1 https://github.com/scylladb/scylla-jmx.git
|
||||
cd scylla-jmx
|
||||
dist/redhat/build_rpm.sh --target epel-7-x86_64
|
||||
cd ../..
|
||||
|
||||
2
dist/common/scripts/node_exporter_install
vendored
2
dist/common/scripts/node_exporter_install
vendored
@@ -25,7 +25,7 @@ import tempfile
|
||||
import tarfile
|
||||
from scylla_util import *
|
||||
|
||||
VERSION='0.14.0'
|
||||
VERSION='0.17.0'
|
||||
INSTALL_DIR='/usr/lib/scylla/Prometheus/node_exporter'
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
7
dist/common/scripts/scylla_prepare
vendored
7
dist/common/scripts/scylla_prepare
vendored
@@ -62,10 +62,9 @@ if __name__ == '__main__':
|
||||
run('hugeadm --create-mounts')
|
||||
fi
|
||||
else:
|
||||
set_nic = cfg.get('SET_NIC')
|
||||
set_nic_and_disks = get_set_nic_and_disks_config_value(cfg)
|
||||
ifname = cfg.get('IFNAME')
|
||||
if set_nic == 'yes':
|
||||
if set_nic_and_disks == 'yes':
|
||||
create_perftune_conf(ifname)
|
||||
run('/usr/lib/scylla/posix_net_conf.sh {IFNAME} --options-file /etc/scylla.d/perftune.yaml'.format(IFNAME=ifname))
|
||||
run("{} --options-file /etc/scylla.d/perftune.yaml".format(perftune_base_command()))
|
||||
|
||||
run('/usr/lib/scylla/scylla-blocktune')
|
||||
|
||||
12
dist/common/scripts/scylla_setup
vendored
12
dist/common/scripts/scylla_setup
vendored
@@ -122,8 +122,8 @@ if __name__ == '__main__':
|
||||
help='specify NTP domain')
|
||||
parser.add_argument('--ami', action='store_true', default=False,
|
||||
help='setup AMI instance')
|
||||
parser.add_argument('--setup-nic', action='store_true', default=False,
|
||||
help='optimize NIC queue')
|
||||
parser.add_argument('--setup-nic-and-disks', action='store_true', default=False,
|
||||
help='optimize NIC and disks')
|
||||
parser.add_argument('--developer-mode', action='store_true', default=False,
|
||||
help='enable developer mode')
|
||||
parser.add_argument('--no-ec2-check', action='store_true', default=False,
|
||||
@@ -173,7 +173,7 @@ if __name__ == '__main__':
|
||||
|
||||
disks = args.disks
|
||||
nic = args.nic
|
||||
set_nic = args.setup_nic
|
||||
set_nic_and_disks = args.setup_nic_and_disks
|
||||
ec2_check = not args.no_ec2_check
|
||||
kernel_check = not args.no_kernel_check
|
||||
verify_package = not args.no_verify_package
|
||||
@@ -336,11 +336,11 @@ if __name__ == '__main__':
|
||||
if interactive:
|
||||
sysconfig_setup = interactive_ask_service('Do you want to setup a system-wide customized configuration for Scylla?', 'Yes - setup the sysconfig file. No - skips this step.', 'yes')
|
||||
if sysconfig_setup:
|
||||
nic = interactive_choose_nic()
|
||||
if interactive:
|
||||
set_nic = interactive_ask_service('Do you want to enable Network Interface Card (NIC) optimization?', 'Yes - optimize the NIC queue settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
|
||||
nic = interactive_choose_nic()
|
||||
set_nic_and_disks = interactive_ask_service('Do you want to enable Network Interface Card (NIC) and disk(s) optimization?', 'Yes - optimize the NIC queue and disks settings. Selecting Yes greatly improves performance. No - skip this step.', 'yes')
|
||||
if sysconfig_setup:
|
||||
setup_args = '--setup-nic' if set_nic else ''
|
||||
setup_args = '--setup-nic-and-disks' if set_nic_and_disks else ''
|
||||
run_setup_script('NIC queue', '/usr/lib/scylla/scylla_sysconfig_setup --nic {nic} {setup_args}'.format(nic=nic, setup_args=setup_args))
|
||||
|
||||
if interactive:
|
||||
|
||||
19
dist/common/scripts/scylla_sysconfig_setup
vendored
19
dist/common/scripts/scylla_sysconfig_setup
vendored
@@ -40,7 +40,7 @@ if __name__ == '__main__':
|
||||
cfg = sysconfig_parser('/etc/sysconfig/scylla-server')
|
||||
else:
|
||||
cfg = sysconfig_parser('/etc/default/scylla-server')
|
||||
set_nic = str2bool(cfg.get('SET_NIC'))
|
||||
set_nic_and_disks = str2bool(get_set_nic_and_disks_config_value(cfg))
|
||||
ami = str2bool(cfg.get('AMI'))
|
||||
|
||||
parser = argparse.ArgumentParser(description='Setting parameters on Scylla sysconfig file.')
|
||||
@@ -58,8 +58,8 @@ if __name__ == '__main__':
|
||||
help='scylla home directory')
|
||||
parser.add_argument('--confdir',
|
||||
help='scylla config directory')
|
||||
parser.add_argument('--setup-nic', action='store_true', default=set_nic,
|
||||
help='setup NIC\'s interrupts, RPS, XPS')
|
||||
parser.add_argument('--setup-nic-and-disks', action='store_true', default=set_nic_and_disks,
|
||||
help='setup NIC\'s and disks\' interrupts, RPS, XPS, nomerges and I/O scheduler')
|
||||
parser.add_argument('--ami', action='store_true', default=ami,
|
||||
help='AMI instance mode')
|
||||
args = parser.parse_args()
|
||||
@@ -71,8 +71,8 @@ if __name__ == '__main__':
|
||||
ifname = args.nic if args.nic else cfg.get('IFNAME')
|
||||
network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')
|
||||
|
||||
if args.setup_nic:
|
||||
rps_cpus = out('/usr/lib/scylla/posix_net_conf.sh --cpu-mask {}'.format(ifname))
|
||||
if args.setup_nic_and_disks:
|
||||
rps_cpus = out('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname))
|
||||
if len(rps_cpus) > 0:
|
||||
cpuset = hex2list(rps_cpus)
|
||||
run('/usr/lib/scylla/scylla_cpuset_setup --cpuset {}'.format(cpuset))
|
||||
@@ -104,8 +104,13 @@ if __name__ == '__main__':
|
||||
cfg.set('SCYLLA_HOME', args.homedir)
|
||||
if args.confdir:
|
||||
cfg.set('SCYLLA_CONF', args.confdir)
|
||||
if str2bool(cfg.get('SET_NIC')) != args.setup_nic:
|
||||
cfg.set('SET_NIC', bool2str(args.setup_nic))
|
||||
|
||||
if str2bool(get_set_nic_and_disks_config_value(cfg)) != args.setup_nic_and_disks:
|
||||
if cfg.has_option('SET_NIC'):
|
||||
cfg.set('SET_NIC', bool2str(args.setup_nic_and_disks))
|
||||
else:
|
||||
cfg.set('SET_NIC_AND_DISKS', bool2str(args.setup_nic_and_disks))
|
||||
|
||||
if str2bool(cfg.get('AMI')) != args.ami:
|
||||
cfg.set('AMI', bool2str(args.ami))
|
||||
cfg.commit()
|
||||
|
||||
56
dist/common/scripts/scylla_util.py
vendored
56
dist/common/scripts/scylla_util.py
vendored
@@ -28,6 +28,7 @@ import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import yaml
|
||||
|
||||
|
||||
def curl(url, byte=False):
|
||||
@@ -384,6 +385,37 @@ def get_mode_cpuset(nic, mode):
|
||||
except subprocess.CalledProcessError:
|
||||
return '-1'
|
||||
|
||||
def get_scylla_dirs():
|
||||
"""
|
||||
Returns a list of scylla directories configured in /etc/scylla/scylla.yaml.
|
||||
Verifies that mandatory parameters are set.
|
||||
"""
|
||||
scylla_yaml_name = '/etc/scylla/scylla.yaml'
|
||||
y = yaml.load(open(scylla_yaml_name))
|
||||
|
||||
# Check that mandatory fields are set
|
||||
if 'data_file_directories' not in y or \
|
||||
not y['data_file_directories'] or \
|
||||
not len(y['data_file_directories']) or \
|
||||
not " ".join(y['data_file_directories']).strip():
|
||||
raise Exception("{}: at least one directory has to be set in 'data_file_directory'".format(scylla_yaml_name))
|
||||
if 'commitlog_directory' not in y or not y['commitlog_directory']:
|
||||
raise Exception("{}: 'commitlog_directory' has to be set".format(scylla_yaml_name))
|
||||
|
||||
dirs = []
|
||||
dirs.extend(y['data_file_directories'])
|
||||
dirs.append(y['commitlog_directory'])
|
||||
|
||||
if 'hints_directory' in y and y['hints_directory']:
|
||||
dirs.append(y['hints_directory'])
|
||||
if 'view_hints_directory' in y and y['view_hints_directory']:
|
||||
dirs.append(y['view_hints_directory'])
|
||||
|
||||
return [d for d in dirs if d is not None]
|
||||
|
||||
def perftune_base_command():
|
||||
disk_tune_param = "--tune disks " + " ".join("--dir {}".format(d) for d in get_scylla_dirs())
|
||||
return '/usr/lib/scylla/perftune.py {}'.format(disk_tune_param)
|
||||
|
||||
def get_cur_cpuset():
|
||||
cfg = sysconfig_parser('/etc/scylla.d/cpuset.conf')
|
||||
@@ -419,6 +451,25 @@ def create_perftune_conf(nic='eth0'):
|
||||
def is_valid_nic(nic):
|
||||
return os.path.exists('/sys/class/net/{}'.format(nic))
|
||||
|
||||
# Remove this when we do not support SET_NIC configuration value anymore
|
||||
def get_set_nic_and_disks_config_value(cfg):
|
||||
"""
|
||||
Get the SET_NIC_AND_DISKS configuration value.
|
||||
Return the SET_NIC configuration value if SET_NIC_AND_DISKS is not found (old releases case).
|
||||
:param cfg: sysconfig_parser object
|
||||
:return configuration value
|
||||
:except If the configuration value is not found
|
||||
"""
|
||||
|
||||
# Sanity check
|
||||
if cfg.has_option('SET_NIC_AND_DISKS') and cfg.has_option('SET_NIC'):
|
||||
raise Exception("Only one of 'SET_NIC_AND_DISKS' and 'SET_NIC' is allowed to be present")
|
||||
|
||||
try:
|
||||
return cfg.get('SET_NIC_AND_DISKS')
|
||||
except:
|
||||
# For backwards compatibility
|
||||
return cfg.get('SET_NIC')
|
||||
|
||||
class SystemdException(Exception):
|
||||
pass
|
||||
@@ -483,8 +534,11 @@ class sysconfig_parser:
|
||||
def get(self, key):
|
||||
return self._cfg.get('global', key).strip('"')
|
||||
|
||||
def has_option(self, key):
|
||||
return self._cfg.has_option('global', key)
|
||||
|
||||
def set(self, key, val):
|
||||
if not self._cfg.has_option('global', key):
|
||||
if not self.has_option(key):
|
||||
return self.__add(key, val)
|
||||
self._data = re.sub('^{}=[^\n]*$'.format(key), '{}="{}"'.format(key, self.__escape(val)), self._data, flags=re.MULTILINE)
|
||||
self.__load()
|
||||
|
||||
4
dist/common/sysconfig/scylla-server
vendored
4
dist/common/sysconfig/scylla-server
vendored
@@ -10,8 +10,8 @@ BRIDGE=virbr0
|
||||
# ethernet device name
|
||||
IFNAME=eth0
|
||||
|
||||
# setup NIC's interrupts, RPS, XPS (posix)
|
||||
SET_NIC=no
|
||||
# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
|
||||
SET_NIC_AND_DISKS=no
|
||||
|
||||
# ethernet device driver (dpdk)
|
||||
ETHDRV=
|
||||
|
||||
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
Normal file
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# Raise max AIO events
|
||||
fs.aio-max-nr = 1048576
|
||||
2
dist/common/systemd/node-exporter.service
vendored
2
dist/common/systemd/node-exporter.service
vendored
@@ -5,7 +5,7 @@ Description=Node Exporter
|
||||
Type=simple
|
||||
User=scylla
|
||||
Group=scylla
|
||||
ExecStart=/usr/bin/node_exporter -collectors.enabled interrupts,conntrack,diskstats,entropy,filefd,filesystem,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat
|
||||
ExecStart=/usr/bin/node_exporter --collector.interrupts
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@@ -6,7 +6,12 @@ After=network.target
|
||||
Type=simple
|
||||
User=scylla
|
||||
Group=scylla
|
||||
{{#debian}}
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/apt/sources.list.d/scylla*.list' version --mode r
|
||||
{{/debian}}
|
||||
{{#redhat}}
|
||||
ExecStart=/usr/lib/scylla/scylla-housekeeping --uuid-file /var/lib/scylla-housekeeping/housekeeping.uuid -q -c /etc/scylla.d/housekeeping.cfg --repo-files '/etc/yum.repos.d/scylla*.repo' version --mode r
|
||||
{{/redhat}}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
dist/common/sysctl.d/99-scylla-sched.conf /etc/sysctl.d
|
||||
dist/common/sysctl.d/99-scylla-aio.conf /etc/sysctl.d
|
||||
|
||||
@@ -9,6 +9,7 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
|
||||
else
|
||||
# expect failures in virtualized environments
|
||||
sysctl -p/etc/sysctl.d/99-scylla-sched.conf || :
|
||||
sysctl -p/etc/sysctl.d/99-scylla-aio.conf || :
|
||||
fi
|
||||
|
||||
#DEBHELPER#
|
||||
|
||||
1
dist/debian/debian/scylla-server.dirs
vendored
1
dist/debian/debian/scylla-server.dirs
vendored
@@ -4,5 +4,6 @@ var/lib/scylla
|
||||
var/lib/scylla/data
|
||||
var/lib/scylla/commitlog
|
||||
var/lib/scylla/hints
|
||||
var/lib/scylla/view_hints
|
||||
var/lib/scylla/coredump
|
||||
var/lib/scylla-housekeeping
|
||||
|
||||
2
dist/debian/rules.mustache
vendored
2
dist/debian/rules.mustache
vendored
@@ -4,7 +4,7 @@ export PYBUILD_DISABLE=1
|
||||
jobs := $(shell echo $$DEB_BUILD_OPTIONS | sed -r "s/.*parallel=([0-9]+).*/-j\1/")
|
||||
|
||||
override_dh_auto_configure:
|
||||
./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"
|
||||
./configure.py --with=scylla --with=iotune --enable-dpdk --mode=release --static-thrift --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7 --c-compiler=/opt/scylladb/bin/gcc-7 --cflags="-I/opt/scylladb/include -L/opt/scylladb/lib/x86-linux-gnu/" --ldflags="-Wl,-rpath=/opt/scylladb/lib"
|
||||
|
||||
override_dh_auto_build:
|
||||
PATH="/opt/scylladb/bin:$$PATH" ninja $(jobs)
|
||||
|
||||
1
dist/debian/scylla-server.install.mustache
vendored
1
dist/debian/scylla-server.install.mustache
vendored
@@ -1,7 +1,6 @@
|
||||
dist/common/limits.d/scylla.conf etc/security/limits.d
|
||||
dist/common/scylla.d/*.conf etc/scylla.d
|
||||
seastar/dpdk/usertools/dpdk-devbind.py usr/lib/scylla
|
||||
seastar/scripts/posix_net_conf.sh usr/lib/scylla
|
||||
seastar/scripts/perftune.py usr/lib/scylla
|
||||
dist/common/scripts/* usr/lib/scylla
|
||||
scylla-housekeeping usr/lib/scylla
|
||||
|
||||
2
dist/docker/redhat/Dockerfile
vendored
2
dist/docker/redhat/Dockerfile
vendored
@@ -26,7 +26,7 @@ ADD commandlineparser.py /commandlineparser.py
|
||||
ADD docker-entrypoint.py /docker-entrypoint.py
|
||||
|
||||
# Install Scylla:
|
||||
RUN curl http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo -o /etc/yum.repos.d/scylla.repo && \
|
||||
RUN curl http://downloads.scylladb.com/rpm/centos/scylla-3.0.repo -o /etc/yum.repos.d/scylla.repo && \
|
||||
yum -y install epel-release && \
|
||||
yum -y clean expire-cache && \
|
||||
yum -y update && \
|
||||
|
||||
@@ -10,8 +10,8 @@ BRIDGE=virbr0
|
||||
# ethernet device name
|
||||
IFNAME=eth0
|
||||
|
||||
# setup NIC's interrupts, RPS, XPS (posix)
|
||||
SET_NIC=no
|
||||
# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
|
||||
SET_NIC_AND_DISKS=no
|
||||
|
||||
# ethernet device driver (dpdk)
|
||||
ETHDRV=
|
||||
|
||||
@@ -91,7 +91,27 @@ mkdir -p build/offline_installer
|
||||
cp dist/offline_installer/redhat/header build/offline_installer
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve scylla
|
||||
# XXX: resolve option doesn't fetch some dependencies, need to manually fetch them
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sudo.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntp.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libedit.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve ntpdate.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve net-tools.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve kernel
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve grubby.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve linux-firmware
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve initscripts.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iproute.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve iptables.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnfnetlink.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libnetfilter_conntrack.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libmnl.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve sysvinit-tools.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve yajl.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve mdadm.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libreport-filesystem.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve xfsprogs.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve PyYAML.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libyaml.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libjpeg-turbo.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve libaio.x86_64
|
||||
sudo yumdownloader --installroot=`pwd`/build/installroot --archlist=x86_64 --destdir=build/offline_installer --resolve snappy.x86_64
|
||||
|
||||
4
dist/redhat/build_rpm.sh
vendored
4
dist/redhat/build_rpm.sh
vendored
@@ -108,11 +108,11 @@ fix_ownership() {
|
||||
if [ $JOBS -gt 0 ]; then
|
||||
RPM_JOBS_OPTS=(--define="_smp_mflags -j$JOBS")
|
||||
fi
|
||||
sudo mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/$PRODUCT-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
|
||||
sudo mock --rootdir=`pwd`/build/mock --buildsrpm --root=$TARGET --resultdir=`pwd`/build/srpms --spec=build/scylla.spec --sources=build/$PRODUCT-$VERSION.tar $SRPM_OPTS "${RPM_JOBS_OPTS[@]}"
|
||||
fix_ownership build/srpms
|
||||
if [[ "$TARGET" =~ ^epel-7- ]]; then
|
||||
TARGET=scylla-$TARGET
|
||||
RPM_OPTS="$RPM_OPTS --configdir=dist/redhat/mock"
|
||||
fi
|
||||
sudo mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/$PRODUCT-$VERSION*.src.rpm
|
||||
sudo mock --rootdir=`pwd`/build/mock --rebuild --root=$TARGET --resultdir=`pwd`/build/rpms $RPM_OPTS "${RPM_JOBS_OPTS[@]}" build/srpms/$PRODUCT-$VERSION*.src.rpm
|
||||
fix_ownership build/rpms
|
||||
|
||||
7
dist/redhat/scylla.spec.mustache
vendored
7
dist/redhat/scylla.spec.mustache
vendored
@@ -56,7 +56,7 @@ License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
BuildRequires: libaio-devel libstdc++-devel cryptopp-devel hwloc-devel numactl-devel libpciaccess-devel libxml2-devel zlib-devel thrift-devel yaml-cpp-devel lz4-devel snappy-devel jsoncpp-devel systemd-devel xz-devel pcre-devel elfutils-libelf-devel bzip2-devel keyutils-libs-devel xfsprogs-devel make gnutls-devel systemd-devel lksctp-tools-devel protobuf-devel protobuf-compiler systemtap-sdt-devel ninja-build cmake python ragel grep kernel-headers
|
||||
%{?fedora:BuildRequires: boost-devel antlr3-tool antlr3-C++-devel python3 gcc-c++ libasan libubsan python3-pyparsing dnf-yum python2-pystache}
|
||||
%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
|
||||
%{?rhel:BuildRequires: scylla-libstdc++73-static scylla-libatomic73-static scylla-boost163-devel scylla-boost163-static scylla-antlr35-tool scylla-antlr35-C++-devel python34 scylla-gcc73-c++, scylla-python34-pyparsing20 yaml-cpp-static pystache python-setuptools}
|
||||
Requires: {{product}}-conf systemd-libs hwloc PyYAML python-urwid pciutils pyparsing python-requests curl util-linux python-setuptools pciutils python3-pyudev mdadm xfsprogs
|
||||
%{?rhel:Requires: python34 python34-PyYAML kernel >= 3.10.0-514}
|
||||
%{?fedora:Requires: python3 python3-PyYAML}
|
||||
@@ -97,7 +97,7 @@ cflags="--cflags=${defines[*]}"
|
||||
%endif
|
||||
%if 0%{?rhel}
|
||||
. /etc/profile.d/scylla.sh
|
||||
python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
|
||||
python3.4 ./configure.py %{?configure_opt} --with=scylla --with=iotune --mode=release "$cflags" --static-boost --static-yaml-cpp --compiler=/opt/scylladb/bin/g++-7.3 --c-compiler=/opt/scylladb/bin/gcc-7.3 --python python3.4 --ldflag=-Wl,-rpath=/opt/scylladb/lib64
|
||||
%endif
|
||||
ninja-build %{?_smp_mflags} build/release/scylla build/release/iotune
|
||||
|
||||
@@ -193,7 +193,6 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%{_prefix}/lib/scylla/scylla_cpuscaling_setup
|
||||
%{_prefix}/lib/scylla/scylla_fstrim
|
||||
%{_prefix}/lib/scylla/scylla_fstrim_setup
|
||||
%{_prefix}/lib/scylla/posix_net_conf.sh
|
||||
%{_prefix}/lib/scylla/perftune.py
|
||||
%{_prefix}/lib/scylla/dpdk-devbind.py
|
||||
%{_prefix}/lib/scylla/hex2list.py
|
||||
@@ -209,6 +208,7 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/data
|
||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/commitlog
|
||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/hints
|
||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/view_hints
|
||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla/coredump
|
||||
%attr(0755,scylla,scylla) %dir %{_sharedstatedir}/scylla-housekeeping
|
||||
%ghost /etc/systemd/system/scylla-server.service.d/
|
||||
@@ -283,6 +283,7 @@ if Scylla is the main application on your server and you wish to optimize its la
|
||||
# We cannot use the sysctl_apply rpm macro because it is not present in 7.0
|
||||
# following is a "manual" expansion
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
|
||||
|
||||
%files kernel-conf
|
||||
%defattr(-,root,root)
|
||||
|
||||
@@ -66,7 +66,7 @@ You can use Docker volumes to improve performance of Scylla.
|
||||
Create a Scylla data directory ``/var/lib/scylla`` on the host, which is used by Scylla container to store all data:
|
||||
|
||||
```console
|
||||
$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog /var/lib/scylla/hints
|
||||
$ sudo mkdir -p /var/lib/scylla/data /var/lib/scylla/commitlog /var/lib/scylla/hints /var/lib/scylla/view_hints
|
||||
```
|
||||
|
||||
Launch Scylla using Docker's ``--volume`` command line option to mount the created host directory as a data volume in the container and disable Scylla's developer mode to run I/O tuning before starting up the Scylla node.
|
||||
|
||||
@@ -41,12 +41,11 @@ struct encoding_stats {
|
||||
// int DELETION_TIME_EPOCH = (int)(c.getTimeInMillis() / 1000); // local deletion times are in seconds
|
||||
// Encoding stats are used for delta-encoding, so we want some default values
|
||||
// that are just good enough so we take some recent date in the past
|
||||
static constexpr uint32_t deletion_time_epoch = 1442880000;
|
||||
static constexpr int32_t deletion_time_epoch = 1442880000;
|
||||
static constexpr api::timestamp_type timestamp_epoch = api::timestamp_type(deletion_time_epoch) * 1000 * 1000;
|
||||
static constexpr uint32_t ttl_epoch = 0;
|
||||
static constexpr int32_t ttl_epoch = 0;
|
||||
|
||||
api::timestamp_type min_timestamp = timestamp_epoch;
|
||||
uint32_t min_local_deletion_time = deletion_time_epoch;
|
||||
uint32_t min_ttl = ttl_epoch;
|
||||
int32_t min_local_deletion_time = deletion_time_epoch;
|
||||
int32_t min_ttl = ttl_epoch;
|
||||
};
|
||||
|
||||
|
||||
@@ -78,6 +78,11 @@ public:
|
||||
|
||||
frozen_mutation freeze(const mutation& m);
|
||||
|
||||
struct frozen_mutation_and_schema {
|
||||
frozen_mutation fm;
|
||||
schema_ptr s;
|
||||
};
|
||||
|
||||
// Can receive streamed_mutation in reversed order.
|
||||
class streamed_mutation_freezer {
|
||||
const schema& _schema;
|
||||
|
||||
@@ -63,6 +63,8 @@ static const std::map<application_state, sstring> application_state_names = {
|
||||
{application_state::SUPPORTED_FEATURES, "SUPPORTED_FEATURES"},
|
||||
{application_state::CACHE_HITRATES, "CACHE_HITRATES"},
|
||||
{application_state::SCHEMA_TABLES_VERSION, "SCHEMA_TABLES_VERSION"},
|
||||
{application_state::RPC_READY, "RPC_READY"},
|
||||
{application_state::VIEW_BACKLOG, "VIEW_BACKLOG"},
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const application_state& m) {
|
||||
|
||||
@@ -60,9 +60,9 @@ enum class application_state {
|
||||
SUPPORTED_FEATURES,
|
||||
CACHE_HITRATES,
|
||||
SCHEMA_TABLES_VERSION,
|
||||
RPC_READY,
|
||||
VIEW_BACKLOG,
|
||||
// pad to allow adding new states to existing cluster
|
||||
X4,
|
||||
X5,
|
||||
X6,
|
||||
X7,
|
||||
X8,
|
||||
|
||||
@@ -61,4 +61,16 @@ std::ostream& operator<<(std::ostream& os, const endpoint_state& x) {
|
||||
return os;
|
||||
}
|
||||
|
||||
bool endpoint_state::is_cql_ready() const {
|
||||
auto* app_state = get_application_state_ptr(application_state::RPC_READY);
|
||||
if (!app_state) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
return boost::lexical_cast<int>(app_state->value);
|
||||
} catch (...) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -129,26 +129,8 @@ public:
|
||||
update_is_normal();
|
||||
}
|
||||
|
||||
void apply_application_state(application_state key, versioned_value&& value) {
|
||||
auto&& e = _application_state[key];
|
||||
if (e.version < value.version) {
|
||||
e = std::move(value);
|
||||
}
|
||||
update_is_normal();
|
||||
}
|
||||
|
||||
void apply_application_state(application_state key, const versioned_value& value) {
|
||||
auto&& e = _application_state[key];
|
||||
if (e.version < value.version) {
|
||||
e = value;
|
||||
}
|
||||
update_is_normal();
|
||||
}
|
||||
|
||||
void apply_application_state(const endpoint_state& es) {
|
||||
for (auto&& e : es._application_state) {
|
||||
apply_application_state(e.first, e.second);
|
||||
}
|
||||
void add_application_state(const endpoint_state& es) {
|
||||
_application_state = es._application_state;
|
||||
update_is_normal();
|
||||
}
|
||||
|
||||
@@ -208,6 +190,8 @@ public:
|
||||
_is_normal = get_status() == sstring(versioned_value::STATUS_NORMAL);
|
||||
}
|
||||
|
||||
bool is_cql_ready() const;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const endpoint_state& x);
|
||||
};
|
||||
|
||||
|
||||
@@ -930,7 +930,7 @@ void gossiper::make_random_gossip_digest(utils::chunked_vector<gossip_digest>& g
|
||||
future<> gossiper::replicate(inet_address ep, const endpoint_state& es) {
|
||||
return container().invoke_on_all([ep, es, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
|
||||
if (engine().cpu_id() != orig) {
|
||||
g.endpoint_state_map[ep].apply_application_state(es);
|
||||
g.endpoint_state_map[ep].add_application_state(es);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -939,7 +939,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
|
||||
return container().invoke_on_all([ep, &src, &changed, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
|
||||
if (engine().cpu_id() != orig) {
|
||||
for (auto&& key : changed) {
|
||||
g.endpoint_state_map[ep].apply_application_state(key, src.at(key));
|
||||
g.endpoint_state_map[ep].add_application_state(key, src.at(key));
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -948,7 +948,7 @@ future<> gossiper::replicate(inet_address ep, const std::map<application_state,
|
||||
future<> gossiper::replicate(inet_address ep, application_state key, const versioned_value& value) {
|
||||
return container().invoke_on_all([ep, key, &value, orig = engine().cpu_id(), self = shared_from_this()] (gossiper& g) {
|
||||
if (engine().cpu_id() != orig) {
|
||||
g.endpoint_state_map[ep].apply_application_state(key, value);
|
||||
g.endpoint_state_map[ep].add_application_state(key, value);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -1175,11 +1175,13 @@ stdx::optional<endpoint_state> gossiper::get_endpoint_state_for_endpoint(inet_ad
|
||||
}
|
||||
}
|
||||
|
||||
void gossiper::reset_endpoint_state_map() {
|
||||
endpoint_state_map.clear();
|
||||
future<> gossiper::reset_endpoint_state_map() {
|
||||
_unreachable_endpoints.clear();
|
||||
_live_endpoints.clear();
|
||||
_live_endpoints_just_added.clear();
|
||||
return container().invoke_on_all([] (gossiper& g) {
|
||||
g.endpoint_state_map.clear();
|
||||
});
|
||||
}
|
||||
|
||||
std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpoint_states() {
|
||||
@@ -1191,6 +1193,25 @@ bool gossiper::uses_host_id(inet_address endpoint) {
|
||||
get_application_state_ptr(endpoint, application_state::NET_VERSION);
|
||||
}
|
||||
|
||||
bool gossiper::is_cql_ready(const inet_address& endpoint) const {
|
||||
// Note:
|
||||
// - New scylla node always send application_state::RPC_READY = false when
|
||||
// the node boots and send application_state::RPC_READY = true when cql
|
||||
// server is up
|
||||
// - Old scylla node that does not support the application_state::RPC_READY
|
||||
// never has application_state::RPC_READY in the endpoint_state, we can
|
||||
// only think their cql server is up, so we return true here if
|
||||
// application_state::RPC_READY is not present
|
||||
auto* eps = get_endpoint_state_for_endpoint_ptr(endpoint);
|
||||
if (!eps) {
|
||||
logger.debug("Node {} does not have RPC_READY application_state, return is_cql_ready=true", endpoint);
|
||||
return true;
|
||||
}
|
||||
auto ready = eps->is_cql_ready();
|
||||
logger.debug("Node {}: is_cql_ready={}", endpoint, ready);
|
||||
return ready;
|
||||
}
|
||||
|
||||
utils::UUID gossiper::get_host_id(inet_address endpoint) {
|
||||
if (!uses_host_id(endpoint)) {
|
||||
throw std::runtime_error(sprint("Host %s does not use new-style tokens!", endpoint));
|
||||
@@ -1298,6 +1319,14 @@ void gossiper::mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
// Runs inside seastar::async context
|
||||
void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
logger.trace("marking as alive {}", addr);
|
||||
|
||||
// Do not mark a node with status shutdown as UP.
|
||||
auto status = get_gossip_status(local_state);
|
||||
if (status == sstring(versioned_value::SHUTDOWN)) {
|
||||
logger.warn("Skip marking node {} with status = {} as UP", addr, status);
|
||||
return;
|
||||
}
|
||||
|
||||
local_state.mark_alive();
|
||||
local_state.update_timestamp(); // prevents do_status_check from racing us and evicting if it was down > A_VERY_LONG_TIME
|
||||
|
||||
@@ -1319,7 +1348,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
}
|
||||
|
||||
if (!_in_shadow_round) {
|
||||
logger.info("InetAddress {} is now UP, status = {}", addr, get_gossip_status(local_state));
|
||||
logger.info("InetAddress {} is now UP, status = {}", addr, status);
|
||||
}
|
||||
|
||||
_subscribers.for_each([addr, local_state] (auto& subscriber) {
|
||||
@@ -1662,6 +1691,7 @@ void gossiper::maybe_initialize_local_state(int generation_nbr) {
|
||||
}
|
||||
}
|
||||
|
||||
// Runs inside seastar::async context
|
||||
void gossiper::add_saved_endpoint(inet_address ep) {
|
||||
if (ep == get_broadcast_address()) {
|
||||
logger.debug("Attempt to add self as saved endpoint");
|
||||
@@ -1687,6 +1717,7 @@ void gossiper::add_saved_endpoint(inet_address ep) {
|
||||
}
|
||||
ep_state.mark_dead();
|
||||
endpoint_state_map[ep] = ep_state;
|
||||
replicate(ep, ep_state).get();
|
||||
_unreachable_endpoints[ep] = now();
|
||||
logger.trace("Adding saved endpoint {} {}", ep, ep_state.get_heart_beat_state().get_generation());
|
||||
}
|
||||
@@ -1924,6 +1955,7 @@ void gossiper::mark_as_shutdown(const inet_address& endpoint) {
|
||||
auto& ep_state = *es;
|
||||
ep_state.add_application_state(application_state::STATUS, storage_service_value_factory().shutdown(true));
|
||||
ep_state.get_heart_beat_state().force_highest_possible_version_unsafe();
|
||||
replicate(endpoint, ep_state).get();
|
||||
mark_dead(endpoint, ep_state);
|
||||
get_local_failure_detector().force_conviction(endpoint);
|
||||
}
|
||||
|
||||
@@ -417,7 +417,7 @@ public:
|
||||
stdx::optional<endpoint_state> get_endpoint_state_for_endpoint(inet_address ep) const;
|
||||
|
||||
// removes ALL endpoint states; should only be called after shadow gossip
|
||||
void reset_endpoint_state_map();
|
||||
future<> reset_endpoint_state_map();
|
||||
|
||||
std::unordered_map<inet_address, endpoint_state>& get_endpoint_states();
|
||||
|
||||
@@ -548,6 +548,7 @@ public:
|
||||
bool is_seed(const inet_address& endpoint) const;
|
||||
bool is_shutdown(const inet_address& endpoint) const;
|
||||
bool is_normal(const inet_address& endpoint) const;
|
||||
bool is_cql_ready(const inet_address& endpoint) const;
|
||||
bool is_silent_shutdown_state(const endpoint_state& ep_state) const;
|
||||
void mark_as_shutdown(const inet_address& endpoint);
|
||||
void force_newer_generation();
|
||||
|
||||
@@ -246,6 +246,9 @@ public:
|
||||
return versioned_value(hitrates);
|
||||
}
|
||||
|
||||
versioned_value cql_ready(bool value) {
|
||||
return versioned_value(to_sstring(int(value)));
|
||||
}
|
||||
};
|
||||
}; // class versioned_value
|
||||
|
||||
|
||||
@@ -42,4 +42,13 @@ class prepare_message {
|
||||
uint32_t dst_cpu_id;
|
||||
};
|
||||
|
||||
enum class stream_reason : uint8_t {
|
||||
unspecified,
|
||||
bootstrap,
|
||||
decommission,
|
||||
removenode,
|
||||
rebuild,
|
||||
repair,
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
29
idl/view.idl.hh
Normal file
29
idl/view.idl.hh
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
namespace db {
|
||||
namespace view {
|
||||
class update_backlog {
|
||||
size_t current;
|
||||
size_t max;
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -93,7 +93,6 @@ install -m644 build/*.service -Dt "$rprefix"/lib/systemd/system
|
||||
install -m644 dist/common/systemd/*.service -Dt "$rprefix"/lib/systemd/system
|
||||
install -m644 dist/common/systemd/*.timer -Dt "$rprefix"/lib/systemd/system
|
||||
install -m755 dist/common/scripts/* -Dt "$rprefix"/lib/scylla/
|
||||
install -m755 seastar/scripts/posix_net_conf.sh "$rprefix"/lib/scylla/
|
||||
install -m755 seastar/scripts/perftune.py -Dt "$rprefix"/lib/scylla/
|
||||
install -m755 seastar/dpdk/usertools/dpdk-devbind.py -Dt "$rprefix"/lib/scylla/
|
||||
install -m755 build/release/scylla -Dt "$rprefix/bin"
|
||||
@@ -116,6 +115,7 @@ install -m755 -d "$root"/var/lib/scylla/
|
||||
install -m755 -d "$root"/var/lib/scylla/data
|
||||
install -m755 -d "$root"/var/lib/scylla/commitlog
|
||||
install -m755 -d "$root"/var/lib/scylla/hints
|
||||
install -m755 -d "$root"/var/lib/scylla/view_hints
|
||||
install -m755 -d "$root"/var/lib/scylla/coredump
|
||||
install -m755 -d "$root"/var/lib/scylla-housekeeping
|
||||
install -m755 -d "$rprefix"/lib/scylla/swagger-ui
|
||||
|
||||
1
libdeflate
Submodule
1
libdeflate
Submodule
Submodule libdeflate added at e7e54eab42
21
licenses/libdeflate-license.txt
Normal file
21
licenses/libdeflate-license.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
Copyright 2016 Eric Biggers
|
||||
|
||||
Permission is hereby granted, free of charge, to any person
|
||||
obtaining a copy of this software and associated documentation files
|
||||
(the "Software"), to deal in the Software without restriction,
|
||||
including without limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
and to permit persons to whom the Software is furnished to do so,
|
||||
subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -119,9 +119,17 @@ insert_token_range_to_sorted_container_while_unwrapping(
|
||||
const dht::token& tok,
|
||||
dht::token_range_vector& ret) {
|
||||
if (prev_tok < tok) {
|
||||
ret.emplace_back(
|
||||
dht::token_range::bound(prev_tok, false),
|
||||
dht::token_range::bound(tok, true));
|
||||
auto pos = ret.end();
|
||||
if (!ret.empty() && !std::prev(pos)->end()) {
|
||||
// We inserted a wrapped range (a, b] previously as
|
||||
// (-inf, b], (a, +inf). So now we insert in the next-to-last
|
||||
// position to keep the last range (a, +inf) at the end.
|
||||
pos = std::prev(pos);
|
||||
}
|
||||
ret.insert(pos,
|
||||
dht::token_range{
|
||||
dht::token_range::bound(prev_tok, false),
|
||||
dht::token_range::bound(tok, true)});
|
||||
} else {
|
||||
ret.emplace_back(
|
||||
dht::token_range::bound(prev_tok, false),
|
||||
|
||||
69
main.cc
69
main.cc
@@ -33,6 +33,7 @@
|
||||
#include "service/storage_service.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
#include "service/load_broadcaster.hh"
|
||||
#include "service/view_update_backlog_broker.hh"
|
||||
#include "streaming/stream_session.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "db/system_distributed_keyspace.hh"
|
||||
@@ -62,6 +63,7 @@
|
||||
#include "service/cache_hitrate_calculator.hh"
|
||||
#include "sstables/compaction_manager.hh"
|
||||
#include "sstables/sstables.hh"
|
||||
#include <db/view/view_update_from_staging_generator.hh>
|
||||
|
||||
seastar::metrics::metric_groups app_metrics;
|
||||
|
||||
@@ -548,7 +550,7 @@ int main(int ac, char** av) {
|
||||
directories.insert(std::move(shard_dir));
|
||||
}
|
||||
}
|
||||
boost::filesystem::path view_pending_updates_base_dir = boost::filesystem::path(db.local().get_config().data_file_directories()[0]) / "view_pending_updates";
|
||||
boost::filesystem::path view_pending_updates_base_dir = boost::filesystem::path(db.local().get_config().view_hints_directory());
|
||||
sstring view_pending_updates_base_dir_str = view_pending_updates_base_dir.native();
|
||||
dirs.touch_and_lock(view_pending_updates_base_dir_str).get();
|
||||
directories.insert(view_pending_updates_base_dir_str);
|
||||
@@ -618,7 +620,8 @@ int main(int ac, char** av) {
|
||||
service::storage_proxy::config spcfg;
|
||||
spcfg.hinted_handoff_enabled = hinted_handoff_enabled;
|
||||
spcfg.available_memory = memory::stats().total_memory();
|
||||
proxy.start(std::ref(db), spcfg).get();
|
||||
static db::view::node_update_backlog node_backlog(smp::count, 10ms);
|
||||
proxy.start(std::ref(db), spcfg, std::ref(node_backlog)).get();
|
||||
// #293 - do not stop anything
|
||||
// engine().at_exit([&proxy] { return proxy.stop(); });
|
||||
supervisor::notify("starting migration manager");
|
||||
@@ -647,6 +650,21 @@ int main(int ac, char** av) {
|
||||
|
||||
supervisor::notify("loading sstables");
|
||||
distributed_loader::init_non_system_keyspaces(db, proxy).get();
|
||||
|
||||
static sharded<db::view::view_update_from_staging_generator> view_update_from_staging_generator;
|
||||
view_update_from_staging_generator.start(std::ref(db), std::ref(proxy)).get();
|
||||
supervisor::notify("discovering staging sstables");
|
||||
db.invoke_on_all([] (database& db) {
|
||||
for (auto& x : db.get_column_families()) {
|
||||
table& t = *(x.second);
|
||||
for (sstables::shared_sstable sst : *t.get_sstables()) {
|
||||
if (sst->is_staging()) {
|
||||
view_update_from_staging_generator.local().register_staging_sstable(std::move(sst), t.shared_from_this());
|
||||
}
|
||||
}
|
||||
}
|
||||
}).get();
|
||||
|
||||
// register connection drop notification to update cf's cache hit rate data
|
||||
db.invoke_on_all([] (database& db) {
|
||||
db.register_connection_drop_notifier(netw::get_local_messaging_service());
|
||||
@@ -669,6 +687,11 @@ int main(int ac, char** av) {
|
||||
cl->delete_segments(std::move(paths));
|
||||
}
|
||||
}
|
||||
|
||||
db.invoke_on_all([&proxy] (database& db) {
|
||||
db.get_compaction_manager().start();
|
||||
}).get();
|
||||
|
||||
// If the same sstable is shared by several shards, it cannot be
|
||||
// deleted until all shards decide to compact it. So we want to
|
||||
// start these compactions now. Note we start compacting only after
|
||||
@@ -700,9 +723,21 @@ int main(int ac, char** av) {
|
||||
proxy.invoke_on_all([] (service::storage_proxy& p) {
|
||||
p.init_messaging_service();
|
||||
}).get();
|
||||
|
||||
supervisor::notify("starting streaming service");
|
||||
streaming::stream_session::init_streaming_service(db).get();
|
||||
streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_from_staging_generator).get();
|
||||
api::set_server_stream_manager(ctx).get();
|
||||
|
||||
supervisor::notify("starting hinted handoff manager");
|
||||
if (hinted_handoff_enabled) {
|
||||
db::hints::manager::rebalance(cfg->hints_directory()).get();
|
||||
}
|
||||
db::hints::manager::rebalance(cfg->view_hints_directory()).get();
|
||||
|
||||
proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
|
||||
local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
|
||||
}).get();
|
||||
|
||||
supervisor::notify("starting messaging service");
|
||||
// Start handling REPAIR_CHECKSUM_RANGE messages
|
||||
netw::get_messaging_service().invoke_on_all([&db] (auto& ms) {
|
||||
@@ -735,20 +770,29 @@ int main(int ac, char** av) {
|
||||
cf_cache_hitrate_calculator.start(std::ref(db), std::ref(cf_cache_hitrate_calculator)).get();
|
||||
engine().at_exit([&cf_cache_hitrate_calculator] { return cf_cache_hitrate_calculator.stop(); });
|
||||
cf_cache_hitrate_calculator.local().run_on(engine().cpu_id());
|
||||
|
||||
supervisor::notify("starting view update backlog broker");
|
||||
static sharded<service::view_update_backlog_broker> view_backlog_broker;
|
||||
view_backlog_broker.start(std::ref(proxy), std::ref(gms::get_gossiper())).get();
|
||||
view_backlog_broker.invoke_on_all(&service::view_update_backlog_broker::start).get();
|
||||
engine().at_exit([] {
|
||||
return view_backlog_broker.stop();
|
||||
});
|
||||
|
||||
api::set_server_cache(ctx);
|
||||
gms::get_local_gossiper().wait_for_gossip_to_settle().get();
|
||||
api::set_server_gossip_settle(ctx).get();
|
||||
|
||||
supervisor::notify("starting hinted handoff manager");
|
||||
if (hinted_handoff_enabled) {
|
||||
db::hints::manager::rebalance(cfg->hints_directory()).get();
|
||||
}
|
||||
db::hints::manager::rebalance(cfg->data_file_directories()[0] + "/view_pending_updates").get();
|
||||
|
||||
supervisor::notify("allow replaying hints");
|
||||
proxy.invoke_on_all([] (service::storage_proxy& local_proxy) {
|
||||
local_proxy.start_hints_manager(gms::get_local_gossiper().shared_from_this(), service::get_local_storage_service().shared_from_this());
|
||||
local_proxy.allow_replaying_hints();
|
||||
}).get();
|
||||
|
||||
if (cfg->view_building()) {
|
||||
supervisor::notify("Launching generate_mv_updates for non system tables");
|
||||
view_update_from_staging_generator.invoke_on_all(&db::view::view_update_from_staging_generator::start).get();
|
||||
}
|
||||
|
||||
static sharded<db::view::view_builder> view_builder;
|
||||
if (cfg->view_building()) {
|
||||
supervisor::notify("starting the view builder");
|
||||
@@ -786,6 +830,11 @@ int main(int ac, char** av) {
|
||||
engine().at_exit([] {
|
||||
return repair_shutdown(service::get_local_storage_service().db());
|
||||
});
|
||||
|
||||
engine().at_exit([] {
|
||||
return view_update_from_staging_generator.stop();
|
||||
});
|
||||
|
||||
engine().at_exit([] {
|
||||
return service::get_local_storage_service().drain_on_shutdown();
|
||||
});
|
||||
|
||||
@@ -145,8 +145,8 @@ private:
|
||||
class encoding_stats_collector {
|
||||
private:
|
||||
min_max_tracker<api::timestamp_type> timestamp;
|
||||
min_tracker<uint32_t> min_local_deletion_time;
|
||||
min_tracker<uint32_t> min_ttl;
|
||||
min_tracker<int32_t> min_local_deletion_time;
|
||||
min_tracker<int32_t> min_ttl;
|
||||
|
||||
void update_timestamp(api::timestamp_type ts) {
|
||||
if (ts != api::missing_timestamp) {
|
||||
@@ -214,7 +214,9 @@ private:
|
||||
|
||||
void update(const schema& s, const deletable_row& dr) {
|
||||
update(dr.marker());
|
||||
update(dr.deleted_at().tomb());
|
||||
row_tombstone row_tomb = dr.deleted_at();
|
||||
update(row_tomb.regular());
|
||||
update(row_tomb.tomb());
|
||||
update(s, dr.cells(), column_kind::regular_column);
|
||||
}
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
#include "rpc/rpc.hh"
|
||||
#include "db/config.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "db/view/view_update_backlog.hh"
|
||||
#include "dht/i_partitioner.hh"
|
||||
#include "range.hh"
|
||||
#include "frozen_schema.hh"
|
||||
@@ -56,6 +57,7 @@
|
||||
#include "idl/partition_checksum.dist.hh"
|
||||
#include "idl/query.dist.hh"
|
||||
#include "idl/cache_temperature.dist.hh"
|
||||
#include "idl/view.dist.hh"
|
||||
#include "serializer_impl.hh"
|
||||
#include "serialization_visitors.hh"
|
||||
#include "idl/consistency_level.dist.impl.hh"
|
||||
@@ -77,6 +79,7 @@
|
||||
#include "idl/cache_temperature.dist.impl.hh"
|
||||
#include "rpc/lz4_compressor.hh"
|
||||
#include "rpc/multi_algo_compressor_factory.hh"
|
||||
#include "idl/view.dist.impl.hh"
|
||||
#include "partition_range_compat.hh"
|
||||
#include "stdx.hh"
|
||||
#include <boost/range/adaptor/filtered.hpp>
|
||||
@@ -135,12 +138,14 @@ struct messaging_service::rpc_protocol_wrapper : public rpc_protocol { using rpc
|
||||
// This should be integrated into messaging_service proper.
|
||||
class messaging_service::rpc_protocol_client_wrapper {
|
||||
std::unique_ptr<rpc_protocol::client> _p;
|
||||
::shared_ptr<seastar::tls::server_credentials> _credentials;
|
||||
public:
|
||||
rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local = ipv4_addr())
|
||||
: _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), addr, local)) {
|
||||
}
|
||||
rpc_protocol_client_wrapper(rpc_protocol& proto, rpc::client_options opts, ipv4_addr addr, ipv4_addr local, ::shared_ptr<seastar::tls::server_credentials> c)
|
||||
: _p(std::make_unique<rpc_protocol::client>(proto, std::move(opts), seastar::tls::socket(c), addr, local))
|
||||
, _credentials(c)
|
||||
{}
|
||||
auto get_stats() const { return _p->get_stats(); }
|
||||
future<> stop() { return _p->stop(); }
|
||||
@@ -148,6 +153,19 @@ public:
|
||||
return _p->error();
|
||||
}
|
||||
operator rpc_protocol::client&() { return *_p; }
|
||||
|
||||
/**
|
||||
* #3787 Must ensure we use the right type of socker. I.e. tls or not.
|
||||
* See above, we retain credentials object so we here can know if we
|
||||
* are tls or not.
|
||||
*/
|
||||
template<typename Serializer, typename... Out>
|
||||
future<rpc::sink<Out...>> make_stream_sink() {
|
||||
if (_credentials) {
|
||||
return _p->make_stream_sink<Serializer, Out...>(seastar::tls::socket(_credentials));
|
||||
}
|
||||
return _p->make_stream_sink<Serializer, Out...>();
|
||||
}
|
||||
};
|
||||
|
||||
struct messaging_service::rpc_protocol_server_wrapper : public rpc_protocol::server { using rpc_protocol::server::server; };
|
||||
@@ -638,17 +656,18 @@ rpc::sink<int32_t> messaging_service::make_sink_for_stream_mutation_fragments(rp
|
||||
}
|
||||
|
||||
future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>
|
||||
messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, msg_addr id) {
|
||||
rpc_protocol::client& rpc_client = *get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
|
||||
return rpc_client.make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
|
||||
auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
|
||||
return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, sink).then([sink] (rpc::source<int32_t> source) mutable {
|
||||
messaging_service::make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id) {
|
||||
auto wrapper = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, id);
|
||||
rpc_protocol::client& rpc_client = *wrapper;
|
||||
return wrapper->make_stream_sink<netw::serializer, frozen_mutation_fragment>().then([this, plan_id, schema_id, cf_id, estimated_partitions, reason, &rpc_client] (rpc::sink<frozen_mutation_fragment> sink) mutable {
|
||||
auto rpc_handler = rpc()->make_client<rpc::source<int32_t> (utils::UUID, utils::UUID, utils::UUID, uint64_t, streaming::stream_reason, rpc::sink<frozen_mutation_fragment>)>(messaging_verb::STREAM_MUTATION_FRAGMENTS);
|
||||
return rpc_handler(rpc_client , plan_id, schema_id, cf_id, estimated_partitions, reason, sink).then([sink] (rpc::source<int32_t> source) mutable {
|
||||
return make_ready_future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>>(std::move(sink), std::move(source));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::source<frozen_mutation_fragment> source)>&& func) {
|
||||
void messaging_service::register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason>, rpc::source<frozen_mutation_fragment> source)>&& func) {
|
||||
register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
|
||||
}
|
||||
|
||||
@@ -726,13 +745,13 @@ auto send_message_oneway_timeout(messaging_service* ms, Timeout timeout, messagi
|
||||
|
||||
// PREPARE_MESSAGE
|
||||
void messaging_service::register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description)>&& func) {
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func) {
|
||||
register_handler(this, messaging_verb::PREPARE_MESSAGE, std::move(func));
|
||||
}
|
||||
future<streaming::prepare_message> messaging_service::send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
|
||||
sstring description) {
|
||||
sstring description, streaming::stream_reason reason) {
|
||||
return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
|
||||
std::move(msg), plan_id, std::move(description));
|
||||
std::move(msg), plan_id, std::move(description), reason);
|
||||
}
|
||||
|
||||
// PREPARE_DONE_MESSAGE
|
||||
@@ -745,12 +764,12 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
|
||||
}
|
||||
|
||||
// STREAM_MUTATION
|
||||
void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented)>&& func) {
|
||||
void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
|
||||
register_handler(this, messaging_verb::STREAM_MUTATION, std::move(func));
|
||||
}
|
||||
future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented) {
|
||||
future<> messaging_service::send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented, streaming::stream_reason reason) {
|
||||
return send_message<void>(this, messaging_verb::STREAM_MUTATION, id,
|
||||
plan_id, std::move(fm), dst_cpu_id, fragmented);
|
||||
plan_id, std::move(fm), dst_cpu_id, fragmented, reason);
|
||||
}
|
||||
|
||||
// STREAM_MUTATION_DONE
|
||||
@@ -873,24 +892,24 @@ future<> messaging_service::send_counter_mutation(msg_addr id, clock_type::time_
|
||||
return send_message_timeout<void>(this, messaging_verb::COUNTER_MUTATION, std::move(id), timeout, std::move(fms), cl, std::move(trace_info));
|
||||
}
|
||||
|
||||
void messaging_service::register_mutation_done(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id)>&& func) {
|
||||
void messaging_service::register_mutation_done(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, rpc::optional<db::view::update_backlog> backlog)>&& func) {
|
||||
register_handler(this, netw::messaging_verb::MUTATION_DONE, std::move(func));
|
||||
}
|
||||
void messaging_service::unregister_mutation_done() {
|
||||
_rpc->unregister_handler(netw::messaging_verb::MUTATION_DONE);
|
||||
}
|
||||
future<> messaging_service::send_mutation_done(msg_addr id, unsigned shard, response_id_type response_id) {
|
||||
return send_message_oneway(this, messaging_verb::MUTATION_DONE, std::move(id), std::move(shard), std::move(response_id));
|
||||
future<> messaging_service::send_mutation_done(msg_addr id, unsigned shard, response_id_type response_id, db::view::update_backlog backlog) {
|
||||
return send_message_oneway(this, messaging_verb::MUTATION_DONE, std::move(id), std::move(shard), std::move(response_id), std::move(backlog));
|
||||
}
|
||||
|
||||
void messaging_service::register_mutation_failed(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, size_t num_failed)>&& func) {
|
||||
void messaging_service::register_mutation_failed(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog)>&& func) {
|
||||
register_handler(this, netw::messaging_verb::MUTATION_FAILED, std::move(func));
|
||||
}
|
||||
void messaging_service::unregister_mutation_failed() {
|
||||
_rpc->unregister_handler(netw::messaging_verb::MUTATION_FAILED);
|
||||
}
|
||||
future<> messaging_service::send_mutation_failed(msg_addr id, unsigned shard, response_id_type response_id, size_t num_failed) {
|
||||
return send_message_oneway(this, messaging_verb::MUTATION_FAILED, std::move(id), std::move(shard), std::move(response_id), num_failed);
|
||||
future<> messaging_service::send_mutation_failed(msg_addr id, unsigned shard, response_id_type response_id, size_t num_failed, db::view::update_backlog backlog) {
|
||||
return send_message_oneway(this, messaging_verb::MUTATION_FAILED, std::move(id), std::move(shard), std::move(response_id), num_failed, std::move(backlog));
|
||||
}
|
||||
|
||||
void messaging_service::register_read_data(std::function<future<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> (const rpc::client_info&, rpc::opt_time_point t, query::read_command cmd, ::compat::wrapping_partition_range pr, rpc::optional<query::digest_algorithm> oda)>&& func) {
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "repair/repair.hh"
|
||||
#include "tracing/tracing.hh"
|
||||
#include "digest_algorithm.hh"
|
||||
#include "streaming/stream_reason.hh"
|
||||
|
||||
#include <seastar/net/tls.hh>
|
||||
|
||||
@@ -57,6 +58,10 @@ namespace db {
|
||||
class seed_provider_type;
|
||||
}
|
||||
|
||||
namespace db::view {
|
||||
class update_backlog;
|
||||
}
|
||||
|
||||
class frozen_mutation;
|
||||
class frozen_schema;
|
||||
class partition_checksum;
|
||||
@@ -237,23 +242,23 @@ public:
|
||||
|
||||
// Wrapper for PREPARE_MESSAGE verb
|
||||
void register_prepare_message(std::function<future<streaming::prepare_message> (const rpc::client_info& cinfo,
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description)>&& func);
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
|
||||
future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
|
||||
sstring description);
|
||||
sstring description, streaming::stream_reason);
|
||||
|
||||
// Wrapper for PREPARE_DONE_MESSAGE verb
|
||||
void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
|
||||
|
||||
// Wrapper for STREAM_MUTATION verb
|
||||
void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>)>&& func);
|
||||
future<> send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented);
|
||||
void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
|
||||
future<> send_stream_mutation(msg_addr id, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, bool fragmented, streaming::stream_reason reason);
|
||||
|
||||
// Wrapper for STREAM_MUTATION_FRAGMENTS
|
||||
// The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
|
||||
void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::source<frozen_mutation_fragment> source)>&& func);
|
||||
void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment> source)>&& func);
|
||||
rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment>& source);
|
||||
future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, msg_addr id);
|
||||
future<rpc::sink<frozen_mutation_fragment>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
|
||||
|
||||
void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
|
||||
@@ -316,14 +321,14 @@ public:
|
||||
future<> send_counter_mutation(msg_addr id, clock_type::time_point timeout, std::vector<frozen_mutation> fms, db::consistency_level cl, stdx::optional<tracing::trace_info> trace_info = std::experimental::nullopt);
|
||||
|
||||
// Wrapper for MUTATION_DONE
|
||||
void register_mutation_done(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id)>&& func);
|
||||
void register_mutation_done(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, rpc::optional<db::view::update_backlog> backlog)>&& func);
|
||||
void unregister_mutation_done();
|
||||
future<> send_mutation_done(msg_addr id, unsigned shard, response_id_type response_id);
|
||||
future<> send_mutation_done(msg_addr id, unsigned shard, response_id_type response_id, db::view::update_backlog backlog);
|
||||
|
||||
// Wrapper for MUTATION_FAILED
|
||||
void register_mutation_failed(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, size_t num_failed)>&& func);
|
||||
void register_mutation_failed(std::function<future<rpc::no_wait_type> (const rpc::client_info& cinfo, unsigned shard, response_id_type response_id, size_t num_failed, rpc::optional<db::view::update_backlog> backlog)>&& func);
|
||||
void unregister_mutation_failed();
|
||||
future<> send_mutation_failed(msg_addr id, unsigned shard, response_id_type response_id, size_t num_failed);
|
||||
future<> send_mutation_failed(msg_addr id, unsigned shard, response_id_type response_id, size_t num_failed, db::view::update_backlog backlog);
|
||||
|
||||
// Wrapper for READ_DATA
|
||||
// Note: WTH is future<foreign_ptr<lw_shared_ptr<query::result>>
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user