Compare commits
133 Commits
scylla-2.3
...
mv
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0ec3ff0611 | ||
|
|
4a435e6f66 | ||
|
|
03f2f8633b | ||
|
|
8cb242ab0b | ||
|
|
dfd1e1229e | ||
|
|
4df982fe07 | ||
|
|
7a1bcd9ad3 | ||
|
|
f55a2fe3a7 | ||
|
|
c126b00793 | ||
|
|
a7dd02309f | ||
|
|
27bf20aa3f | ||
|
|
7b018f6fd6 | ||
|
|
a08fba19e3 | ||
|
|
2a0b720102 | ||
|
|
1cf5653f89 | ||
|
|
4d3d32f465 | ||
|
|
dd083122f9 | ||
|
|
f4caa418ff | ||
|
|
3bcc123000 | ||
|
|
07a429e837 | ||
|
|
7d6af5da3a | ||
|
|
b32f94d31e | ||
|
|
77ad085393 | ||
|
|
4b57fc9aea | ||
|
|
a9c465d7d2 | ||
|
|
c37aff419e | ||
|
|
81a03db955 | ||
|
|
a8e795a16e | ||
|
|
33d7de0805 | ||
|
|
8084ce3a8e | ||
|
|
3cb7ddaf68 | ||
|
|
ed1d0b6839 | ||
|
|
f42eaff75e | ||
|
|
a99acbc376 | ||
|
|
f7a2f15935 | ||
|
|
6aec9e711f | ||
|
|
bf2645c616 | ||
|
|
48b07ba5d3 | ||
|
|
8c03c1e2ce | ||
|
|
2ffb621271 | ||
|
|
c236a96d7d | ||
|
|
59a30f0684 | ||
|
|
eafd16266d | ||
|
|
a36b1f1967 | ||
|
|
d0f39ea31d | ||
|
|
3289642223 | ||
|
|
e0c16c4585 | ||
|
|
24ca2d85c6 | ||
|
|
99b5cf1f92 | ||
|
|
084c824d12 | ||
|
|
fafcacc31c | ||
|
|
677991f353 | ||
|
|
01bd34d117 | ||
|
|
78ecf2740a | ||
|
|
d26b35b058 | ||
|
|
0b148d0070 | ||
|
|
a45c3aa8c7 | ||
|
|
19e7493d5b | ||
|
|
3194ce16b3 | ||
|
|
569437aaa5 | ||
|
|
5ee09e5f3b | ||
|
|
b464b66e90 | ||
|
|
f3da043230 | ||
|
|
8eba27829a | ||
|
|
28d064e7c0 | ||
|
|
5fd9c3b9d4 | ||
|
|
f73340e6f8 | ||
|
|
da53ea7a13 | ||
|
|
db2c029f7a | ||
|
|
b24eb5c11d | ||
|
|
30109a693b | ||
|
|
80d1d5017f | ||
|
|
0db5419ec5 | ||
|
|
c45e291084 | ||
|
|
6c54a97320 | ||
|
|
2bfdc2d781 | ||
|
|
199f9196e9 | ||
|
|
92700c6758 | ||
|
|
bf330a99f0 | ||
|
|
569176aad1 | ||
|
|
6bd71015e7 | ||
|
|
2259eee97c | ||
|
|
a497edcbda | ||
|
|
81fba73e9d | ||
|
|
bb4d361cf6 | ||
|
|
e9dffc753c | ||
|
|
8153df7684 | ||
|
|
2dc78a6ca2 | ||
|
|
6adc78d690 | ||
|
|
e69f2c361c | ||
|
|
972ce88601 | ||
|
|
a83c66b402 | ||
|
|
99fb754221 | ||
|
|
f2132c61bd | ||
|
|
daccc10a06 | ||
|
|
fa6db21fea | ||
|
|
2401115e14 | ||
|
|
9d537cb449 | ||
|
|
5b4da4d4bd | ||
|
|
83bc72b0ab | ||
|
|
1650d37dae | ||
|
|
4b5826ff5a | ||
|
|
f828c5c4f3 | ||
|
|
6cffb164d6 | ||
|
|
82f76b0947 | ||
|
|
c87a961667 | ||
|
|
fd8b7efb99 | ||
|
|
4050a4b24e | ||
|
|
f4b406cce1 | ||
|
|
8eccff1723 | ||
|
|
27cb41ddeb | ||
|
|
fc629b9ca6 | ||
|
|
0a1aec2bd6 | ||
|
|
074be4d4e8 | ||
|
|
6c6ffaee71 | ||
|
|
450985dfee | ||
|
|
e1efda8b0c | ||
|
|
054514a47a | ||
|
|
d1e8c32b2e | ||
|
|
b0e8547569 | ||
|
|
da19508317 | ||
|
|
d485e1c1d8 | ||
|
|
b51c70ef69 | ||
|
|
d76cfa77b1 | ||
|
|
aa0b41f0b2 | ||
|
|
c26a304fbb | ||
|
|
4d3cc2867a | ||
|
|
4995a8c568 | ||
|
|
03753cc431 | ||
|
|
e1a867cbe3 | ||
|
|
b4879206fb | ||
|
|
826a237c2e | ||
|
|
19b76bf75b |
@@ -47,6 +47,14 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value, atomic_cell::collection_member cm) {
|
||||
auto& imr_data = type.imr_state();
|
||||
return atomic_cell(
|
||||
imr_data.type_info(),
|
||||
imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, bool(cm)), &imr_data.lsa_migrator())
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, bytes_view value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, atomic_cell::collection_member cm) {
|
||||
auto& imr_data = type.imr_state();
|
||||
@@ -56,6 +64,15 @@ atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_typ
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, atomic_cell::collection_member cm) {
|
||||
auto& imr_data = type.imr_state();
|
||||
return atomic_cell(
|
||||
imr_data.type_info(),
|
||||
imr_object_type::make(data::cell::make_live(imr_data.type_info(), timestamp, value, expiry, ttl, bool(cm)), &imr_data.lsa_migrator())
|
||||
);
|
||||
}
|
||||
|
||||
atomic_cell atomic_cell::make_live_counter_update(api::timestamp_type timestamp, int64_t value) {
|
||||
auto& imr_data = no_type_imr_descriptor();
|
||||
return atomic_cell(
|
||||
@@ -187,7 +204,24 @@ size_t atomic_cell_or_collection::external_memory_usage(const abstract_type& t)
|
||||
return 0;
|
||||
}
|
||||
auto ctx = data::cell::context(_data.get(), t.imr_state().type_info());
|
||||
return data::cell::structure::serialized_object_size(_data.get(), ctx);
|
||||
|
||||
auto view = data::cell::structure::make_view(_data.get(), ctx);
|
||||
auto flags = view.get<data::cell::tags::flags>();
|
||||
|
||||
size_t external_value_size = 0;
|
||||
if (flags.get<data::cell::tags::external_data>()) {
|
||||
if (flags.get<data::cell::tags::collection>()) {
|
||||
external_value_size = get_collection_mutation_view(_data.get()).data.size_bytes();
|
||||
} else {
|
||||
auto cell_view = data::cell::atomic_cell_view(t.imr_state().type_info(), view);
|
||||
external_value_size = cell_view.value_size();
|
||||
}
|
||||
// Add overhead of chunk headers. The last one is a special case.
|
||||
external_value_size += (external_value_size - 1) / data::cell::maximum_external_chunk_length * data::cell::external_chunk_overhead;
|
||||
external_value_size += data::cell::external_last_chunk_overhead;
|
||||
}
|
||||
return data::cell::structure::serialized_object_size(_data.get(), ctx)
|
||||
+ imr_object_type::size_overhead + external_value_size;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const atomic_cell_or_collection& c) {
|
||||
|
||||
@@ -34,6 +34,8 @@
|
||||
#include "data/schema_info.hh"
|
||||
#include "imr/utils.hh"
|
||||
|
||||
#include "serializer.hh"
|
||||
|
||||
class abstract_type;
|
||||
class collection_type_impl;
|
||||
|
||||
@@ -186,6 +188,8 @@ public:
|
||||
static atomic_cell make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time);
|
||||
static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, bytes_view value,
|
||||
collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
|
||||
collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
|
||||
collection_member cm = collection_member::no) {
|
||||
return make_live(type, timestamp, bytes_view(value), cm);
|
||||
@@ -193,6 +197,8 @@ public:
|
||||
static atomic_cell make_live_counter_update(api::timestamp_type timestamp, int64_t value);
|
||||
static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, bytes_view value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type&, api::timestamp_type timestamp, ser::buffer_view<bytes_ostream::fragment_iterator> value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, collection_member = collection_member::no);
|
||||
static atomic_cell make_live(const abstract_type& type, api::timestamp_type timestamp, const bytes& value,
|
||||
gc_clock::time_point expiry, gc_clock::duration ttl, collection_member cm = collection_member::no)
|
||||
{
|
||||
|
||||
@@ -64,7 +64,7 @@ class cache_flat_mutation_reader final : public flat_mutation_reader::impl {
|
||||
|
||||
end_of_stream
|
||||
};
|
||||
lw_shared_ptr<partition_snapshot> _snp;
|
||||
partition_snapshot_ptr _snp;
|
||||
position_in_partition::tri_compare _position_cmp;
|
||||
|
||||
query::clustering_key_filter_ranges _ck_ranges;
|
||||
@@ -129,7 +129,7 @@ public:
|
||||
dht::decorated_key dk,
|
||||
query::clustering_key_filter_ranges&& crr,
|
||||
lw_shared_ptr<read_context> ctx,
|
||||
lw_shared_ptr<partition_snapshot> snp,
|
||||
partition_snapshot_ptr snp,
|
||||
row_cache& cache)
|
||||
: flat_mutation_reader::impl(std::move(s))
|
||||
, _snp(std::move(snp))
|
||||
@@ -149,9 +149,6 @@ public:
|
||||
cache_flat_mutation_reader(const cache_flat_mutation_reader&) = delete;
|
||||
cache_flat_mutation_reader(cache_flat_mutation_reader&&) = delete;
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override;
|
||||
virtual ~cache_flat_mutation_reader() {
|
||||
maybe_merge_versions(_snp, _lsa_manager.region(), _lsa_manager.read_section());
|
||||
}
|
||||
virtual void next_partition() override {
|
||||
clear_buffer_to_next_partition();
|
||||
if (is_buffer_empty()) {
|
||||
@@ -667,7 +664,7 @@ inline flat_mutation_reader make_cache_flat_mutation_reader(schema_ptr s,
|
||||
query::clustering_key_filter_ranges crr,
|
||||
row_cache& cache,
|
||||
lw_shared_ptr<cache::read_context> ctx,
|
||||
lw_shared_ptr<partition_snapshot> snp)
|
||||
partition_snapshot_ptr snp)
|
||||
{
|
||||
return make_flat_mutation_reader<cache::cache_flat_mutation_reader>(
|
||||
std::move(s), std::move(dk), std::move(crr), std::move(ctx), std::move(snp), cache);
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include "keys.hh"
|
||||
#include "schema.hh"
|
||||
#include "range.hh"
|
||||
@@ -43,22 +44,20 @@ bound_kind invert_kind(bound_kind k);
|
||||
int32_t weight(bound_kind k);
|
||||
|
||||
class bound_view {
|
||||
const static thread_local clustering_key _empty_prefix;
|
||||
std::reference_wrapper<const clustering_key_prefix> _prefix;
|
||||
bound_kind _kind;
|
||||
public:
|
||||
const static thread_local clustering_key empty_prefix;
|
||||
const clustering_key_prefix& prefix;
|
||||
bound_kind kind;
|
||||
bound_view(const clustering_key_prefix& prefix, bound_kind kind)
|
||||
: prefix(prefix)
|
||||
, kind(kind)
|
||||
: _prefix(prefix)
|
||||
, _kind(kind)
|
||||
{ }
|
||||
bound_view(const bound_view& other) noexcept = default;
|
||||
bound_view& operator=(const bound_view& other) noexcept {
|
||||
if (this != &other) {
|
||||
this->~bound_view();
|
||||
new (this) bound_view(other);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
bound_view& operator=(const bound_view& other) noexcept = default;
|
||||
|
||||
bound_kind kind() const { return _kind; }
|
||||
const clustering_key_prefix& prefix() const { return _prefix; }
|
||||
|
||||
struct tri_compare {
|
||||
// To make it assignable and to avoid taking a schema_ptr, we
|
||||
// wrap the schema reference.
|
||||
@@ -82,13 +81,13 @@ public:
|
||||
return d1 < d2 ? w1 - (w1 <= 0) : -(w2 - (w2 <= 0));
|
||||
}
|
||||
int operator()(const bound_view b, const clustering_key_prefix& p) const {
|
||||
return operator()(b.prefix, weight(b.kind), p, 0);
|
||||
return operator()(b._prefix, weight(b._kind), p, 0);
|
||||
}
|
||||
int operator()(const clustering_key_prefix& p, const bound_view b) const {
|
||||
return operator()(p, 0, b.prefix, weight(b.kind));
|
||||
return operator()(p, 0, b._prefix, weight(b._kind));
|
||||
}
|
||||
int operator()(const bound_view b1, const bound_view b2) const {
|
||||
return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
|
||||
return operator()(b1._prefix, weight(b1._kind), b2._prefix, weight(b2._kind));
|
||||
}
|
||||
};
|
||||
struct compare {
|
||||
@@ -101,26 +100,26 @@ public:
|
||||
return _cmp(p1, w1, p2, w2) < 0;
|
||||
}
|
||||
bool operator()(const bound_view b, const clustering_key_prefix& p) const {
|
||||
return operator()(b.prefix, weight(b.kind), p, 0);
|
||||
return operator()(b._prefix, weight(b._kind), p, 0);
|
||||
}
|
||||
bool operator()(const clustering_key_prefix& p, const bound_view b) const {
|
||||
return operator()(p, 0, b.prefix, weight(b.kind));
|
||||
return operator()(p, 0, b._prefix, weight(b._kind));
|
||||
}
|
||||
bool operator()(const bound_view b1, const bound_view b2) const {
|
||||
return operator()(b1.prefix, weight(b1.kind), b2.prefix, weight(b2.kind));
|
||||
return operator()(b1._prefix, weight(b1._kind), b2._prefix, weight(b2._kind));
|
||||
}
|
||||
};
|
||||
bool equal(const schema& s, const bound_view other) const {
|
||||
return kind == other.kind && prefix.equal(s, other.prefix);
|
||||
return _kind == other._kind && _prefix.get().equal(s, other._prefix.get());
|
||||
}
|
||||
bool adjacent(const schema& s, const bound_view other) const {
|
||||
return invert_kind(other.kind) == kind && prefix.equal(s, other.prefix);
|
||||
return invert_kind(other._kind) == _kind && _prefix.get().equal(s, other._prefix.get());
|
||||
}
|
||||
static bound_view bottom() {
|
||||
return {empty_prefix, bound_kind::incl_start};
|
||||
return {_empty_prefix, bound_kind::incl_start};
|
||||
}
|
||||
static bound_view top() {
|
||||
return {empty_prefix, bound_kind::incl_end};
|
||||
return {_empty_prefix, bound_kind::incl_end};
|
||||
}
|
||||
template<template<typename> typename R>
|
||||
GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
|
||||
@@ -144,13 +143,13 @@ public:
|
||||
template<template<typename> typename R>
|
||||
GCC6_CONCEPT( requires Range<R, clustering_key_prefix_view> )
|
||||
static stdx::optional<typename R<clustering_key_prefix_view>::bound> to_range_bound(const bound_view& bv) {
|
||||
if (&bv.prefix == &empty_prefix) {
|
||||
if (&bv._prefix.get() == &_empty_prefix) {
|
||||
return {};
|
||||
}
|
||||
bool inclusive = bv.kind != bound_kind::excl_end && bv.kind != bound_kind::excl_start;
|
||||
return {typename R<clustering_key_prefix_view>::bound(bv.prefix.view(), inclusive)};
|
||||
bool inclusive = bv._kind != bound_kind::excl_end && bv._kind != bound_kind::excl_start;
|
||||
return {typename R<clustering_key_prefix_view>::bound(bv._prefix.get().view(), inclusive)};
|
||||
}
|
||||
friend std::ostream& operator<<(std::ostream& out, const bound_view& b) {
|
||||
return out << "{bound: prefix=" << b.prefix << ", kind=" << b.kind << "}";
|
||||
return out << "{bound: prefix=" << b._prefix.get() << ", kind=" << b._kind << "}";
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2016 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "query-request.hh"
|
||||
#include <experimental/optional>
|
||||
|
||||
// Wraps ring_position so it is compatible with old-style C++: default constructor,
|
||||
// stateless comparators, yada yada
|
||||
class compatible_ring_position {
|
||||
const schema* _schema = nullptr;
|
||||
// optional to supply a default constructor, no more
|
||||
std::experimental::optional<dht::ring_position> _rp;
|
||||
public:
|
||||
compatible_ring_position() noexcept = default;
|
||||
compatible_ring_position(const schema& s, const dht::ring_position& rp)
|
||||
: _schema(&s), _rp(rp) {
|
||||
}
|
||||
compatible_ring_position(const schema& s, dht::ring_position&& rp)
|
||||
: _schema(&s), _rp(std::move(rp)) {
|
||||
}
|
||||
const dht::token& token() const {
|
||||
return _rp->token();
|
||||
}
|
||||
friend int tri_compare(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return x._rp->tri_compare(*x._schema, *y._rp);
|
||||
}
|
||||
friend bool operator<(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) < 0;
|
||||
}
|
||||
friend bool operator<=(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) <= 0;
|
||||
}
|
||||
friend bool operator>(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) > 0;
|
||||
}
|
||||
friend bool operator>=(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) >= 0;
|
||||
}
|
||||
friend bool operator==(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) == 0;
|
||||
}
|
||||
friend bool operator!=(const compatible_ring_position& x, const compatible_ring_position& y) {
|
||||
return tri_compare(x, y) != 0;
|
||||
}
|
||||
};
|
||||
|
||||
64
compatible_ring_position_view.hh
Normal file
64
compatible_ring_position_view.hh
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright (C) 2016 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "query-request.hh"
|
||||
#include <optional>
|
||||
|
||||
// Wraps ring_position_view so it is compatible with old-style C++: default
|
||||
// constructor, stateless comparators, yada yada.
|
||||
class compatible_ring_position_view {
|
||||
const schema* _schema = nullptr;
|
||||
// Optional to supply a default constructor, no more.
|
||||
std::optional<dht::ring_position_view> _rpv;
|
||||
public:
|
||||
constexpr compatible_ring_position_view() = default;
|
||||
compatible_ring_position_view(const schema& s, dht::ring_position_view rpv)
|
||||
: _schema(&s), _rpv(rpv) {
|
||||
}
|
||||
const dht::ring_position_view& position() const {
|
||||
return *_rpv;
|
||||
}
|
||||
friend int tri_compare(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return dht::ring_position_tri_compare(*x._schema, *x._rpv, *y._rpv);
|
||||
}
|
||||
friend bool operator<(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) < 0;
|
||||
}
|
||||
friend bool operator<=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) <= 0;
|
||||
}
|
||||
friend bool operator>(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) > 0;
|
||||
}
|
||||
friend bool operator>=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) >= 0;
|
||||
}
|
||||
friend bool operator==(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) == 0;
|
||||
}
|
||||
friend bool operator!=(const compatible_ring_position_view& x, const compatible_ring_position_view& y) {
|
||||
return tri_compare(x, y) != 0;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -303,6 +303,7 @@ scylla_tests = [
|
||||
'tests/imr_test',
|
||||
'tests/partition_data_test',
|
||||
'tests/reusable_buffer_test',
|
||||
'tests/multishard_writer_test',
|
||||
]
|
||||
|
||||
perf_tests = [
|
||||
@@ -629,6 +630,7 @@ scylla_core = (['database.cc',
|
||||
'utils/arch/powerpc/crc32-vpmsum/crc32_wrapper.cc',
|
||||
'querier.cc',
|
||||
'data/cell.cc',
|
||||
'multishard_writer.cc',
|
||||
]
|
||||
+ [Antlr3Grammar('cql3/Cql.g')]
|
||||
+ [Thrift('interface/cassandra.thrift', 'Cassandra')]
|
||||
|
||||
@@ -92,6 +92,10 @@ public:
|
||||
_p.apply(t);
|
||||
}
|
||||
|
||||
void accept_static_cell(column_id id, atomic_cell cell) {
|
||||
return accept_static_cell(id, atomic_cell_view(cell));
|
||||
}
|
||||
|
||||
virtual void accept_static_cell(column_id id, atomic_cell_view cell) override {
|
||||
const column_mapping_entry& col = _visited_column_mapping.static_column_at(id);
|
||||
const column_definition* def = _p_schema.get_column_definition(col.name());
|
||||
@@ -119,6 +123,10 @@ public:
|
||||
_current_row = &r;
|
||||
}
|
||||
|
||||
void accept_row_cell(column_id id, atomic_cell cell) {
|
||||
return accept_row_cell(id, atomic_cell_view(cell));
|
||||
}
|
||||
|
||||
virtual void accept_row_cell(column_id id, atomic_cell_view cell) override {
|
||||
const column_mapping_entry& col = _visited_column_mapping.regular_column_at(id);
|
||||
const column_definition* def = _p_schema.get_column_definition(col.name());
|
||||
|
||||
@@ -206,6 +206,30 @@ query_processor::query_processor(service::storage_proxy& proxy, distributed<data
|
||||
_cql_stats.secondary_index_rows_read,
|
||||
sm::description("Counts a total number of rows read during CQL requests performed using secondary indexes.")),
|
||||
|
||||
// read requests that required ALLOW FILTERING
|
||||
sm::make_derive(
|
||||
"filtered_read_requests",
|
||||
_cql_stats.filtered_reads,
|
||||
sm::description("Counts a total number of CQL read requests that required ALLOW FILTERING. See filtered_rows_read_total to compare how many rows needed to be filtered.")),
|
||||
|
||||
// rows read with filtering enabled (because ALLOW FILTERING was required)
|
||||
sm::make_derive(
|
||||
"filtered_rows_read_total",
|
||||
_cql_stats.filtered_rows_read_total,
|
||||
sm::description("Counts a total number of rows read during CQL requests that required ALLOW FILTERING. See filtered_rows_matched_total and filtered_rows_dropped_total for information how accurate filtering queries are.")),
|
||||
|
||||
// rows read with filtering enabled and accepted by the filter
|
||||
sm::make_derive(
|
||||
"filtered_rows_matched_total",
|
||||
_cql_stats.filtered_rows_matched_total,
|
||||
sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and accepted by the filter. Number similar to filtered_rows_read_total indicates that filtering is accurate.")),
|
||||
|
||||
// rows read with filtering enabled and rejected by the filter
|
||||
sm::make_derive(
|
||||
"filtered_rows_dropped_total",
|
||||
[this]() {return _cql_stats.filtered_rows_read_total - _cql_stats.filtered_rows_matched_total;},
|
||||
sm::description("Counts a number of rows read during CQL requests that required ALLOW FILTERING and dropped by the filter. Number similar to filtered_rows_read_total indicates that filtering is not accurate and might cause performance degradation.")),
|
||||
|
||||
sm::make_derive(
|
||||
"authorized_prepared_statements_cache_evictions",
|
||||
[] { return authorized_prepared_statements_cache::shard_stats().authorized_prepared_statements_cache_evictions; },
|
||||
|
||||
@@ -95,7 +95,32 @@ public:
|
||||
uint32_t size() const override {
|
||||
return uint32_t(get_column_defs().size());
|
||||
}
|
||||
|
||||
bool has_unrestricted_components(const schema& schema) const;
|
||||
|
||||
virtual bool needs_filtering(const schema& schema) const;
|
||||
};
|
||||
|
||||
template<>
|
||||
inline bool primary_key_restrictions<partition_key>::has_unrestricted_components(const schema& schema) const {
|
||||
return size() < schema.partition_key_size();
|
||||
}
|
||||
|
||||
template<>
|
||||
inline bool primary_key_restrictions<clustering_key>::has_unrestricted_components(const schema& schema) const {
|
||||
return size() < schema.clustering_key_size();
|
||||
}
|
||||
|
||||
template<>
|
||||
inline bool primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const {
|
||||
return !empty() && !is_on_token() && (has_unrestricted_components(schema) || is_contains() || is_slice());
|
||||
}
|
||||
|
||||
template<>
|
||||
inline bool primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const {
|
||||
// Currently only overloaded single_column_primary_key_restrictions will require ALLOW FILTERING
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -314,6 +314,10 @@ public:
|
||||
fail(unimplemented::cause::LEGACY_COMPOSITE_KEYS); // not 100% correct...
|
||||
}
|
||||
|
||||
const single_column_restrictions::restrictions_map& restrictions() const {
|
||||
return _restrictions->restrictions();
|
||||
}
|
||||
|
||||
virtual bool has_supporting_index(const secondary_index::secondary_index_manager& index_manager) const override {
|
||||
return _restrictions->has_supporting_index(index_manager);
|
||||
}
|
||||
@@ -349,6 +353,8 @@ public:
|
||||
_restrictions->restrictions() | boost::adaptors::map_values,
|
||||
[&] (auto&& r) { return r->is_satisfied_by(schema, key, ckey, cells, options, now); });
|
||||
}
|
||||
|
||||
virtual bool needs_filtering(const schema& schema) const override;
|
||||
};
|
||||
|
||||
template<>
|
||||
@@ -406,6 +412,29 @@ single_column_primary_key_restrictions<clustering_key_prefix>::bounds_ranges(con
|
||||
return bounds;
|
||||
}
|
||||
|
||||
template<>
|
||||
bool single_column_primary_key_restrictions<partition_key>::needs_filtering(const schema& schema) const {
|
||||
return primary_key_restrictions<partition_key>::needs_filtering(schema);
|
||||
}
|
||||
|
||||
template<>
|
||||
bool single_column_primary_key_restrictions<clustering_key>::needs_filtering(const schema& schema) const {
|
||||
// Restrictions currently need filtering in three cases:
|
||||
// 1. any of them is a CONTAINS restriction
|
||||
// 2. restrictions do not form a contiguous prefix (i.e. there are gaps in it)
|
||||
// 3. a SLICE restriction isn't on a last place
|
||||
column_id position = 0;
|
||||
for (const auto& restriction : _restrictions->restrictions() | boost::adaptors::map_values) {
|
||||
if (restriction->is_contains() || position != restriction->get_column_def().id) {
|
||||
return true;
|
||||
}
|
||||
if (!restriction->is_slice()) {
|
||||
position = restriction->get_column_def().id + 1;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -93,6 +93,8 @@ public:
|
||||
}
|
||||
|
||||
virtual bool is_supported_by(const secondary_index::index& index) const = 0;
|
||||
using abstract_restriction::is_satisfied_by;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const = 0;
|
||||
#if 0
|
||||
/**
|
||||
* Check if this type of restriction is supported by the specified index.
|
||||
@@ -166,6 +168,7 @@ public:
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const override;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
|
||||
|
||||
#if 0
|
||||
@Override
|
||||
@@ -201,6 +204,7 @@ public:
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const override;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
|
||||
|
||||
#if 0
|
||||
@Override
|
||||
@@ -356,6 +360,7 @@ public:
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const override;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
|
||||
};
|
||||
|
||||
// This holds CONTAINS, CONTAINS_KEY, and map[key] = value restrictions because we might want to have any combination of them.
|
||||
@@ -477,6 +482,7 @@ public:
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const override;
|
||||
virtual bool is_satisfied_by(bytes_view data, const query_options& options) const override;
|
||||
|
||||
#if 0
|
||||
private List<ByteBuffer> keys(const query_options& options) {
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include <boost/range/algorithm/transform.hpp>
|
||||
#include <boost/range/algorithm.hpp>
|
||||
#include <boost/range/adaptors.hpp>
|
||||
#include <boost/algorithm/cxx11/any_of.hpp>
|
||||
|
||||
#include "statement_restrictions.hh"
|
||||
#include "single_column_primary_key_restrictions.hh"
|
||||
@@ -36,6 +37,8 @@
|
||||
namespace cql3 {
|
||||
namespace restrictions {
|
||||
|
||||
static logging::logger rlogger("restrictions");
|
||||
|
||||
using boost::adaptors::filtered;
|
||||
using boost::adaptors::transformed;
|
||||
|
||||
@@ -202,7 +205,7 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
throw exceptions::invalid_request_exception(sprint("restriction '%s' is only supported in materialized view creation", relation->to_string()));
|
||||
}
|
||||
} else {
|
||||
add_restriction(relation->to_restriction(db, schema, bound_names));
|
||||
add_restriction(relation->to_restriction(db, schema, bound_names), for_view, allow_filtering);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -214,11 +217,11 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
|| _nonprimary_key_restrictions->has_supporting_index(sim);
|
||||
|
||||
// At this point, the select statement if fully constructed, but we still have a few things to validate
|
||||
process_partition_key_restrictions(has_queriable_index, for_view);
|
||||
process_partition_key_restrictions(has_queriable_index, for_view, allow_filtering);
|
||||
|
||||
// Some but not all of the partition key columns have been specified;
|
||||
// hence we need turn these restrictions into index expressions.
|
||||
if (_uses_secondary_indexing) {
|
||||
if (_uses_secondary_indexing || _partition_key_restrictions->needs_filtering(*_schema)) {
|
||||
_index_restrictions.push_back(_partition_key_restrictions);
|
||||
}
|
||||
|
||||
@@ -234,13 +237,14 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
}
|
||||
}
|
||||
|
||||
process_clustering_columns_restrictions(has_queriable_index, select_a_collection, for_view);
|
||||
process_clustering_columns_restrictions(has_queriable_index, select_a_collection, for_view, allow_filtering);
|
||||
|
||||
// Covers indexes on the first clustering column (among others).
|
||||
if (_is_key_range && has_queriable_clustering_column_index)
|
||||
_uses_secondary_indexing = true;
|
||||
if (_is_key_range && has_queriable_clustering_column_index) {
|
||||
_uses_secondary_indexing = true;
|
||||
}
|
||||
|
||||
if (_uses_secondary_indexing) {
|
||||
if (_uses_secondary_indexing || _clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
_index_restrictions.push_back(_clustering_columns_restrictions);
|
||||
} else if (_clustering_columns_restrictions->is_contains()) {
|
||||
fail(unimplemented::cause::INDEXES);
|
||||
@@ -269,31 +273,48 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
uses_secondary_indexing = true;
|
||||
#endif
|
||||
}
|
||||
// Even if uses_secondary_indexing is false at this point, we'll still have to use one if
|
||||
// there is restrictions not covered by the PK.
|
||||
|
||||
if (!_nonprimary_key_restrictions->empty()) {
|
||||
_uses_secondary_indexing = true;
|
||||
if (has_queriable_index) {
|
||||
_uses_secondary_indexing = true;
|
||||
} else if (!allow_filtering) {
|
||||
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
||||
"thus may have unpredictable performance. If you want to execute "
|
||||
"this query despite the performance unpredictability, use ALLOW FILTERING");
|
||||
}
|
||||
_index_restrictions.push_back(_nonprimary_key_restrictions);
|
||||
}
|
||||
|
||||
if (_uses_secondary_indexing && !for_view) {
|
||||
if (_uses_secondary_indexing && !(for_view || allow_filtering)) {
|
||||
validate_secondary_index_selections(selects_only_static_columns);
|
||||
}
|
||||
}
|
||||
|
||||
void statement_restrictions::add_restriction(::shared_ptr<restriction> restriction) {
|
||||
void statement_restrictions::add_restriction(::shared_ptr<restriction> restriction, bool for_view, bool allow_filtering) {
|
||||
if (restriction->is_multi_column()) {
|
||||
_clustering_columns_restrictions = _clustering_columns_restrictions->merge_to(_schema, restriction);
|
||||
} else if (restriction->is_on_token()) {
|
||||
_partition_key_restrictions = _partition_key_restrictions->merge_to(_schema, restriction);
|
||||
} else {
|
||||
add_single_column_restriction(::static_pointer_cast<single_column_restriction>(restriction));
|
||||
add_single_column_restriction(::static_pointer_cast<single_column_restriction>(restriction), for_view, allow_filtering);
|
||||
}
|
||||
}
|
||||
|
||||
void statement_restrictions::add_single_column_restriction(::shared_ptr<single_column_restriction> restriction) {
|
||||
void statement_restrictions::add_single_column_restriction(::shared_ptr<single_column_restriction> restriction, bool for_view, bool allow_filtering) {
|
||||
auto& def = restriction->get_column_def();
|
||||
if (def.is_partition_key()) {
|
||||
// A SELECT query may not request a slice (range) of partition keys
|
||||
// without using token(). This is because there is no way to do this
|
||||
// query efficiently: mumur3 turns a contiguous range of partition
|
||||
// keys into tokens all over the token space.
|
||||
// However, in a SELECT statement used to define a materialized view,
|
||||
// such a slice is fine - it is used to check whether individual
|
||||
// partitions, match, and does not present a performance problem.
|
||||
assert(!restriction->is_on_token());
|
||||
if (restriction->is_slice() && !for_view && !allow_filtering) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Only EQ and IN relation are supported on the partition key (unless you use the token() function or allow filtering)");
|
||||
}
|
||||
_partition_key_restrictions = _partition_key_restrictions->merge_to(_schema, restriction);
|
||||
} else if (def.is_clustering_key()) {
|
||||
_clustering_columns_restrictions = _clustering_columns_restrictions->merge_to(_schema, restriction);
|
||||
@@ -312,7 +333,7 @@ const std::vector<::shared_ptr<restrictions>>& statement_restrictions::index_res
|
||||
return _index_restrictions;
|
||||
}
|
||||
|
||||
void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view) {
|
||||
void statement_restrictions::process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering) {
|
||||
// If there is a queriable index, no special condition are required on the other restrictions.
|
||||
// But we still need to know 2 things:
|
||||
// - If we don't have a queriable index, is the query ok
|
||||
@@ -321,39 +342,32 @@ void statement_restrictions::process_partition_key_restrictions(bool has_queriab
|
||||
// components must have a EQ. Only the last partition key component can be in IN relation.
|
||||
if (_partition_key_restrictions->is_on_token()) {
|
||||
_is_key_range = true;
|
||||
} else if (has_partition_key_unrestricted_components()) {
|
||||
if (!_partition_key_restrictions->empty() && !for_view) {
|
||||
if (!has_queriable_index) {
|
||||
throw exceptions::invalid_request_exception(sprint("Partition key parts: %s must be restricted as other parts are",
|
||||
join(", ", get_partition_key_unrestricted_components())));
|
||||
}
|
||||
}
|
||||
|
||||
} else if (_partition_key_restrictions->has_unrestricted_components(*_schema)) {
|
||||
_is_key_range = true;
|
||||
_uses_secondary_indexing = has_queriable_index;
|
||||
}
|
||||
if (_partition_key_restrictions->is_slice() && !_partition_key_restrictions->is_on_token() && !for_view) {
|
||||
// A SELECT query may not request a slice (range) of partition keys
|
||||
// without using token(). This is because there is no way to do this
|
||||
// query efficiently: mumur3 turns a contiguous range of partition
|
||||
// keys into tokens all over the token space.
|
||||
// However, in a SELECT statement used to define a materialized view,
|
||||
// such a slice is fine - it is used to check whether individual
|
||||
// partitions, match, and does not present a performance problem.
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Only EQ and IN relation are supported on the partition key (unless you use the token() function)");
|
||||
|
||||
if (_partition_key_restrictions->needs_filtering(*_schema)) {
|
||||
if (!allow_filtering && !for_view && !has_queriable_index) {
|
||||
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
||||
"thus may have unpredictable performance. If you want to execute "
|
||||
"this query despite the performance unpredictability, use ALLOW FILTERING");
|
||||
}
|
||||
_is_key_range = true;
|
||||
_uses_secondary_indexing = has_queriable_index;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool statement_restrictions::has_partition_key_unrestricted_components() const {
|
||||
return _partition_key_restrictions->size() < _schema->partition_key_size();
|
||||
return _partition_key_restrictions->has_unrestricted_components(*_schema);
|
||||
}
|
||||
|
||||
bool statement_restrictions::has_unrestricted_clustering_columns() const {
|
||||
return _clustering_columns_restrictions->size() < _schema->clustering_key_size();
|
||||
return _clustering_columns_restrictions->has_unrestricted_components(*_schema);
|
||||
}
|
||||
|
||||
void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view) {
|
||||
void statement_restrictions::process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering) {
|
||||
if (!has_clustering_columns_restriction()) {
|
||||
return;
|
||||
}
|
||||
@@ -362,38 +376,36 @@ void statement_restrictions::process_clustering_columns_restrictions(bool has_qu
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Cannot restrict clustering columns by IN relations when a collection is selected by the query");
|
||||
}
|
||||
if (_clustering_columns_restrictions->is_contains() && !has_queriable_index) {
|
||||
if (_clustering_columns_restrictions->is_contains() && !has_queriable_index && !allow_filtering) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
"Cannot restrict clustering columns by a CONTAINS relation without a secondary index");
|
||||
"Cannot restrict clustering columns by a CONTAINS relation without a secondary index or filtering");
|
||||
}
|
||||
|
||||
auto clustering_columns_iter = _schema->clustering_key_columns().begin();
|
||||
|
||||
for (auto&& restricted_column : _clustering_columns_restrictions->get_column_defs()) {
|
||||
const column_definition* clustering_column = &(*clustering_columns_iter);
|
||||
++clustering_columns_iter;
|
||||
|
||||
if (clustering_column != restricted_column && !for_view) {
|
||||
if (!has_queriable_index) {
|
||||
throw exceptions::invalid_request_exception(sprint(
|
||||
"PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
|
||||
restricted_column->name_as_text(), clustering_column->name_as_text()));
|
||||
if (has_clustering_columns_restriction() && _clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
if (has_queriable_index) {
|
||||
_uses_secondary_indexing = true;
|
||||
} else if (!allow_filtering && !for_view) {
|
||||
auto clustering_columns_iter = _schema->clustering_key_columns().begin();
|
||||
for (auto&& restricted_column : _clustering_columns_restrictions->get_column_defs()) {
|
||||
const column_definition* clustering_column = &(*clustering_columns_iter);
|
||||
++clustering_columns_iter;
|
||||
if (clustering_column != restricted_column) {
|
||||
throw exceptions::invalid_request_exception(sprint(
|
||||
"PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
|
||||
restricted_column->name_as_text(), clustering_column->name_as_text()));
|
||||
}
|
||||
}
|
||||
|
||||
_uses_secondary_indexing = true; // handle gaps and non-keyrange cases.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (_clustering_columns_restrictions->is_contains()) {
|
||||
_uses_secondary_indexing = true;
|
||||
}
|
||||
}
|
||||
|
||||
dht::partition_range_vector statement_restrictions::get_partition_key_ranges(const query_options& options) const {
|
||||
if (_partition_key_restrictions->empty()) {
|
||||
return {dht::partition_range::make_open_ended_both_sides()};
|
||||
}
|
||||
if (_partition_key_restrictions->needs_filtering(*_schema)) {
|
||||
return {dht::partition_range::make_open_ended_both_sides()};
|
||||
}
|
||||
return _partition_key_restrictions->bounds_ranges(options);
|
||||
}
|
||||
|
||||
@@ -401,18 +413,30 @@ std::vector<query::clustering_range> statement_restrictions::get_clustering_boun
|
||||
if (_clustering_columns_restrictions->empty()) {
|
||||
return {query::clustering_range::make_open_ended_both_sides()};
|
||||
}
|
||||
// TODO(sarna): For filtering to work, clustering range is not bounded at all. For filtering to work faster,
|
||||
// the biggest clustering prefix restriction should be used here.
|
||||
if (_clustering_columns_restrictions->needs_filtering(*_schema)) {
|
||||
return {query::clustering_range::make_open_ended_both_sides()};
|
||||
}
|
||||
return _clustering_columns_restrictions->bounds_ranges(options);
|
||||
}
|
||||
|
||||
bool statement_restrictions::need_filtering() {
|
||||
bool statement_restrictions::need_filtering() const {
|
||||
uint32_t number_of_restricted_columns = 0;
|
||||
for (auto&& restrictions : _index_restrictions) {
|
||||
number_of_restricted_columns += restrictions->size();
|
||||
}
|
||||
|
||||
if (_partition_key_restrictions->is_multi_column() || _clustering_columns_restrictions->is_multi_column()) {
|
||||
// TODO(sarna): Implement ALLOW FILTERING support for multi-column restrictions - return false for now
|
||||
// in order to ensure backwards compatibility
|
||||
return false;
|
||||
}
|
||||
|
||||
return number_of_restricted_columns > 1
|
||||
|| (number_of_restricted_columns == 0 && has_clustering_columns_restriction())
|
||||
|| (number_of_restricted_columns != 0 && _nonprimary_key_restrictions->has_multiple_contains());
|
||||
|| (number_of_restricted_columns == 0 && _partition_key_restrictions->empty() && !_clustering_columns_restrictions->empty())
|
||||
|| (number_of_restricted_columns != 0 && _nonprimary_key_restrictions->has_multiple_contains())
|
||||
|| (number_of_restricted_columns != 0 && !_uses_secondary_indexing);
|
||||
}
|
||||
|
||||
void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
|
||||
@@ -430,6 +454,33 @@ void statement_restrictions::validate_secondary_index_selections(bool selects_on
|
||||
}
|
||||
}
|
||||
|
||||
const single_column_restrictions::restrictions_map& statement_restrictions::get_single_column_partition_key_restrictions() const {
|
||||
static single_column_restrictions::restrictions_map empty;
|
||||
auto single_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<partition_key>>(_partition_key_restrictions);
|
||||
if (!single_restrictions) {
|
||||
if (dynamic_pointer_cast<initial_key_restrictions<partition_key>>(_partition_key_restrictions)) {
|
||||
return empty;
|
||||
}
|
||||
throw std::runtime_error("statement restrictions for multi-column partition key restrictions are not implemented yet");
|
||||
}
|
||||
return single_restrictions->restrictions();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return clustering key restrictions split into single column restrictions (e.g. for filtering support).
|
||||
*/
|
||||
const single_column_restrictions::restrictions_map& statement_restrictions::get_single_column_clustering_key_restrictions() const {
|
||||
static single_column_restrictions::restrictions_map empty;
|
||||
auto single_restrictions = dynamic_pointer_cast<single_column_primary_key_restrictions<clustering_key>>(_clustering_columns_restrictions);
|
||||
if (!single_restrictions) {
|
||||
if (dynamic_pointer_cast<initial_key_restrictions<clustering_key>>(_clustering_columns_restrictions)) {
|
||||
return empty;
|
||||
}
|
||||
throw std::runtime_error("statement restrictions for multi-column partition key restrictions are not implemented yet");
|
||||
}
|
||||
return single_restrictions->restrictions();
|
||||
}
|
||||
|
||||
static std::optional<atomic_cell_value_view> do_get_value(const schema& schema,
|
||||
const column_definition& cdef,
|
||||
const partition_key& key,
|
||||
@@ -482,6 +533,14 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
|
||||
return false;
|
||||
}
|
||||
|
||||
bool single_column_restriction::EQ::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
auto operand = value(options);
|
||||
return operand && _column_def.type->compare(*operand, data) == 0;
|
||||
}
|
||||
|
||||
bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
|
||||
const partition_key& key,
|
||||
const clustering_key_prefix& ckey,
|
||||
@@ -503,6 +562,16 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
|
||||
});
|
||||
}
|
||||
|
||||
bool single_column_restriction::IN::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
auto operands = values(options);
|
||||
return boost::algorithm::any_of(operands, [this, &data] (const bytes_opt& operand) {
|
||||
return operand && _column_def.type->compare(*operand, data) == 0;
|
||||
});
|
||||
}
|
||||
|
||||
static query::range<bytes_view> to_range(const term_slice& slice, const query_options& options) {
|
||||
using range_type = query::range<bytes_view>;
|
||||
auto extract_bound = [&] (statements::bound bound) -> stdx::optional<range_type::bound> {
|
||||
@@ -538,6 +607,13 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
|
||||
});
|
||||
}
|
||||
|
||||
bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
return to_range(_slice, options).contains(data, _column_def.type->as_tri_comparator());
|
||||
}
|
||||
|
||||
bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
const partition_key& key,
|
||||
const clustering_key_prefix& ckey,
|
||||
@@ -680,6 +756,11 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool single_column_restriction::contains::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
//TODO(sarna): Deserialize & return. It would be nice to deduplicate, is_satisfied_by above is rather long
|
||||
fail(unimplemented::cause::INDEXES);
|
||||
}
|
||||
|
||||
bool token_restriction::EQ::is_satisfied_by(const schema& schema,
|
||||
const partition_key& key,
|
||||
const clustering_key_prefix& ckey,
|
||||
|
||||
@@ -120,8 +120,8 @@ public:
|
||||
bool for_view = false,
|
||||
bool allow_filtering = false);
|
||||
private:
|
||||
void add_restriction(::shared_ptr<restriction> restriction);
|
||||
void add_single_column_restriction(::shared_ptr<single_column_restriction> restriction);
|
||||
void add_restriction(::shared_ptr<restriction> restriction, bool for_view, bool allow_filtering);
|
||||
void add_single_column_restriction(::shared_ptr<single_column_restriction> restriction, bool for_view, bool allow_filtering);
|
||||
public:
|
||||
bool uses_function(const sstring& ks_name, const sstring& function_name) const;
|
||||
|
||||
@@ -175,7 +175,7 @@ public:
|
||||
*/
|
||||
bool has_unrestricted_clustering_columns() const;
|
||||
private:
|
||||
void process_partition_key_restrictions(bool has_queriable_index, bool for_view);
|
||||
void process_partition_key_restrictions(bool has_queriable_index, bool for_view, bool allow_filtering);
|
||||
|
||||
/**
|
||||
* Returns the partition key components that are not restricted.
|
||||
@@ -190,7 +190,7 @@ private:
|
||||
* @param select_a_collection <code>true</code> if the query should return a collection column
|
||||
* @throws InvalidRequestException if the request is invalid
|
||||
*/
|
||||
void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view);
|
||||
void process_clustering_columns_restrictions(bool has_queriable_index, bool select_a_collection, bool for_view, bool allow_filtering);
|
||||
|
||||
/**
|
||||
* Returns the <code>Restrictions</code> for the specified type of columns.
|
||||
@@ -358,7 +358,7 @@ public:
|
||||
* Checks if the query need to use filtering.
|
||||
* @return <code>true</code> if the query need to use filtering, <code>false</code> otherwise.
|
||||
*/
|
||||
bool need_filtering();
|
||||
bool need_filtering() const;
|
||||
|
||||
void validate_secondary_index_selections(bool selects_only_static_columns);
|
||||
|
||||
@@ -399,6 +399,16 @@ public:
|
||||
const single_column_restrictions::restrictions_map& get_non_pk_restriction() const {
|
||||
return _nonprimary_key_restrictions->restrictions();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return partition key restrictions split into single column restrictions (e.g. for filtering support).
|
||||
*/
|
||||
const single_column_restrictions::restrictions_map& get_single_column_partition_key_restrictions() const;
|
||||
|
||||
/**
|
||||
* @return clustering key restrictions split into single column restrictions (e.g. for filtering support).
|
||||
*/
|
||||
const single_column_restrictions::restrictions_map& get_single_column_clustering_key_restrictions() const;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -330,93 +330,86 @@ std::unique_ptr<result_set> result_set_builder::build() {
|
||||
return std::move(_result_set);
|
||||
}
|
||||
|
||||
result_set_builder::visitor::visitor(
|
||||
cql3::selection::result_set_builder& builder, const schema& s,
|
||||
const selection& selection)
|
||||
: _builder(builder), _schema(s), _selection(selection), _row_count(0) {
|
||||
}
|
||||
bool result_set_builder::restrictions_filter::operator()(const selection& selection,
|
||||
const std::vector<bytes>& partition_key,
|
||||
const std::vector<bytes>& clustering_key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) const {
|
||||
static logging::logger rlogger("restrictions_filter");
|
||||
|
||||
void result_set_builder::visitor::add_value(const column_definition& def,
|
||||
query::result_row_view::iterator_type& i) {
|
||||
if (def.type->is_multi_cell()) {
|
||||
auto cell = i.next_collection_cell();
|
||||
if (!cell) {
|
||||
_builder.add_empty();
|
||||
return;
|
||||
}
|
||||
_builder.add_collection(def, cell->linearize());
|
||||
} else {
|
||||
auto cell = i.next_atomic_cell();
|
||||
if (!cell) {
|
||||
_builder.add_empty();
|
||||
return;
|
||||
}
|
||||
_builder.add(def, *cell);
|
||||
if (_current_pratition_key_does_not_match || _current_static_row_does_not_match) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_new_partition(const partition_key& key,
|
||||
uint32_t row_count) {
|
||||
_partition_key = key.explode(_schema);
|
||||
_row_count = row_count;
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_new_partition(uint32_t row_count) {
|
||||
_row_count = row_count;
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_new_row(const clustering_key& key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) {
|
||||
_clustering_key = key.explode(_schema);
|
||||
accept_new_row(static_row, row);
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_new_row(
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row) {
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
auto row_iterator = row.iterator();
|
||||
_builder.new_row();
|
||||
for (auto&& def : _selection.get_columns()) {
|
||||
switch (def->kind) {
|
||||
case column_kind::partition_key:
|
||||
_builder.add(_partition_key[def->component_index()]);
|
||||
break;
|
||||
case column_kind::clustering_key:
|
||||
if (_clustering_key.size() > def->component_index()) {
|
||||
_builder.add(_clustering_key[def->component_index()]);
|
||||
auto non_pk_restrictions_map = _restrictions->get_non_pk_restriction();
|
||||
auto partition_key_restrictions_map = _restrictions->get_single_column_partition_key_restrictions();
|
||||
auto clustering_key_restrictions_map = _restrictions->get_single_column_clustering_key_restrictions();
|
||||
for (auto&& cdef : selection.get_columns()) {
|
||||
switch (cdef->kind) {
|
||||
case column_kind::static_column:
|
||||
// fallthrough
|
||||
case column_kind::regular_column:
|
||||
if (cdef->type->is_multi_cell()) {
|
||||
rlogger.debug("Multi-cell filtering is not implemented yet", cdef->name_as_text());
|
||||
} else {
|
||||
_builder.add({});
|
||||
auto cell_iterator = (cdef->kind == column_kind::static_column) ? static_row_iterator : row_iterator;
|
||||
auto cell = cell_iterator.next_atomic_cell();
|
||||
|
||||
auto restr_it = non_pk_restrictions_map.find(cdef);
|
||||
if (restr_it == non_pk_restrictions_map.end()) {
|
||||
continue;
|
||||
}
|
||||
restrictions::single_column_restriction& restriction = *restr_it->second;
|
||||
|
||||
bool regular_restriction_matches;
|
||||
if (cell) {
|
||||
regular_restriction_matches = cell->value().with_linearized([&restriction](bytes_view data) {
|
||||
return restriction.is_satisfied_by(data, cql3::query_options({ }));
|
||||
});
|
||||
} else {
|
||||
regular_restriction_matches = restriction.is_satisfied_by(bytes(), cql3::query_options({ }));
|
||||
}
|
||||
if (!regular_restriction_matches) {
|
||||
_current_static_row_does_not_match = (cdef->kind == column_kind::static_column);
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
break;
|
||||
case column_kind::regular_column:
|
||||
add_value(*def, row_iterator);
|
||||
case column_kind::partition_key: {
|
||||
auto restr_it = partition_key_restrictions_map.find(cdef);
|
||||
if (restr_it == partition_key_restrictions_map.end()) {
|
||||
continue;
|
||||
}
|
||||
restrictions::single_column_restriction& restriction = *restr_it->second;
|
||||
const bytes& value_to_check = partition_key[cdef->id];
|
||||
bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, cql3::query_options({ }));
|
||||
if (!pk_restriction_matches) {
|
||||
_current_pratition_key_does_not_match = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case column_kind::static_column:
|
||||
add_value(*def, static_row_iterator);
|
||||
case column_kind::clustering_key: {
|
||||
auto restr_it = clustering_key_restrictions_map.find(cdef);
|
||||
if (restr_it == clustering_key_restrictions_map.end()) {
|
||||
continue;
|
||||
}
|
||||
restrictions::single_column_restriction& restriction = *restr_it->second;
|
||||
const bytes& value_to_check = clustering_key[cdef->id];
|
||||
bool pk_restriction_matches = restriction.is_satisfied_by(value_to_check, cql3::query_options({ }));
|
||||
if (!pk_restriction_matches) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void result_set_builder::visitor::accept_partition_end(
|
||||
const query::result_row_view& static_row) {
|
||||
if (_row_count == 0) {
|
||||
_builder.new_row();
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
for (auto&& def : _selection.get_columns()) {
|
||||
if (def->is_partition_key()) {
|
||||
_builder.add(_partition_key[def->component_index()]);
|
||||
} else if (def->is_static()) {
|
||||
add_value(*def, static_row_iterator);
|
||||
} else {
|
||||
_builder.add_empty();
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
api::timestamp_type result_set_builder::timestamp_of(size_t idx) {
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
#include "exceptions/exceptions.hh"
|
||||
#include "cql3/selection/raw_selector.hh"
|
||||
#include "cql3/selection/selector_factories.hh"
|
||||
#include "cql3/restrictions/statement_restrictions.hh"
|
||||
#include "unimplemented.hh"
|
||||
|
||||
namespace cql3 {
|
||||
@@ -247,6 +248,28 @@ private:
|
||||
const gc_clock::time_point _now;
|
||||
cql_serialization_format _cql_serialization_format;
|
||||
public:
|
||||
class nop_filter {
|
||||
public:
|
||||
inline bool operator()(const selection&, const std::vector<bytes>&, const std::vector<bytes>&, const query::result_row_view&, const query::result_row_view&) const {
|
||||
return true;
|
||||
}
|
||||
void reset() {
|
||||
}
|
||||
};
|
||||
class restrictions_filter {
|
||||
::shared_ptr<restrictions::statement_restrictions> _restrictions;
|
||||
mutable bool _current_pratition_key_does_not_match = false;
|
||||
mutable bool _current_static_row_does_not_match = false;
|
||||
public:
|
||||
restrictions_filter() = default;
|
||||
explicit restrictions_filter(::shared_ptr<restrictions::statement_restrictions> restrictions) : _restrictions(restrictions) {}
|
||||
bool operator()(const selection& selection, const std::vector<bytes>& pk, const std::vector<bytes>& ck, const query::result_row_view& static_row, const query::result_row_view& row) const;
|
||||
void reset() {
|
||||
_current_pratition_key_does_not_match = false;
|
||||
_current_static_row_does_not_match = false;
|
||||
}
|
||||
};
|
||||
|
||||
result_set_builder(const selection& s, gc_clock::time_point now, cql_serialization_format sf);
|
||||
void add_empty();
|
||||
void add(bytes_opt value);
|
||||
@@ -256,8 +279,9 @@ public:
|
||||
std::unique_ptr<result_set> build();
|
||||
api::timestamp_type timestamp_of(size_t idx);
|
||||
int32_t ttl_of(size_t idx);
|
||||
|
||||
|
||||
// Implements ResultVisitor concept from query.hh
|
||||
template<typename Filter = nop_filter>
|
||||
class visitor {
|
||||
protected:
|
||||
result_set_builder& _builder;
|
||||
@@ -266,20 +290,100 @@ public:
|
||||
uint32_t _row_count;
|
||||
std::vector<bytes> _partition_key;
|
||||
std::vector<bytes> _clustering_key;
|
||||
Filter _filter;
|
||||
public:
|
||||
visitor(cql3::selection::result_set_builder& builder, const schema& s, const selection&);
|
||||
visitor(cql3::selection::result_set_builder& builder, const schema& s,
|
||||
const selection& selection, Filter filter = Filter())
|
||||
: _builder(builder)
|
||||
, _schema(s)
|
||||
, _selection(selection)
|
||||
, _row_count(0)
|
||||
, _filter(filter)
|
||||
{}
|
||||
visitor(visitor&&) = default;
|
||||
|
||||
void add_value(const column_definition& def, query::result_row_view::iterator_type& i);
|
||||
void accept_new_partition(const partition_key& key, uint32_t row_count);
|
||||
void accept_new_partition(uint32_t row_count);
|
||||
void accept_new_row(const clustering_key& key,
|
||||
const query::result_row_view& static_row,
|
||||
const query::result_row_view& row);
|
||||
void accept_new_row(const query::result_row_view& static_row,
|
||||
const query::result_row_view& row);
|
||||
void accept_partition_end(const query::result_row_view& static_row);
|
||||
void add_value(const column_definition& def, query::result_row_view::iterator_type& i) {
|
||||
if (def.type->is_multi_cell()) {
|
||||
auto cell = i.next_collection_cell();
|
||||
if (!cell) {
|
||||
_builder.add_empty();
|
||||
return;
|
||||
}
|
||||
_builder.add_collection(def, cell->linearize());
|
||||
} else {
|
||||
auto cell = i.next_atomic_cell();
|
||||
if (!cell) {
|
||||
_builder.add_empty();
|
||||
return;
|
||||
}
|
||||
_builder.add(def, *cell);
|
||||
}
|
||||
}
|
||||
|
||||
void accept_new_partition(const partition_key& key, uint32_t row_count) {
|
||||
_partition_key = key.explode(_schema);
|
||||
_row_count = row_count;
|
||||
_filter.reset();
|
||||
}
|
||||
|
||||
void accept_new_partition(uint32_t row_count) {
|
||||
_row_count = row_count;
|
||||
_filter.reset();
|
||||
}
|
||||
|
||||
void accept_new_row(const clustering_key& key, const query::result_row_view& static_row, const query::result_row_view& row) {
|
||||
_clustering_key = key.explode(_schema);
|
||||
accept_new_row(static_row, row);
|
||||
}
|
||||
|
||||
void accept_new_row(const query::result_row_view& static_row, const query::result_row_view& row) {
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
auto row_iterator = row.iterator();
|
||||
if (!_filter(_selection, _partition_key, _clustering_key, static_row, row)) {
|
||||
return;
|
||||
}
|
||||
_builder.new_row();
|
||||
for (auto&& def : _selection.get_columns()) {
|
||||
switch (def->kind) {
|
||||
case column_kind::partition_key:
|
||||
_builder.add(_partition_key[def->component_index()]);
|
||||
break;
|
||||
case column_kind::clustering_key:
|
||||
if (_clustering_key.size() > def->component_index()) {
|
||||
_builder.add(_clustering_key[def->component_index()]);
|
||||
} else {
|
||||
_builder.add({});
|
||||
}
|
||||
break;
|
||||
case column_kind::regular_column:
|
||||
add_value(*def, row_iterator);
|
||||
break;
|
||||
case column_kind::static_column:
|
||||
add_value(*def, static_row_iterator);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void accept_partition_end(const query::result_row_view& static_row) {
|
||||
if (_row_count == 0) {
|
||||
_builder.new_row();
|
||||
auto static_row_iterator = static_row.iterator();
|
||||
for (auto&& def : _selection.get_columns()) {
|
||||
if (def->is_partition_key()) {
|
||||
_builder.add(_partition_key[def->component_index()]);
|
||||
} else if (def->is_static()) {
|
||||
add_value(*def, static_row_iterator);
|
||||
} else {
|
||||
_builder.add_empty();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
bytes_opt get_value(data_type t, query::result_atomic_cell_view c);
|
||||
};
|
||||
|
||||
@@ -118,7 +118,8 @@ private:
|
||||
schema_ptr schema,
|
||||
::shared_ptr<variable_specifications> bound_names,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
bool for_view = false);
|
||||
bool for_view = false,
|
||||
bool allow_filtering = false);
|
||||
|
||||
/** Returns a ::shared_ptr<term> for the limit or null if no limit is set */
|
||||
::shared_ptr<term> prepare_limit(database& db, ::shared_ptr<variable_specifications> bound_names);
|
||||
|
||||
@@ -384,6 +384,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
auto now = gc_clock::now();
|
||||
|
||||
++_stats.reads;
|
||||
_stats.filtered_reads += _restrictions->need_filtering();
|
||||
|
||||
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
|
||||
make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), options.get_timestamp(state));
|
||||
@@ -409,7 +410,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
command->slice.options.set<query::partition_slice::option::allow_short_read>();
|
||||
auto timeout = options.get_timeout_config().*get_timeout_config_selector();
|
||||
auto p = service::pager::query_pagers::pager(_schema, _selection,
|
||||
state, options, timeout, command, std::move(key_ranges));
|
||||
state, options, timeout, command, std::move(key_ranges), _stats, _restrictions->need_filtering() ? _restrictions : nullptr);
|
||||
|
||||
if (aggregate) {
|
||||
return do_with(
|
||||
@@ -423,6 +424,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
).then([this, &builder] {
|
||||
auto rs = builder.build();
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
});
|
||||
@@ -435,7 +437,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
" you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
|
||||
}
|
||||
|
||||
if (_selection->is_trivial()) {
|
||||
if (_selection->is_trivial() && !_restrictions->need_filtering()) {
|
||||
return p->fetch_page_generator(page_size, now, _stats).then([this, p, limit] (result_generator generator) {
|
||||
auto meta = make_shared<metadata>(*_selection->get_result_metadata());
|
||||
if (!p->is_exhausted()) {
|
||||
@@ -456,6 +458,7 @@ select_statement::do_execute(service::storage_proxy& proxy,
|
||||
}
|
||||
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
||||
auto msg = ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
return make_ready_future<shared_ptr<cql_transport::messages::result_message>>(std::move(msg));
|
||||
});
|
||||
@@ -554,7 +557,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
const query_options& options,
|
||||
gc_clock::time_point now)
|
||||
{
|
||||
bool fast_path = !needs_post_query_ordering() && _selection->is_trivial();
|
||||
bool fast_path = !needs_post_query_ordering() && _selection->is_trivial() && !_restrictions->need_filtering();
|
||||
if (fast_path) {
|
||||
return make_shared<cql_transport::messages::result_message::rows>(result(
|
||||
result_generator(_schema, std::move(results), std::move(cmd), _selection, _stats),
|
||||
@@ -564,9 +567,17 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
|
||||
cql3::selection::result_set_builder builder(*_selection, now,
|
||||
options.get_cql_serialization_format());
|
||||
query::result_view::consume(*results, cmd->slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||
*_selection));
|
||||
if (_restrictions->need_filtering()) {
|
||||
results->ensure_counts();
|
||||
_stats.filtered_rows_read_total += *results->row_count();
|
||||
query::result_view::consume(*results, cmd->slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||
*_selection, cql3::selection::result_set_builder::restrictions_filter(_restrictions)));
|
||||
} else {
|
||||
query::result_view::consume(*results, cmd->slice,
|
||||
cql3::selection::result_set_builder::visitor(builder, *_schema,
|
||||
*_selection));
|
||||
}
|
||||
auto rs = builder.build();
|
||||
|
||||
if (needs_post_query_ordering()) {
|
||||
@@ -577,6 +588,7 @@ select_statement::process_results(foreign_ptr<lw_shared_ptr<query::result>> resu
|
||||
rs->trim(cmd->row_limit);
|
||||
}
|
||||
update_stats_rows_read(rs->size());
|
||||
_stats.filtered_rows_matched_total += _restrictions->need_filtering() ? rs->size() : 0;
|
||||
return ::make_shared<cql_transport::messages::result_message::rows>(result(std::move(rs)));
|
||||
}
|
||||
|
||||
@@ -957,7 +969,7 @@ std::unique_ptr<prepared_statement> select_statement::prepare(database& db, cql_
|
||||
? selection::selection::wildcard(schema)
|
||||
: selection::selection::from_selectors(db, schema, _select_clause);
|
||||
|
||||
auto restrictions = prepare_restrictions(db, schema, bound_names, selection, for_view);
|
||||
auto restrictions = prepare_restrictions(db, schema, bound_names, selection, for_view, _parameters->allow_filtering());
|
||||
|
||||
if (_parameters->is_distinct()) {
|
||||
validate_distinct_selection(schema, selection, restrictions);
|
||||
@@ -1011,13 +1023,14 @@ select_statement::prepare_restrictions(database& db,
|
||||
schema_ptr schema,
|
||||
::shared_ptr<variable_specifications> bound_names,
|
||||
::shared_ptr<selection::selection> selection,
|
||||
bool for_view)
|
||||
bool for_view,
|
||||
bool allow_filtering)
|
||||
{
|
||||
try {
|
||||
// FIXME: this method should take a separate allow_filtering parameter
|
||||
// and pass it on. Currently we pass "for_view" as allow_filtering.
|
||||
return ::make_shared<restrictions::statement_restrictions>(db, schema, statement_type::SELECT, std::move(_where_clause), bound_names,
|
||||
selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, for_view);
|
||||
selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, allow_filtering);
|
||||
} catch (const exceptions::unrecognized_entity_exception& e) {
|
||||
if (contains_alias(e.entity)) {
|
||||
throw exceptions::invalid_request_exception(sprint("Aliases aren't allowed in the where clause ('%s')", e.relation->to_string()));
|
||||
|
||||
@@ -41,6 +41,10 @@ struct cql_stats {
|
||||
int64_t secondary_index_drops = 0;
|
||||
int64_t secondary_index_reads = 0;
|
||||
int64_t secondary_index_rows_read = 0;
|
||||
|
||||
int64_t filtered_reads = 0;
|
||||
int64_t filtered_rows_matched_total = 0;
|
||||
int64_t filtered_rows_read_total = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -211,6 +211,7 @@ struct cell {
|
||||
imr::member<tags::chunk_next, imr::pod<uint8_t*>>,
|
||||
imr::member<tags::chunk_data, imr::buffer<tags::chunk_data>>
|
||||
>;
|
||||
static constexpr size_t external_chunk_overhead = sizeof(uint8_t*) * 2;
|
||||
|
||||
using external_last_chunk_size = imr::pod<uint16_t>;
|
||||
/// The last fragment of an externally stored value
|
||||
@@ -224,6 +225,7 @@ struct cell {
|
||||
imr::member<tags::last_chunk_size, external_last_chunk_size>,
|
||||
imr::member<tags::chunk_data, imr::buffer<tags::chunk_data>>
|
||||
>;
|
||||
static constexpr size_t external_last_chunk_overhead = sizeof(uint8_t*) + sizeof(uint16_t);
|
||||
|
||||
class context;
|
||||
class minimal_context;
|
||||
|
||||
105
database.cc
105
database.cc
@@ -182,7 +182,7 @@ thread_local dirty_memory_manager default_dirty_memory_manager;
|
||||
lw_shared_ptr<memtable_list>
|
||||
table::make_memory_only_memtable_list() {
|
||||
auto get_schema = [this] { return schema(); };
|
||||
return make_lw_shared<memtable_list>(std::move(get_schema), _config.dirty_memory_manager);
|
||||
return make_lw_shared<memtable_list>(std::move(get_schema), _config.dirty_memory_manager, _config.memory_compaction_scheduling_group);
|
||||
}
|
||||
|
||||
lw_shared_ptr<memtable_list>
|
||||
@@ -191,7 +191,7 @@ table::make_memtable_list() {
|
||||
return seal_active_memtable(std::move(permit));
|
||||
};
|
||||
auto get_schema = [this] { return schema(); };
|
||||
return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.dirty_memory_manager);
|
||||
return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.dirty_memory_manager, _config.memory_compaction_scheduling_group);
|
||||
}
|
||||
|
||||
lw_shared_ptr<memtable_list>
|
||||
@@ -200,7 +200,7 @@ table::make_streaming_memtable_list() {
|
||||
return seal_active_streaming_memtable_immediate(std::move(permit));
|
||||
};
|
||||
auto get_schema = [this] { return schema(); };
|
||||
return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
|
||||
return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager, _config.streaming_scheduling_group);
|
||||
}
|
||||
|
||||
lw_shared_ptr<memtable_list>
|
||||
@@ -209,7 +209,7 @@ table::make_streaming_memtable_big_list(streaming_memtable_big& smb) {
|
||||
return seal_active_streaming_memtable_big(smb, std::move(permit));
|
||||
};
|
||||
auto get_schema = [this] { return schema(); };
|
||||
return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager);
|
||||
return make_lw_shared<memtable_list>(std::move(seal), std::move(get_schema), _config.streaming_dirty_memory_manager, _config.streaming_scheduling_group);
|
||||
}
|
||||
|
||||
table::table(schema_ptr schema, config config, db::commitlog* cl, compaction_manager& compaction_manager, cell_locker_stats& cl_stats, cache_tracker& row_cache_tracker)
|
||||
@@ -237,7 +237,7 @@ partition_presence_checker
|
||||
table::make_partition_presence_checker(lw_shared_ptr<sstables::sstable_set> sstables) {
|
||||
auto sel = make_lw_shared(sstables->make_incremental_selector());
|
||||
return [this, sstables = std::move(sstables), sel = std::move(sel)] (const dht::decorated_key& key) {
|
||||
auto& sst = sel->select(key.token()).sstables;
|
||||
auto& sst = sel->select(key).sstables;
|
||||
if (sst.empty()) {
|
||||
return partition_presence_checker_result::definitely_doesnt_exist;
|
||||
}
|
||||
@@ -383,9 +383,13 @@ filter_sstable_for_reader(std::vector<sstables::shared_sstable>&& sstables, colu
|
||||
};
|
||||
sstables.erase(boost::remove_if(sstables, sstable_has_not_key), sstables.end());
|
||||
|
||||
// FIXME: Workaround for https://github.com/scylladb/scylla/issues/3552
|
||||
// and https://github.com/scylladb/scylla/issues/3553
|
||||
const bool filtering_broken = true;
|
||||
|
||||
// no clustering filtering is applied if schema defines no clustering key or
|
||||
// compaction strategy thinks it will not benefit from such an optimization.
|
||||
if (!schema->clustering_key_size() || !cf.get_compaction_strategy().use_clustering_key_filter()) {
|
||||
if (filtering_broken || !schema->clustering_key_size() || !cf.get_compaction_strategy().use_clustering_key_filter()) {
|
||||
return sstables;
|
||||
}
|
||||
::cf_stats* stats = cf.cf_stats();
|
||||
@@ -449,7 +453,7 @@ public:
|
||||
const dht::partition_range& pr,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
sstable_reader_factory_type fn)
|
||||
: reader_selector(s, pr.start() ? pr.start()->value() : dht::ring_position::min())
|
||||
: reader_selector(s, pr.start() ? pr.start()->value() : dht::ring_position_view::min())
|
||||
, _pr(&pr)
|
||||
, _sstables(std::move(sstables))
|
||||
, _trace_state(std::move(trace_state))
|
||||
@@ -468,47 +472,34 @@ public:
|
||||
incremental_reader_selector(incremental_reader_selector&&) = delete;
|
||||
incremental_reader_selector& operator=(incremental_reader_selector&&) = delete;
|
||||
|
||||
virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const t) override {
|
||||
dblog.trace("incremental_reader_selector {}: {}({})", this, __FUNCTION__, seastar::lazy_deref(t));
|
||||
virtual std::vector<flat_mutation_reader> create_new_readers(const std::optional<dht::ring_position_view>& pos) override {
|
||||
dblog.trace("incremental_reader_selector {}: {}({})", this, __FUNCTION__, seastar::lazy_deref(pos));
|
||||
|
||||
const auto& position = (t ? *t : _selector_position.token());
|
||||
// we only pass _selector_position's token to _selector::select() when T is nullptr
|
||||
// because it means gap between sstables, and the lower bound of the first interval
|
||||
// after the gap is guaranteed to be inclusive.
|
||||
auto selection = _selector.select(position);
|
||||
auto readers = std::vector<flat_mutation_reader>();
|
||||
|
||||
if (selection.sstables.empty()) {
|
||||
// For the lower bound of the token range the _selector
|
||||
// might not return any sstables, in this case try again
|
||||
// with next_token unless it's maximum token.
|
||||
if (!selection.next_position.is_max()
|
||||
&& position == (_pr->start() ? _pr->start()->value().token() : dht::minimum_token())) {
|
||||
dblog.trace("incremental_reader_selector {}: no sstables intersect with the lower bound, retrying", this);
|
||||
_selector_position = std::move(selection.next_position);
|
||||
return create_new_readers(nullptr);
|
||||
}
|
||||
do {
|
||||
auto selection = _selector.select(_selector_position);
|
||||
_selector_position = selection.next_position;
|
||||
|
||||
_selector_position = dht::ring_position::max();
|
||||
return {};
|
||||
}
|
||||
dblog.trace("incremental_reader_selector {}: {} sstables to consider, advancing selector to {}", this, selection.sstables.size(),
|
||||
_selector_position);
|
||||
|
||||
_selector_position = std::move(selection.next_position);
|
||||
readers = boost::copy_range<std::vector<flat_mutation_reader>>(selection.sstables
|
||||
| boost::adaptors::filtered([this] (auto& sst) { return _read_sstables.emplace(sst).second; })
|
||||
| boost::adaptors::transformed([this] (auto& sst) { return this->create_reader(sst); }));
|
||||
} while (!_selector_position.is_max() && readers.empty() && (!pos || dht::ring_position_tri_compare(*_s, *pos, _selector_position) >= 0));
|
||||
|
||||
dblog.trace("incremental_reader_selector {}: {} new sstables to consider, advancing selector to {}", this, selection.sstables.size(), _selector_position);
|
||||
dblog.trace("incremental_reader_selector {}: created {} new readers", this, readers.size());
|
||||
|
||||
return boost::copy_range<std::vector<flat_mutation_reader>>(selection.sstables
|
||||
| boost::adaptors::filtered([this] (auto& sst) { return _read_sstables.emplace(sst).second; })
|
||||
| boost::adaptors::transformed([this] (auto& sst) {
|
||||
return this->create_reader(sst);
|
||||
}));
|
||||
return readers;
|
||||
}
|
||||
|
||||
virtual std::vector<flat_mutation_reader> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
|
||||
_pr = ≺
|
||||
|
||||
dht::ring_position_comparator cmp(*_s);
|
||||
if (cmp(dht::ring_position_view::for_range_start(*_pr), _selector_position) >= 0) {
|
||||
return create_new_readers(&_pr->start()->value().token());
|
||||
auto pos = dht::ring_position_view::for_range_start(*_pr);
|
||||
if (dht::ring_position_tri_compare(*_s, pos, _selector_position) >= 0) {
|
||||
return create_new_readers(pos);
|
||||
}
|
||||
|
||||
return {};
|
||||
@@ -1562,7 +1553,7 @@ future<std::unordered_set<sstring>> table::get_sstables_by_partition_key(const s
|
||||
[this] (std::unordered_set<sstring>& filenames, lw_shared_ptr<sstables::sstable_set::incremental_selector>& sel, partition_key& pk) {
|
||||
return do_with(dht::decorated_key(dht::global_partitioner().decorate_key(*_schema, pk)),
|
||||
[this, &filenames, &sel, &pk](dht::decorated_key& dk) mutable {
|
||||
auto sst = sel->select(dk.token()).sstables;
|
||||
auto sst = sel->select(dk).sstables;
|
||||
auto hk = sstables::sstable::make_hashed_key(*_schema, dk.key());
|
||||
|
||||
return do_for_each(sst, [this, &filenames, &dk, hk = std::move(hk)] (std::vector<sstables::shared_sstable>::const_iterator::reference s) mutable {
|
||||
@@ -2154,6 +2145,8 @@ database::database(const db::config& cfg, database_config dbcfg)
|
||||
_compaction_manager->start();
|
||||
setup_metrics();
|
||||
|
||||
_row_cache_tracker.set_compaction_scheduling_group(dbcfg.memory_compaction_scheduling_group);
|
||||
|
||||
dblog.info("Row: max_vector_size: {}, internal_count: {}", size_t(row::max_vector_size), size_t(row::internal_count));
|
||||
}
|
||||
|
||||
@@ -2699,7 +2692,7 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
|
||||
remove(*cf);
|
||||
cf->clear_views();
|
||||
auto& ks = find_keyspace(ks_name);
|
||||
return cf->await_pending_writes().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
|
||||
return when_all_succeed(cf->await_pending_writes(), cf->await_pending_reads()).then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
|
||||
return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
|
||||
return cf->stop();
|
||||
});
|
||||
@@ -2841,6 +2834,7 @@ keyspace::make_column_family_config(const schema& s, const db::config& db_config
|
||||
cfg.cf_stats = _config.cf_stats;
|
||||
cfg.enable_incremental_backups = _config.enable_incremental_backups;
|
||||
cfg.compaction_scheduling_group = _config.compaction_scheduling_group;
|
||||
cfg.memory_compaction_scheduling_group = _config.memory_compaction_scheduling_group;
|
||||
cfg.memtable_scheduling_group = _config.memtable_scheduling_group;
|
||||
cfg.memtable_to_cache_scheduling_group = _config.memtable_to_cache_scheduling_group;
|
||||
cfg.streaming_scheduling_group = _config.streaming_scheduling_group;
|
||||
@@ -3139,7 +3133,7 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
|
||||
seastar::ref(get_result_memory_limiter()),
|
||||
max_result_size,
|
||||
timeout,
|
||||
std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate()] (auto f) {
|
||||
std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
|
||||
if (f.failed()) {
|
||||
++s->total_reads_failed;
|
||||
return make_exception_future<lw_shared_ptr<query::result>, cache_temperature>(f.get_exception());
|
||||
@@ -3167,7 +3161,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
|
||||
std::move(accounter),
|
||||
std::move(trace_state),
|
||||
timeout,
|
||||
std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate()] (auto f) {
|
||||
std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
|
||||
if (f.failed()) {
|
||||
++s->total_reads_failed;
|
||||
return make_exception_future<reconcilable_result, cache_temperature>(f.get_exception());
|
||||
@@ -3392,7 +3386,7 @@ future<> memtable_list::request_flush() {
|
||||
}
|
||||
|
||||
lw_shared_ptr<memtable> memtable_list::new_memtable() {
|
||||
return make_lw_shared<memtable>(_current_schema(), *_dirty_memory_manager, this);
|
||||
return make_lw_shared<memtable>(_current_schema(), *_dirty_memory_manager, this, _compaction_scheduling_group);
|
||||
}
|
||||
|
||||
future<flush_permit> flush_permit::reacquire_sstable_write_permit() && {
|
||||
@@ -3621,6 +3615,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
|
||||
cfg.enable_incremental_backups = _enable_incremental_backups;
|
||||
|
||||
cfg.compaction_scheduling_group = _dbcfg.compaction_scheduling_group;
|
||||
cfg.memory_compaction_scheduling_group = _dbcfg.memory_compaction_scheduling_group;
|
||||
cfg.memtable_scheduling_group = _dbcfg.memtable_scheduling_group;
|
||||
cfg.memtable_to_cache_scheduling_group = _dbcfg.memtable_to_cache_scheduling_group;
|
||||
cfg.streaming_scheduling_group = _dbcfg.streaming_scheduling_group;
|
||||
@@ -4606,14 +4601,11 @@ flat_mutation_reader make_local_shard_sstable_reader(schema_ptr s,
|
||||
}
|
||||
return reader;
|
||||
};
|
||||
auto all_readers = boost::copy_range<std::vector<flat_mutation_reader>>(
|
||||
*sstables->all()
|
||||
| boost::adaptors::transformed([&] (sstables::shared_sstable sst) -> flat_mutation_reader {
|
||||
return reader_factory_fn(sst, pr);
|
||||
})
|
||||
);
|
||||
return make_combined_reader(s,
|
||||
std::move(all_readers),
|
||||
return make_combined_reader(s, std::make_unique<incremental_reader_selector>(s,
|
||||
std::move(sstables),
|
||||
pr,
|
||||
std::move(trace_state),
|
||||
std::move(reader_factory_fn)),
|
||||
fwd,
|
||||
fwd_mr);
|
||||
}
|
||||
@@ -4632,14 +4624,11 @@ flat_mutation_reader make_range_sstable_reader(schema_ptr s,
|
||||
auto reader_factory_fn = [s, &slice, &pc, resource_tracker, fwd, fwd_mr, &monitor_generator] (sstables::shared_sstable& sst, const dht::partition_range& pr) {
|
||||
return sst->read_range_rows_flat(s, pr, slice, pc, resource_tracker, fwd, fwd_mr, monitor_generator(sst));
|
||||
};
|
||||
auto sstable_readers = boost::copy_range<std::vector<flat_mutation_reader>>(
|
||||
*sstables->all()
|
||||
| boost::adaptors::transformed([&] (sstables::shared_sstable sst) {
|
||||
return reader_factory_fn(sst, pr);
|
||||
})
|
||||
);
|
||||
return make_combined_reader(s,
|
||||
std::move(sstable_readers),
|
||||
return make_combined_reader(s, std::make_unique<incremental_reader_selector>(s,
|
||||
std::move(sstables),
|
||||
pr,
|
||||
std::move(trace_state),
|
||||
std::move(reader_factory_fn)),
|
||||
fwd,
|
||||
fwd_mr);
|
||||
}
|
||||
|
||||
29
database.hh
29
database.hh
@@ -164,29 +164,33 @@ private:
|
||||
std::function<schema_ptr()> _current_schema;
|
||||
dirty_memory_manager* _dirty_memory_manager;
|
||||
std::experimental::optional<shared_promise<>> _flush_coalescing;
|
||||
seastar::scheduling_group _compaction_scheduling_group;
|
||||
public:
|
||||
memtable_list(
|
||||
seal_immediate_fn_type seal_immediate_fn,
|
||||
seal_delayed_fn_type seal_delayed_fn,
|
||||
std::function<schema_ptr()> cs,
|
||||
dirty_memory_manager* dirty_memory_manager)
|
||||
dirty_memory_manager* dirty_memory_manager,
|
||||
seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
|
||||
: _memtables({})
|
||||
, _seal_immediate_fn(seal_immediate_fn)
|
||||
, _seal_delayed_fn(seal_delayed_fn)
|
||||
, _current_schema(cs)
|
||||
, _dirty_memory_manager(dirty_memory_manager) {
|
||||
, _dirty_memory_manager(dirty_memory_manager)
|
||||
, _compaction_scheduling_group(compaction_scheduling_group) {
|
||||
add_memtable();
|
||||
}
|
||||
|
||||
memtable_list(
|
||||
seal_immediate_fn_type seal_immediate_fn,
|
||||
std::function<schema_ptr()> cs,
|
||||
dirty_memory_manager* dirty_memory_manager)
|
||||
: memtable_list(std::move(seal_immediate_fn), {}, std::move(cs), dirty_memory_manager) {
|
||||
dirty_memory_manager* dirty_memory_manager,
|
||||
seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
|
||||
: memtable_list(std::move(seal_immediate_fn), {}, std::move(cs), dirty_memory_manager, compaction_scheduling_group) {
|
||||
}
|
||||
|
||||
memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
|
||||
: memtable_list({}, {}, std::move(cs), dirty_memory_manager) {
|
||||
memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager, seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group())
|
||||
: memtable_list({}, {}, std::move(cs), dirty_memory_manager, compaction_scheduling_group) {
|
||||
}
|
||||
|
||||
bool may_flush() const {
|
||||
@@ -312,6 +316,7 @@ public:
|
||||
seastar::scheduling_group memtable_scheduling_group;
|
||||
seastar::scheduling_group memtable_to_cache_scheduling_group;
|
||||
seastar::scheduling_group compaction_scheduling_group;
|
||||
seastar::scheduling_group memory_compaction_scheduling_group;
|
||||
seastar::scheduling_group statement_scheduling_group;
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
bool enable_metrics_reporting = false;
|
||||
@@ -475,6 +480,8 @@ private:
|
||||
// after some modification, needs to ensure that news writes will see it before
|
||||
// it can proceed, such as the view building code.
|
||||
utils::phased_barrier _pending_writes_phaser;
|
||||
// Corresponding phaser for in-progress reads.
|
||||
utils::phased_barrier _pending_reads_phaser;
|
||||
private:
|
||||
void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, const std::vector<unsigned>& shards_for_the_sstable) noexcept;
|
||||
// Adds new sstable to the set of sstables
|
||||
@@ -817,6 +824,14 @@ public:
|
||||
return _pending_writes_phaser.advance_and_await();
|
||||
}
|
||||
|
||||
utils::phased_barrier::operation read_in_progress() {
|
||||
return _pending_reads_phaser.start();
|
||||
}
|
||||
|
||||
future<> await_pending_reads() {
|
||||
return _pending_reads_phaser.advance_and_await();
|
||||
}
|
||||
|
||||
void add_or_update_view(view_ptr v);
|
||||
void remove_view(view_ptr v);
|
||||
void clear_views();
|
||||
@@ -1029,6 +1044,7 @@ public:
|
||||
seastar::scheduling_group memtable_scheduling_group;
|
||||
seastar::scheduling_group memtable_to_cache_scheduling_group;
|
||||
seastar::scheduling_group compaction_scheduling_group;
|
||||
seastar::scheduling_group memory_compaction_scheduling_group;
|
||||
seastar::scheduling_group statement_scheduling_group;
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
bool enable_metrics_reporting = false;
|
||||
@@ -1109,6 +1125,7 @@ struct database_config {
|
||||
seastar::scheduling_group memtable_scheduling_group;
|
||||
seastar::scheduling_group memtable_to_cache_scheduling_group; // FIXME: merge with memtable_scheduling_group
|
||||
seastar::scheduling_group compaction_scheduling_group;
|
||||
seastar::scheduling_group memory_compaction_scheduling_group;
|
||||
seastar::scheduling_group statement_scheduling_group;
|
||||
seastar::scheduling_group streaming_scheduling_group;
|
||||
size_t available_memory;
|
||||
|
||||
@@ -461,7 +461,7 @@ bool ring_position::less_compare(const schema& s, const ring_position& other) co
|
||||
return tri_compare(s, other) < 0;
|
||||
}
|
||||
|
||||
int ring_position_comparator::operator()(ring_position_view lh, ring_position_view rh) const {
|
||||
int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh) {
|
||||
auto token_cmp = tri_compare(*lh._token, *rh._token);
|
||||
if (token_cmp) {
|
||||
return token_cmp;
|
||||
@@ -482,6 +482,10 @@ int ring_position_comparator::operator()(ring_position_view lh, ring_position_vi
|
||||
}
|
||||
}
|
||||
|
||||
int ring_position_comparator::operator()(ring_position_view lh, ring_position_view rh) const {
|
||||
return ring_position_tri_compare(s, lh, rh);
|
||||
}
|
||||
|
||||
int ring_position_comparator::operator()(ring_position_view lh, sstables::decorated_key_view rh) const {
|
||||
auto token_cmp = tri_compare(*lh._token, rh.token());
|
||||
if (token_cmp) {
|
||||
|
||||
@@ -384,6 +384,10 @@ public:
|
||||
return "biased-token-round-robin";
|
||||
}
|
||||
|
||||
virtual unsigned sharding_ignore_msb() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
friend bool operator==(token_view t1, token_view t2);
|
||||
friend bool operator<(token_view t1, token_view t2);
|
||||
friend int tri_compare(token_view t1, token_view t2);
|
||||
@@ -525,6 +529,7 @@ public:
|
||||
// Such range includes all keys k such that v1 <= k < v2, with order defined by ring_position_comparator.
|
||||
//
|
||||
class ring_position_view {
|
||||
friend int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh);
|
||||
friend class ring_position_comparator;
|
||||
|
||||
// Order is lexicographical on (_token, _key) tuples, where _key part may be missing, and
|
||||
@@ -539,6 +544,7 @@ class ring_position_view {
|
||||
const partition_key* _key; // Can be nullptr
|
||||
int8_t _weight;
|
||||
public:
|
||||
using token_bound = ring_position::token_bound;
|
||||
struct after_key_tag {};
|
||||
using after_key = bool_class<after_key_tag>;
|
||||
|
||||
@@ -574,6 +580,14 @@ public:
|
||||
return ring_position_view(after_key_tag(), view);
|
||||
}
|
||||
|
||||
static ring_position_view starting_at(const dht::token& t) {
|
||||
return ring_position_view(t, token_bound::start);
|
||||
}
|
||||
|
||||
static ring_position_view ending_at(const dht::token& t) {
|
||||
return ring_position_view(t, token_bound::end);
|
||||
}
|
||||
|
||||
ring_position_view(const dht::ring_position& pos, after_key after = after_key::no)
|
||||
: _token(&pos.token())
|
||||
, _key(pos.has_key() ? &*pos.key() : nullptr)
|
||||
@@ -601,17 +615,25 @@ public:
|
||||
, _weight(weight)
|
||||
{ }
|
||||
|
||||
explicit ring_position_view(const dht::token& token, int8_t weight = -1)
|
||||
explicit ring_position_view(const dht::token& token, token_bound bound = token_bound::start)
|
||||
: _token(&token)
|
||||
, _key(nullptr)
|
||||
, _weight(weight)
|
||||
, _weight(static_cast<std::underlying_type_t<token_bound>>(bound))
|
||||
{ }
|
||||
|
||||
const dht::token& token() const { return *_token; }
|
||||
const partition_key* key() const { return _key; }
|
||||
|
||||
// Only when key() == nullptr
|
||||
token_bound get_token_bound() const { return token_bound(_weight); }
|
||||
// Only when key() != nullptr
|
||||
after_key is_after_key() const { return after_key(_weight == 1); }
|
||||
|
||||
friend std::ostream& operator<<(std::ostream&, ring_position_view);
|
||||
};
|
||||
|
||||
int ring_position_tri_compare(const schema& s, ring_position_view lh, ring_position_view rh);
|
||||
|
||||
// Trichotomic comparator for ring order
|
||||
struct ring_position_comparator {
|
||||
const schema& s;
|
||||
|
||||
@@ -290,6 +290,11 @@ murmur3_partitioner::token_for_next_shard(const token& t, shard_id shard, unsign
|
||||
return bias(n);
|
||||
}
|
||||
|
||||
unsigned
|
||||
murmur3_partitioner::sharding_ignore_msb() const {
|
||||
return _sharding_ignore_msb_bits;
|
||||
}
|
||||
|
||||
|
||||
using registry = class_registrator<i_partitioner, murmur3_partitioner, const unsigned&, const unsigned&>;
|
||||
static registry registrator("org.apache.cassandra.dht.Murmur3Partitioner");
|
||||
|
||||
@@ -52,6 +52,7 @@ public:
|
||||
|
||||
virtual unsigned shard_of(const token& t) const override;
|
||||
virtual token token_for_next_shard(const token& t, shard_id shard, unsigned spans) const override;
|
||||
virtual unsigned sharding_ignore_msb() const override;
|
||||
private:
|
||||
using uint128_t = unsigned __int128;
|
||||
static int64_t normalize(int64_t in);
|
||||
|
||||
@@ -324,11 +324,11 @@ future<> range_streamer::do_stream_async() {
|
||||
for (auto& range : ranges_to_stream) {
|
||||
range_vec.push_back(range);
|
||||
}
|
||||
auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
|
||||
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
|
||||
logger.warn("{} with {} for keyspace={} failed, took {} seconds: {}", description, source, keyspace, t, std::current_exception());
|
||||
throw;
|
||||
}
|
||||
auto t = std::chrono::duration_cast<std::chrono::seconds>(lowres_clock::now() - start_time).count();
|
||||
auto t = std::chrono::duration_cast<std::chrono::duration<float>>(lowres_clock::now() - start_time).count();
|
||||
logger.info("{} with {} for keyspace={} succeeded, took {} seconds", description, source, keyspace, t);
|
||||
});
|
||||
|
||||
|
||||
111
dist/ami/build_ami.sh
vendored
111
dist/ami/build_ami.sh
vendored
@@ -11,11 +11,9 @@ print_usage() {
|
||||
echo " --repo repository for both install and update, specify .repo/.list file URL"
|
||||
echo " --repo-for-install repository for install, specify .repo/.list file URL"
|
||||
echo " --repo-for-update repository for update, specify .repo/.list file URL"
|
||||
echo " --target specify target distribution"
|
||||
exit 1
|
||||
}
|
||||
LOCALRPM=0
|
||||
TARGET=centos
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--localrpm")
|
||||
@@ -34,10 +32,6 @@ while [ $# -gt 0 ]; do
|
||||
INSTALL_ARGS="$INSTALL_ARGS --repo-for-update $2"
|
||||
shift 2
|
||||
;;
|
||||
"--target")
|
||||
TARGET="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
@@ -62,91 +56,42 @@ pkg_install() {
|
||||
fi
|
||||
}
|
||||
|
||||
case "$TARGET" in
|
||||
"centos")
|
||||
AMI=ami-ae7bfdb8
|
||||
REGION=us-east-1
|
||||
SSH_USERNAME=centos
|
||||
;;
|
||||
"trusty")
|
||||
AMI=ami-ff427095
|
||||
REGION=us-east-1
|
||||
SSH_USERNAME=ubuntu
|
||||
;;
|
||||
"xenial")
|
||||
AMI=ami-da05a4a0
|
||||
REGION=us-east-1
|
||||
SSH_USERNAME=ubuntu
|
||||
;;
|
||||
*)
|
||||
echo "build_ami.sh does not supported this distribution."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
AMI=ami-ae7bfdb8
|
||||
REGION=us-east-1
|
||||
SSH_USERNAME=centos
|
||||
|
||||
if [ $LOCALRPM -eq 1 ]; then
|
||||
sudo rm -rf build/*
|
||||
REPO=`./scripts/scylla_current_repo --target $TARGET`
|
||||
REPO=`./scripts/scylla_current_repo --target centos`
|
||||
INSTALL_ARGS="$INSTALL_ARGS --localrpm --repo $REPO"
|
||||
if [ ! -f /usr/bin/git ]; then
|
||||
pkg_install git
|
||||
fi
|
||||
|
||||
if [ "$TARGET" = "centos" ]; then
|
||||
if [ ! -f dist/ami/files/scylla.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-kernel-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-server.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-debuginfo.x86_64.rpm ]; then
|
||||
dist/redhat/build_rpm.sh --dist --target epel-7-x86_64
|
||||
cp build/rpms/scylla-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla.x86_64.rpm
|
||||
cp build/rpms/scylla-kernel-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-kernel-conf.x86_64.rpm
|
||||
cp build/rpms/scylla-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-conf.x86_64.rpm
|
||||
cp build/rpms/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
|
||||
cp build/rpms/scylla-debuginfo-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-debuginfo.x86_64.rpm
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
|
||||
cd scylla-jmx
|
||||
dist/redhat/build_rpm.sh --target epel-7-x86_64
|
||||
cd ../..
|
||||
cp build/scylla-jmx/build/rpms/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ] || [ ! -f dist/ami/files/scylla-tools-core.noarch.rpm ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
|
||||
cd scylla-tools-java
|
||||
dist/redhat/build_rpm.sh --target epel-7-x86_64
|
||||
cd ../..
|
||||
cp build/scylla-tools-java/build/rpms/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
|
||||
cp build/scylla-tools-java/build/rpms/scylla-tools-core-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools-core.noarch.rpm
|
||||
fi
|
||||
else
|
||||
if [ ! -f dist/ami/files/scylla-server_amd64.deb ]; then
|
||||
./scripts/git-archive-all --force-submodules --prefix scylla build/scylla.tar
|
||||
tar -C build/ -xvpf build/scylla.tar
|
||||
cd build/scylla
|
||||
dist/debian/build_deb.sh --dist --target $TARGET
|
||||
cd ../..
|
||||
cp build/scylla/build/debs/scylla_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla_amd64.deb
|
||||
cp build/scylla/build/debs/scylla-kernel-conf_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla-kernel-conf_amd64.deb
|
||||
cp build/scylla/build/debs/scylla-conf_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla-conf_amd64.deb
|
||||
cp build/scylla/build/debs/scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla-server_amd64.deb
|
||||
cp build/scylla/build/debs/scylla-server-dbg_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_amd64.deb dist/ami/files/scylla-server-dbg_amd64.deb
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-jmx_all.deb ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
|
||||
cd scylla-jmx
|
||||
dist/debian/build_deb.sh --target $TARGET
|
||||
cd ../..
|
||||
cp build/scylla-jmx/build/debs/scylla-jmx_`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_all.deb dist/ami/files/scylla-jmx_all.deb
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-tools_all.deb ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
|
||||
cd scylla-tools-java
|
||||
dist/debian/build_deb.sh --target $TARGET
|
||||
cd ../..
|
||||
cp build/scylla-tools-java/build/debs/scylla-tools_`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`-0ubuntu1~${TARGET}_all.deb dist/ami/files/scylla-tools_all.deb
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-kernel-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-conf.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-server.x86_64.rpm ] || [ ! -f dist/ami/files/scylla-debuginfo.x86_64.rpm ]; then
|
||||
dist/redhat/build_rpm.sh --dist --target epel-7-x86_64
|
||||
cp build/rpms/scylla-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla.x86_64.rpm
|
||||
cp build/rpms/scylla-kernel-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-kernel-conf.x86_64.rpm
|
||||
cp build/rpms/scylla-conf-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-conf.x86_64.rpm
|
||||
cp build/rpms/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
|
||||
cp build/rpms/scylla-debuginfo-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-debuginfo.x86_64.rpm
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
|
||||
cd scylla-jmx
|
||||
dist/redhat/build_rpm.sh --target epel-7-x86_64
|
||||
cd ../..
|
||||
cp build/scylla-jmx/build/rpms/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ] || [ ! -f dist/ami/files/scylla-tools-core.noarch.rpm ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
|
||||
cd scylla-tools-java
|
||||
dist/redhat/build_rpm.sh --target epel-7-x86_64
|
||||
cd ../..
|
||||
cp build/scylla-tools-java/build/rpms/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
|
||||
cp build/scylla-tools-java/build/rpms/scylla-tools-core-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools-core.noarch.rpm
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
32
dist/ami/files/.bash_profile
vendored
32
dist/ami/files/.bash_profile
vendored
@@ -40,18 +40,11 @@ if [ `ec2_is_supported_instance_type` -eq 0 ]; then
|
||||
echo " $TYPE is not supported instance type!"
|
||||
tput sgr0
|
||||
echo -n "To continue startup ScyllaDB on this instance, run 'sudo scylla_io_setup' "
|
||||
if ! is_systemd; then
|
||||
echo "then 'initctl start scylla-server'."
|
||||
else
|
||||
echo "then 'systemctl start scylla-server'."
|
||||
fi
|
||||
echo "then 'systemctl start scylla-server'."
|
||||
echo "For a list of optimized instance types and more EC2 instructions see http://www.scylladb.com/doc/getting-started-amazon/"
|
||||
echo
|
||||
else
|
||||
SETUP=
|
||||
if is_systemd; then
|
||||
SETUP=`systemctl is-active scylla-ami-setup`
|
||||
fi
|
||||
SETUP=`systemctl is-active scylla-ami-setup`
|
||||
if [ "$SETUP" == "activating" ]; then
|
||||
tput setaf 4
|
||||
tput bold
|
||||
@@ -75,15 +68,7 @@ else
|
||||
echo " 'systemctl status scylla-ami-setup'"
|
||||
echo
|
||||
else
|
||||
if is_systemd; then
|
||||
SCYLLA=`systemctl is-active scylla-server`
|
||||
else
|
||||
if [ "`initctl status scylla-server|grep "running, process"`" != "" ]; then
|
||||
SCYLLA="active"
|
||||
else
|
||||
SCYLLA="failed"
|
||||
fi
|
||||
fi
|
||||
SCYLLA=`systemctl is-active scylla-server`
|
||||
if [ "$SCYLLA" == "activating" ]; then
|
||||
tput setaf 4
|
||||
tput bold
|
||||
@@ -108,15 +93,8 @@ else
|
||||
echo " ScyllaDB is not started!"
|
||||
tput sgr0
|
||||
echo "Please wait for startup. To see status of ScyllaDB, run "
|
||||
if ! is_systemd; then
|
||||
echo " 'initctl status scylla-server'"
|
||||
echo "and"
|
||||
echo " 'sudo cat /var/log/upstart/scylla-server.log'"
|
||||
echo
|
||||
else
|
||||
echo " 'systemctl status scylla-server'"
|
||||
echo
|
||||
fi
|
||||
echo " 'systemctl status scylla-server'"
|
||||
echo
|
||||
fi
|
||||
fi
|
||||
echo -n " "
|
||||
|
||||
2
dist/ami/files/scylla-ami
vendored
2
dist/ami/files/scylla-ami
vendored
Submodule dist/ami/files/scylla-ami updated: 36e85110ec...67293baf37
5
dist/ami/scylla.json
vendored
5
dist/ami/scylla.json
vendored
@@ -64,11 +64,6 @@
|
||||
"source": "files/",
|
||||
"destination": "/home/{{user `ssh_username`}}/"
|
||||
},
|
||||
{
|
||||
"type": "file",
|
||||
"source": "../../scripts/scylla_install_pkg",
|
||||
"destination": "/home/{{user `ssh_username`}}/scylla_install_pkg"
|
||||
},
|
||||
{
|
||||
"type": "shell",
|
||||
"inline": [
|
||||
|
||||
32
dist/common/scripts/node_health_check
vendored
32
dist/common/scripts/node_health_check
vendored
@@ -28,6 +28,7 @@ OUTPUT_PATH4="$OUTPUT_PATH/data_model"
|
||||
OUTPUT_PATH5="$OUTPUT_PATH/network_checks"
|
||||
IS_FEDORA="0"
|
||||
IS_DEBIAN="0"
|
||||
IS_GENTOO="0"
|
||||
JMX_PORT="7199"
|
||||
CQL_PORT="9042"
|
||||
PRINT_DM=NO
|
||||
@@ -75,7 +76,7 @@ while getopts ":hdncap:q:" opt; do
|
||||
done
|
||||
|
||||
|
||||
##Check server release (Fedora/Oracle/Debian)##
|
||||
##Check server release (Fedora/Oracle/Debian/Gentoo)##
|
||||
cat /etc/os-release | grep -i fedora &> /dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
cat /etc/os-release | grep -i oracle &> /dev/null
|
||||
@@ -89,7 +90,12 @@ if [ $? -ne 0 ]; then
|
||||
IS_DEBIAN="1"
|
||||
fi
|
||||
|
||||
if [ "$IS_FEDORA" == "1" ] && [ "$IS_DEBIAN" == "1" ]; then
|
||||
cat /etc/os-release | grep -i gentoo &> /dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
IS_GENTOO="1"
|
||||
fi
|
||||
|
||||
if [ "$IS_FEDORA" == "1" ] && [ "$IS_DEBIAN" == "1" ] && [ "$IS_GENTOO" == "1" ]; then
|
||||
echo "This s a Non-Supported OS, Please Review the Support Matrix"
|
||||
exit 222
|
||||
fi
|
||||
@@ -108,7 +114,7 @@ if [ $? -ne 0 ]; then
|
||||
else
|
||||
echo "Scylla-server Service: OK"
|
||||
echo "--------------------------------------------------"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
##Scylla-JMX service status##
|
||||
@@ -125,7 +131,7 @@ if [ $? -ne 0 ]; then
|
||||
else
|
||||
echo "Scylla-JMX Service (nodetool): OK"
|
||||
echo "--------------------------------------------------"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
#Install 'net-tools' pkg, to be used for netstat command#
|
||||
@@ -141,6 +147,9 @@ if [ "$IS_DEBIAN" == "0" ]; then
|
||||
sudo apt-get install net-tools -y | grep already
|
||||
fi
|
||||
|
||||
if [ "$IS_GENTOO" == "0" ]; then
|
||||
sudo emerge -1uq sys-apps/ethtool sys-apps/net-tools
|
||||
fi
|
||||
|
||||
#Create dir structure to save output_files#
|
||||
echo "--------------------------------------------------"
|
||||
@@ -182,6 +191,12 @@ if [ "$IS_DEBIAN" == "0" ]; then
|
||||
cp -p /etc/default/scylla-server $OUTPUT_PATH2
|
||||
fi
|
||||
|
||||
if [ "$IS_GENTOO" == "0" ]; then
|
||||
sudo emerge -1uq app-portage/portage-utils
|
||||
sudo qlist -ICv scylla > $OUTPUT_PATH2/scylla-pkgs.txt
|
||||
cp -p /etc/default/scylla-server $OUTPUT_PATH2
|
||||
fi
|
||||
|
||||
|
||||
#Scylla Logs#
|
||||
echo "--------------------------------------------------"
|
||||
@@ -192,7 +207,11 @@ journalctl --help &> /dev/null
|
||||
if [ $? -eq 0 ]; then
|
||||
journalctl -t scylla > $OUTPUT_PATH/scylla-logs.txt
|
||||
else
|
||||
cat /var/log/syslog | grep -i scylla > $OUTPUT_PATH/scylla-logs.txt
|
||||
if [ "$IS_GENTOO" == "0" ]; then
|
||||
cat /var/log/scylla/scylla.log > $OUTPUT_PATH/scylla-logs.txt
|
||||
else
|
||||
cat /var/log/syslog | grep -i scylla > $OUTPUT_PATH/scylla-logs.txt
|
||||
fi
|
||||
fi
|
||||
|
||||
gzip -f $OUTPUT_PATH/scylla-logs.txt
|
||||
@@ -224,6 +243,7 @@ if [ "$SCYLLA_SERVICE" == "1" ]; then
|
||||
echo "Skipping Data Model Info Collection"
|
||||
echo "--------------------------------------------------"
|
||||
else
|
||||
# TODO: handle connecting with authentication
|
||||
cqlsh `hostname -i` $CQL_PORT -e "HELP" &> /dev/null
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Collecting Data Model Info (using port $CQL_PORT)"
|
||||
@@ -357,7 +377,7 @@ if [ "$IS_FEDORA" == "0" ]; then
|
||||
echo "## /etc/sysconfig/scylla-server ##" >> $REPORT
|
||||
fi
|
||||
|
||||
if [ "$IS_DEBIAN" == "0" ]; then
|
||||
if [ "$IS_DEBIAN" == "0" ] || [ "$IS_GENTOO" == "0" ]; then
|
||||
echo "## /etc/default/scylla-server ##" >> $REPORT
|
||||
fi
|
||||
|
||||
|
||||
3
dist/common/scripts/scylla_coredump_setup
vendored
3
dist/common/scripts/scylla_coredump_setup
vendored
@@ -23,7 +23,6 @@ import os
|
||||
import sys
|
||||
import argparse
|
||||
import subprocess
|
||||
import shutil
|
||||
from scylla_util import *
|
||||
|
||||
if __name__ == '__main__':
|
||||
@@ -62,7 +61,7 @@ ExternalSizeMax=1024G
|
||||
with open('/etc/systemd/coredump.conf', 'w') as f:
|
||||
conf = f.write(conf_data)
|
||||
if args.dump_to_raiddir:
|
||||
shutil.rmtree('/var/lib/systemd/coredump')
|
||||
rmtree('/var/lib/systemd/coredump')
|
||||
makedirs('/var/lib/scylla/coredump')
|
||||
os.symlink('/var/lib/scylla/coredump', '/var/lib/systemd/coredump')
|
||||
run('systemctl daemon-reload')
|
||||
|
||||
4
dist/common/scripts/scylla_ec2_check
vendored
4
dist/common/scripts/scylla_ec2_check
vendored
@@ -26,9 +26,9 @@ from scylla_util import *
|
||||
|
||||
def get_en_interface_type():
|
||||
type, subtype = curl('http://169.254.169.254/latest/meta-data/instance-type').split('.')
|
||||
if type in ['c3', 'c4', 'd4', 'd2', 'i2', 'r3']:
|
||||
if type in ['c3', 'c4', 'd2', 'i2', 'r3']:
|
||||
return 'ixgbevf'
|
||||
if type in ['i3', 'p2', 'r4', 'x1']:
|
||||
if type in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
|
||||
return 'ena'
|
||||
if type == 'm4':
|
||||
if subtype == '16xlarge':
|
||||
|
||||
2
dist/common/scripts/scylla_fstrim_setup
vendored
2
dist/common/scripts/scylla_fstrim_setup
vendored
@@ -28,6 +28,8 @@ if __name__ == '__main__':
|
||||
if os.getuid() > 0:
|
||||
print('Requires root permission.')
|
||||
sys.exit(1)
|
||||
if is_systemd():
|
||||
systemd_unit('scylla-fstrim.timer').unmask()
|
||||
if is_redhat_variant():
|
||||
systemd_unit('fstrim.timer').disable()
|
||||
if dist_name() == 'Ubuntu' and os.path.exists('/etc/cron.weekly/fstrim'):
|
||||
|
||||
48
dist/common/scripts/scylla_raid_setup
vendored
48
dist/common/scripts/scylla_raid_setup
vendored
@@ -23,6 +23,8 @@ import os
|
||||
import argparse
|
||||
import pwd
|
||||
import grp
|
||||
import sys
|
||||
import stat
|
||||
from scylla_util import *
|
||||
|
||||
if __name__ == '__main__':
|
||||
@@ -40,6 +42,8 @@ if __name__ == '__main__':
|
||||
help='specify the root of the tree')
|
||||
parser.add_argument('--volume-role', default='all',
|
||||
help='specify how will this device be used (data, commitlog, or all)')
|
||||
parser.add_argument('--force-raid', action='store_true', default=False,
|
||||
help='force constructing RAID when only one disk is specified')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -60,6 +64,12 @@ if __name__ == '__main__':
|
||||
if not os.path.exists(disk):
|
||||
print('{} is not found'.format(disk))
|
||||
sys.exit(1)
|
||||
if not stat.S_ISBLK(os.stat(disk).st_mode):
|
||||
print('{} is not block device'.format(disk))
|
||||
sys.exit(1)
|
||||
if not is_unused_disk(disk):
|
||||
print('{} is busy'.format(disk))
|
||||
sys.exit(1)
|
||||
|
||||
if os.path.exists(args.raiddev):
|
||||
print('{} is already using'.format(args.raiddev))
|
||||
@@ -74,12 +84,20 @@ if __name__ == '__main__':
|
||||
elif is_gentoo_variant():
|
||||
run('emerge -uq sys-fs/mdadm sys-fs/xfsprogs')
|
||||
|
||||
print('Creating RAID0 for scylla using {nr_disk} disk(s): {disks}'.format(nr_disk=len(disks), disks=args.disks))
|
||||
if len(disks) == 1 and not args.force_raid:
|
||||
raid = False
|
||||
fsdev = disks[0]
|
||||
else:
|
||||
raid = True
|
||||
fsdev = args.raiddev
|
||||
|
||||
print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='RAID0' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
|
||||
if dist_name() == 'Ubuntu' and dist_ver() == '14.04':
|
||||
run('udevadm settle')
|
||||
run('mdadm --create --verbose --force --run {raid} --level=0 -c1024 --raid-devices={nr_disk} {disks}'.format(raid=args.raiddev, nr_disk=len(disks), disks=args.disks.replace(',', ' ')))
|
||||
run('udevadm settle')
|
||||
run('mkfs.xfs {} -f'.format(args.raiddev))
|
||||
if raid:
|
||||
run('udevadm settle')
|
||||
run('mdadm --create --verbose --force --run {raid} --level=0 -c1024 --raid-devices={nr_disk} {disks}'.format(raid=fsdev, nr_disk=len(disks), disks=args.disks.replace(',', ' ')))
|
||||
run('udevadm settle')
|
||||
run('mkfs.xfs {} -f'.format(fsdev))
|
||||
else:
|
||||
procs=[]
|
||||
for disk in disks:
|
||||
@@ -93,22 +111,24 @@ if __name__ == '__main__':
|
||||
procs.append(proc)
|
||||
for proc in procs:
|
||||
proc.wait()
|
||||
run('udevadm settle')
|
||||
run('mdadm --create --verbose --force --run {raid} --level=0 -c1024 --raid-devices={nr_disk} {disks}'.format(raid=args.raiddev, nr_disk=len(disks), disks=args.disks.replace(',', ' ')))
|
||||
run('udevadm settle')
|
||||
run('mkfs.xfs {} -f -K'.format(args.raiddev))
|
||||
if raid:
|
||||
run('udevadm settle')
|
||||
run('mdadm --create --verbose --force --run {raid} --level=0 -c1024 --raid-devices={nr_disk} {disks}'.format(raid=fsdev, nr_disk=len(disks), disks=args.disks.replace(',', ' ')))
|
||||
run('udevadm settle')
|
||||
run('mkfs.xfs {} -f -K'.format(fsdev))
|
||||
|
||||
if is_debian_variant():
|
||||
confpath = '/etc/mdadm/mdadm.conf'
|
||||
else:
|
||||
confpath = '/etc/mdadm.conf'
|
||||
|
||||
res = out('mdadm --detail --scan')
|
||||
with open(confpath, 'w') as f:
|
||||
f.write(res)
|
||||
if raid:
|
||||
res = out('mdadm --detail --scan')
|
||||
with open(confpath, 'w') as f:
|
||||
f.write(res)
|
||||
|
||||
makedirs(mount_at)
|
||||
run('mount -t xfs -o noatime {raid} "{mount_at}"'.format(raid=args.raiddev, mount_at=mount_at))
|
||||
run('mount -t xfs -o noatime {raid} "{mount_at}"'.format(raid=fsdev, mount_at=mount_at))
|
||||
|
||||
makedirs('{}/data'.format(root))
|
||||
makedirs('{}/commitlog'.format(root))
|
||||
@@ -122,7 +142,7 @@ if __name__ == '__main__':
|
||||
os.chown('{}/coredump'.format(root), uid, gid)
|
||||
|
||||
if args.update_fstab:
|
||||
res = out('blkid {}'.format(args.raiddev))
|
||||
res = out('blkid {}'.format(fsdev))
|
||||
match = re.search(r'^/dev/\S+: (UUID="\S+")', res.strip())
|
||||
uuid = match.group(1)
|
||||
with open('/etc/fstab', 'a') as f:
|
||||
|
||||
38
dist/common/scripts/scylla_setup
vendored
38
dist/common/scripts/scylla_setup
vendored
@@ -22,7 +22,6 @@
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import logging
|
||||
import glob
|
||||
import shutil
|
||||
import io
|
||||
@@ -54,6 +53,8 @@ def do_verify_package(pkg):
|
||||
res = run('dpkg -s {}'.format(pkg), silent=True, exception=False)
|
||||
elif is_redhat_variant():
|
||||
res = run('rpm -q {}'.format(pkg), silent=True, exception=False)
|
||||
elif is_gentoo_variant():
|
||||
res = 1 if len(glob.glob('/var/db/pkg/*/{}-*'.format(pkg))) else 0
|
||||
if res != 0:
|
||||
print('{} package is not installed.'.format(pkg))
|
||||
sys.exit(1)
|
||||
@@ -67,22 +68,18 @@ def list_block_devices():
|
||||
devices = []
|
||||
for p in ['/dev/sd*', '/dev/hd*', '/dev/xvd*', '/dev/nvme*', '/dev/mapper/*']:
|
||||
devices.extend([d for d in glob.glob(p) if d != '/dev/mapper/control'])
|
||||
return devices
|
||||
return devices
|
||||
|
||||
def get_unused_disks():
|
||||
unused = []
|
||||
for dev in list_block_devices():
|
||||
with open('/proc/mounts') as f:
|
||||
s = f.read().strip()
|
||||
count_raw = len(re.findall('^{} '.format(dev), s, flags=re.MULTILINE))
|
||||
count_pvs = 0
|
||||
if shutil.which('pvs'):
|
||||
s = out('pvs -o pv_name --nohead')
|
||||
count_pvs = len(re.findall(dev, s, flags=re.MULTILINE))
|
||||
s = out('swapon --show=NAME --noheadings')
|
||||
count_swap = len(re.findall(dev, s, flags=re.MULTILINE))
|
||||
if count_raw + count_pvs + count_swap == 0:
|
||||
unused.append(dev)
|
||||
# dev contains partitions
|
||||
if len(glob.glob('/sys/class/block/{dev}/{dev}*'.format(dev=dev.replace('/dev/','')))) > 0:
|
||||
continue
|
||||
# dev is used
|
||||
if not is_unused_disk(dev):
|
||||
continue
|
||||
unused.append(dev)
|
||||
return unused
|
||||
|
||||
def run_setup_script(name, script):
|
||||
@@ -99,7 +96,7 @@ def run_setup_script(name, script):
|
||||
|
||||
if __name__ == '__main__':
|
||||
if os.getuid() > 0:
|
||||
logging.error('Requires root permission.')
|
||||
print('Requires root permission.')
|
||||
sys.exit(1)
|
||||
parser = argparse.ArgumentParser(description='Configure environment for Scylla.')
|
||||
parser.add_argument('--disks',
|
||||
@@ -202,7 +199,6 @@ if __name__ == '__main__':
|
||||
if enable_service:
|
||||
if is_systemd():
|
||||
systemd_unit('scylla-server.service').enable()
|
||||
systemd_unit('scylla-fstrim.timer').unmask()
|
||||
elif is_gentoo_variant():
|
||||
run('rc-update add scylla-server default')
|
||||
|
||||
@@ -277,10 +273,14 @@ if __name__ == '__main__':
|
||||
else:
|
||||
print('Please select unmounted disks from the following list: {}'.format(devices))
|
||||
selected = []
|
||||
dsklist = []
|
||||
while len(devices):
|
||||
print('type \'cancel\' to cancel RAID/XFS setup.')
|
||||
print('type \'done\' to finish selection. Selected: {}'.format(selected))
|
||||
dsk = input('> ')
|
||||
if len(dsklist) > 0:
|
||||
dsk = dsklist.pop(0)
|
||||
else:
|
||||
dsk = input('> ')
|
||||
if dsk == 'cancel':
|
||||
raid_setup = 0
|
||||
break
|
||||
@@ -290,12 +290,16 @@ if __name__ == '__main__':
|
||||
break
|
||||
if dsk == '':
|
||||
continue
|
||||
if dsk.find(',') > 0:
|
||||
dsklist = dsk.split(',')
|
||||
continue
|
||||
if not os.path.exists(dsk):
|
||||
print('{} not found'.format(dsk))
|
||||
continue
|
||||
if not stat.S_ISBLK(os.stat(dsk).st_mode):
|
||||
print('{} is not block device'.format(dsk))
|
||||
selected += dsk
|
||||
continue
|
||||
selected.append(dsk)
|
||||
devices.remove(dsk)
|
||||
disks = ','.join(selected)
|
||||
if raid_setup:
|
||||
|
||||
4
dist/common/scripts/scylla_sysconfig_setup
vendored
4
dist/common/scripts/scylla_sysconfig_setup
vendored
@@ -64,6 +64,10 @@ if __name__ == '__main__':
|
||||
help='AMI instance mode')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.nic and not os.path.exists('/sys/class/net/{}'.format(args.nic)):
|
||||
print('NIC {} not found.'.format(args.nic))
|
||||
sys.exit(1)
|
||||
|
||||
ifname = args.nic if args.nic else cfg.get('IFNAME')
|
||||
network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')
|
||||
|
||||
|
||||
37
dist/common/scripts/scylla_util.py
vendored
37
dist/common/scripts/scylla_util.py
vendored
@@ -27,6 +27,7 @@ import platform
|
||||
import configparser
|
||||
import io
|
||||
import shlex
|
||||
import shutil
|
||||
|
||||
def curl(url):
|
||||
max_retries = 5
|
||||
@@ -306,17 +307,53 @@ def makedirs(name):
|
||||
if not os.path.isdir(name):
|
||||
os.makedirs(name)
|
||||
|
||||
def rmtree(path):
|
||||
if not os.path.islink(path):
|
||||
shutil.rmtree(path)
|
||||
else:
|
||||
os.remove(path)
|
||||
|
||||
def dist_name():
|
||||
return platform.dist()[0]
|
||||
|
||||
def dist_ver():
|
||||
return platform.dist()[1]
|
||||
|
||||
def is_unused_disk(dev):
|
||||
# dev is not in /sys/class/block/
|
||||
if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/',''))):
|
||||
return False
|
||||
# dev is mounted
|
||||
with open('/proc/mounts') as f:
|
||||
s = f.read().strip()
|
||||
if len(re.findall('^{} '.format(dev), s, flags=re.MULTILINE)) > 0:
|
||||
return False
|
||||
# dev is used in LVM
|
||||
if shutil.which('pvs'):
|
||||
s = out('pvs -o pv_name --nohead')
|
||||
if len(re.findall(dev, s, flags=re.MULTILINE)) > 0:
|
||||
return False
|
||||
# dev is used for swap
|
||||
s = out('swapon --show=NAME --noheadings')
|
||||
if len(re.findall(dev, s, flags=re.MULTILINE)) > 0:
|
||||
return False
|
||||
# dev is used in MDRAID
|
||||
if os.path.exists('/proc/mdstat'):
|
||||
with open('/proc/mdstat') as f:
|
||||
s = f.read().strip()
|
||||
if len(re.findall(dev, s, flags=re.MULTILINE)) > 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
class SystemdException(Exception):
|
||||
pass
|
||||
|
||||
class systemd_unit:
|
||||
def __init__(self, unit):
|
||||
try:
|
||||
run('systemctl cat {}'.format(unit), silent=True)
|
||||
except subprocess.CalledProcessError:
|
||||
raise SystemdException('unit {} not found'.format(unit))
|
||||
self._unit = unit
|
||||
|
||||
def start(self):
|
||||
|
||||
24
dist/debian/build_deb.sh
vendored
24
dist/debian/build_deb.sh
vendored
@@ -51,6 +51,18 @@ is_redhat_variant() {
|
||||
is_debian_variant() {
|
||||
[ -f /etc/debian_version ]
|
||||
}
|
||||
is_debian() {
|
||||
case "$1" in
|
||||
jessie|stretch) return 0;;
|
||||
*) return 1;;
|
||||
esac
|
||||
}
|
||||
is_ubuntu() {
|
||||
case "$1" in
|
||||
trusty|xenial|bionic) return 0;;
|
||||
*) return 1;;
|
||||
esac
|
||||
}
|
||||
|
||||
|
||||
pkg_install() {
|
||||
@@ -99,7 +111,7 @@ if [ ! -f /usr/bin/dh_testdir ]; then
|
||||
fi
|
||||
if [ ! -f /usr/bin/pystache ]; then
|
||||
if is_redhat_variant; then
|
||||
sudo yum install -y python2-pystache || sudo yum install -y pystache
|
||||
sudo yum install -y /usr/bin/pystache
|
||||
elif is_debian_variant; then
|
||||
sudo apt-get install -y python-pystache
|
||||
fi
|
||||
@@ -125,12 +137,12 @@ echo $VERSION > version
|
||||
|
||||
cp -a dist/debian/debian debian
|
||||
cp dist/common/sysconfig/scylla-server debian/scylla-server.default
|
||||
if [ "$TARGET" = "jessie" ] || [ "$TARGET" = "stretch" ]; then
|
||||
REVISION="1~$TARGET"
|
||||
elif [ "$TARGET" = "trusty" ]; then
|
||||
if [ "$TARGET" = "trusty" ]; then
|
||||
cp dist/debian/scylla-server.cron.d debian/
|
||||
REVISION="0ubuntu1~$TARGET"
|
||||
elif [ "$TARGET" = "xenial" ] || [ "$TARGET" = "bionic" ]; then
|
||||
fi
|
||||
if is_debian $TARGET; then
|
||||
REVISION="1~$TARGET"
|
||||
elif is_ubuntu $TARGET; then
|
||||
REVISION="0ubuntu1~$TARGET"
|
||||
else
|
||||
echo "Unknown distribution: $TARGET"
|
||||
|
||||
82
docs/protocol-extensions.md
Normal file
82
docs/protocol-extensions.md
Normal file
@@ -0,0 +1,82 @@
|
||||
Protocol extensions to the Cassandra Native Protocol
|
||||
====================================================
|
||||
|
||||
This document specifies extensions to the protocol defined
|
||||
by Cassandra's native_protocol_v4.spec and native_protocol_v5.spec.
|
||||
The extensions are designed so that a driver supporting them can
|
||||
continue to interoperate with Cassandra and other compatible servers
|
||||
with no configuration needed; the driver can discover the extensions
|
||||
and enable them conditionally.
|
||||
|
||||
An extension can be discovered by using the OPTIONS request; the
|
||||
returned SUPPORTED response will have zero or more options beginning
|
||||
with SCYLLA indicating extensions defined in this documented, in
|
||||
addition to options documented by Cassandra. How to use the extension
|
||||
is further explained in this document.
|
||||
|
||||
# Intranode sharding
|
||||
|
||||
This extension allows the driver to discover how Scylla internally
|
||||
partitions data among logical cores. It can then create at least
|
||||
one connection per logical core, and send queries directly to the
|
||||
logical core that will serve them, greatly improving load balancing
|
||||
and efficiency.
|
||||
|
||||
To use the extension, send the OPTIONS message. The data is returned
|
||||
in the SUPPORTED message, as a set of key/value options. Numeric values
|
||||
are returned as their base-10 ASCII representation.
|
||||
|
||||
The keys and values are:
|
||||
- `SCYLLA_SHARD` is an integer, the zero-based shard number this connection
|
||||
is connected to (for example, `3`).
|
||||
- `SCYLLA_NR_SHARDS` is an integer containing the number of shards on this
|
||||
node (for example, `12`). All shard numbers are smaller than this number.
|
||||
- `SCYLLA_PARTITIONER` is a the fully-qualified name of the partitioner in use (i.e.
|
||||
`org.apache.cassandra.partitioners.Murmur3Partitioner`).
|
||||
- `SCYLLA_SHARDING_ALGORITHM` is the name of an algorithm used to select how
|
||||
partitions are mapped into shards (described below)
|
||||
- `SCYLLA_SHARDING_IGNORE_MSB` is an integer parameter to the algorithm (also
|
||||
described below)
|
||||
|
||||
Currently, one `SCYLLA_SHARDING_ALGORITHM` is defined,
|
||||
`biased-token-round-robin`. To apply the algorithm,
|
||||
perform the following steps (assuming infinite-precision arithmetic):
|
||||
|
||||
- subtract the minimum token value from the partition's token
|
||||
in order to bias it: `biased_token = token - (-2**63)`
|
||||
- shift `biased_token` left by `ignore_msb` bits, discarding any
|
||||
bits beyond the 63rd:
|
||||
`biased_token = (biased_token << SCYLLA_SHARDING_IGNORE_MSB) % (2**64)`
|
||||
- multiply by `SCYLLA_NR_SHARDS` and perform a truncating division by 2**64:
|
||||
`shard = (biased_token * SCYLLA_NR_SHARDS) / 2**64`
|
||||
|
||||
(this apparently convoluted algorithm replaces a slow division instruction with
|
||||
a fast multiply instruction).
|
||||
|
||||
in C with 128-bit arithmetic support, these operations can be efficiently
|
||||
performed in three steps:
|
||||
|
||||
```c++
|
||||
uint64_t biased_token = token + ((uint64_t)1 << 63);
|
||||
biased_token <<= ignore_msb;
|
||||
int shard = ((unsigned __int128)biased_token * nr_shards) >> 64;
|
||||
```
|
||||
|
||||
In languages without 128-bit arithmetic support, use the following (this example
|
||||
is for Java):
|
||||
|
||||
```Java
|
||||
private int scyllaShardOf(long token) {
|
||||
token += Long.MIN_VALUE;
|
||||
token <<= ignoreMsb;
|
||||
long tokLo = token & 0xffffffffL;
|
||||
long tokHi = (token >>> 32) & 0xffffffffL;
|
||||
long mul1 = tokLo * nrShards;
|
||||
long mul2 = tokHi * nrShards;
|
||||
long sum = (mul1 >>> 32) + mul2;
|
||||
return (int)(sum >>> 32);
|
||||
}
|
||||
```
|
||||
|
||||
It is recommended that drivers open connections until they have at
|
||||
least one connection per shard, then close excess connections.
|
||||
@@ -92,7 +92,7 @@ public:
|
||||
imr::member<tags::back_pointer, imr::tagged_type<tags::back_pointer, imr::pod<basic_object*>>>,
|
||||
imr::member<tags::object, Structure>
|
||||
>;
|
||||
|
||||
static constexpr size_t size_overhead = sizeof(basic_object*);
|
||||
private:
|
||||
explicit object(uint8_t* ptr) noexcept
|
||||
: basic_object(ptr)
|
||||
|
||||
2
keys.cc
2
keys.cc
@@ -113,4 +113,4 @@ int32_t weight(bound_kind k) {
|
||||
abort();
|
||||
}
|
||||
|
||||
const thread_local clustering_key_prefix bound_view::empty_prefix = clustering_key::make_empty();
|
||||
const thread_local clustering_key_prefix bound_view::_empty_prefix = clustering_key::make_empty();
|
||||
|
||||
@@ -164,6 +164,30 @@ abstract_replication_strategy::get_primary_ranges(inet_address ep) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::get_primary_ranges_within_dc(inet_address ep) {
|
||||
dht::token_range_vector ret;
|
||||
sstring local_dc = _snitch->get_datacenter(ep);
|
||||
std::unordered_set<inet_address> local_dc_nodes = _token_metadata.get_topology().get_datacenter_endpoints().at(local_dc);
|
||||
auto prev_tok = _token_metadata.sorted_tokens().back();
|
||||
for (auto tok : _token_metadata.sorted_tokens()) {
|
||||
auto&& eps = calculate_natural_endpoints(tok, _token_metadata);
|
||||
// Unlike get_primary_ranges() which checks if ep is the first
|
||||
// owner of this range, here we check if ep is the first just
|
||||
// among nodes which belong to the local dc of ep.
|
||||
for (auto& e : eps) {
|
||||
if (local_dc_nodes.count(e)) {
|
||||
if (e == ep) {
|
||||
insert_token_range_to_sorted_container_while_unwrapping(prev_tok, tok, ret);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
prev_tok = tok;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::unordered_multimap<inet_address, dht::token_range>
|
||||
abstract_replication_strategy::get_address_ranges(token_metadata& tm) const {
|
||||
std::unordered_multimap<inet_address, dht::token_range> ret;
|
||||
|
||||
@@ -113,6 +113,10 @@ public:
|
||||
// This function is the analogue of Origin's
|
||||
// StorageService.getPrimaryRangesForEndpoint().
|
||||
dht::token_range_vector get_primary_ranges(inet_address ep);
|
||||
// get_primary_ranges_within_dc() is similar to get_primary_ranges()
|
||||
// except it assigns a primary node for each range within each dc,
|
||||
// instead of one node globally.
|
||||
dht::token_range_vector get_primary_ranges_within_dc(inet_address ep);
|
||||
|
||||
std::unordered_multimap<inet_address, dht::token_range> get_address_ranges(token_metadata& tm) const;
|
||||
|
||||
|
||||
17
main.cc
17
main.cc
@@ -389,7 +389,13 @@ int main(int ac, char** av) {
|
||||
sstring broadcast_address = cfg->broadcast_address();
|
||||
sstring broadcast_rpc_address = cfg->broadcast_rpc_address();
|
||||
stdx::optional<std::vector<sstring>> hinted_handoff_enabled = cfg->experimental() ? parse_hinted_handoff_enabled(cfg->hinted_handoff_enabled()) : stdx::nullopt;
|
||||
auto prom_addr = seastar::net::dns::get_host_by_name(cfg->prometheus_address()).get0();
|
||||
auto prom_addr = [&] {
|
||||
try {
|
||||
return seastar::net::dns::get_host_by_name(cfg->prometheus_address()).get0();
|
||||
} catch (...) {
|
||||
std::throw_with_nested(std::runtime_error(fmt::format("Unable to resolve prometheus_address {}", cfg->prometheus_address())));
|
||||
}
|
||||
}();
|
||||
supervisor::notify("starting prometheus API server");
|
||||
uint16_t pport = cfg->prometheus_port();
|
||||
if (pport) {
|
||||
@@ -467,7 +473,13 @@ int main(int ac, char** av) {
|
||||
// #293 - do not stop anything
|
||||
// engine().at_exit([] { return i_endpoint_snitch::stop_snitch(); });
|
||||
supervisor::notify("determining DNS name");
|
||||
auto e = seastar::net::dns::get_host_by_name(api_address).get0();
|
||||
auto e = [&] {
|
||||
try {
|
||||
return seastar::net::dns::get_host_by_name(api_address).get0();
|
||||
} catch (...) {
|
||||
std::throw_with_nested(std::runtime_error(fmt::format("Unable to resolve api_address {}", api_address)));
|
||||
}
|
||||
}();
|
||||
supervisor::notify("starting API server");
|
||||
auto ip = e.addr_list.front();
|
||||
ctx.http_server.start("API").get();
|
||||
@@ -490,6 +502,7 @@ int main(int ac, char** av) {
|
||||
}
|
||||
};
|
||||
dbcfg.compaction_scheduling_group = make_sched_group("compaction", 1000);
|
||||
dbcfg.memory_compaction_scheduling_group = make_sched_group("mem_compaction", 1000);
|
||||
dbcfg.streaming_scheduling_group = make_sched_group("streaming", 200);
|
||||
dbcfg.statement_scheduling_group = make_sched_group("statement", 1000);
|
||||
dbcfg.memtable_scheduling_group = make_sched_group("memtable", 1000);
|
||||
|
||||
17
memtable.cc
17
memtable.cc
@@ -27,11 +27,11 @@
|
||||
#include "schema_upgrader.hh"
|
||||
#include "partition_builder.hh"
|
||||
|
||||
memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, memtable_list* memtable_list)
|
||||
memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, memtable_list* memtable_list,
|
||||
seastar::scheduling_group compaction_scheduling_group)
|
||||
: logalloc::region(dmm.region_group())
|
||||
, _dirty_mgr(dmm)
|
||||
, _memtable_cleaner(*this, no_cache_tracker)
|
||||
, _cleaner(&_memtable_cleaner)
|
||||
, _cleaner(*this, no_cache_tracker, compaction_scheduling_group)
|
||||
, _memtable_list(memtable_list)
|
||||
, _schema(std::move(schema))
|
||||
, partitions(memtable_entry::compare(_schema)) {
|
||||
@@ -56,10 +56,9 @@ void memtable::clear() noexcept {
|
||||
auto dirty_before = dirty_size();
|
||||
with_allocator(allocator(), [this] {
|
||||
partitions.clear_and_dispose([this] (memtable_entry* e) {
|
||||
e->partition().evict(_memtable_cleaner);
|
||||
e->partition().evict(_cleaner);
|
||||
current_deleter<memtable_entry>()(e);
|
||||
});
|
||||
_memtable_cleaner.clear();
|
||||
});
|
||||
remove_flushed_memory(dirty_before - dirty_size());
|
||||
}
|
||||
@@ -322,7 +321,7 @@ public:
|
||||
_delegate = delegate_reader(*_delegate_range, _slice, _pc, streamed_mutation::forwarding::no, _fwd_mr);
|
||||
} else {
|
||||
auto key_and_snp = read_section()(region(), [&] {
|
||||
return with_linearized_managed_bytes([&] () -> std::optional<std::pair<dht::decorated_key, lw_shared_ptr<partition_snapshot>>> {
|
||||
return with_linearized_managed_bytes([&] () -> std::optional<std::pair<dht::decorated_key, partition_snapshot_ptr>> {
|
||||
memtable_entry *e = fetch_entry();
|
||||
if (!e) {
|
||||
return { };
|
||||
@@ -484,7 +483,7 @@ private:
|
||||
void get_next_partition() {
|
||||
uint64_t component_size = 0;
|
||||
auto key_and_snp = read_section()(region(), [&] {
|
||||
return with_linearized_managed_bytes([&] () -> std::optional<std::pair<dht::decorated_key, lw_shared_ptr<partition_snapshot>>> {
|
||||
return with_linearized_managed_bytes([&] () -> std::optional<std::pair<dht::decorated_key, partition_snapshot_ptr>> {
|
||||
memtable_entry* e = fetch_entry();
|
||||
if (e) {
|
||||
auto dk = e->key();
|
||||
@@ -550,7 +549,7 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
lw_shared_ptr<partition_snapshot> memtable_entry::snapshot(memtable& mtbl) {
|
||||
partition_snapshot_ptr memtable_entry::snapshot(memtable& mtbl) {
|
||||
return _pe.read(mtbl.region(), mtbl.cleaner(), _schema, no_cache_tracker);
|
||||
}
|
||||
|
||||
@@ -564,7 +563,7 @@ memtable::make_flat_reader(schema_ptr s,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
if (query::is_single_partition(range)) {
|
||||
const query::ring_position& pos = range.start()->value();
|
||||
auto snp = _read_section(*this, [&] () -> lw_shared_ptr<partition_snapshot> {
|
||||
auto snp = _read_section(*this, [&] () -> partition_snapshot_ptr {
|
||||
managed_bytes::linearization_context_guard lcg;
|
||||
auto i = partitions.find(pos, memtable_entry::compare(_schema));
|
||||
if (i != partitions.end()) {
|
||||
|
||||
10
memtable.hh
10
memtable.hh
@@ -66,7 +66,7 @@ public:
|
||||
partition_entry& partition() { return _pe; }
|
||||
const schema_ptr& schema() const { return _schema; }
|
||||
schema_ptr& schema() { return _schema; }
|
||||
lw_shared_ptr<partition_snapshot> snapshot(memtable& mtbl);
|
||||
partition_snapshot_ptr snapshot(memtable& mtbl);
|
||||
|
||||
size_t external_memory_usage_without_rows() const {
|
||||
return _key.key().external_memory_usage();
|
||||
@@ -125,8 +125,7 @@ public:
|
||||
bi::compare<memtable_entry::compare>>;
|
||||
private:
|
||||
dirty_memory_manager& _dirty_mgr;
|
||||
mutation_cleaner _memtable_cleaner;
|
||||
mutation_cleaner* _cleaner; // will switch to cache's cleaner after memtable is moved to cache.
|
||||
mutation_cleaner _cleaner;
|
||||
memtable_list *_memtable_list;
|
||||
schema_ptr _schema;
|
||||
logalloc::allocating_section _read_section;
|
||||
@@ -254,7 +253,8 @@ private:
|
||||
void clear() noexcept;
|
||||
uint64_t dirty_size() const;
|
||||
public:
|
||||
explicit memtable(schema_ptr schema, dirty_memory_manager&, memtable_list *memtable_list = nullptr);
|
||||
explicit memtable(schema_ptr schema, dirty_memory_manager&, memtable_list *memtable_list = nullptr,
|
||||
seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group());
|
||||
// Used for testing that want to control the flush process.
|
||||
explicit memtable(schema_ptr schema);
|
||||
~memtable();
|
||||
@@ -294,7 +294,7 @@ public:
|
||||
}
|
||||
|
||||
mutation_cleaner& cleaner() {
|
||||
return *_cleaner;
|
||||
return _cleaner;
|
||||
}
|
||||
public:
|
||||
memtable_list* get_memtable_list() {
|
||||
|
||||
226
multishard_writer.cc
Normal file
226
multishard_writer.cc
Normal file
@@ -0,0 +1,226 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "multishard_writer.hh"
|
||||
#include "mutation_reader.hh"
|
||||
#include "mutation_fragment.hh"
|
||||
#include "schema_registry.hh"
|
||||
#include <vector>
|
||||
#include <seastar/core/future-util.hh>
|
||||
#include <seastar/core/queue.hh>
|
||||
|
||||
class queue_reader final : public flat_mutation_reader::impl {
|
||||
seastar::queue<mutation_fragment_opt>& _mq;
|
||||
public:
|
||||
queue_reader(schema_ptr s, seastar::queue<mutation_fragment_opt>& mq)
|
||||
: impl(std::move(s))
|
||||
, _mq(mq) {
|
||||
}
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point) override {
|
||||
return do_until([this] { return is_end_of_stream() || is_buffer_full(); }, [this] {
|
||||
return _mq.pop_eventually().then([this] (mutation_fragment_opt mopt) {
|
||||
if (!mopt) {
|
||||
_end_of_stream = true;
|
||||
} else {
|
||||
push_mutation_fragment(std::move(*mopt));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
virtual void next_partition() override {
|
||||
throw std::bad_function_call();
|
||||
}
|
||||
virtual future<> fast_forward_to(const dht::partition_range&, db::timeout_clock::time_point) override {
|
||||
throw std::bad_function_call();
|
||||
}
|
||||
virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point) override {
|
||||
throw std::bad_function_call();
|
||||
}
|
||||
};
|
||||
|
||||
class shard_writer {
|
||||
private:
|
||||
schema_ptr _s;
|
||||
flat_mutation_reader _reader;
|
||||
std::function<future<> (flat_mutation_reader reader)> _consumer;
|
||||
public:
|
||||
shard_writer(schema_ptr s,
|
||||
flat_mutation_reader reader,
|
||||
std::function<future<> (flat_mutation_reader reader)> consumer);
|
||||
future<> consume();
|
||||
};
|
||||
|
||||
// The multishard_writer class gets mutation_fragments generated from
|
||||
// flat_mutation_reader and consumes the mutation_fragments with
|
||||
// multishard_writer::_consumer. If the mutation_fragment does not belong to
|
||||
// the shard multishard_writer is on, it will forward the mutation_fragment to
|
||||
// the correct shard. Future returned by multishard_writer() becomes
|
||||
// ready when all the mutation_fragments are consumed.
|
||||
class multishard_writer {
|
||||
private:
|
||||
schema_ptr _s;
|
||||
dht::i_partitioner& _partitioner;
|
||||
std::vector<foreign_ptr<std::unique_ptr<shard_writer>>> _shard_writers;
|
||||
std::vector<future<>> _pending_consumers;
|
||||
std::vector<seastar::queue<mutation_fragment_opt>> _queues;
|
||||
unsigned _current_shard = -1;
|
||||
uint64_t _consumed_partitions = 0;
|
||||
flat_mutation_reader _producer;
|
||||
std::function<future<> (flat_mutation_reader)> _consumer;
|
||||
private:
|
||||
unsigned shard_for_mf(const mutation_fragment& mf) {
|
||||
return _partitioner.shard_of(mf.as_partition_start().key().token());
|
||||
}
|
||||
future<> make_shard_writer(unsigned shard);
|
||||
future<stop_iteration> handle_mutation_fragment(mutation_fragment mf);
|
||||
future<stop_iteration> handle_end_of_stream();
|
||||
future<> consume(unsigned shard);
|
||||
future<> wait_pending_consumers();
|
||||
future<> distribute_mutation_fragments();
|
||||
public:
|
||||
multishard_writer(
|
||||
schema_ptr s,
|
||||
dht::i_partitioner& partitioner,
|
||||
flat_mutation_reader producer,
|
||||
std::function<future<> (flat_mutation_reader)> consumer);
|
||||
future<uint64_t> operator()();
|
||||
};
|
||||
|
||||
shard_writer::shard_writer(schema_ptr s,
|
||||
flat_mutation_reader reader,
|
||||
std::function<future<> (flat_mutation_reader reader)> consumer)
|
||||
: _s(s)
|
||||
, _reader(std::move(reader))
|
||||
, _consumer(std::move(consumer)) {
|
||||
}
|
||||
|
||||
future<> shard_writer::consume() {
|
||||
return _reader.peek().then([this] (mutation_fragment* mf_ptr) {
|
||||
if (mf_ptr) {
|
||||
return _consumer(std::move(_reader));
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
multishard_writer::multishard_writer(
|
||||
schema_ptr s,
|
||||
dht::i_partitioner& partitioner,
|
||||
flat_mutation_reader producer,
|
||||
std::function<future<> (flat_mutation_reader)> consumer)
|
||||
: _s(std::move(s))
|
||||
, _partitioner(partitioner)
|
||||
, _producer(std::move(producer))
|
||||
, _consumer(std::move(consumer)) {
|
||||
_shard_writers.resize(_partitioner.shard_count());
|
||||
_queues.reserve(_partitioner.shard_count());
|
||||
for (unsigned shard = 0; shard < _partitioner.shard_count(); shard++) {
|
||||
_queues.push_back(seastar::queue<mutation_fragment_opt>{2});
|
||||
}
|
||||
}
|
||||
|
||||
future<> multishard_writer::make_shard_writer(unsigned shard) {
|
||||
auto this_shard_reader = make_foreign(std::make_unique<flat_mutation_reader>(make_flat_mutation_reader<queue_reader>(_s, _queues[shard])));
|
||||
return smp::submit_to(shard, [gs = global_schema_ptr(_s),
|
||||
consumer = _consumer,
|
||||
reader = std::move(this_shard_reader)] () mutable {
|
||||
auto this_shard_reader = make_foreign_reader(gs.get(), std::move(reader));
|
||||
return make_foreign(std::make_unique<shard_writer>(gs.get(), std::move(this_shard_reader), consumer));
|
||||
}).then([this, shard] (foreign_ptr<std::unique_ptr<shard_writer>> writer) {
|
||||
_shard_writers[shard] = std::move(writer);
|
||||
_pending_consumers.push_back(consume(shard));
|
||||
});
|
||||
}
|
||||
|
||||
future<stop_iteration> multishard_writer::handle_mutation_fragment(mutation_fragment mf) {
|
||||
auto f = make_ready_future<>();
|
||||
if (mf.is_partition_start()) {
|
||||
_consumed_partitions++;
|
||||
if (unsigned shard = shard_for_mf(mf); shard != _current_shard) {
|
||||
_current_shard = shard;
|
||||
if (!bool(_shard_writers[shard])) {
|
||||
f = make_shard_writer(shard);
|
||||
}
|
||||
}
|
||||
}
|
||||
return f.then([this, mf = std::move(mf)] () mutable {
|
||||
assert(_current_shard != -1u);
|
||||
return _queues[_current_shard].push_eventually(mutation_fragment_opt(std::move(mf)));
|
||||
}).then([] {
|
||||
return stop_iteration::no;
|
||||
});
|
||||
}
|
||||
|
||||
future<stop_iteration> multishard_writer::handle_end_of_stream() {
|
||||
return parallel_for_each(boost::irange(0u, _partitioner.shard_count()), [this] (unsigned shard) {
|
||||
if (bool(_shard_writers[shard])) {
|
||||
return _queues[shard].push_eventually(mutation_fragment_opt());
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}).then([] {
|
||||
return stop_iteration::yes;
|
||||
});
|
||||
}
|
||||
|
||||
future<> multishard_writer::consume(unsigned shard) {
|
||||
return smp::submit_to(shard, [writer = _shard_writers[shard].get()] () mutable {
|
||||
return writer->consume();
|
||||
}).handle_exception([this] (std::exception_ptr ep) {
|
||||
for (auto& q : _queues) {
|
||||
q.abort(ep);
|
||||
}
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
}
|
||||
|
||||
future<> multishard_writer::wait_pending_consumers() {
|
||||
return seastar::when_all_succeed(_pending_consumers.begin(), _pending_consumers.end());
|
||||
}
|
||||
|
||||
future<> multishard_writer::distribute_mutation_fragments() {
|
||||
return repeat([this] () mutable {
|
||||
return _producer().then([this] (mutation_fragment_opt mf_opt) mutable {
|
||||
if (mf_opt) {
|
||||
return handle_mutation_fragment(std::move(*mf_opt));
|
||||
} else {
|
||||
return handle_end_of_stream();
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<uint64_t> multishard_writer::operator()() {
|
||||
return distribute_mutation_fragments().finally([this] {
|
||||
return wait_pending_consumers();
|
||||
}).then([this] {
|
||||
return _consumed_partitions;
|
||||
});
|
||||
}
|
||||
|
||||
future<uint64_t> distribute_reader_and_consume_on_shards(schema_ptr s,
|
||||
dht::i_partitioner& partitioner,
|
||||
flat_mutation_reader producer,
|
||||
std::function<future<> (flat_mutation_reader)> consumer) {
|
||||
return do_with(multishard_writer(std::move(s), partitioner, std::move(producer), std::move(consumer)), [] (multishard_writer& writer) {
|
||||
return writer();
|
||||
});
|
||||
}
|
||||
35
multishard_writer.hh
Normal file
35
multishard_writer.hh
Normal file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "schema.hh"
|
||||
#include "flat_mutation_reader.hh"
|
||||
#include "dht/i_partitioner.hh"
|
||||
|
||||
// Helper to use multishard_writer to distribute mutation_fragments from the
|
||||
// producer to the correct shard and consume with the consumer.
|
||||
// It returns number of partitions consumed.
|
||||
future<uint64_t> distribute_reader_and_consume_on_shards(schema_ptr s,
|
||||
dht::i_partitioner& partitioner,
|
||||
flat_mutation_reader producer,
|
||||
std::function<future<> (flat_mutation_reader)> consumer);
|
||||
|
||||
@@ -26,6 +26,76 @@
|
||||
|
||||
#include "utils/logalloc.hh"
|
||||
|
||||
class mutation_cleaner_impl final {
|
||||
using snapshot_list = boost::intrusive::slist<partition_snapshot,
|
||||
boost::intrusive::member_hook<partition_snapshot, boost::intrusive::slist_member_hook<>, &partition_snapshot::_cleaner_hook>>;
|
||||
struct worker {
|
||||
condition_variable cv;
|
||||
snapshot_list snapshots;
|
||||
logalloc::allocating_section alloc_section;
|
||||
bool done = false; // true means the worker was abandoned and cannot access the mutation_cleaner_impl instance.
|
||||
};
|
||||
private:
|
||||
logalloc::region& _region;
|
||||
cache_tracker* _tracker;
|
||||
partition_version_list _versions;
|
||||
lw_shared_ptr<worker> _worker_state;
|
||||
seastar::scheduling_group _scheduling_group;
|
||||
private:
|
||||
stop_iteration merge_some(partition_snapshot& snp) noexcept;
|
||||
stop_iteration merge_some() noexcept;
|
||||
void start_worker();
|
||||
public:
|
||||
mutation_cleaner_impl(logalloc::region& r, cache_tracker* t, seastar::scheduling_group sg = seastar::current_scheduling_group())
|
||||
: _region(r)
|
||||
, _tracker(t)
|
||||
, _worker_state(make_lw_shared<worker>())
|
||||
, _scheduling_group(sg)
|
||||
{
|
||||
start_worker();
|
||||
}
|
||||
~mutation_cleaner_impl();
|
||||
stop_iteration clear_gently() noexcept;
|
||||
memory::reclaiming_result clear_some() noexcept;
|
||||
void clear() noexcept;
|
||||
void destroy_later(partition_version& v) noexcept;
|
||||
void destroy_gently(partition_version& v) noexcept;
|
||||
void merge(mutation_cleaner_impl& other) noexcept;
|
||||
bool empty() const noexcept { return _versions.empty(); }
|
||||
future<> drain();
|
||||
void merge_and_destroy(partition_snapshot&) noexcept;
|
||||
void set_scheduling_group(seastar::scheduling_group sg) {
|
||||
_scheduling_group = sg;
|
||||
_worker_state->cv.broadcast();
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
void mutation_cleaner_impl::destroy_later(partition_version& v) noexcept {
|
||||
_versions.push_back(v);
|
||||
}
|
||||
|
||||
inline
|
||||
void mutation_cleaner_impl::destroy_gently(partition_version& v) noexcept {
|
||||
if (v.clear_gently(_tracker) == stop_iteration::no) {
|
||||
destroy_later(v);
|
||||
} else {
|
||||
current_allocator().destroy(&v);
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void mutation_cleaner_impl::merge_and_destroy(partition_snapshot& ps) noexcept {
|
||||
if (ps.slide_to_oldest() == stop_iteration::yes || merge_some(ps) == stop_iteration::yes) {
|
||||
lw_shared_ptr<partition_snapshot>::dispose(&ps);
|
||||
} else {
|
||||
// The snapshot must not be reachable by partitino_entry::read() after this,
|
||||
// which is ensured by slide_to_oldest() == stop_iteration::no.
|
||||
_worker_state->snapshots.push_front(ps);
|
||||
_worker_state->cv.signal();
|
||||
}
|
||||
}
|
||||
|
||||
// Container for garbage partition_version objects, used for freeing them incrementally.
|
||||
//
|
||||
// Mutation cleaner extends the lifetime of mutation_partition without doing
|
||||
@@ -36,57 +106,71 @@
|
||||
// mutation_cleaner should not be thread local objects (or members of thread
|
||||
// local objects).
|
||||
class mutation_cleaner final {
|
||||
logalloc::region& _region;
|
||||
cache_tracker* _tracker;
|
||||
partition_version_list _versions;
|
||||
lw_shared_ptr<mutation_cleaner_impl> _impl;
|
||||
public:
|
||||
mutation_cleaner(logalloc::region& r, cache_tracker* t) : _region(r), _tracker(t) {}
|
||||
~mutation_cleaner();
|
||||
mutation_cleaner(logalloc::region& r, cache_tracker* t, seastar::scheduling_group sg = seastar::current_scheduling_group())
|
||||
: _impl(make_lw_shared<mutation_cleaner_impl>(r, t, sg)) {
|
||||
}
|
||||
|
||||
void set_scheduling_group(seastar::scheduling_group sg) {
|
||||
_impl->set_scheduling_group(sg);
|
||||
}
|
||||
|
||||
// Frees some of the data. Returns stop_iteration::yes iff all was freed.
|
||||
// Must be invoked under owning allocator.
|
||||
stop_iteration clear_gently() noexcept;
|
||||
stop_iteration clear_gently() noexcept {
|
||||
return _impl->clear_gently();
|
||||
}
|
||||
|
||||
// Must be invoked under owning allocator.
|
||||
memory::reclaiming_result clear_some() noexcept;
|
||||
memory::reclaiming_result clear_some() noexcept {
|
||||
return _impl->clear_some();
|
||||
}
|
||||
|
||||
// Must be invoked under owning allocator.
|
||||
void clear() noexcept;
|
||||
void clear() noexcept {
|
||||
_impl->clear();
|
||||
}
|
||||
|
||||
// Enqueues v for destruction.
|
||||
// The object must not be part of any list, and must not be accessed externally any more.
|
||||
// In particular, it must not be attached, even indirectly, to any snapshot or partition_entry,
|
||||
// and must not be evicted from.
|
||||
// Must be invoked under owning allocator.
|
||||
void destroy_later(partition_version& v) noexcept;
|
||||
void destroy_later(partition_version& v) noexcept {
|
||||
return _impl->destroy_later(v);
|
||||
}
|
||||
|
||||
// Destroys v now or later.
|
||||
// Same requirements as destroy_later().
|
||||
// Must be invoked under owning allocator.
|
||||
void destroy_gently(partition_version& v) noexcept;
|
||||
void destroy_gently(partition_version& v) noexcept {
|
||||
return _impl->destroy_gently(v);
|
||||
}
|
||||
|
||||
// Transfers objects from other to this.
|
||||
// This and other must belong to the same logalloc::region, and the same cache_tracker.
|
||||
// After the call bool(other) is false.
|
||||
void merge(mutation_cleaner& other) noexcept;
|
||||
// After the call other will refer to this cleaner.
|
||||
void merge(mutation_cleaner& other) noexcept {
|
||||
_impl->merge(*other._impl);
|
||||
other._impl = _impl;
|
||||
}
|
||||
|
||||
// Returns true iff contains no unfreed objects
|
||||
bool empty() const noexcept { return _versions.empty(); }
|
||||
bool empty() const noexcept {
|
||||
return _impl->empty();
|
||||
}
|
||||
|
||||
// Forces cleaning and returns a future which resolves when there is nothing to clean.
|
||||
future<> drain();
|
||||
};
|
||||
|
||||
inline
|
||||
void mutation_cleaner::destroy_later(partition_version& v) noexcept {
|
||||
_versions.push_back(v);
|
||||
}
|
||||
|
||||
inline
|
||||
void mutation_cleaner::destroy_gently(partition_version& v) noexcept {
|
||||
if (v.clear_gently(_tracker) == stop_iteration::no) {
|
||||
destroy_later(v);
|
||||
} else {
|
||||
current_allocator().destroy(&v);
|
||||
future<> drain() {
|
||||
return _impl->drain();
|
||||
}
|
||||
}
|
||||
|
||||
// Will merge given snapshot using partition_snapshot::merge_partition_versions() and then destroys it
|
||||
// using destroy_from_this(), possibly deferring in between.
|
||||
// This instance becomes the sole owner of the partition_snapshot object, the caller should not destroy it
|
||||
// nor access it after calling this.
|
||||
void merge_and_destroy(partition_snapshot& ps) {
|
||||
return _impl->merge_and_destroy(ps);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -280,12 +280,15 @@ mutation_partition::apply(const schema& s, const mutation_fragment& mf) {
|
||||
mf.visit(applier);
|
||||
}
|
||||
|
||||
void mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker* tracker) {
|
||||
stop_iteration mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker* tracker, is_preemptible preemptible) {
|
||||
_tombstone.apply(p._tombstone);
|
||||
_row_tombstones.apply_monotonically(s, std::move(p._row_tombstones));
|
||||
_static_row.apply_monotonically(s, column_kind::static_column, std::move(p._static_row));
|
||||
_static_row_continuous |= p._static_row_continuous;
|
||||
|
||||
if (_row_tombstones.apply_monotonically(s, std::move(p._row_tombstones), preemptible) == stop_iteration::no) {
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
rows_entry::compare less(s);
|
||||
auto del = current_deleter<rows_entry>();
|
||||
auto p_i = p._rows.begin();
|
||||
@@ -317,22 +320,34 @@ void mutation_partition::apply_monotonically(const schema& s, mutation_partition
|
||||
// Newer evictable versions store complete rows
|
||||
i->_row = std::move(src_e._row);
|
||||
} else {
|
||||
memory::on_alloc_point();
|
||||
i->_row.apply_monotonically(s, std::move(src_e._row));
|
||||
}
|
||||
i->set_continuous(continuous);
|
||||
i->set_dummy(dummy);
|
||||
p_i = p._rows.erase_and_dispose(p_i, del);
|
||||
}
|
||||
if (preemptible && need_preempt() && p_i != p._rows.end()) {
|
||||
// We cannot leave p with the clustering range up to p_i->position()
|
||||
// marked as continuous because some of its sub-ranges may have originally been discontinuous.
|
||||
// This would result in the sum of this and p to have broader continuity after preemption,
|
||||
// also possibly violating the invariant of non-overlapping continuity between MVCC versions,
|
||||
// if that's what we're merging here.
|
||||
// It's always safe to mark the range as discontinuous.
|
||||
p_i->set_continuous(false);
|
||||
return stop_iteration::no;
|
||||
}
|
||||
}
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
void mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, const schema& p_schema) {
|
||||
stop_iteration mutation_partition::apply_monotonically(const schema& s, mutation_partition&& p, const schema& p_schema, is_preemptible preemptible) {
|
||||
if (s.version() == p_schema.version()) {
|
||||
apply_monotonically(s, std::move(p), no_cache_tracker);
|
||||
return apply_monotonically(s, std::move(p), no_cache_tracker, preemptible);
|
||||
} else {
|
||||
mutation_partition p2(s, p);
|
||||
p2.upgrade(p_schema, s);
|
||||
apply_monotonically(s, std::move(p2), no_cache_tracker);
|
||||
return apply_monotonically(s, std::move(p2), no_cache_tracker, is_preemptible::no); // FIXME: make preemptible
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2305,17 +2320,20 @@ future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& so
|
||||
return f.finally([r_a_r = std::move(r_a_r)] { });
|
||||
}
|
||||
|
||||
mutation_cleaner::~mutation_cleaner() {
|
||||
mutation_cleaner_impl::~mutation_cleaner_impl() {
|
||||
_worker_state->done = true;
|
||||
_worker_state->cv.signal();
|
||||
_worker_state->snapshots.clear_and_dispose(typename lw_shared_ptr<partition_snapshot>::disposer());
|
||||
with_allocator(_region.allocator(), [this] {
|
||||
clear();
|
||||
});
|
||||
}
|
||||
|
||||
void mutation_cleaner::clear() noexcept {
|
||||
void mutation_cleaner_impl::clear() noexcept {
|
||||
while (clear_gently() == stop_iteration::no) ;
|
||||
}
|
||||
|
||||
stop_iteration mutation_cleaner::clear_gently() noexcept {
|
||||
stop_iteration mutation_cleaner_impl::clear_gently() noexcept {
|
||||
while (clear_some() == memory::reclaiming_result::reclaimed_something) {
|
||||
if (need_preempt()) {
|
||||
return stop_iteration::no;
|
||||
@@ -2324,7 +2342,7 @@ stop_iteration mutation_cleaner::clear_gently() noexcept {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
memory::reclaiming_result mutation_cleaner::clear_some() noexcept {
|
||||
memory::reclaiming_result mutation_cleaner_impl::clear_some() noexcept {
|
||||
if (_versions.empty()) {
|
||||
return memory::reclaiming_result::reclaimed_nothing;
|
||||
}
|
||||
@@ -2337,14 +2355,81 @@ memory::reclaiming_result mutation_cleaner::clear_some() noexcept {
|
||||
return memory::reclaiming_result::reclaimed_something;
|
||||
}
|
||||
|
||||
void mutation_cleaner::merge(mutation_cleaner& r) noexcept {
|
||||
void mutation_cleaner_impl::merge(mutation_cleaner_impl& r) noexcept {
|
||||
_versions.splice(r._versions);
|
||||
_worker_state->snapshots.splice(_worker_state->snapshots.end(), r._worker_state->snapshots);
|
||||
if (!_worker_state->snapshots.empty()) {
|
||||
_worker_state->cv.signal();
|
||||
}
|
||||
}
|
||||
|
||||
future<> mutation_cleaner::drain() {
|
||||
return repeat([this] {
|
||||
return with_allocator(_region.allocator(), [this] {
|
||||
return clear_gently();
|
||||
void mutation_cleaner_impl::start_worker() {
|
||||
auto f = repeat([w = _worker_state, this] () mutable noexcept {
|
||||
if (w->done) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
return with_scheduling_group(_scheduling_group, [w, this] {
|
||||
return w->cv.wait([w] {
|
||||
return w->done || !w->snapshots.empty();
|
||||
}).then([this, w] () noexcept {
|
||||
if (w->done) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
merge_some();
|
||||
return stop_iteration::no;
|
||||
});
|
||||
});
|
||||
});
|
||||
if (f.failed()) {
|
||||
f.get();
|
||||
}
|
||||
}
|
||||
|
||||
stop_iteration mutation_cleaner_impl::merge_some(partition_snapshot& snp) noexcept {
|
||||
auto&& region = snp.region();
|
||||
return with_allocator(region.allocator(), [&] {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
// Allocating sections require the region to be reclaimable
|
||||
// which means that they cannot be nested.
|
||||
// It is, however, possible, that if the snapshot is taken
|
||||
// inside an allocating section and then an exception is thrown
|
||||
// this function will be called to clean up even though we
|
||||
// still will be in the context of the allocating section.
|
||||
if (!region.reclaiming_enabled()) {
|
||||
return stop_iteration::no;
|
||||
}
|
||||
try {
|
||||
return _worker_state->alloc_section(region, [&] {
|
||||
return snp.merge_partition_versions();
|
||||
});
|
||||
} catch (...) {
|
||||
// Merging failed, give up as there is no guarantee of forward progress.
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
stop_iteration mutation_cleaner_impl::merge_some() noexcept {
|
||||
if (_worker_state->snapshots.empty()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
partition_snapshot& snp = _worker_state->snapshots.front();
|
||||
if (merge_some(snp) == stop_iteration::yes) {
|
||||
_worker_state->snapshots.pop_front();
|
||||
lw_shared_ptr<partition_snapshot>::dispose(&snp);
|
||||
}
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
future<> mutation_cleaner_impl::drain() {
|
||||
return repeat([this] {
|
||||
return merge_some();
|
||||
}).then([this] {
|
||||
return repeat([this] {
|
||||
return with_allocator(_region.allocator(), [this] {
|
||||
return clear_gently();
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -46,6 +46,7 @@
|
||||
#include "clustering_key_filter.hh"
|
||||
#include "intrusive_set_external_comparator.hh"
|
||||
#include "utils/with_relational_operators.hh"
|
||||
#include "utils/preempt.hh"
|
||||
|
||||
class mutation_fragment;
|
||||
class clustering_row;
|
||||
@@ -987,8 +988,19 @@ public:
|
||||
// This instance and p are governed by the same schema.
|
||||
//
|
||||
// Must be provided with a pointer to the cache_tracker, which owns both this and p.
|
||||
void apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker*);
|
||||
void apply_monotonically(const schema& s, mutation_partition&& p, const schema& p_schema);
|
||||
//
|
||||
// Returns stop_iteration::no if the operation was preempted before finished, and stop_iteration::yes otherwise.
|
||||
// On preemption the sum of this and p stays the same (represents the same set of writes), and the state of this
|
||||
// object contains at least all the writes it contained before the call (monotonicity). It may contain partial writes.
|
||||
// Also, some progress is always guaranteed (liveness).
|
||||
//
|
||||
// The operation can be drien to completion like this:
|
||||
//
|
||||
// while (apply_monotonically(..., is_preemtable::yes) == stop_iteration::no) { }
|
||||
//
|
||||
// If is_preemptible::no is passed as argument then stop_iteration::no is never returned.
|
||||
stop_iteration apply_monotonically(const schema& s, mutation_partition&& p, cache_tracker*, is_preemptible = is_preemptible::no);
|
||||
stop_iteration apply_monotonically(const schema& s, mutation_partition&& p, const schema& p_schema, is_preemptible = is_preemptible::no);
|
||||
|
||||
// Weak exception guarantees.
|
||||
// Assumes this and p are not owned by a cache_tracker.
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
#include "mutation_partition.hh"
|
||||
#include "counters.hh"
|
||||
#include "frozen_mutation.hh"
|
||||
#include "partition_builder.hh"
|
||||
#include "converting_mutation_partition_applier.hh"
|
||||
|
||||
#include "utils/UUID.hh"
|
||||
#include "serializer.hh"
|
||||
@@ -60,10 +62,10 @@ atomic_cell read_atomic_cell(const abstract_type& type, atomic_cell_variant cv,
|
||||
explicit atomic_cell_visitor(const abstract_type& t, atomic_cell::collection_member cm)
|
||||
: _type(t), _collection_member(cm) { }
|
||||
atomic_cell operator()(ser::live_cell_view& lcv) const {
|
||||
return atomic_cell::make_live(_type, lcv.created_at(), lcv.value(), _collection_member);
|
||||
return atomic_cell::make_live(_type, lcv.created_at(), lcv.value().view(), _collection_member);
|
||||
}
|
||||
atomic_cell operator()(ser::expiring_cell_view& ecv) const {
|
||||
return atomic_cell::make_live(_type, ecv.c().created_at(), ecv.c().value(), ecv.expiry(), ecv.ttl(), _collection_member);
|
||||
return atomic_cell::make_live(_type, ecv.c().created_at(), ecv.c().value().view(), ecv.expiry(), ecv.ttl(), _collection_member);
|
||||
}
|
||||
atomic_cell operator()(ser::dead_cell_view& dcv) const {
|
||||
return atomic_cell::make_dead(dcv.tomb().timestamp(), dcv.tomb().deletion_time());
|
||||
@@ -129,20 +131,13 @@ void read_and_visit_row(ser::row_view rv, const column_mapping& cm, column_kind
|
||||
: _visitor(v), _id(id), _col(col) { }
|
||||
|
||||
void operator()(atomic_cell_variant& acv) const {
|
||||
if (!_col.type()->is_atomic()) {
|
||||
if (!_col.is_atomic()) {
|
||||
throw std::runtime_error("A collection expected, got an atomic cell");
|
||||
}
|
||||
// FIXME: Pass view to cell to avoid copy
|
||||
auto&& outer = current_allocator();
|
||||
with_allocator(standard_allocator(), [&] {
|
||||
auto cell = read_atomic_cell(*_col.type(), acv);
|
||||
with_allocator(outer, [&] {
|
||||
_visitor.accept_atomic_cell(_id, cell);
|
||||
});
|
||||
});
|
||||
_visitor.accept_atomic_cell(_id, read_atomic_cell(*_col.type(), acv));
|
||||
}
|
||||
void operator()(ser::collection_cell_view& ccv) const {
|
||||
if (_col.type()->is_atomic()) {
|
||||
if (_col.is_atomic()) {
|
||||
throw std::runtime_error("An atomic cell expected, got a collection");
|
||||
}
|
||||
// FIXME: Pass view to cell to avoid copy
|
||||
@@ -187,23 +182,19 @@ row_marker read_row_marker(boost::variant<ser::live_marker_view, ser::expiring_m
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
mutation_partition_view::accept(const schema& s, mutation_partition_visitor& visitor) const {
|
||||
accept(s.get_column_mapping(), visitor);
|
||||
}
|
||||
|
||||
void
|
||||
mutation_partition_view::accept(const column_mapping& cm, mutation_partition_visitor& visitor) const {
|
||||
template<typename Visitor>
|
||||
GCC6_CONCEPT(requires MutationViewVisitor<Visitor>)
|
||||
void mutation_partition_view::do_accept(const column_mapping& cm, Visitor& visitor) const {
|
||||
auto in = _in;
|
||||
auto mpv = ser::deserialize(in, boost::type<ser::mutation_partition_view>());
|
||||
|
||||
visitor.accept_partition_tombstone(mpv.tomb());
|
||||
|
||||
struct static_row_cell_visitor {
|
||||
mutation_partition_visitor& _visitor;
|
||||
Visitor& _visitor;
|
||||
|
||||
void accept_atomic_cell(column_id id, const atomic_cell& ac) const {
|
||||
_visitor.accept_static_cell(id, ac);
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_static_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_static_cell(id, cm);
|
||||
@@ -217,13 +208,13 @@ mutation_partition_view::accept(const column_mapping& cm, mutation_partition_vis
|
||||
|
||||
for (auto&& cr : mpv.rows()) {
|
||||
auto t = row_tombstone(cr.deleted_at(), shadowable_tombstone(cr.shadowable_deleted_at()));
|
||||
visitor.accept_row(position_in_partition_view::for_key(cr.key()), t, read_row_marker(cr.marker()));
|
||||
visitor.accept_row(position_in_partition_view::for_key(cr.key()), t, read_row_marker(cr.marker()), is_dummy::no, is_continuous::yes);
|
||||
|
||||
struct cell_visitor {
|
||||
mutation_partition_visitor& _visitor;
|
||||
Visitor& _visitor;
|
||||
|
||||
void accept_atomic_cell(column_id id, const atomic_cell& ac) const {
|
||||
_visitor.accept_row_cell(id, ac);
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) const {
|
||||
_visitor.accept_row_cell(id, std::move(ac));
|
||||
}
|
||||
void accept_collection(column_id id, const collection_mutation& cm) const {
|
||||
_visitor.accept_row_cell(id, cm);
|
||||
@@ -233,6 +224,38 @@ mutation_partition_view::accept(const column_mapping& cm, mutation_partition_vis
|
||||
}
|
||||
}
|
||||
|
||||
void mutation_partition_view::accept(const schema& s, partition_builder& visitor) const
|
||||
{
|
||||
do_accept(s.get_column_mapping(), visitor);
|
||||
}
|
||||
|
||||
void mutation_partition_view::accept(const column_mapping& cm, converting_mutation_partition_applier& visitor) const
|
||||
{
|
||||
do_accept(cm, visitor);
|
||||
}
|
||||
|
||||
std::optional<clustering_key> mutation_partition_view::first_row_key() const
|
||||
{
|
||||
auto in = _in;
|
||||
auto mpv = ser::deserialize(in, boost::type<ser::mutation_partition_view>());
|
||||
auto rows = mpv.rows();
|
||||
if (rows.empty()) {
|
||||
return { };
|
||||
}
|
||||
return rows.front().key();
|
||||
}
|
||||
|
||||
std::optional<clustering_key> mutation_partition_view::last_row_key() const
|
||||
{
|
||||
auto in = _in;
|
||||
auto mpv = ser::deserialize(in, boost::type<ser::mutation_partition_view>());
|
||||
auto rows = mpv.rows();
|
||||
if (rows.empty()) {
|
||||
return { };
|
||||
}
|
||||
return rows.back().key();
|
||||
}
|
||||
|
||||
mutation_partition_view mutation_partition_view::from_view(ser::mutation_partition_view v)
|
||||
{
|
||||
return { v.v };
|
||||
@@ -250,9 +273,8 @@ mutation_fragment frozen_mutation_fragment::unfreeze(const schema& s)
|
||||
public:
|
||||
clustering_row_builder(const schema& s, clustering_key key, row_tombstone t, row_marker m)
|
||||
: _s(s), _mf(mutation_fragment::clustering_row_tag_t(), std::move(key), std::move(t), std::move(m), row()) { }
|
||||
void accept_atomic_cell(column_id id, const atomic_cell& ac) {
|
||||
auto& type = *_s.regular_column_at(id).type;
|
||||
_mf.as_mutable_clustering_row().cells().append_cell(id, atomic_cell_or_collection(atomic_cell(type, ac)));
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) {
|
||||
_mf.as_mutable_clustering_row().cells().append_cell(id, atomic_cell_or_collection(std::move(ac)));
|
||||
}
|
||||
void accept_collection(column_id id, const collection_mutation& cm) {
|
||||
auto& ctype = *static_pointer_cast<const collection_type_impl>(_s.regular_column_at(id).type);
|
||||
@@ -273,9 +295,8 @@ mutation_fragment frozen_mutation_fragment::unfreeze(const schema& s)
|
||||
mutation_fragment _mf;
|
||||
public:
|
||||
explicit static_row_builder(const schema& s) : _s(s), _mf(static_row()) { }
|
||||
void accept_atomic_cell(column_id id, const atomic_cell& ac) {
|
||||
auto& type = *_s.static_column_at(id).type;
|
||||
_mf.as_mutable_static_row().cells().append_cell(id, atomic_cell_or_collection(atomic_cell(type, ac)));
|
||||
void accept_atomic_cell(column_id id, atomic_cell ac) {
|
||||
_mf.as_mutable_static_row().cells().append_cell(id, atomic_cell_or_collection(std::move(ac)));
|
||||
}
|
||||
void accept_collection(column_id id, const collection_mutation& cm) {
|
||||
auto& ctype = *static_pointer_cast<const collection_type_impl>(_s.static_column_at(id).type);
|
||||
|
||||
@@ -29,6 +29,26 @@ namespace ser {
|
||||
class mutation_partition_view;
|
||||
}
|
||||
|
||||
class partition_builder;
|
||||
class converting_mutation_partition_applier;
|
||||
|
||||
GCC6_CONCEPT(
|
||||
template<typename T>
|
||||
concept bool MutationViewVisitor = requires (T visitor, tombstone t, atomic_cell ac,
|
||||
collection_mutation_view cmv, range_tombstone rt,
|
||||
position_in_partition_view pipv, row_tombstone row_tomb,
|
||||
row_marker rm) {
|
||||
visitor.accept_partition_tombstone(t);
|
||||
visitor.accept_static_cell(column_id(), std::move(ac));
|
||||
visitor.accept_static_cell(column_id(), cmv);
|
||||
visitor.accept_row_tombstone(rt);
|
||||
visitor.accept_row(pipv, row_tomb, rm,
|
||||
is_dummy::no, is_continuous::yes);
|
||||
visitor.accept_row_cell(column_id(), std::move(ac));
|
||||
visitor.accept_row_cell(column_id(), cmv);
|
||||
};
|
||||
)
|
||||
|
||||
// View on serialized mutation partition. See mutation_partition_serializer.
|
||||
class mutation_partition_view {
|
||||
utils::input_stream _in;
|
||||
@@ -36,11 +56,18 @@ private:
|
||||
mutation_partition_view(utils::input_stream v)
|
||||
: _in(v)
|
||||
{ }
|
||||
|
||||
template<typename Visitor>
|
||||
GCC6_CONCEPT(requires MutationViewVisitor<Visitor>)
|
||||
void do_accept(const column_mapping&, Visitor& visitor) const;
|
||||
public:
|
||||
static mutation_partition_view from_stream(utils::input_stream v) {
|
||||
return { v };
|
||||
}
|
||||
static mutation_partition_view from_view(ser::mutation_partition_view v);
|
||||
void accept(const schema& schema, mutation_partition_visitor& visitor) const;
|
||||
void accept(const column_mapping&, mutation_partition_visitor& visitor) const;
|
||||
void accept(const schema& schema, partition_builder& visitor) const;
|
||||
void accept(const column_mapping&, converting_mutation_partition_applier& visitor) const;
|
||||
|
||||
std::optional<clustering_key> first_row_key() const;
|
||||
std::optional<clustering_key> last_row_key() const;
|
||||
};
|
||||
|
||||
@@ -184,13 +184,11 @@ private:
|
||||
// end, a call to next_partition() or a call to
|
||||
// fast_forward_to(dht::partition_range).
|
||||
reader_and_last_fragment_kind _single_reader;
|
||||
dht::decorated_key_opt _key;
|
||||
const schema_ptr _schema;
|
||||
streamed_mutation::forwarding _fwd_sm;
|
||||
mutation_reader::forwarding _fwd_mr;
|
||||
private:
|
||||
const dht::token* current_position() const;
|
||||
void maybe_add_readers(const dht::token* const t);
|
||||
void maybe_add_readers(const std::optional<dht::ring_position_view>& pos);
|
||||
void add_readers(std::vector<flat_mutation_reader> new_readers);
|
||||
future<> prepare_next();
|
||||
// Collect all forwardable readers into _next, and remove them from
|
||||
@@ -236,7 +234,7 @@ class list_reader_selector : public reader_selector {
|
||||
|
||||
public:
|
||||
explicit list_reader_selector(schema_ptr s, std::vector<flat_mutation_reader> readers)
|
||||
: reader_selector(s, dht::ring_position::min())
|
||||
: reader_selector(s, dht::ring_position_view::min())
|
||||
, _readers(std::move(readers)) {
|
||||
}
|
||||
|
||||
@@ -246,8 +244,8 @@ public:
|
||||
list_reader_selector(list_reader_selector&&) = default;
|
||||
list_reader_selector& operator=(list_reader_selector&&) = default;
|
||||
|
||||
virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const) override {
|
||||
_selector_position = dht::ring_position::max();
|
||||
virtual std::vector<flat_mutation_reader> create_new_readers(const std::optional<dht::ring_position_view>&) override {
|
||||
_selector_position = dht::ring_position_view::max();
|
||||
return std::exchange(_readers, {});
|
||||
}
|
||||
|
||||
@@ -256,12 +254,10 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
void mutation_reader_merger::maybe_add_readers(const dht::token* const t) {
|
||||
if (!_selector->has_new_readers(t)) {
|
||||
return;
|
||||
void mutation_reader_merger::maybe_add_readers(const std::optional<dht::ring_position_view>& pos) {
|
||||
if (_selector->has_new_readers(pos)) {
|
||||
add_readers(_selector->create_new_readers(pos));
|
||||
}
|
||||
|
||||
add_readers(_selector->create_new_readers(t));
|
||||
}
|
||||
|
||||
void mutation_reader_merger::add_readers(std::vector<flat_mutation_reader> new_readers) {
|
||||
@@ -272,14 +268,6 @@ void mutation_reader_merger::add_readers(std::vector<flat_mutation_reader> new_r
|
||||
}
|
||||
}
|
||||
|
||||
const dht::token* mutation_reader_merger::current_position() const {
|
||||
if (!_key) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return &_key->token();
|
||||
}
|
||||
|
||||
struct mutation_reader_merger::reader_heap_compare {
|
||||
const schema& s;
|
||||
|
||||
@@ -338,12 +326,10 @@ future<> mutation_reader_merger::prepare_next() {
|
||||
// waiting for a fast-forward so there is nothing to do.
|
||||
if (_fragment_heap.empty() && _halted_readers.empty()) {
|
||||
if (_reader_heap.empty()) {
|
||||
_key = {};
|
||||
maybe_add_readers(std::nullopt);
|
||||
} else {
|
||||
_key = _reader_heap.front().fragment.as_partition_start().key();
|
||||
maybe_add_readers(_reader_heap.front().fragment.as_partition_start().key());
|
||||
}
|
||||
|
||||
maybe_add_readers(current_position());
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -371,7 +357,7 @@ mutation_reader_merger::mutation_reader_merger(schema_ptr schema,
|
||||
, _schema(std::move(schema))
|
||||
, _fwd_sm(fwd_sm)
|
||||
, _fwd_mr(fwd_mr) {
|
||||
maybe_add_readers(nullptr);
|
||||
maybe_add_readers(std::nullopt);
|
||||
}
|
||||
|
||||
future<mutation_reader_merger::mutation_fragment_batch> mutation_reader_merger::operator()() {
|
||||
|
||||
@@ -50,19 +50,19 @@ namespace mutation_reader {
|
||||
class reader_selector {
|
||||
protected:
|
||||
schema_ptr _s;
|
||||
dht::ring_position _selector_position;
|
||||
dht::ring_position_view _selector_position;
|
||||
public:
|
||||
reader_selector(schema_ptr s, dht::ring_position rp) noexcept : _s(std::move(s)), _selector_position(std::move(rp)) {}
|
||||
reader_selector(schema_ptr s, dht::ring_position_view rpv) noexcept : _s(std::move(s)), _selector_position(std::move(rpv)) {}
|
||||
|
||||
virtual ~reader_selector() = default;
|
||||
// Call only if has_new_readers() returned true.
|
||||
virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const t) = 0;
|
||||
virtual std::vector<flat_mutation_reader> create_new_readers(const std::optional<dht::ring_position_view>& pos) = 0;
|
||||
virtual std::vector<flat_mutation_reader> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) = 0;
|
||||
|
||||
// Can be false-positive but never false-negative!
|
||||
bool has_new_readers(const dht::token* const t) const noexcept {
|
||||
bool has_new_readers(const std::optional<dht::ring_position_view>& pos) const noexcept {
|
||||
dht::ring_position_comparator cmp(*_s);
|
||||
return !_selector_position.is_max() && (!t || cmp(dht::ring_position_view(*t), _selector_position) >= 0);
|
||||
return !_selector_position.is_max() && (!pos || cmp(*pos, _selector_position) >= 0);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
#include "mutation_partition_view.hh"
|
||||
|
||||
// Partition visitor which builds mutation_partition corresponding to the data its fed with.
|
||||
class partition_builder : public mutation_partition_visitor {
|
||||
class partition_builder final : public mutation_partition_visitor {
|
||||
private:
|
||||
const schema& _schema;
|
||||
mutation_partition& _partition;
|
||||
@@ -43,9 +43,13 @@ public:
|
||||
}
|
||||
|
||||
virtual void accept_static_cell(column_id id, atomic_cell_view cell) override {
|
||||
row& r = _partition.static_row();
|
||||
auto& cdef = _schema.static_column_at(id);
|
||||
r.append_cell(id, atomic_cell_or_collection(*cdef.type, cell));
|
||||
accept_static_cell(id, atomic_cell(*cdef.type, cell));
|
||||
}
|
||||
|
||||
void accept_static_cell(column_id id, atomic_cell&& cell) {
|
||||
row& r = _partition.static_row();
|
||||
r.append_cell(id, atomic_cell_or_collection(std::move(cell)));
|
||||
}
|
||||
|
||||
virtual void accept_static_cell(column_id id, collection_mutation_view collection) override {
|
||||
@@ -66,9 +70,13 @@ public:
|
||||
}
|
||||
|
||||
virtual void accept_row_cell(column_id id, atomic_cell_view cell) override {
|
||||
row& r = _current_row->cells();
|
||||
auto& cdef = _schema.regular_column_at(id);
|
||||
r.append_cell(id, atomic_cell_or_collection(*cdef.type, cell));
|
||||
accept_row_cell(id, atomic_cell(*cdef.type, cell));
|
||||
}
|
||||
|
||||
void accept_row_cell(column_id id, atomic_cell&& cell) {
|
||||
row& r = _current_row->cells();
|
||||
r.append_cell(id, atomic_cell_or_collection(std::move(cell)));
|
||||
}
|
||||
|
||||
virtual void accept_row_cell(column_id id, collection_mutation_view collection) override {
|
||||
|
||||
@@ -33,34 +33,6 @@ struct partition_snapshot_reader_dummy_accounter {
|
||||
};
|
||||
extern partition_snapshot_reader_dummy_accounter no_accounter;
|
||||
|
||||
inline void maybe_merge_versions(lw_shared_ptr<partition_snapshot>& snp,
|
||||
logalloc::region& lsa_region,
|
||||
logalloc::allocating_section& read_section) {
|
||||
if (!snp.owned()) {
|
||||
return;
|
||||
}
|
||||
// If no one else is using this particular snapshot try to merge partition
|
||||
// versions.
|
||||
with_allocator(lsa_region.allocator(), [&snp, &lsa_region, &read_section] {
|
||||
return with_linearized_managed_bytes([&snp, &lsa_region, &read_section] {
|
||||
try {
|
||||
// Allocating sections require the region to be reclaimable
|
||||
// which means that they cannot be nested.
|
||||
// It is, however, possible, that if the snapshot is taken
|
||||
// inside an allocating section and then an exception is thrown
|
||||
// this function will be called to clean up even though we
|
||||
// still will be in the context of the allocating section.
|
||||
if (lsa_region.reclaiming_enabled()) {
|
||||
read_section(lsa_region, [&snp] {
|
||||
snp->merge_partition_versions();
|
||||
});
|
||||
}
|
||||
} catch (...) { }
|
||||
snp = {};
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename MemoryAccounter = partition_snapshot_reader_dummy_accounter>
|
||||
class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public MemoryAccounter {
|
||||
struct rows_position {
|
||||
@@ -87,7 +59,7 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
|
||||
position_in_partition::equal_compare _eq;
|
||||
heap_compare _heap_cmp;
|
||||
|
||||
lw_shared_ptr<partition_snapshot> _snapshot;
|
||||
partition_snapshot_ptr _snapshot;
|
||||
|
||||
logalloc::region& _region;
|
||||
logalloc::allocating_section& _read_section;
|
||||
@@ -99,7 +71,7 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
|
||||
private:
|
||||
template<typename Function>
|
||||
decltype(auto) in_alloc_section(Function&& fn) {
|
||||
return _read_section.with_reclaiming_disabled(_region, [&] {
|
||||
return _read_section.with_reclaiming_disabled(_region, [&] {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
return fn();
|
||||
});
|
||||
@@ -155,7 +127,7 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
|
||||
return !_clustering_rows.empty();
|
||||
}
|
||||
public:
|
||||
explicit lsa_partition_reader(const schema& s, lw_shared_ptr<partition_snapshot> snp,
|
||||
explicit lsa_partition_reader(const schema& s, partition_snapshot_ptr snp,
|
||||
logalloc::region& region, logalloc::allocating_section& read_section,
|
||||
bool digest_requested)
|
||||
: _schema(s)
|
||||
@@ -168,10 +140,6 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
|
||||
, _digest_requested(digest_requested)
|
||||
{ }
|
||||
|
||||
~lsa_partition_reader() {
|
||||
maybe_merge_versions(_snapshot, _region, _read_section);
|
||||
}
|
||||
|
||||
template<typename Function>
|
||||
decltype(auto) with_reserve(Function&& fn) {
|
||||
return _read_section.with_reserve(std::forward<Function>(fn));
|
||||
@@ -187,7 +155,7 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
|
||||
return _snapshot->static_row(_digest_requested);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Returns next clustered row in the range.
|
||||
// If the ck_range is the same as the one used previously last_row needs
|
||||
// to be engaged and equal the position of the row returned last time.
|
||||
@@ -298,7 +266,7 @@ private:
|
||||
}
|
||||
public:
|
||||
template <typename... Args>
|
||||
partition_snapshot_flat_reader(schema_ptr s, dht::decorated_key dk, lw_shared_ptr<partition_snapshot> snp,
|
||||
partition_snapshot_flat_reader(schema_ptr s, dht::decorated_key dk, partition_snapshot_ptr snp,
|
||||
query::clustering_key_filter_ranges crr, bool digest_requested,
|
||||
logalloc::region& region, logalloc::allocating_section& read_section,
|
||||
boost::any pointer_to_container, Args&&... args)
|
||||
@@ -344,7 +312,7 @@ inline flat_mutation_reader
|
||||
make_partition_snapshot_flat_reader(schema_ptr s,
|
||||
dht::decorated_key dk,
|
||||
query::clustering_key_filter_ranges crr,
|
||||
lw_shared_ptr<partition_snapshot> snp,
|
||||
partition_snapshot_ptr snp,
|
||||
bool digest_requested,
|
||||
logalloc::region& region,
|
||||
logalloc::allocating_section& read_section,
|
||||
@@ -365,7 +333,7 @@ inline flat_mutation_reader
|
||||
make_partition_snapshot_flat_reader(schema_ptr s,
|
||||
dht::decorated_key dk,
|
||||
query::clustering_key_filter_ranges crr,
|
||||
lw_shared_ptr<partition_snapshot> snp,
|
||||
partition_snapshot_ptr snp,
|
||||
bool digest_requested,
|
||||
logalloc::region& region,
|
||||
logalloc::allocating_section& read_section,
|
||||
|
||||
@@ -187,23 +187,49 @@ void merge_versions(const schema& s, mutation_partition& newer, mutation_partiti
|
||||
newer = std::move(older);
|
||||
}
|
||||
|
||||
void partition_snapshot::merge_partition_versions() {
|
||||
stop_iteration partition_snapshot::merge_partition_versions() {
|
||||
partition_version_ref& v = version();
|
||||
if (!v.is_unique_owner()) {
|
||||
auto first_used = &*v;
|
||||
_version = { };
|
||||
while (first_used->prev() && !first_used->is_referenced()) {
|
||||
first_used = first_used->prev();
|
||||
// Shift _version to the oldest unreferenced version and then keep merging left hand side into it.
|
||||
// This is good for performance because in case we were at the latest version
|
||||
// we leave it for incoming writes and they don't have to create a new one.
|
||||
partition_version* current = &*v;
|
||||
while (current->next() && !current->next()->is_referenced()) {
|
||||
current = current->next();
|
||||
_version = partition_version_ref(*current);
|
||||
}
|
||||
|
||||
auto current = first_used->next();
|
||||
while (current && !current->is_referenced()) {
|
||||
auto next = current->next();
|
||||
merge_versions(*_schema, first_used->partition(), std::move(current->partition()), _tracker);
|
||||
current_allocator().destroy(current);
|
||||
current = next;
|
||||
while (auto prev = current->prev()) {
|
||||
_region.allocator().invalidate_references();
|
||||
if (current->partition().apply_monotonically(*schema(), std::move(prev->partition()), _tracker, is_preemptible::yes) == stop_iteration::no) {
|
||||
return stop_iteration::no;
|
||||
}
|
||||
if (prev->is_referenced()) {
|
||||
_version.release();
|
||||
prev->back_reference() = partition_version_ref(*current, prev->back_reference().is_unique_owner());
|
||||
current_allocator().destroy(prev);
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
current_allocator().destroy(prev);
|
||||
}
|
||||
}
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
stop_iteration partition_snapshot::slide_to_oldest() noexcept {
|
||||
partition_version_ref& v = version();
|
||||
if (v.is_unique_owner()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
if (_entry) {
|
||||
_entry->_snapshot = nullptr;
|
||||
_entry = nullptr;
|
||||
}
|
||||
partition_version* current = &*v;
|
||||
while (current->next() && !current->next()->is_referenced()) {
|
||||
current = current->next();
|
||||
_version = partition_version_ref(*current);
|
||||
}
|
||||
return current->prev() ? stop_iteration::no : stop_iteration::yes;
|
||||
}
|
||||
|
||||
unsigned partition_snapshot::version_count()
|
||||
@@ -457,19 +483,19 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
|
||||
pe.upgrade(pe_schema.shared_from_this(), s.shared_from_this(), pe_cleaner, no_cache_tracker);
|
||||
}
|
||||
|
||||
bool can_move = !pe._snapshot;
|
||||
// When preemptible, later memtable reads could start using the snapshot before
|
||||
// snapshot's writes are made visible in cache, which would cause them to miss those writes.
|
||||
// So we cannot allow erasing when preemptible.
|
||||
bool can_move = !preemptible && !pe._snapshot;
|
||||
|
||||
auto src_snp = pe.read(reg, pe_cleaner, s.shared_from_this(), no_cache_tracker);
|
||||
lw_shared_ptr<partition_snapshot> prev_snp;
|
||||
partition_snapshot_ptr prev_snp;
|
||||
if (preemptible) {
|
||||
// Reads must see prev_snp until whole update completes so that writes
|
||||
// are not partially visible.
|
||||
prev_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase - 1);
|
||||
}
|
||||
auto dst_snp = read(reg, tracker.cleaner(), s.shared_from_this(), &tracker, phase);
|
||||
auto merge_dst_snp = defer([preemptible, dst_snp, ®, &alloc] () mutable {
|
||||
maybe_merge_versions(dst_snp, reg, alloc);
|
||||
});
|
||||
|
||||
// Once we start updating the partition, we must keep all snapshots until the update completes,
|
||||
// otherwise partial writes would be published. So the scope of snapshots must enclose the scope
|
||||
@@ -477,7 +503,6 @@ coroutine partition_entry::apply_to_incomplete(const schema& s,
|
||||
// give the caller a chance to store the coroutine object. The code inside coroutine below
|
||||
// runs outside allocating section.
|
||||
return coroutine([&tracker, &s, &alloc, ®, &acc, can_move, preemptible,
|
||||
merge_dst_snp = std::move(merge_dst_snp), // needs to go away last so that dst_snp is not owned by anyone else
|
||||
cur = partition_snapshot_row_cursor(s, *dst_snp),
|
||||
src_cur = partition_snapshot_row_cursor(s, *src_snp, can_move),
|
||||
dst_snp = std::move(dst_snp),
|
||||
@@ -584,7 +609,7 @@ void partition_entry::upgrade(schema_ptr from, schema_ptr to, mutation_cleaner&
|
||||
remove_or_mark_as_unique_owner(old_version, &cleaner);
|
||||
}
|
||||
|
||||
lw_shared_ptr<partition_snapshot> partition_entry::read(logalloc::region& r,
|
||||
partition_snapshot_ptr partition_entry::read(logalloc::region& r,
|
||||
mutation_cleaner& cleaner, schema_ptr entry_schema, cache_tracker* tracker, partition_snapshot::phase_type phase)
|
||||
{
|
||||
if (_snapshot) {
|
||||
@@ -607,7 +632,7 @@ lw_shared_ptr<partition_snapshot> partition_entry::read(logalloc::region& r,
|
||||
|
||||
auto snp = make_lw_shared<partition_snapshot>(entry_schema, r, cleaner, this, tracker, phase);
|
||||
_snapshot = snp.get();
|
||||
return snp;
|
||||
return partition_snapshot_ptr(std::move(snp));
|
||||
}
|
||||
|
||||
std::vector<range_tombstone>
|
||||
@@ -671,3 +696,13 @@ void partition_entry::evict(mutation_cleaner& cleaner) noexcept {
|
||||
remove_or_mark_as_unique_owner(v, &cleaner);
|
||||
}
|
||||
}
|
||||
|
||||
partition_snapshot_ptr::~partition_snapshot_ptr() {
|
||||
if (_snp) {
|
||||
auto&& cleaner = _snp->cleaner();
|
||||
auto snp = _snp.release();
|
||||
if (snp) {
|
||||
cleaner.merge_and_destroy(*snp.release());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "utils/coroutine.hh"
|
||||
|
||||
#include <boost/intrusive/parent_from_member.hpp>
|
||||
#include <boost/intrusive/slist.hpp>
|
||||
|
||||
// This is MVCC implementation for mutation_partitions.
|
||||
//
|
||||
@@ -188,8 +189,9 @@ class partition_version_ref {
|
||||
friend class partition_version;
|
||||
public:
|
||||
partition_version_ref() = default;
|
||||
explicit partition_version_ref(partition_version& pv) noexcept
|
||||
explicit partition_version_ref(partition_version& pv, bool unique_owner = false) noexcept
|
||||
: _version(&pv)
|
||||
, _unique_owner(unique_owner)
|
||||
{
|
||||
assert(!_version->_backref);
|
||||
_version->_backref = this;
|
||||
@@ -300,8 +302,9 @@ private:
|
||||
logalloc::region& _region;
|
||||
mutation_cleaner& _cleaner;
|
||||
cache_tracker* _tracker;
|
||||
|
||||
boost::intrusive::slist_member_hook<> _cleaner_hook;
|
||||
friend class partition_entry;
|
||||
friend class mutation_cleaner_impl;
|
||||
public:
|
||||
explicit partition_snapshot(schema_ptr s,
|
||||
logalloc::region& region,
|
||||
@@ -329,10 +332,17 @@ public:
|
||||
return container_of(v._backref);
|
||||
}
|
||||
|
||||
// If possible merges the version pointed to by this snapshot with
|
||||
// If possible, merges the version pointed to by this snapshot with
|
||||
// adjacent partition versions. Leaves the snapshot in an unspecified state.
|
||||
// Can be retried if previous merge attempt has failed.
|
||||
void merge_partition_versions();
|
||||
stop_iteration merge_partition_versions();
|
||||
|
||||
// Prepares the snapshot for cleaning by moving to the right-most unreferenced version.
|
||||
// Returns stop_iteration::yes if there is nothing to merge with and the snapshot
|
||||
// should be collected right away, and stop_iteration::no otherwise.
|
||||
// When returns stop_iteration::no, the snapshots is guaranteed to not be attached
|
||||
// to the latest version.
|
||||
stop_iteration slide_to_oldest() noexcept;
|
||||
|
||||
~partition_snapshot();
|
||||
|
||||
@@ -357,6 +367,7 @@ public:
|
||||
const schema_ptr& schema() const { return _schema; }
|
||||
logalloc::region& region() const { return _region; }
|
||||
cache_tracker* tracker() const { return _tracker; }
|
||||
mutation_cleaner& cleaner() { return _cleaner; }
|
||||
|
||||
tombstone partition_tombstone() const;
|
||||
::static_row static_row(bool digest_requested) const;
|
||||
@@ -368,6 +379,36 @@ public:
|
||||
std::vector<range_tombstone> range_tombstones();
|
||||
};
|
||||
|
||||
class partition_snapshot_ptr {
|
||||
lw_shared_ptr<partition_snapshot> _snp;
|
||||
public:
|
||||
using value_type = partition_snapshot;
|
||||
partition_snapshot_ptr() = default;
|
||||
partition_snapshot_ptr(partition_snapshot_ptr&&) = default;
|
||||
partition_snapshot_ptr(const partition_snapshot_ptr&) = default;
|
||||
partition_snapshot_ptr(lw_shared_ptr<partition_snapshot> snp) : _snp(std::move(snp)) {}
|
||||
~partition_snapshot_ptr();
|
||||
partition_snapshot_ptr& operator=(partition_snapshot_ptr&& other) noexcept {
|
||||
if (this != &other) {
|
||||
this->~partition_snapshot_ptr();
|
||||
new (this) partition_snapshot_ptr(std::move(other));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
partition_snapshot_ptr& operator=(const partition_snapshot_ptr& other) noexcept {
|
||||
if (this != &other) {
|
||||
this->~partition_snapshot_ptr();
|
||||
new (this) partition_snapshot_ptr(other);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
partition_snapshot& operator*() { return *_snp; }
|
||||
const partition_snapshot& operator*() const { return *_snp; }
|
||||
partition_snapshot* operator->() { return &*_snp; }
|
||||
const partition_snapshot* operator->() const { return &*_snp; }
|
||||
explicit operator bool() const { return bool(_snp); }
|
||||
};
|
||||
|
||||
class real_dirty_memory_accounter;
|
||||
|
||||
// Represents mutation_partition with snapshotting support a la MVCC.
|
||||
@@ -523,7 +564,7 @@ public:
|
||||
void upgrade(schema_ptr from, schema_ptr to, mutation_cleaner&, cache_tracker*);
|
||||
|
||||
// Snapshots with different values of phase will point to different partition_version objects.
|
||||
lw_shared_ptr<partition_snapshot> read(logalloc::region& region,
|
||||
partition_snapshot_ptr read(logalloc::region& region,
|
||||
mutation_cleaner&,
|
||||
schema_ptr entry_schema,
|
||||
cache_tracker*,
|
||||
|
||||
@@ -121,7 +121,7 @@ public:
|
||||
position_in_partition_view(const clustering_key_prefix& ck)
|
||||
: _type(partition_region::clustered), _ck(&ck) { }
|
||||
position_in_partition_view(range_tag_t, bound_view bv)
|
||||
: _type(partition_region::clustered), _bound_weight(position_weight(bv.kind)), _ck(&bv.prefix) { }
|
||||
: _type(partition_region::clustered), _bound_weight(position_weight(bv.kind())), _ck(&bv.prefix()) { }
|
||||
|
||||
static position_in_partition_view for_range_start(const query::clustering_range& r) {
|
||||
return {position_in_partition_view::range_tag_t(), bound_view::from_range_start(r)};
|
||||
@@ -214,7 +214,7 @@ public:
|
||||
position_in_partition(before_clustering_row_tag_t, clustering_key_prefix ck)
|
||||
: _type(partition_region::clustered), _bound_weight(-1), _ck(std::move(ck)) { }
|
||||
position_in_partition(range_tag_t, bound_view bv)
|
||||
: _type(partition_region::clustered), _bound_weight(position_weight(bv.kind)), _ck(bv.prefix) { }
|
||||
: _type(partition_region::clustered), _bound_weight(position_weight(bv.kind())), _ck(bv.prefix()) { }
|
||||
position_in_partition(after_static_row_tag_t) :
|
||||
position_in_partition(range_tag_t(), bound_view::bottom()) { }
|
||||
explicit position_in_partition(position_in_partition_view view)
|
||||
|
||||
@@ -68,10 +68,10 @@ void range_tombstone_accumulator::drop_unneeded_tombstones(const clustering_key_
|
||||
auto cmp = [&] (const range_tombstone& rt, const clustering_key_prefix& ck, int w) {
|
||||
if (_reversed) {
|
||||
auto bv = rt.start_bound();
|
||||
return _cmp(ck, w, bv.prefix, weight(bv.kind));
|
||||
return _cmp(ck, w, bv.prefix(), weight(bv.kind()));
|
||||
}
|
||||
auto bv = rt.end_bound();
|
||||
return _cmp(bv.prefix, weight(bv.kind), ck, w);
|
||||
return _cmp(bv.prefix(), weight(bv.kind()), ck, w);
|
||||
};
|
||||
while (!_range_tombstones.empty() && cmp(*_range_tombstones.begin(), ck, w)) {
|
||||
_range_tombstones.pop_front();
|
||||
|
||||
@@ -52,7 +52,7 @@ public:
|
||||
, tomb(std::move(tomb))
|
||||
{ }
|
||||
range_tombstone(bound_view start, bound_view end, tombstone tomb)
|
||||
: range_tombstone(start.prefix, start.kind, end.prefix, end.kind, std::move(tomb))
|
||||
: range_tombstone(start.prefix(), start.kind(), end.prefix(), end.kind(), std::move(tomb))
|
||||
{ }
|
||||
range_tombstone(clustering_key_prefix&& start, clustering_key_prefix&& end, tombstone tomb)
|
||||
: range_tombstone(std::move(start), bound_kind::incl_start, std::move(end), bound_kind::incl_end, std::move(tomb))
|
||||
@@ -151,6 +151,9 @@ public:
|
||||
}
|
||||
if (less(position(), pos)) {
|
||||
set_start(s, pos);
|
||||
bound_view new_start = pos.as_start_bound_view();
|
||||
start = new_start.prefix();
|
||||
start_kind = new_start.kind();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -158,8 +161,8 @@ public:
|
||||
// Assumes !pos.is_clustering_row(), because range_tombstone bounds can't represent such positions
|
||||
void set_start(const schema& s, position_in_partition_view pos) {
|
||||
bound_view new_start = pos.as_start_bound_view();
|
||||
start = new_start.prefix;
|
||||
start_kind = new_start.kind;
|
||||
start = new_start.prefix();
|
||||
start_kind = new_start.kind();
|
||||
}
|
||||
|
||||
size_t external_memory_usage(const schema&) const {
|
||||
|
||||
@@ -252,7 +252,7 @@ range_tombstone_list range_tombstone_list::difference(const schema& s, const ran
|
||||
++other_rt;
|
||||
continue;
|
||||
}
|
||||
auto new_end = bound_view(other_rt->start_bound().prefix, invert_kind(other_rt->start_bound().kind));
|
||||
auto new_end = bound_view(other_rt->start_bound().prefix(), invert_kind(other_rt->start_bound().kind()));
|
||||
if (cmp_rt(cur_start, new_end)) {
|
||||
diff.apply(s, cur_start, new_end, this_rt->tomb);
|
||||
cur_start = other_rt->start_bound();
|
||||
@@ -267,7 +267,7 @@ range_tombstone_list range_tombstone_list::difference(const schema& s, const ran
|
||||
if (this_rt->tomb > other_rt->tomb) {
|
||||
diff.apply(s, cur_start, end, this_rt->tomb);
|
||||
}
|
||||
cur_start = bound_view(end.prefix, invert_kind(end.kind));
|
||||
cur_start = bound_view(end.prefix(), invert_kind(end.kind()));
|
||||
++other_rt;
|
||||
if (cmp_rt(cur_end, cur_start)) {
|
||||
advance_this_rt();
|
||||
@@ -437,14 +437,18 @@ bool range_tombstone_list::equal(const schema& s, const range_tombstone_list& ot
|
||||
});
|
||||
}
|
||||
|
||||
void range_tombstone_list::apply_monotonically(const schema& s, range_tombstone_list&& list) {
|
||||
stop_iteration range_tombstone_list::apply_monotonically(const schema& s, range_tombstone_list&& list, is_preemptible preemptible) {
|
||||
auto del = current_deleter<range_tombstone>();
|
||||
auto it = list.begin();
|
||||
while (it != list.end()) {
|
||||
// FIXME: Optimize by stealing the entry
|
||||
apply_monotonically(s, *it);
|
||||
it = list._tombstones.erase_and_dispose(it, del);
|
||||
if (preemptible && need_preempt()) {
|
||||
return stop_iteration::no;
|
||||
}
|
||||
}
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
void range_tombstone_list::apply_monotonically(const schema& s, const range_tombstone_list& list) {
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "range_tombstone.hh"
|
||||
#include "query-request.hh"
|
||||
#include "position_in_partition.hh"
|
||||
#include "utils/preempt.hh"
|
||||
#include <iosfwd>
|
||||
|
||||
class range_tombstone_list final {
|
||||
@@ -125,7 +126,7 @@ public:
|
||||
return _tombstones.end();
|
||||
}
|
||||
void apply(const schema& s, const bound_view& start_bound, const bound_view& end_bound, tombstone tomb) {
|
||||
apply(s, start_bound.prefix, start_bound.kind, end_bound.prefix, end_bound.kind, std::move(tomb));
|
||||
apply(s, start_bound.prefix(), start_bound.kind(), end_bound.prefix(), end_bound.kind(), std::move(tomb));
|
||||
}
|
||||
void apply(const schema& s, const range_tombstone& rt) {
|
||||
apply(s, rt.start, rt.start_kind, rt.end, rt.end_kind, rt.tomb);
|
||||
@@ -149,7 +150,7 @@ public:
|
||||
/// Monotonic exception guarantees. In case of failure the object will contain at least as much information as before the call.
|
||||
/// The other list will be left in a state such that it would still commute with this object to the same state as it
|
||||
/// would if the call didn't fail.
|
||||
void apply_monotonically(const schema& s, range_tombstone_list&& list);
|
||||
stop_iteration apply_monotonically(const schema& s, range_tombstone_list&& list, is_preemptible = is_preemptible::no);
|
||||
public:
|
||||
tombstone search_tombstone_covering(const schema& s, const clustering_key_prefix& key) const;
|
||||
// Returns range of tombstones which overlap with given range
|
||||
|
||||
@@ -1004,6 +1004,22 @@ static dht::token_range_vector get_primary_ranges(
|
||||
utils::fb_utilities::get_broadcast_address());
|
||||
}
|
||||
|
||||
// get_primary_ranges_within_dc() is similar to get_primary_ranges(),
|
||||
// but instead of each range being assigned just one primary owner
|
||||
// across the entire cluster, here each range is assigned a primary
|
||||
// owner in each of the clusters.
|
||||
static dht::token_range_vector get_primary_ranges_within_dc(
|
||||
database& db, sstring keyspace) {
|
||||
auto& rs = db.find_keyspace(keyspace).get_replication_strategy();
|
||||
return rs.get_primary_ranges_within_dc(
|
||||
utils::fb_utilities::get_broadcast_address());
|
||||
}
|
||||
|
||||
static sstring get_local_dc() {
|
||||
return locator::i_endpoint_snitch::get_local_snitch_ptr()->get_datacenter(
|
||||
utils::fb_utilities::get_broadcast_address());
|
||||
}
|
||||
|
||||
|
||||
struct repair_options {
|
||||
// If primary_range is true, we should perform repair only on this node's
|
||||
@@ -1256,21 +1272,14 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
|
||||
rlogger.info("primary-range repair");
|
||||
// when "primary_range" option is on, neither data_centers nor hosts
|
||||
// may be set, except data_centers may contain only local DC (-local)
|
||||
#if 0
|
||||
if (options.data_centers.size() == 1 &&
|
||||
options.data_centers[0] == DatabaseDescriptor.getLocalDataCenter()) {
|
||||
options.data_centers[0] == get_local_dc()) {
|
||||
ranges = get_primary_ranges_within_dc(db.local(), keyspace);
|
||||
} else
|
||||
#endif
|
||||
#if 0
|
||||
if (options.data_centers.size() > 0 || options.hosts.size() > 0) {
|
||||
} else if (options.data_centers.size() > 0 || options.hosts.size() > 0) {
|
||||
throw std::runtime_error("You need to run primary range repair on all nodes in the cluster.");
|
||||
} else {
|
||||
#endif
|
||||
ranges = get_primary_ranges(db.local(), keyspace);
|
||||
#if 0
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
ranges = get_local_ranges(db.local(), keyspace);
|
||||
}
|
||||
|
||||
@@ -92,6 +92,11 @@ cache_tracker::~cache_tracker() {
|
||||
clear();
|
||||
}
|
||||
|
||||
void cache_tracker::set_compaction_scheduling_group(seastar::scheduling_group sg) {
|
||||
_memtable_cleaner.set_scheduling_group(sg);
|
||||
_garbage.set_scheduling_group(sg);
|
||||
}
|
||||
|
||||
void
|
||||
cache_tracker::setup_metrics() {
|
||||
namespace sm = seastar::metrics;
|
||||
@@ -933,8 +938,7 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
|
||||
real_dirty_memory_accounter real_dirty_acc(m, _tracker);
|
||||
m.on_detach_from_region_group();
|
||||
_tracker.region().merge(m); // Now all data in memtable belongs to cache
|
||||
_tracker.memtable_cleaner().merge(m._memtable_cleaner);
|
||||
m._cleaner = &_tracker.memtable_cleaner();
|
||||
_tracker.memtable_cleaner().merge(m._cleaner);
|
||||
STAP_PROBE(scylla, row_cache_update_start);
|
||||
auto cleanup = defer([&m, this] {
|
||||
invalidate_sync(m);
|
||||
|
||||
@@ -269,6 +269,7 @@ public:
|
||||
mutation_cleaner& memtable_cleaner() { return _memtable_cleaner; }
|
||||
uint64_t partitions() const { return _stats.partitions; }
|
||||
const stats& get_stats() const { return _stats; }
|
||||
void set_compaction_scheduling_group(seastar::scheduling_group);
|
||||
};
|
||||
|
||||
inline
|
||||
|
||||
14
schema.cc
14
schema.cc
@@ -52,24 +52,18 @@ bool is_compatible(column_kind k1, column_kind k2) {
|
||||
}
|
||||
|
||||
column_mapping_entry::column_mapping_entry(bytes name, sstring type_name)
|
||||
: _name(std::move(name))
|
||||
, _type(db::marshal::type_parser::parse(type_name))
|
||||
: column_mapping_entry(std::move(name), db::marshal::type_parser::parse(type_name))
|
||||
{
|
||||
}
|
||||
|
||||
column_mapping_entry::column_mapping_entry(const column_mapping_entry& o)
|
||||
: _name(o._name)
|
||||
, _type(db::marshal::type_parser::parse(o._type->name()))
|
||||
: column_mapping_entry(o._name, o._type->name())
|
||||
{
|
||||
}
|
||||
|
||||
column_mapping_entry& column_mapping_entry::operator=(const column_mapping_entry& o) {
|
||||
if (this != &o) {
|
||||
auto tmp = o;
|
||||
this->~column_mapping_entry();
|
||||
new (this) column_mapping_entry(std::move(tmp));
|
||||
}
|
||||
return *this;
|
||||
auto copy = o;
|
||||
return operator=(std::move(copy));
|
||||
}
|
||||
|
||||
template<typename Sequence>
|
||||
|
||||
@@ -295,9 +295,10 @@ static constexpr int DEFAULT_GC_GRACE_SECONDS = 864000;
|
||||
class column_mapping_entry {
|
||||
bytes _name;
|
||||
data_type _type;
|
||||
bool _is_atomic;
|
||||
public:
|
||||
column_mapping_entry(bytes name, data_type type)
|
||||
: _name(std::move(name)), _type(std::move(type)) { }
|
||||
: _name(std::move(name)), _type(std::move(type)), _is_atomic(_type->is_atomic()) { }
|
||||
column_mapping_entry(bytes name, sstring type_name);
|
||||
column_mapping_entry(const column_mapping_entry&);
|
||||
column_mapping_entry& operator=(const column_mapping_entry&);
|
||||
@@ -306,6 +307,7 @@ public:
|
||||
const bytes& name() const { return _name; }
|
||||
const data_type& type() const { return _type; }
|
||||
const sstring& type_name() const { return _type->name(); }
|
||||
bool is_atomic() const { return _is_atomic; }
|
||||
};
|
||||
|
||||
// Encapsulates information needed for converting mutations between different schema versions.
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
#!/bin/bash -e
|
||||
#
|
||||
# Copyright (C) 2015 ScyllaDB
|
||||
|
||||
if [ "`id -u`" -ne 0 ]; then
|
||||
echo "Requires root permission."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_usage() {
|
||||
echo "scylla_install_pkg --local-pkg /home/scylla/rpms --repo [URL]"
|
||||
echo " --local-pkg install locally built .rpm/.deb on specified directory"
|
||||
echo " --repo repository for both install and update, specify .repo/.list file URL"
|
||||
echo " --repo-for-install repository for install, specify .repo/.list file URL"
|
||||
echo " --repo-for-update repository for update, specify .repo/.list file URL"
|
||||
exit 1
|
||||
}
|
||||
|
||||
LOCAL_PKG=
|
||||
UNSTABLE=0
|
||||
REPO_FOR_INSTALL=
|
||||
REPO_FOR_UPDATE=
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--local-pkg")
|
||||
LOCAL_PKG=$2
|
||||
shift 2
|
||||
;;
|
||||
"--repo")
|
||||
REPO_FOR_INSTALL=$2
|
||||
REPO_FOR_UPDATE=$2
|
||||
shift 2
|
||||
;;
|
||||
"--repo-for-install")
|
||||
REPO_FOR_INSTALL=$2
|
||||
shift 2
|
||||
;;
|
||||
"--repo-for-update")
|
||||
REPO_FOR_UPDATE=$2
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
shift 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
. /etc/os-release
|
||||
|
||||
if [ -f /etc/debian_version ]; then
|
||||
echo "#!/bin/sh" >> /usr/sbin/policy-rc.d
|
||||
echo "exit 101" >> /usr/sbin/policy-rc.d
|
||||
chmod +x /usr/sbin/policy-rc.d
|
||||
cp /etc/hosts /etc/hosts.orig
|
||||
echo 127.0.0.1 `hostname` >> /etc/hosts
|
||||
if [ "$REPO_FOR_INSTALL" != "" ]; then
|
||||
curl -L -o /etc/apt/sources.list.d/scylla_install.list $REPO_FOR_INSTALL
|
||||
fi
|
||||
apt-get -o Acquire::AllowInsecureRepositories=true \
|
||||
-o Acquire::AllowDowngradeToInsecureRepositories=true update
|
||||
if [ "$LOCAL_PKG" = "" ]; then
|
||||
apt-get install -o APT::Get::AllowUnauthenticated=true \
|
||||
-y --force-yes scylla
|
||||
else
|
||||
if [ ! -f /usr/bin/gdebi ]; then
|
||||
apt-get install -y --force-yes gdebi-core
|
||||
fi
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-kernel-conf*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-conf*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-server_*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-server-dbg*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-jmx*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-tools*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla_*.deb
|
||||
fi
|
||||
mv /etc/hosts.orig /etc/hosts
|
||||
rm /usr/sbin/policy-rc.d
|
||||
rm /etc/apt/sources.list.d/scylla_install.list
|
||||
if [ "$REPO_FOR_UPDATE" != "" ]; then
|
||||
curl -L -o /etc/apt/sources.list.d/scylla.list $REPO_FOR_UPDATE
|
||||
fi
|
||||
apt-get -o Acquire::AllowInsecureRepositories=true \
|
||||
-o Acquire::AllowDowngradeToInsecureRepositories=true update
|
||||
else
|
||||
if [ "$REPO_FOR_INSTALL" != "" ]; then
|
||||
curl -L -o /etc/yum.repos.d/scylla_install.repo $REPO_FOR_INSTALL
|
||||
fi
|
||||
|
||||
if [ "$ID" = "centos" ]; then
|
||||
yum install -y epel-release
|
||||
elif [ "$ID" = "rhel" ]; then
|
||||
rpm -ivh http://download.fedoraproject.org/pub/epel/7/x86_64/e/epel-release-7-7.noarch.rpm
|
||||
else
|
||||
echo "Unsupported distribution"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "$LOCAL_PKG" = "" ]; then
|
||||
yum install -y scylla
|
||||
else
|
||||
yum install -y $LOCAL_PKG/scylla*.*.rpm
|
||||
fi
|
||||
|
||||
rm /etc/yum.repos.d/scylla_install.repo
|
||||
if [ "$REPO_FOR_UPDATE" != "" ]; then
|
||||
curl -L -o /etc/yum.repos.d/scylla.repo $REPO_FOR_UPDATE
|
||||
fi
|
||||
fi
|
||||
146
scylla-gdb.py
146
scylla-gdb.py
@@ -58,6 +58,57 @@ class intrusive_list:
|
||||
def __bool__(self):
|
||||
return self.__nonzero__()
|
||||
|
||||
class intrusive_set:
|
||||
size_t = gdb.lookup_type('size_t')
|
||||
|
||||
def __init__(self, ref):
|
||||
container_type = ref.type.strip_typedefs()
|
||||
self.node_type = container_type.template_argument(0)
|
||||
member_hook = get_template_arg_with_prefix(container_type, "boost::intrusive::member_hook")
|
||||
if not member_hook:
|
||||
raise Exception('Expected member_hook<> option not found in container\'s template parameters')
|
||||
self.link_offset = member_hook.template_argument(2).cast(self.size_t)
|
||||
self.root = ref['holder']['root']['parent_']
|
||||
|
||||
def __visit(self, node):
|
||||
if node:
|
||||
for n in self.__visit(node['left_']):
|
||||
yield n
|
||||
|
||||
node_ptr = node.cast(self.size_t) - self.link_offset
|
||||
yield node_ptr.cast(self.node_type.pointer()).dereference()
|
||||
|
||||
for n in self.__visit(node['right_']):
|
||||
yield n
|
||||
|
||||
def __iter__(self):
|
||||
for n in self.__visit(self.root):
|
||||
yield n
|
||||
|
||||
class intrusive_set_external_comparator:
|
||||
size_t = gdb.lookup_type('size_t')
|
||||
|
||||
def __init__(self, ref):
|
||||
container_type = ref.type.strip_typedefs()
|
||||
self.node_type = container_type.template_argument(0)
|
||||
self.link_offset = container_type.template_argument(1).cast(self.size_t)
|
||||
self.root = ref['_header']['parent_']
|
||||
|
||||
def __visit(self, node):
|
||||
if node:
|
||||
for n in self.__visit(node['left_']):
|
||||
yield n
|
||||
|
||||
node_ptr = node.cast(self.size_t) - self.link_offset
|
||||
yield node_ptr.cast(self.node_type.pointer()).dereference()
|
||||
|
||||
for n in self.__visit(node['right_']):
|
||||
yield n
|
||||
|
||||
def __iter__(self):
|
||||
for n in self.__visit(self.root):
|
||||
yield n
|
||||
|
||||
class std_array:
|
||||
def __init__(self, ref):
|
||||
self.ref = ref
|
||||
@@ -120,6 +171,96 @@ class sstring_printer(gdb.printing.PrettyPrinter):
|
||||
def display_hint(self):
|
||||
return 'string'
|
||||
|
||||
class managed_bytes_printer(gdb.printing.PrettyPrinter):
|
||||
'print a managed_bytes'
|
||||
def __init__(self, val):
|
||||
self.val = val
|
||||
|
||||
def bytes(self):
|
||||
def signed_chr(c):
|
||||
return int(c).to_bytes(1, byteorder='little', signed=True)
|
||||
if self.val['_u']['small']['size'] >= 0:
|
||||
array = self.val['_u']['small']['data']
|
||||
len = int(self.val['_u']['small']['size'])
|
||||
return b''.join([signed_chr(array[x]) for x in range(len)])
|
||||
else:
|
||||
ref = self.val['_u']['ptr']
|
||||
chunks = list()
|
||||
while ref['ptr']:
|
||||
array = ref['ptr']['data']
|
||||
len = int(ref['ptr']['frag_size'])
|
||||
ref = ref['ptr']['next']
|
||||
chunks.append(b''.join([signed_chr(array[x]) for x in range(len)]))
|
||||
return b''.join(chunks)
|
||||
|
||||
def to_string(self):
|
||||
return str(self.bytes())
|
||||
|
||||
def display_hint(self):
|
||||
return 'managed_bytes'
|
||||
|
||||
class partition_entry_printer(gdb.printing.PrettyPrinter):
|
||||
def __init__(self, val):
|
||||
self.val = val
|
||||
|
||||
def to_string(self):
|
||||
versions = list()
|
||||
v = self.val['_version']['_version']
|
||||
while v:
|
||||
versions.append('@%s: %s' % (v, v.dereference()))
|
||||
v = v['_next']
|
||||
return '{_snapshot=%s, _version=%s, versions=[\n%s\n]}' % (self.val['_snapshot'], self.val['_version'], ',\n'.join(versions))
|
||||
|
||||
def display_hint(self):
|
||||
return 'partition_entry'
|
||||
|
||||
class mutation_partition_printer(gdb.printing.PrettyPrinter):
|
||||
def __init__(self, val):
|
||||
self.val = val
|
||||
|
||||
def to_string(self):
|
||||
rows = list(str(r) for r in intrusive_set_external_comparator(self.val['_rows']))
|
||||
range_tombstones = list(str(r) for r in intrusive_set(self.val['_row_tombstones']['_tombstones']))
|
||||
return '{_tombstone=%s, _static_row=%s (cont=%s), _row_tombstones=[%s], _rows=[%s]}' % (
|
||||
self.val['_tombstone'],
|
||||
self.val['_static_row'],
|
||||
('no', 'yes')[self.val['_static_row_continuous']],
|
||||
'\n' + ',\n'.join(range_tombstones) + '\n' if range_tombstones else '',
|
||||
'\n' + ',\n'.join(rows) + '\n' if rows else '')
|
||||
|
||||
def display_hint(self):
|
||||
return 'mutation_partition'
|
||||
|
||||
class row_printer(gdb.printing.PrettyPrinter):
|
||||
def __init__(self, val):
|
||||
self.val = val
|
||||
|
||||
def to_string(self):
|
||||
if self.val['_type'] == gdb.parse_and_eval('row::storage_type::vector'):
|
||||
cells = str(self.val['_storage']['vector'])
|
||||
elif self.val['_type'] == gdb.parse_and_eval('row::storage_type::set'):
|
||||
cells = '[%s]' % (', '.join(str(cell) for cell in intrusive_set(self.val['_storage']['set'])))
|
||||
else:
|
||||
raise Exception('Unsupported storage type: ' + self.val['_type'])
|
||||
return '{type=%s, cells=%s}' % (self.val['_type'], cells)
|
||||
|
||||
def display_hint(self):
|
||||
return 'row'
|
||||
|
||||
class managed_vector_printer(gdb.printing.PrettyPrinter):
|
||||
def __init__(self, val):
|
||||
self.val = val
|
||||
|
||||
def to_string(self):
|
||||
size = int(self.val['_size'])
|
||||
items = list()
|
||||
for i in range(size):
|
||||
items.append(str(self.val['_data'][i]))
|
||||
return '{size=%d, items=[%s]}' % (size, ', '.join(items))
|
||||
|
||||
def display_hint(self):
|
||||
return 'managed_vector'
|
||||
|
||||
class uuid_printer(gdb.printing.PrettyPrinter):
|
||||
'print a uuid'
|
||||
def __init__(self, val):
|
||||
@@ -135,6 +276,11 @@ class uuid_printer(gdb.printing.PrettyPrinter):
|
||||
def build_pretty_printer():
|
||||
pp = gdb.printing.RegexpCollectionPrettyPrinter('scylla')
|
||||
pp.add_printer('sstring', r'^seastar::basic_sstring<char,.*>$', sstring_printer)
|
||||
pp.add_printer('managed_bytes', r'^managed_bytes$', managed_bytes_printer)
|
||||
pp.add_printer('partition_entry', r'^partition_entry$', partition_entry_printer)
|
||||
pp.add_printer('mutation_partition', r'^mutation_partition$', mutation_partition_printer)
|
||||
pp.add_printer('row', r'^row$', row_printer)
|
||||
pp.add_printer('managed_vector', r'^managed_vector<.*>$', managed_vector_printer)
|
||||
pp.add_printer('uuid', r'^utils::UUID$', uuid_printer)
|
||||
return pp
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ namespace pager {
|
||||
* is done even though it is.
|
||||
*/
|
||||
class query_pager {
|
||||
private:
|
||||
protected:
|
||||
// remember if we use clustering. if not, each partition == one row
|
||||
const bool _has_clustering_keys;
|
||||
bool _exhausted = false;
|
||||
@@ -95,6 +95,7 @@ public:
|
||||
db::timeout_clock::duration timeout,
|
||||
lw_shared_ptr<query::read_command> cmd,
|
||||
dht::partition_range_vector ranges);
|
||||
virtual ~query_pager() {}
|
||||
|
||||
/**
|
||||
* Fetches the next page.
|
||||
@@ -107,7 +108,7 @@ public:
|
||||
/**
|
||||
* For more than one page.
|
||||
*/
|
||||
future<> fetch_page(cql3::selection::result_set_builder&, uint32_t page_size, gc_clock::time_point);
|
||||
virtual future<> fetch_page(cql3::selection::result_set_builder&, uint32_t page_size, gc_clock::time_point);
|
||||
|
||||
future<cql3::result_generator> fetch_page_generator(uint32_t page_size, gc_clock::time_point now, cql3::cql_stats& stats);
|
||||
|
||||
@@ -140,7 +141,7 @@ public:
|
||||
*/
|
||||
::shared_ptr<const paging_state> state() const;
|
||||
|
||||
private:
|
||||
protected:
|
||||
template<typename Base>
|
||||
class query_result_visitor;
|
||||
|
||||
|
||||
@@ -85,7 +85,7 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
|
||||
_query_read_repair_decision = state->get_query_read_repair_decision();
|
||||
} else {
|
||||
// Reusing readers is currently only supported for singular queries.
|
||||
if (_ranges.front().is_singular()) {
|
||||
if (!_ranges.empty() && query::is_single_partition(_ranges.front())) {
|
||||
_cmd->query_uuid = utils::make_random_uuid();
|
||||
}
|
||||
_cmd->is_first_page = true;
|
||||
@@ -166,7 +166,7 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
|
||||
auto it = ranges.begin();
|
||||
while (it != ranges.end()) {
|
||||
auto range = bound_view::from_range(*it);
|
||||
if (cmp(end_bound(range), lo) || eq(end_bound(range).prefix, lo)) {
|
||||
if (cmp(end_bound(range), lo) || eq(end_bound(range).prefix(), lo)) {
|
||||
qlogger.trace("Remove ck range {}", *it);
|
||||
it = ranges.erase(it);
|
||||
continue;
|
||||
@@ -262,6 +262,37 @@ future<cql3::result_generator> query_pager::fetch_page_generator(uint32_t page_s
|
||||
});
|
||||
}
|
||||
|
||||
class filtering_query_pager : public query_pager {
|
||||
::shared_ptr<cql3::restrictions::statement_restrictions> _filtering_restrictions;
|
||||
cql3::cql_stats& _stats;
|
||||
public:
|
||||
filtering_query_pager(schema_ptr s, shared_ptr<const cql3::selection::selection> selection,
|
||||
service::query_state& state,
|
||||
const cql3::query_options& options,
|
||||
db::timeout_clock::duration timeout,
|
||||
lw_shared_ptr<query::read_command> cmd,
|
||||
dht::partition_range_vector ranges,
|
||||
::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions,
|
||||
cql3::cql_stats& stats)
|
||||
: query_pager(s, selection, state, options, timeout, std::move(cmd), std::move(ranges))
|
||||
, _filtering_restrictions(std::move(filtering_restrictions))
|
||||
, _stats(stats)
|
||||
{}
|
||||
virtual ~filtering_query_pager() {}
|
||||
|
||||
virtual future<> fetch_page(cql3::selection::result_set_builder& builder, uint32_t page_size, gc_clock::time_point now) override {
|
||||
return do_fetch_page(page_size, now).then([this, &builder, page_size, now] (service::storage_proxy::coordinator_query_result qr) {
|
||||
_last_replicas = std::move(qr.last_replicas);
|
||||
_query_read_repair_decision = qr.read_repair_decision;
|
||||
qr.query_result->ensure_counts();
|
||||
_stats.filtered_rows_read_total += *qr.query_result->row_count();
|
||||
handle_result(cql3::selection::result_set_builder::visitor(builder, *_schema, *_selection,
|
||||
cql3::selection::result_set_builder::restrictions_filter(_filtering_restrictions)),
|
||||
std::move(qr.query_result), page_size, now);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Base>
|
||||
class query_pager::query_result_visitor : public Base {
|
||||
using visitor = Base;
|
||||
@@ -372,7 +403,13 @@ bool service::pager::query_pagers::may_need_paging(uint32_t page_size,
|
||||
service::query_state& state, const cql3::query_options& options,
|
||||
db::timeout_clock::duration timeout,
|
||||
lw_shared_ptr<query::read_command> cmd,
|
||||
dht::partition_range_vector ranges) {
|
||||
dht::partition_range_vector ranges,
|
||||
cql3::cql_stats& stats,
|
||||
::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions) {
|
||||
if (filtering_restrictions) {
|
||||
return ::make_shared<filtering_query_pager>(std::move(s), std::move(selection), state,
|
||||
options, timeout, std::move(cmd), std::move(ranges), std::move(filtering_restrictions), stats);
|
||||
}
|
||||
return ::make_shared<query_pager>(std::move(s), std::move(selection), state,
|
||||
options, timeout, std::move(cmd), std::move(ranges));
|
||||
}
|
||||
|
||||
@@ -66,7 +66,9 @@ public:
|
||||
const cql3::query_options&,
|
||||
db::timeout_clock::duration timeout,
|
||||
lw_shared_ptr<query::read_command>,
|
||||
dht::partition_range_vector);
|
||||
dht::partition_range_vector,
|
||||
cql3::cql_stats& stats,
|
||||
::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions = nullptr);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -211,7 +211,7 @@ protected:
|
||||
|
||||
protected:
|
||||
virtual bool waited_for(gms::inet_address from) = 0;
|
||||
virtual void signal(gms::inet_address from) {
|
||||
void signal(gms::inet_address from) {
|
||||
if (waited_for(from)) {
|
||||
signal();
|
||||
}
|
||||
@@ -221,7 +221,7 @@ public:
|
||||
abstract_write_response_handler(shared_ptr<storage_proxy> p, keyspace& ks, db::consistency_level cl, db::write_type type,
|
||||
std::unique_ptr<mutation_holder> mh, std::unordered_set<gms::inet_address> targets, tracing::trace_state_ptr trace_state,
|
||||
storage_proxy::write_stats& stats, size_t pending_endpoints = 0, std::vector<gms::inet_address> dead_endpoints = {})
|
||||
: _id(p->_next_response_id++), _proxy(std::move(p)), _trace_state(trace_state), _cl(cl), _type(type), _mutation_holder(std::move(mh)), _targets(std::move(targets)),
|
||||
: _id(p->get_next_response_id()), _proxy(std::move(p)), _trace_state(trace_state), _cl(cl), _type(type), _mutation_holder(std::move(mh)), _targets(std::move(targets)),
|
||||
_dead_endpoints(std::move(dead_endpoints)), _stats(stats) {
|
||||
// original comment from cassandra:
|
||||
// during bootstrap, include pending endpoints in the count
|
||||
@@ -285,10 +285,13 @@ public:
|
||||
}
|
||||
// return true on last ack
|
||||
bool response(gms::inet_address from) {
|
||||
signal(from);
|
||||
auto it = _targets.find(from);
|
||||
assert(it != _targets.end());
|
||||
_targets.erase(it);
|
||||
if (it != _targets.end()) {
|
||||
signal(from);
|
||||
_targets.erase(it);
|
||||
} else {
|
||||
slogger.warn("Receive outdated write ack from {}", from);
|
||||
}
|
||||
return _targets.size() == 0;
|
||||
}
|
||||
future<> wait() {
|
||||
@@ -632,9 +635,12 @@ void storage_proxy_stats::split_stats::register_metrics_for(gms::inet_address ep
|
||||
}
|
||||
}
|
||||
|
||||
using namespace std::literals::chrono_literals;
|
||||
|
||||
storage_proxy::~storage_proxy() {}
|
||||
storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cfg)
|
||||
: _db(db)
|
||||
, _next_response_id(std::chrono::system_clock::now().time_since_epoch()/1ms)
|
||||
, _hints_resource_manager(cfg.available_memory / 10)
|
||||
, _hints_for_views_manager(_db.local().get_config().data_file_directories()[0] + "/view_pending_updates", {}, _db.local().get_config().max_hint_window_in_ms(), _hints_resource_manager, _db)
|
||||
, _background_write_throttle_threahsold(cfg.available_memory / 10) {
|
||||
@@ -2118,7 +2124,7 @@ class data_read_resolver : public abstract_read_resolver {
|
||||
|
||||
struct primary_key {
|
||||
dht::decorated_key partition;
|
||||
stdx::optional<clustering_key> clustering;
|
||||
std::optional<clustering_key> clustering;
|
||||
|
||||
class less_compare_clustering {
|
||||
bool _is_reversed;
|
||||
@@ -2214,33 +2220,7 @@ private:
|
||||
}
|
||||
|
||||
static primary_key get_last_row(const schema& s, const partition& p, bool is_reversed) {
|
||||
class last_clustering_key final : public mutation_partition_visitor {
|
||||
stdx::optional<clustering_key> _last_ck;
|
||||
bool _is_reversed;
|
||||
public:
|
||||
explicit last_clustering_key(bool is_reversed) : _is_reversed(is_reversed) { }
|
||||
|
||||
virtual void accept_partition_tombstone(tombstone) override { }
|
||||
virtual void accept_static_cell(column_id, atomic_cell_view) override { }
|
||||
virtual void accept_static_cell(column_id, collection_mutation_view) override { }
|
||||
virtual void accept_row_tombstone(const range_tombstone&) override { }
|
||||
virtual void accept_row(position_in_partition_view pos, const row_tombstone&, const row_marker&, is_dummy dummy, is_continuous) override {
|
||||
assert(!dummy);
|
||||
if (!_is_reversed || !_last_ck) {
|
||||
_last_ck = pos.key();
|
||||
}
|
||||
}
|
||||
virtual void accept_row_cell(column_id id, atomic_cell_view) override { }
|
||||
virtual void accept_row_cell(column_id id, collection_mutation_view) override { }
|
||||
|
||||
stdx::optional<clustering_key>&& release() {
|
||||
return std::move(_last_ck);
|
||||
}
|
||||
};
|
||||
|
||||
last_clustering_key lck(is_reversed);
|
||||
p.mut().partition().accept(s, lck);
|
||||
return {p.mut().decorated_key(s), lck.release()};
|
||||
return {p.mut().decorated_key(s), is_reversed ? p.mut().partition().first_row_key() : p.mut().partition().last_row_key() };
|
||||
}
|
||||
|
||||
// Returns the highest row sent by the specified replica, according to the schema and the direction of
|
||||
@@ -2267,7 +2247,7 @@ private:
|
||||
auto&& ranges = cmd.slice.row_ranges(s, m.key());
|
||||
mp.compact_for_query(s, cmd.timestamp, ranges, is_reversed, limit);
|
||||
|
||||
stdx::optional<clustering_key> ck;
|
||||
std::optional<clustering_key> ck;
|
||||
if (!mp.clustered_rows().empty()) {
|
||||
if (is_reversed) {
|
||||
ck = mp.clustered_rows().begin()->key();
|
||||
|
||||
@@ -143,7 +143,7 @@ public:
|
||||
};
|
||||
private:
|
||||
distributed<database>& _db;
|
||||
response_id_type _next_response_id = 1; // 0 is reserved for unique_response_handler
|
||||
response_id_type _next_response_id;
|
||||
std::unordered_map<response_id_type, rh_entry> _response_handlers;
|
||||
// This buffer hold ids of throttled writes in case resource consumption goes
|
||||
// below the threshold and we want to unthrottle some of them. Without this throttled
|
||||
@@ -263,6 +263,13 @@ public:
|
||||
return _db;
|
||||
}
|
||||
|
||||
response_id_type get_next_response_id() {
|
||||
auto next = _next_response_id++;
|
||||
if (next == 0) { // 0 is reserved for unique_response_handler
|
||||
next = _next_response_id++;
|
||||
}
|
||||
return next;
|
||||
}
|
||||
void init_messaging_service();
|
||||
|
||||
// Applies mutation on this node.
|
||||
|
||||
@@ -2643,14 +2643,20 @@ future<> storage_service::send_replication_notification(inet_address remote) {
|
||||
// notify the remote token
|
||||
auto done = make_shared<bool>(false);
|
||||
auto local = get_broadcast_address();
|
||||
auto sent = make_lw_shared<int>(0);
|
||||
slogger.debug("Notifying {} of replication completion", remote);
|
||||
return do_until(
|
||||
[done, remote] {
|
||||
return *done || !gms::get_local_failure_detector().is_alive(remote);
|
||||
[done, sent, remote] {
|
||||
// The node can send REPLICATION_FINISHED to itself, in which case
|
||||
// is_alive will be true. If the messaging_service is stopped,
|
||||
// REPLICATION_FINISHED can be sent infinitely here. To fix, limit
|
||||
// the number of retries.
|
||||
return *done || !gms::get_local_failure_detector().is_alive(remote) || *sent >= 3;
|
||||
},
|
||||
[done, remote, local] {
|
||||
[done, sent, remote, local] {
|
||||
auto& ms = netw::get_local_messaging_service();
|
||||
netw::msg_addr id{remote, 0};
|
||||
(*sent)++;
|
||||
return ms.send_replication_finished(id, local).then_wrapped([id, done] (auto&& f) {
|
||||
try {
|
||||
f.get();
|
||||
|
||||
@@ -75,7 +75,7 @@ static api::timestamp_type get_max_purgeable_timestamp(const column_family& cf,
|
||||
const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
|
||||
auto timestamp = api::max_timestamp;
|
||||
stdx::optional<utils::hashed_key> hk;
|
||||
for (auto&& sst : boost::range::join(selector.select(dk.token()).sstables, cf.compacted_undeleted_sstables())) {
|
||||
for (auto&& sst : boost::range::join(selector.select(dk).sstables, cf.compacted_undeleted_sstables())) {
|
||||
if (compacting_set.count(sst)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -158,13 +158,10 @@ int compaction_manager::trim_to_compact(column_family* cf, sstables::compaction_
|
||||
}
|
||||
|
||||
bool compaction_manager::can_register_weight(column_family* cf, int weight) {
|
||||
auto has_cf_ongoing_compaction = [&] {
|
||||
auto ret = boost::range::count_if(_tasks, [&] (const lw_shared_ptr<task>& task) {
|
||||
return task->compacting_cf == cf;
|
||||
auto has_cf_ongoing_compaction = [&] () -> bool {
|
||||
return boost::range::count_if(_tasks, [&] (const lw_shared_ptr<task>& task) {
|
||||
return task->compacting_cf == cf && task->compaction_running;
|
||||
});
|
||||
// compaction task trying to proceed is already registered in task list,
|
||||
// so we must check for an additional one.
|
||||
return ret >= 2;
|
||||
};
|
||||
|
||||
// Only one weight is allowed if parallel compaction is disabled.
|
||||
@@ -188,7 +185,7 @@ void compaction_manager::register_weight(int weight) {
|
||||
|
||||
void compaction_manager::deregister_weight(int weight) {
|
||||
_weight_tracker.erase(weight);
|
||||
reevalute_postponed_compactions();
|
||||
reevaluate_postponed_compactions();
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable> compaction_manager::get_candidates(const column_family& cf) {
|
||||
@@ -406,7 +403,7 @@ void compaction_manager::postponed_compactions_reevaluation() {
|
||||
});
|
||||
}
|
||||
|
||||
void compaction_manager::reevalute_postponed_compactions() {
|
||||
void compaction_manager::reevaluate_postponed_compactions() {
|
||||
_postponed_reevaluation.signal();
|
||||
}
|
||||
|
||||
@@ -434,7 +431,7 @@ future<> compaction_manager::stop() {
|
||||
return this->task_stop(task);
|
||||
});
|
||||
}).then([this] () mutable {
|
||||
reevalute_postponed_compactions();
|
||||
reevaluate_postponed_compactions();
|
||||
return std::move(_waiting_reevalution);
|
||||
}).then([this] {
|
||||
_weight_tracker.clear();
|
||||
@@ -511,8 +508,10 @@ void compaction_manager::submit(column_family* cf) {
|
||||
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
task->compaction_running = true;
|
||||
return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
|
||||
_stats.active_tasks--;
|
||||
task->compaction_running = false;
|
||||
|
||||
if (!can_proceed(task)) {
|
||||
maybe_stop_on_error(std::move(f));
|
||||
@@ -528,6 +527,7 @@ void compaction_manager::submit(column_family* cf) {
|
||||
_stats.pending_tasks++;
|
||||
_stats.completed_tasks++;
|
||||
task->compaction_retry.reset();
|
||||
reevaluate_postponed_compactions();
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
});
|
||||
@@ -569,12 +569,14 @@ future<> compaction_manager::perform_cleanup(column_family* cf) {
|
||||
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
task->compaction_running = true;
|
||||
compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
|
||||
return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
|
||||
return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)] () mutable {
|
||||
return cf.cleanup_sstables(std::move(descriptor));
|
||||
});
|
||||
}).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
|
||||
task->compaction_running = false;
|
||||
_stats.active_tasks--;
|
||||
if (!can_proceed(task)) {
|
||||
maybe_stop_on_error(std::move(f));
|
||||
@@ -588,6 +590,7 @@ future<> compaction_manager::perform_cleanup(column_family* cf) {
|
||||
});
|
||||
}
|
||||
_stats.completed_tasks++;
|
||||
reevaluate_postponed_compactions();
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
});
|
||||
}).finally([this, task] {
|
||||
@@ -653,7 +656,7 @@ void compaction_manager::stop_compaction(sstring type) {
|
||||
|
||||
void compaction_manager::on_compaction_complete(compaction_weight_registration& weight_registration) {
|
||||
weight_registration.deregister();
|
||||
reevalute_postponed_compactions();
|
||||
reevaluate_postponed_compactions();
|
||||
}
|
||||
|
||||
double compaction_backlog_tracker::backlog() const {
|
||||
|
||||
@@ -60,6 +60,7 @@ private:
|
||||
exponential_backoff_retry compaction_retry = exponential_backoff_retry(std::chrono::seconds(5), std::chrono::seconds(300));
|
||||
bool stopping = false;
|
||||
bool cleanup = false;
|
||||
bool compaction_running = false;
|
||||
};
|
||||
|
||||
// compaction manager may have N fibers to allow parallel compaction per shard.
|
||||
@@ -134,7 +135,7 @@ private:
|
||||
inline bool maybe_stop_on_error(future<> f);
|
||||
|
||||
void postponed_compactions_reevaluation();
|
||||
void reevalute_postponed_compactions();
|
||||
void reevaluate_postponed_compactions();
|
||||
// Postpone compaction for a column family that couldn't be executed due to ongoing
|
||||
// similar-sized compaction.
|
||||
void postpone_compaction_for_column_family(column_family* cf);
|
||||
|
||||
@@ -47,7 +47,7 @@
|
||||
#include "compaction_strategy_impl.hh"
|
||||
#include "schema.hh"
|
||||
#include "sstable_set.hh"
|
||||
#include "compatible_ring_position.hh"
|
||||
#include "compatible_ring_position_view.hh"
|
||||
#include <boost/range/algorithm/find.hpp>
|
||||
#include <boost/range/adaptors.hpp>
|
||||
#include <boost/icl/interval_map.hpp>
|
||||
@@ -69,7 +69,7 @@ extern logging::logger clogger;
|
||||
class incremental_selector_impl {
|
||||
public:
|
||||
virtual ~incremental_selector_impl() {}
|
||||
virtual std::tuple<dht::token_range, std::vector<shared_sstable>, dht::ring_position> select(const dht::token& token) = 0;
|
||||
virtual std::tuple<dht::partition_range, std::vector<shared_sstable>, dht::ring_position_view> select(const dht::ring_position_view&) = 0;
|
||||
};
|
||||
|
||||
class sstable_set_impl {
|
||||
@@ -82,13 +82,15 @@ public:
|
||||
virtual std::unique_ptr<incremental_selector_impl> make_incremental_selector() const = 0;
|
||||
};
|
||||
|
||||
sstable_set::sstable_set(std::unique_ptr<sstable_set_impl> impl, lw_shared_ptr<sstable_list> all)
|
||||
sstable_set::sstable_set(std::unique_ptr<sstable_set_impl> impl, schema_ptr s, lw_shared_ptr<sstable_list> all)
|
||||
: _impl(std::move(impl))
|
||||
, _schema(std::move(s))
|
||||
, _all(std::move(all)) {
|
||||
}
|
||||
|
||||
sstable_set::sstable_set(const sstable_set& x)
|
||||
: _impl(x._impl->clone())
|
||||
, _schema(x._schema)
|
||||
, _all(make_lw_shared(sstable_list(*x._all))) {
|
||||
}
|
||||
|
||||
@@ -130,8 +132,9 @@ sstable_set::erase(shared_sstable sst) {
|
||||
|
||||
sstable_set::~sstable_set() = default;
|
||||
|
||||
sstable_set::incremental_selector::incremental_selector(std::unique_ptr<incremental_selector_impl> impl)
|
||||
: _impl(std::move(impl)) {
|
||||
sstable_set::incremental_selector::incremental_selector(std::unique_ptr<incremental_selector_impl> impl, const schema& s)
|
||||
: _impl(std::move(impl))
|
||||
, _cmp(s) {
|
||||
}
|
||||
|
||||
sstable_set::incremental_selector::~incremental_selector() = default;
|
||||
@@ -139,16 +142,17 @@ sstable_set::incremental_selector::~incremental_selector() = default;
|
||||
sstable_set::incremental_selector::incremental_selector(sstable_set::incremental_selector&&) noexcept = default;
|
||||
|
||||
sstable_set::incremental_selector::selection
|
||||
sstable_set::incremental_selector::select(const dht::token& t) const {
|
||||
if (!_current_token_range || !_current_token_range->contains(t, dht::token_comparator())) {
|
||||
std::tie(_current_token_range, _current_sstables, _current_next_position) = _impl->select(t);
|
||||
sstable_set::incremental_selector::select(const dht::ring_position_view& pos) const {
|
||||
if (!_current_range_view || !_current_range_view->contains(pos, _cmp)) {
|
||||
std::tie(_current_range, _current_sstables, _current_next_position) = _impl->select(pos);
|
||||
_current_range_view = _current_range->transform([] (const dht::ring_position& rp) { return dht::ring_position_view(rp); });
|
||||
}
|
||||
return {_current_sstables, _current_next_position};
|
||||
}
|
||||
|
||||
sstable_set::incremental_selector
|
||||
sstable_set::make_incremental_selector() const {
|
||||
return incremental_selector(_impl->make_incremental_selector());
|
||||
return incremental_selector(_impl->make_incremental_selector(), *_schema);
|
||||
}
|
||||
|
||||
// default sstable_set, not specialized for anything
|
||||
@@ -178,8 +182,8 @@ public:
|
||||
incremental_selector(const std::vector<shared_sstable>& sstables)
|
||||
: _sstables(sstables) {
|
||||
}
|
||||
virtual std::tuple<dht::token_range, std::vector<shared_sstable>, dht::ring_position> select(const dht::token& token) override {
|
||||
return std::make_tuple(dht::token_range::make_open_ended_both_sides(), _sstables, dht::ring_position::max());
|
||||
virtual std::tuple<dht::partition_range, std::vector<shared_sstable>, dht::ring_position_view> select(const dht::ring_position_view&) override {
|
||||
return std::make_tuple(dht::partition_range::make_open_ended_both_sides(), _sstables, dht::ring_position_view::max());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -191,7 +195,7 @@ std::unique_ptr<incremental_selector_impl> bag_sstable_set::make_incremental_sel
|
||||
// e.g. leveled compaction strategy
|
||||
class partitioned_sstable_set : public sstable_set_impl {
|
||||
using value_set = std::unordered_set<shared_sstable>;
|
||||
using interval_map_type = boost::icl::interval_map<compatible_ring_position, value_set>;
|
||||
using interval_map_type = boost::icl::interval_map<compatible_ring_position_view, value_set>;
|
||||
using interval_type = interval_map_type::interval_type;
|
||||
using map_iterator = interval_map_type::const_iterator;
|
||||
private:
|
||||
@@ -201,14 +205,22 @@ private:
|
||||
private:
|
||||
static interval_type make_interval(const schema& s, const dht::partition_range& range) {
|
||||
return interval_type::closed(
|
||||
compatible_ring_position(s, range.start()->value()),
|
||||
compatible_ring_position(s, range.end()->value()));
|
||||
compatible_ring_position_view(s, range.start()->value()),
|
||||
compatible_ring_position_view(s, range.end()->value()));
|
||||
}
|
||||
interval_type make_interval(const dht::partition_range& range) const {
|
||||
return make_interval(*_schema, range);
|
||||
}
|
||||
static interval_type make_interval(const schema& s, const sstable& sst) {
|
||||
return interval_type::closed(
|
||||
compatible_ring_position_view(s, sst.get_first_decorated_key()),
|
||||
compatible_ring_position_view(s, sst.get_last_decorated_key()));
|
||||
}
|
||||
interval_type make_interval(const sstable& sst) const {
|
||||
return make_interval(*_schema, sst);
|
||||
}
|
||||
interval_type singular(const dht::ring_position& rp) const {
|
||||
auto crp = compatible_ring_position(*_schema, rp);
|
||||
auto crp = compatible_ring_position_view(*_schema, rp);
|
||||
return interval_type::closed(crp, crp);
|
||||
}
|
||||
std::pair<map_iterator, map_iterator> query(const dht::partition_range& range) const {
|
||||
@@ -226,6 +238,29 @@ private:
|
||||
}
|
||||
}
|
||||
public:
|
||||
static dht::ring_position to_ring_position(const compatible_ring_position_view& crpv) {
|
||||
// Ring position views, representing bounds of sstable intervals are
|
||||
// guaranteed to have key() != nullptr;
|
||||
const auto& pos = crpv.position();
|
||||
return dht::ring_position(pos.token(), *pos.key());
|
||||
}
|
||||
static dht::partition_range to_partition_range(const interval_type& i) {
|
||||
return dht::partition_range::make(
|
||||
{to_ring_position(i.lower()), boost::icl::is_left_closed(i.bounds())},
|
||||
{to_ring_position(i.upper()), boost::icl::is_right_closed(i.bounds())});
|
||||
}
|
||||
static dht::partition_range to_partition_range(const dht::ring_position_view& pos, const interval_type& i) {
|
||||
auto lower_bound = [&] {
|
||||
if (pos.key()) {
|
||||
return dht::partition_range::bound(dht::ring_position(pos.token(), *pos.key()),
|
||||
pos.is_after_key() == dht::ring_position_view::after_key::no);
|
||||
} else {
|
||||
return dht::partition_range::bound(dht::ring_position(pos.token(), pos.get_token_bound()), true);
|
||||
}
|
||||
}();
|
||||
auto upper_bound = dht::partition_range::bound(to_ring_position(i.lower()), !boost::icl::is_left_closed(i.bounds()));
|
||||
return dht::partition_range::make(std::move(lower_bound), std::move(upper_bound));
|
||||
}
|
||||
explicit partitioned_sstable_set(schema_ptr schema)
|
||||
: _schema(std::move(schema)) {
|
||||
}
|
||||
@@ -248,30 +283,14 @@ public:
|
||||
if (sst->get_sstable_level() == 0) {
|
||||
_unleveled_sstables.push_back(std::move(sst));
|
||||
} else {
|
||||
auto first = sst->get_first_decorated_key().token();
|
||||
auto last = sst->get_last_decorated_key().token();
|
||||
using bound = dht::partition_range::bound;
|
||||
_leveled_sstables.add({
|
||||
make_interval(
|
||||
dht::partition_range(
|
||||
bound(dht::ring_position::starting_at(first)),
|
||||
bound(dht::ring_position::ending_at(last)))),
|
||||
value_set({sst})});
|
||||
_leveled_sstables.add({make_interval(*sst), value_set({sst})});
|
||||
}
|
||||
}
|
||||
virtual void erase(shared_sstable sst) override {
|
||||
if (sst->get_sstable_level() == 0) {
|
||||
_unleveled_sstables.erase(std::remove(_unleveled_sstables.begin(), _unleveled_sstables.end(), sst), _unleveled_sstables.end());
|
||||
} else {
|
||||
auto first = sst->get_first_decorated_key().token();
|
||||
auto last = sst->get_last_decorated_key().token();
|
||||
using bound = dht::partition_range::bound;
|
||||
_leveled_sstables.subtract({
|
||||
make_interval(
|
||||
dht::partition_range(
|
||||
bound(dht::ring_position::starting_at(first)),
|
||||
bound(dht::ring_position::ending_at(last)))),
|
||||
value_set({sst})});
|
||||
_leveled_sstables.subtract({make_interval(*sst), value_set({sst})});
|
||||
}
|
||||
}
|
||||
virtual std::unique_ptr<incremental_selector_impl> make_incremental_selector() const override;
|
||||
@@ -283,53 +302,50 @@ class partitioned_sstable_set::incremental_selector : public incremental_selecto
|
||||
const std::vector<shared_sstable>& _unleveled_sstables;
|
||||
map_iterator _it;
|
||||
const map_iterator _end;
|
||||
// Only to back the dht::ring_position_view returned from select().
|
||||
dht::ring_position _next_position;
|
||||
private:
|
||||
static dht::token_range to_token_range(const interval_type& i) {
|
||||
return dht::token_range::make({i.lower().token(), boost::icl::is_left_closed(i.bounds())},
|
||||
{i.upper().token(), boost::icl::is_right_closed(i.bounds())});
|
||||
dht::ring_position_view next_position(map_iterator it) {
|
||||
if (it == _end) {
|
||||
_next_position = dht::ring_position::max();
|
||||
return dht::ring_position_view::max();
|
||||
} else {
|
||||
_next_position = partitioned_sstable_set::to_ring_position(it->first.lower());
|
||||
return dht::ring_position_view(_next_position, dht::ring_position_view::after_key(!boost::icl::is_left_closed(it->first.bounds())));
|
||||
}
|
||||
}
|
||||
static bool is_before_interval(const compatible_ring_position_view& crpv, const interval_type& interval) {
|
||||
if (boost::icl::is_left_closed(interval.bounds())) {
|
||||
return crpv < interval.lower();
|
||||
} else {
|
||||
return crpv <= interval.lower();
|
||||
}
|
||||
}
|
||||
public:
|
||||
incremental_selector(schema_ptr schema, const std::vector<shared_sstable>& unleveled_sstables, const interval_map_type& leveled_sstables)
|
||||
: _schema(std::move(schema))
|
||||
, _unleveled_sstables(unleveled_sstables)
|
||||
, _it(leveled_sstables.begin())
|
||||
, _end(leveled_sstables.end()) {
|
||||
, _end(leveled_sstables.end())
|
||||
, _next_position(dht::ring_position::min()) {
|
||||
}
|
||||
virtual std::tuple<dht::token_range, std::vector<shared_sstable>, dht::ring_position> select(const dht::token& token) override {
|
||||
auto pr = dht::partition_range::make(dht::ring_position::starting_at(token), dht::ring_position::ending_at(token));
|
||||
auto interval = make_interval(*_schema, std::move(pr));
|
||||
virtual std::tuple<dht::partition_range, std::vector<shared_sstable>, dht::ring_position_view> select(const dht::ring_position_view& pos) override {
|
||||
auto crpv = compatible_ring_position_view(*_schema, pos);
|
||||
auto ssts = _unleveled_sstables;
|
||||
using namespace dht;
|
||||
|
||||
auto inclusiveness = [] (auto& interval) {
|
||||
return boost::icl::is_left_closed(interval.bounds()) ? ring_position::token_bound::start : ring_position::token_bound::end;
|
||||
};
|
||||
|
||||
const auto next_pos = [&] {
|
||||
const auto next = std::next(_it);
|
||||
auto& interval = next->first;
|
||||
return next == _end ? ring_position::max() : ring_position(interval.lower().token(), inclusiveness(interval));
|
||||
};
|
||||
|
||||
const auto current_pos = [&] {
|
||||
auto& interval = _it->first;
|
||||
return _it == _end ? ring_position::max() : ring_position(interval.lower().token(), inclusiveness(interval));
|
||||
};
|
||||
|
||||
while (_it != _end) {
|
||||
if (boost::icl::contains(_it->first, interval)) {
|
||||
if (boost::icl::contains(_it->first, crpv)) {
|
||||
ssts.insert(ssts.end(), _it->second.begin(), _it->second.end());
|
||||
return std::make_tuple(to_token_range(_it->first), std::move(ssts), next_pos());
|
||||
return std::make_tuple(partitioned_sstable_set::to_partition_range(_it->first), std::move(ssts), next_position(std::next(_it)));
|
||||
}
|
||||
// we don't want to skip current interval if token lies before it.
|
||||
if (boost::icl::lower_less(interval, _it->first)) {
|
||||
return std::make_tuple(dht::token_range::make({token, true}, {_it->first.lower().token(), false}),
|
||||
std::move(ssts),
|
||||
current_pos());
|
||||
// We don't want to skip current interval if pos lies before it.
|
||||
if (is_before_interval(crpv, _it->first)) {
|
||||
return std::make_tuple(partitioned_sstable_set::to_partition_range(pos, _it->first), std::move(ssts), next_position(_it));
|
||||
}
|
||||
_it++;
|
||||
}
|
||||
return std::make_tuple(dht::token_range::make_open_ended_both_sides(), std::move(ssts), ring_position::max());
|
||||
return std::make_tuple(partition_range::make_open_ended_both_sides(), std::move(ssts), ring_position_view::max());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -669,7 +685,8 @@ bool compaction_strategy::use_clustering_key_filter() const {
|
||||
sstable_set
|
||||
compaction_strategy::make_sstable_set(schema_ptr schema) const {
|
||||
return sstable_set(
|
||||
_compaction_strategy_impl->make_sstable_set(std::move(schema)),
|
||||
_compaction_strategy_impl->make_sstable_set(schema),
|
||||
schema,
|
||||
make_lw_shared<sstable_list>());
|
||||
}
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include "unimplemented.hh"
|
||||
#include "stdx.hh"
|
||||
#include "segmented_compress_params.hh"
|
||||
#include "utils/class_registrator.hh"
|
||||
|
||||
namespace sstables {
|
||||
|
||||
@@ -299,7 +300,8 @@ size_t local_compression::compress_max_size(size_t input_len) const {
|
||||
|
||||
void compression::set_compressor(compressor_ptr c) {
|
||||
if (c) {
|
||||
auto& cn = c->name();
|
||||
unqualified_name uqn(compressor::namespace_prefix, c->name());
|
||||
const sstring& cn = uqn;
|
||||
name.value = bytes(cn.begin(), cn.end());
|
||||
for (auto& p : c->options()) {
|
||||
if (p.first != compression_parameters::SSTABLE_COMPRESSION) {
|
||||
|
||||
@@ -27,6 +27,44 @@
|
||||
|
||||
namespace sstables {
|
||||
|
||||
class promoted_index_block {
|
||||
public:
|
||||
promoted_index_block(temporary_buffer<char>&& start, temporary_buffer<char>&& end,
|
||||
uint64_t offset, uint64_t width)
|
||||
: _start(std::move(start)), _end(std::move(end))
|
||||
, _offset(offset), _width(width)
|
||||
{}
|
||||
promoted_index_block(const promoted_index_block& rhs)
|
||||
: _start(rhs._start.get(), rhs._start.size()), _end(rhs._end.get(), rhs._end.size())
|
||||
, _offset(rhs._offset), _width(rhs._width)
|
||||
{}
|
||||
promoted_index_block(promoted_index_block&&) noexcept = default;
|
||||
|
||||
promoted_index_block& operator=(const promoted_index_block& rhs) {
|
||||
if (this != &rhs) {
|
||||
_start = temporary_buffer<char>(rhs._start.get(), rhs._start.size());
|
||||
_end = temporary_buffer<char>(rhs._end.get(), rhs._end.size());
|
||||
_offset = rhs._offset;
|
||||
_width = rhs._width;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
promoted_index_block& operator=(promoted_index_block&&) noexcept = default;
|
||||
|
||||
composite_view start(const schema& s) const { return composite_view(to_bytes_view(_start), s.is_compound());}
|
||||
composite_view end(const schema& s) const { return composite_view(to_bytes_view(_end), s.is_compound());}
|
||||
uint64_t offset() const { return _offset; }
|
||||
uint64_t width() const { return _width; }
|
||||
|
||||
private:
|
||||
temporary_buffer<char> _start;
|
||||
temporary_buffer<char> _end;
|
||||
uint64_t _offset;
|
||||
uint64_t _width;
|
||||
};
|
||||
|
||||
using promoted_index_blocks = seastar::circular_buffer<promoted_index_block>;
|
||||
|
||||
inline void erase_all_but_last(promoted_index_blocks& pi_blocks) {
|
||||
while (pi_blocks.size() > 1) {
|
||||
pi_blocks.pop_front();
|
||||
@@ -55,7 +93,7 @@ private:
|
||||
const schema& _s;
|
||||
consuming_mode _mode = consuming_mode::consume_next;
|
||||
size_t _current_pi_idx = 0; // for consume_until mode
|
||||
stdx::optional<position_in_partition_view> _pos; // for consume_until mode
|
||||
std::optional<position_in_partition_view> _pos; // for consume_until mode
|
||||
|
||||
enum class state {
|
||||
START_NAME_LENGTH,
|
||||
@@ -173,15 +211,40 @@ public:
|
||||
{}
|
||||
};
|
||||
|
||||
class promoted_index {
|
||||
deletion_time _del_time;
|
||||
uint32_t _promoted_index_size;
|
||||
promoted_index_blocks_reader _reader;
|
||||
bool _reader_closed = false;
|
||||
|
||||
public:
|
||||
promoted_index(const schema& s, deletion_time del_time, input_stream<char>&& promoted_index_stream,
|
||||
uint32_t promoted_index_size, uint32_t blocks_count)
|
||||
: _del_time{del_time}
|
||||
, _promoted_index_size(promoted_index_size)
|
||||
, _reader{std::move(promoted_index_stream), blocks_count, s, 0, promoted_index_size}
|
||||
{}
|
||||
|
||||
[[nodiscard]] deletion_time get_deletion_time() const { return _del_time; }
|
||||
[[nodiscard]] uint32_t get_promoted_index_size() const { return _promoted_index_size; }
|
||||
[[nodiscard]] promoted_index_blocks_reader& get_reader() { return _reader; };
|
||||
[[nodiscard]] const promoted_index_blocks_reader& get_reader() const { return _reader; };
|
||||
future<> close_reader() {
|
||||
if (!_reader_closed) {
|
||||
_reader_closed = true;
|
||||
return _reader.close();
|
||||
}
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
};
|
||||
|
||||
class index_entry {
|
||||
private:
|
||||
temporary_buffer<char> _key;
|
||||
mutable stdx::optional<dht::token> _token;
|
||||
mutable std::optional<dht::token> _token;
|
||||
uint64_t _position;
|
||||
stdx::optional<promoted_index_blocks_reader> _reader;
|
||||
bool _reader_closed = false;
|
||||
uint32_t _promoted_index_size;
|
||||
stdx::optional<deletion_time> _del_time;
|
||||
std::optional<promoted_index> _index;
|
||||
|
||||
public:
|
||||
|
||||
@@ -202,21 +265,21 @@ public:
|
||||
|
||||
uint64_t position() const { return _position; };
|
||||
|
||||
stdx::optional<deletion_time> get_deletion_time() const { return _del_time; }
|
||||
uint32_t get_promoted_index_size() const { return _promoted_index_size; }
|
||||
std::optional<deletion_time> get_deletion_time() const {
|
||||
if (_index) {
|
||||
return _index->get_deletion_time();
|
||||
}
|
||||
|
||||
index_entry(temporary_buffer<char>&& key, uint64_t position,
|
||||
stdx::optional<input_stream<char>>&& promoted_index_stream, uint32_t promoted_index_size,
|
||||
stdx::optional<deletion_time>&& del_time, uint32_t num_pi_blocks, const schema& s)
|
||||
return {};
|
||||
}
|
||||
|
||||
uint32_t get_promoted_index_size() const { return _index ? _index->get_promoted_index_size() : 0; }
|
||||
|
||||
index_entry(temporary_buffer<char>&& key, uint64_t position, std::optional<promoted_index>&& index)
|
||||
: _key(std::move(key))
|
||||
, _position(position)
|
||||
, _promoted_index_size(promoted_index_size)
|
||||
, _del_time(std::move(del_time))
|
||||
{
|
||||
if (promoted_index_stream) {
|
||||
_reader.emplace(std::move(*promoted_index_stream), num_pi_blocks, s, 0, _promoted_index_size);
|
||||
}
|
||||
}
|
||||
, _index(std::move(index))
|
||||
{}
|
||||
|
||||
index_entry(index_entry&&) = default;
|
||||
index_entry& operator=(index_entry&&) = default;
|
||||
@@ -225,39 +288,46 @@ public:
|
||||
// for a given position.
|
||||
// Returns the index of the element right before the upper bound one.
|
||||
future<size_t> get_pi_blocks_until(position_in_partition_view pos) {
|
||||
if (!_reader) {
|
||||
if (!_index) {
|
||||
return make_ready_future<size_t>(0);
|
||||
}
|
||||
|
||||
_reader->switch_to_consume_until_mode(pos);
|
||||
promoted_index_blocks& blocks = _reader->get_pi_blocks();
|
||||
auto& reader = _index->get_reader();
|
||||
reader.switch_to_consume_until_mode(pos);
|
||||
promoted_index_blocks& blocks = reader.get_pi_blocks();
|
||||
if (!blocks.empty()) {
|
||||
erase_all_but_last(blocks);
|
||||
}
|
||||
return _reader->consume_input().then([this] {
|
||||
return make_ready_future<size_t>(_reader->get_current_pi_index());
|
||||
return reader.consume_input().then([this, &reader] {
|
||||
return reader.get_current_pi_index();
|
||||
});
|
||||
}
|
||||
|
||||
// Unconditionally reads the promoted index blocks from the next data buffer
|
||||
future<> get_next_pi_blocks() {
|
||||
if (!_reader) {
|
||||
if (!_index) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
promoted_index_blocks& blocks = _reader->get_pi_blocks();
|
||||
auto& reader = _index->get_reader();
|
||||
promoted_index_blocks& blocks = reader.get_pi_blocks();
|
||||
blocks = promoted_index_blocks{};
|
||||
_reader->switch_to_consume_next_mode();
|
||||
return _reader->consume_input();
|
||||
reader.switch_to_consume_next_mode();
|
||||
return reader.consume_input();
|
||||
}
|
||||
|
||||
uint32_t get_total_pi_blocks_count() const { return _reader ? _reader->get_total_num_blocks() : 0; }
|
||||
uint32_t get_read_pi_blocks_count() const { return _reader ? _reader->get_read_num_blocks() : 0; }
|
||||
promoted_index_blocks* get_pi_blocks() { return _reader ? &_reader->get_pi_blocks() : nullptr; }
|
||||
[[nodiscard]] uint32_t get_total_pi_blocks_count() const {
|
||||
return _index ? _index->get_reader().get_total_num_blocks() : 0;
|
||||
}
|
||||
[[nodiscard]] uint32_t get_read_pi_blocks_count() const {
|
||||
return _index ? _index->get_reader().get_read_num_blocks() : 0;
|
||||
}
|
||||
[[nodiscard]] promoted_index_blocks* get_pi_blocks() {
|
||||
return _index ? &_index->get_reader().get_pi_blocks() : nullptr;
|
||||
}
|
||||
future<> close_pi_stream() {
|
||||
if (_reader && !_reader_closed) {
|
||||
_reader_closed = true;
|
||||
return _reader->close();
|
||||
if (_index) {
|
||||
return _index->close_reader();
|
||||
}
|
||||
|
||||
return make_ready_future<>();
|
||||
|
||||
@@ -81,7 +81,7 @@ private:
|
||||
temporary_buffer<char> _key;
|
||||
uint32_t _promoted_index_size;
|
||||
uint64_t _position;
|
||||
stdx::optional<deletion_time> _deletion_time;
|
||||
std::optional<deletion_time> _deletion_time;
|
||||
uint32_t _num_pi_blocks = 0;
|
||||
|
||||
trust_promoted_index _trust_pi;
|
||||
@@ -156,7 +156,7 @@ public:
|
||||
_promoted_index_size -= 16;
|
||||
}
|
||||
auto data_size = data.size();
|
||||
stdx::optional<input_stream<char>> promoted_index_stream;
|
||||
std::optional<input_stream<char>> promoted_index_stream;
|
||||
if ((_trust_pi == trust_promoted_index::yes) && (_promoted_index_size > 0)) {
|
||||
if (_promoted_index_size <= data_size) {
|
||||
auto buf = data.share();
|
||||
@@ -171,10 +171,13 @@ public:
|
||||
} else {
|
||||
_num_pi_blocks = 0;
|
||||
}
|
||||
_consumer.consume_entry(index_entry{std::move(_key), _position, std::move(promoted_index_stream),
|
||||
_promoted_index_size, std::move(_deletion_time), _num_pi_blocks, _s}, _entry_offset);
|
||||
std::optional<promoted_index> index;
|
||||
if (promoted_index_stream) {
|
||||
index.emplace(_s, *_deletion_time, std::move(*promoted_index_stream), _promoted_index_size, _num_pi_blocks);
|
||||
}
|
||||
_consumer.consume_entry(index_entry{std::move(_key), _position, std::move(index)}, _entry_offset);
|
||||
_entry_offset += len;
|
||||
_deletion_time = stdx::nullopt;
|
||||
_deletion_time = std::nullopt;
|
||||
_num_pi_blocks = 0;
|
||||
_state = state::START;
|
||||
if (_promoted_index_size <= data_size) {
|
||||
@@ -245,7 +248,8 @@ future<> close_index_list(shared_index_lists::list_ptr& list) {
|
||||
// Maintains logical cursors to sstable elements (partitions, cells).
|
||||
// Holds two cursors pointing to the range within sstable (upper cursor may be not set).
|
||||
// Initially the lower cursor is positioned on the first partition in the sstable.
|
||||
// Cursors can be advanced forward using advance_to().
|
||||
// Lower cursor can be accessed and advanced from outside.
|
||||
// Upper cursor can only be advanced along with the lower cursor and not accessed from outside.
|
||||
//
|
||||
// If eof() then the lower bound cursor is positioned past all partitions in the sstable.
|
||||
class index_reader {
|
||||
@@ -478,6 +482,63 @@ private:
|
||||
});
|
||||
}
|
||||
|
||||
// Forwards the upper bound cursor to a position which is greater than given position in current partition.
|
||||
//
|
||||
// Note that the index within partition, unlike the partition index, doesn't cover all keys.
|
||||
// So this may not forward to the smallest position which is greater than pos.
|
||||
//
|
||||
// May advance to the next partition if it's not possible to find a suitable position inside
|
||||
// current partition.
|
||||
//
|
||||
// Must be called only when !eof().
|
||||
future<> advance_upper_past(position_in_partition_view pos) {
|
||||
sstlog.trace("index {}: advance_upper_past({})", this, pos);
|
||||
|
||||
// We advance cursor within the current lower bound partition
|
||||
// So need to make sure first that it is read
|
||||
if (!partition_data_ready(_lower_bound)) {
|
||||
return read_partition_data().then([this, pos] {
|
||||
assert(partition_data_ready());
|
||||
return advance_upper_past(pos);
|
||||
});
|
||||
}
|
||||
|
||||
if (!_upper_bound) {
|
||||
_upper_bound = _lower_bound;
|
||||
}
|
||||
|
||||
index_entry& e = current_partition_entry(*_upper_bound);
|
||||
if (e.get_total_pi_blocks_count() == 0) {
|
||||
sstlog.trace("index {}: no promoted index", this);
|
||||
return advance_to_next_partition(*_upper_bound);
|
||||
}
|
||||
|
||||
if (e.get_read_pi_blocks_count() == 0) {
|
||||
return e.get_next_pi_blocks().then([this, pos] {
|
||||
return advance_upper_past(pos);
|
||||
});
|
||||
}
|
||||
|
||||
const schema& s = *_sstable->_schema;
|
||||
auto cmp_with_start = [pos_cmp = position_in_partition::composite_less_compare(s), s]
|
||||
(position_in_partition_view pos, const promoted_index_block& info) -> bool {
|
||||
return pos_cmp(pos, info.start(s));
|
||||
};
|
||||
promoted_index_blocks* pi_blocks = e.get_pi_blocks();
|
||||
assert(pi_blocks);
|
||||
auto i = std::upper_bound(pi_blocks->begin() + _upper_bound->current_pi_idx, pi_blocks->end(), pos, cmp_with_start);
|
||||
_upper_bound->current_pi_idx = std::distance(pi_blocks->begin(), i);
|
||||
if (i == pi_blocks->end()) {
|
||||
return advance_to_next_partition(*_upper_bound);
|
||||
}
|
||||
|
||||
_upper_bound->data_file_position = e.position() + i->offset();
|
||||
_upper_bound->element = indexable_element::cell;
|
||||
sstlog.trace("index {} upper bound: skipped to cell, _current_pi_idx={}, _data_file_position={}",
|
||||
this, _upper_bound->current_pi_idx, _upper_bound->data_file_position);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Returns position right after all partitions in the sstable
|
||||
uint64_t data_file_end() const {
|
||||
return _sstable->data_size();
|
||||
@@ -491,9 +552,9 @@ public:
|
||||
sstlog.trace("index {}: index_reader for {}", this, _sstable->get_filename());
|
||||
}
|
||||
|
||||
// Ensures that lower_partition_data_ready() returns true.
|
||||
// Can be called only when !eof(_lower_bound)
|
||||
future<> read_lower_partition_data() {
|
||||
// Ensures that partition_data_ready() returns true.
|
||||
// Can be called only when !eof()
|
||||
future<> read_partition_data() {
|
||||
assert(!eof());
|
||||
if (partition_data_ready(_lower_bound)) {
|
||||
return make_ready_future<>();
|
||||
@@ -510,30 +571,31 @@ public:
|
||||
advance_upper_to_end(range));
|
||||
}
|
||||
|
||||
index_entry& current_lower_partition_entry() {
|
||||
// Get current index entry
|
||||
index_entry& current_partition_entry() {
|
||||
return current_partition_entry(_lower_bound);
|
||||
}
|
||||
|
||||
// Returns tombstone for the current lower partition if it was recorded in the sstable.
|
||||
// Returns tombstone for the current partition if it was recorded in the sstable.
|
||||
// It may be unavailable for old sstables for which this information was not generated.
|
||||
// Can be called only when lower_partition_data_ready().
|
||||
stdx::optional<sstables::deletion_time> lower_partition_tombstone() {
|
||||
// Can be called only when partition_data_ready().
|
||||
std::optional<sstables::deletion_time> partition_tombstone() {
|
||||
return current_partition_entry(_lower_bound).get_deletion_time();
|
||||
}
|
||||
|
||||
// Returns the key for current lower partition.
|
||||
// Can be called only when lower_partition_data_ready().
|
||||
// Returns the key for current partition.
|
||||
// Can be called only when partition_data_ready().
|
||||
// The result is valid as long as index_reader is valid.
|
||||
key_view lower_partition_key() {
|
||||
key_view partition_key() {
|
||||
index_entry& e = current_partition_entry(_lower_bound);
|
||||
return e.get_key();
|
||||
}
|
||||
|
||||
bool lower_partition_data_ready() const {
|
||||
bool partition_data_ready() const {
|
||||
return partition_data_ready(_lower_bound);
|
||||
}
|
||||
|
||||
// Forwards the lower bound cursor to given position in current partition.
|
||||
// Forwards the cursor to the given position in the current partition.
|
||||
//
|
||||
// Note that the index within partition, unlike the partition index, doesn't cover all keys.
|
||||
// So this may forward the cursor to some position pos' which precedes pos, even though
|
||||
@@ -541,19 +603,19 @@ public:
|
||||
//
|
||||
// Must be called for non-decreasing positions.
|
||||
// Must be called only after advanced to some partition and !eof().
|
||||
future<> advance_lower_to(position_in_partition_view pos) {
|
||||
sstlog.trace("index {}: advance_lower_to({}), current data_file_pos={}",
|
||||
future<> advance_to(position_in_partition_view pos) {
|
||||
sstlog.trace("index {}: advance_to({}), current data_file_pos={}",
|
||||
this, pos, _lower_bound.data_file_position);
|
||||
|
||||
if (!lower_partition_data_ready()) {
|
||||
return read_lower_partition_data().then([this, pos] {
|
||||
if (!partition_data_ready()) {
|
||||
return read_partition_data().then([this, pos] {
|
||||
sstlog.trace("index {}: page done", this);
|
||||
assert(partition_data_ready(_lower_bound));
|
||||
return advance_lower_to(pos);
|
||||
return advance_to(pos);
|
||||
});
|
||||
}
|
||||
|
||||
index_entry& e = current_lower_partition_entry();
|
||||
index_entry& e = current_partition_entry();
|
||||
if (e.get_total_pi_blocks_count() == 0) {
|
||||
sstlog.trace("index {}: no promoted index", this);
|
||||
return make_ready_future<>();
|
||||
@@ -608,85 +670,37 @@ public:
|
||||
});
|
||||
}
|
||||
|
||||
// Forwards the upper blound cursor to a position which is greater than given position in current partition.
|
||||
//
|
||||
// Note that the index within partition, unlike the partition index, doesn't cover all keys.
|
||||
// So this may not forward to the smallest position which is greater than pos.
|
||||
//
|
||||
// May advance to the next partition if it's not possible to find a suitable position inside
|
||||
// current partition.
|
||||
//
|
||||
// Must be called only when !eof().
|
||||
future<> advance_upper_past(position_in_partition_view pos) {
|
||||
sstlog.trace("index {}: advance_upper_past({})", this, pos);
|
||||
|
||||
// We advance cursor within the current lower bound partition
|
||||
// So need to make sure first that it is read
|
||||
if (!partition_data_ready(_lower_bound)) {
|
||||
return read_lower_partition_data().then([this, pos] {
|
||||
assert(lower_partition_data_ready());
|
||||
return advance_upper_past(pos);
|
||||
});
|
||||
}
|
||||
|
||||
if (!_upper_bound) {
|
||||
_upper_bound = _lower_bound;
|
||||
}
|
||||
|
||||
index_entry& e = current_partition_entry(*_upper_bound);
|
||||
if (e.get_total_pi_blocks_count() == 0) {
|
||||
sstlog.trace("index {}: no promoted index", this);
|
||||
return advance_to_next_partition(*_upper_bound);
|
||||
}
|
||||
|
||||
if (e.get_read_pi_blocks_count() == 0) {
|
||||
return e.get_next_pi_blocks().then([this, pos] {
|
||||
return advance_upper_past(pos);
|
||||
});
|
||||
}
|
||||
|
||||
const schema& s = *_sstable->_schema;
|
||||
auto cmp_with_start = [pos_cmp = position_in_partition::composite_less_compare(s), s]
|
||||
(position_in_partition_view pos, const promoted_index_block& info) -> bool {
|
||||
return pos_cmp(pos, info.start(s));
|
||||
};
|
||||
promoted_index_blocks* pi_blocks = e.get_pi_blocks();
|
||||
assert(pi_blocks);
|
||||
auto i = std::upper_bound(pi_blocks->begin() + _upper_bound->current_pi_idx, pi_blocks->end(), pos, cmp_with_start);
|
||||
_upper_bound->current_pi_idx = std::distance(pi_blocks->begin(), i);
|
||||
if (i == pi_blocks->end()) {
|
||||
return advance_to_next_partition(*_upper_bound);
|
||||
}
|
||||
|
||||
_upper_bound->data_file_position = e.position() + i->offset();
|
||||
_upper_bound->element = indexable_element::cell;
|
||||
sstlog.trace("index {} upper bound: skipped to cell, _current_pi_idx={}, _data_file_position={}",
|
||||
this, _upper_bound->current_pi_idx, _upper_bound->data_file_position);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Like advance_to(dht::ring_position_view), but returns information whether the key was found
|
||||
future<bool> advance_lower_and_check_if_present(dht::ring_position_view key) {
|
||||
return advance_to(_lower_bound, key).then([this, key] {
|
||||
// If upper_bound is provided, the upper bound within position is looked up
|
||||
future<bool> advance_lower_and_check_if_present(
|
||||
dht::ring_position_view key, std::optional<position_in_partition_view> pos = {}) {
|
||||
return advance_to(_lower_bound, key).then([this, key, pos] {
|
||||
if (eof()) {
|
||||
return make_ready_future<bool>(false);
|
||||
}
|
||||
return read_lower_partition_data().then([this, key] {
|
||||
return read_partition_data().then([this, key, pos] {
|
||||
index_comparator cmp(*_sstable->_schema);
|
||||
return cmp(key, current_partition_entry(_lower_bound)) == 0;
|
||||
bool found = cmp(key, current_partition_entry(_lower_bound)) == 0;
|
||||
if (!found || !pos) {
|
||||
return make_ready_future<bool>(found);
|
||||
}
|
||||
|
||||
return advance_upper_past(*pos).then([] {
|
||||
return make_ready_future<bool>(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Moves the lower bound cursor to the beginning of next partition.
|
||||
// Moves the cursor to the beginning of next partition.
|
||||
// Can be called only when !eof().
|
||||
future<> advance_lower_to_next_partition() {
|
||||
future<> advance_to_next_partition() {
|
||||
return advance_to_next_partition(_lower_bound);
|
||||
}
|
||||
|
||||
// Positions the lower bound cursor on the first partition which is not smaller than pos (like std::lower_bound).
|
||||
// Positions the cursor on the first partition which is not smaller than pos (like std::lower_bound).
|
||||
// Must be called for non-decreasing positions.
|
||||
future<> advance_lower_to(dht::ring_position_view pos) {
|
||||
future<> advance_to(dht::ring_position_view pos) {
|
||||
return advance_to(_lower_bound, pos);
|
||||
}
|
||||
|
||||
@@ -696,7 +710,7 @@ public:
|
||||
};
|
||||
|
||||
// Returns positions in the data file of the cursor.
|
||||
// End position may not be set
|
||||
// End position may be unset
|
||||
data_file_positions_range data_file_positions() const {
|
||||
data_file_positions_range result;
|
||||
result.start = _lower_bound.data_file_position;
|
||||
@@ -706,8 +720,8 @@ public:
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the kind of sstable element the lower bound cursor is pointing at.
|
||||
indexable_element lower_element_kind() const {
|
||||
// Returns the kind of sstable element the cursor is pointing at.
|
||||
indexable_element element_kind() const {
|
||||
return _lower_bound.element;
|
||||
}
|
||||
|
||||
|
||||
@@ -64,15 +64,12 @@ template
|
||||
data_consume_context<data_consume_rows_context_m>
|
||||
data_consume_rows<data_consume_rows_context_m>(const schema& s, shared_sstable, data_consume_rows_context_m::consumer&);
|
||||
|
||||
|
||||
static
|
||||
future<> advance_to_upper_bound(index_reader& ix, const schema& s, const query::partition_slice& slice, dht::ring_position_view key) {
|
||||
auto& ranges = slice.row_ranges(s, *key.key());
|
||||
if (ranges.empty()) {
|
||||
return ix.advance_upper_past(position_in_partition_view::for_static_row());
|
||||
} else {
|
||||
return ix.advance_upper_past(position_in_partition_view::for_range_end(ranges[ranges.size() - 1]));
|
||||
}
|
||||
position_in_partition_view get_slice_upper_bound(const schema& s, const query::partition_slice& slice, dht::ring_position_view key) {
|
||||
const auto& ranges = slice.row_ranges(s, *key.key());
|
||||
return ranges.empty()
|
||||
? position_in_partition_view::for_static_row()
|
||||
: position_in_partition_view::for_range_end(ranges.back());
|
||||
}
|
||||
|
||||
GCC6_CONCEPT(
|
||||
@@ -170,7 +167,8 @@ public:
|
||||
, _consumer(this, _schema, slice, pc, std::move(resource_tracker), fwd, _sst)
|
||||
, _single_partition_read(true)
|
||||
, _initialize([this, key = std::move(key), &pc, &slice, fwd_mr] () mutable {
|
||||
auto f = get_index_reader().advance_lower_and_check_if_present(key);
|
||||
position_in_partition_view pos = get_slice_upper_bound(*_schema, slice, key);
|
||||
auto f = get_index_reader().advance_lower_and_check_if_present(key, pos);
|
||||
return f.then([this, &slice, &pc, key] (bool present) mutable {
|
||||
if (!present) {
|
||||
_sst->get_filter_tracker().add_false_positive();
|
||||
@@ -179,17 +177,15 @@ public:
|
||||
|
||||
_sst->get_filter_tracker().add_true_positive();
|
||||
|
||||
auto f = advance_to_upper_bound(*_index_reader, *_schema, slice, key);
|
||||
return f.then([this, &slice, &pc] () mutable {
|
||||
auto [start, end] = _index_reader->data_file_positions();
|
||||
assert(end);
|
||||
_read_enabled = (start != *end);
|
||||
_context = data_consume_single_partition<DataConsumeRowsContext>(*_schema, _sst, _consumer,
|
||||
{ start, *end });
|
||||
_monitor.on_read_started(_context->reader_position());
|
||||
_will_likely_slice = will_likely_slice(slice);
|
||||
_index_in_current_partition = true;
|
||||
});
|
||||
auto [start, end] = _index_reader->data_file_positions();
|
||||
assert(end);
|
||||
_read_enabled = (start != *end);
|
||||
_context = data_consume_single_partition<DataConsumeRowsContext>(*_schema, _sst, _consumer,
|
||||
{ start, *end });
|
||||
_monitor.on_read_started(_context->reader_position());
|
||||
_will_likely_slice = will_likely_slice(slice);
|
||||
_index_in_current_partition = true;
|
||||
return make_ready_future<>();
|
||||
});
|
||||
})
|
||||
, _fwd(fwd)
|
||||
@@ -231,25 +227,25 @@ private:
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return (_index_in_current_partition
|
||||
? _index_reader->advance_lower_to_next_partition()
|
||||
: get_index_reader().advance_lower_to(dht::ring_position_view::for_after_key(*_current_partition_key))).then([this] {
|
||||
? _index_reader->advance_to_next_partition()
|
||||
: get_index_reader().advance_to(dht::ring_position_view::for_after_key(*_current_partition_key))).then([this] {
|
||||
_index_in_current_partition = true;
|
||||
auto [start, end] = _index_reader->data_file_positions();
|
||||
if (end && start > *end) {
|
||||
_read_enabled = false;
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return _context->skip_to(_index_reader->lower_element_kind(), start);
|
||||
return _context->skip_to(_index_reader->element_kind(), start);
|
||||
});
|
||||
}
|
||||
future<> read_from_index() {
|
||||
sstlog.trace("reader {}: read from index", this);
|
||||
auto tomb = _index_reader->lower_partition_tombstone();
|
||||
auto tomb = _index_reader->partition_tombstone();
|
||||
if (!tomb) {
|
||||
sstlog.trace("reader {}: no tombstone", this);
|
||||
return read_from_datafile();
|
||||
}
|
||||
auto pk = _index_reader->lower_partition_key().to_partition_key(*_schema);
|
||||
auto pk = _index_reader->partition_key().to_partition_key(*_schema);
|
||||
auto key = dht::global_partitioner().decorate_key(*_schema, std::move(pk));
|
||||
_consumer.setup_for_partition(key.key());
|
||||
on_next_partition(std::move(key), tombstone(*tomb));
|
||||
@@ -303,11 +299,11 @@ private:
|
||||
sstlog.trace("reader {}: eof", this);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
if (_index_reader->lower_partition_data_ready()) {
|
||||
if (_index_reader->partition_data_ready()) {
|
||||
return read_from_index();
|
||||
}
|
||||
if (_will_likely_slice) {
|
||||
return _index_reader->read_lower_partition_data().then([this] {
|
||||
return _index_reader->read_partition_data().then([this] {
|
||||
return read_from_index();
|
||||
});
|
||||
}
|
||||
@@ -341,14 +337,14 @@ private:
|
||||
return [this] {
|
||||
if (!_index_in_current_partition) {
|
||||
_index_in_current_partition = true;
|
||||
return get_index_reader().advance_lower_to(*_current_partition_key);
|
||||
return get_index_reader().advance_to(*_current_partition_key);
|
||||
}
|
||||
return make_ready_future();
|
||||
}().then([this, pos] {
|
||||
return get_index_reader().advance_lower_to(*pos).then([this] {
|
||||
return get_index_reader().advance_to(*pos).then([this] {
|
||||
index_reader& idx = *_index_reader;
|
||||
auto index_position = idx.data_file_positions();
|
||||
return _context->skip_to(idx.lower_element_kind(), index_position.start);
|
||||
return _context->skip_to(idx.element_kind(), index_position.start);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -33,12 +33,13 @@ class incremental_selector_impl;
|
||||
|
||||
class sstable_set {
|
||||
std::unique_ptr<sstable_set_impl> _impl;
|
||||
schema_ptr _schema;
|
||||
// used to support column_family::get_sstable(), which wants to return an sstable_list
|
||||
// that has a reference somewhere
|
||||
lw_shared_ptr<sstable_list> _all;
|
||||
public:
|
||||
~sstable_set();
|
||||
sstable_set(std::unique_ptr<sstable_set_impl> impl, lw_shared_ptr<sstable_list> all);
|
||||
sstable_set(std::unique_ptr<sstable_set_impl> impl, schema_ptr s, lw_shared_ptr<sstable_list> all);
|
||||
sstable_set(const sstable_set&);
|
||||
sstable_set(sstable_set&&) noexcept;
|
||||
sstable_set& operator=(const sstable_set&);
|
||||
@@ -48,31 +49,40 @@ public:
|
||||
void insert(shared_sstable sst);
|
||||
void erase(shared_sstable sst);
|
||||
|
||||
// Used to incrementally select sstables from sstable set using tokens.
|
||||
// Used to incrementally select sstables from sstable set using ring-position.
|
||||
// sstable set must be alive and cannot be modified while incremental
|
||||
// selector is used.
|
||||
class incremental_selector {
|
||||
std::unique_ptr<incremental_selector_impl> _impl;
|
||||
mutable stdx::optional<dht::token_range> _current_token_range;
|
||||
dht::ring_position_comparator _cmp;
|
||||
mutable std::optional<dht::partition_range> _current_range;
|
||||
mutable std::optional<nonwrapping_range<dht::ring_position_view>> _current_range_view;
|
||||
mutable std::vector<shared_sstable> _current_sstables;
|
||||
mutable dht::ring_position _current_next_position = dht::ring_position::min();
|
||||
mutable dht::ring_position_view _current_next_position = dht::ring_position_view::min();
|
||||
public:
|
||||
~incremental_selector();
|
||||
incremental_selector(std::unique_ptr<incremental_selector_impl> impl);
|
||||
incremental_selector(std::unique_ptr<incremental_selector_impl> impl, const schema& s);
|
||||
incremental_selector(incremental_selector&&) noexcept;
|
||||
|
||||
struct selection {
|
||||
const std::vector<shared_sstable>& sstables;
|
||||
dht::ring_position next_position;
|
||||
dht::ring_position_view next_position;
|
||||
};
|
||||
|
||||
// Return the sstables that intersect with t and the best next
|
||||
// token (inclusive) to call select() with so that the least
|
||||
// amount of sstables will be returned (without skipping any).
|
||||
// NOTE: selection.sstables is a reference to an internal cache
|
||||
// and can be invalidated by another call to select().
|
||||
// If you need it long-term copy it!
|
||||
selection select(const dht::token& t) const;
|
||||
// Return the sstables that intersect with `pos` and the next
|
||||
// position where the intersecting sstables change.
|
||||
// To walk through the token range incrementally call `select()`
|
||||
// with `dht::ring_position_view::min()` and then pass back the
|
||||
// returned `next_position` on each next call until
|
||||
// `next_position` becomes `dht::ring_position::max()`.
|
||||
//
|
||||
// Successive calls to `select()' have to pass weakly monotonic
|
||||
// positions (incrementability).
|
||||
//
|
||||
// NOTE: both `selection.sstables` and `selection.next_position`
|
||||
// are only guaranteed to be valid until the next call to
|
||||
// `select()`.
|
||||
selection select(const dht::ring_position_view& pos) const;
|
||||
};
|
||||
incremental_selector make_incremental_selector() const;
|
||||
};
|
||||
|
||||
@@ -111,44 +111,6 @@ enum class indexable_element {
|
||||
cell
|
||||
};
|
||||
|
||||
class promoted_index_block {
|
||||
public:
|
||||
promoted_index_block(temporary_buffer<char>&& start, temporary_buffer<char>&& end,
|
||||
uint64_t offset, uint64_t width)
|
||||
: _start(std::move(start)), _end(std::move(end))
|
||||
, _offset(offset), _width(width)
|
||||
{}
|
||||
promoted_index_block(const promoted_index_block& rhs)
|
||||
: _start(rhs._start.get(), rhs._start.size()), _end(rhs._end.get(), rhs._end.size())
|
||||
, _offset(rhs._offset), _width(rhs._width)
|
||||
{}
|
||||
promoted_index_block(promoted_index_block&&) noexcept = default;
|
||||
|
||||
promoted_index_block& operator=(const promoted_index_block& rhs) {
|
||||
if (this != &rhs) {
|
||||
_start = temporary_buffer<char>(rhs._start.get(), rhs._start.size());
|
||||
_end = temporary_buffer<char>(rhs._end.get(), rhs._end.size());
|
||||
_offset = rhs._offset;
|
||||
_width = rhs._width;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
promoted_index_block& operator=(promoted_index_block&&) noexcept = default;
|
||||
|
||||
composite_view start(const schema& s) const { return composite_view(to_bytes_view(_start), s.is_compound());}
|
||||
composite_view end(const schema& s) const { return composite_view(to_bytes_view(_end), s.is_compound());}
|
||||
uint64_t offset() const { return _offset; }
|
||||
uint64_t width() const { return _width; }
|
||||
|
||||
private:
|
||||
temporary_buffer<char> _start;
|
||||
temporary_buffer<char> _end;
|
||||
uint64_t _offset;
|
||||
uint64_t _width;
|
||||
};
|
||||
|
||||
using promoted_index_blocks = seastar::circular_buffer<promoted_index_block>;
|
||||
|
||||
class summary_entry {
|
||||
public:
|
||||
dht::token_view token;
|
||||
|
||||
13
test.py
13
test.py
@@ -112,6 +112,7 @@ boost_tests = [
|
||||
'sstable_3_x_test',
|
||||
'meta_test',
|
||||
'reusable_buffer_test',
|
||||
'multishard_writer_test',
|
||||
]
|
||||
|
||||
other_tests = [
|
||||
@@ -166,6 +167,10 @@ def alarm_handler(signum, frame):
|
||||
if __name__ == "__main__":
|
||||
all_modes = ['debug', 'release']
|
||||
|
||||
sysmem = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
|
||||
testmem = 2e9
|
||||
default_num_jobs = ((sysmem - 4e9) // testmem)
|
||||
|
||||
parser = argparse.ArgumentParser(description="Scylla test runner")
|
||||
parser.add_argument('--fast', action="store_true",
|
||||
help="Run only fast tests")
|
||||
@@ -179,6 +184,8 @@ if __name__ == "__main__":
|
||||
help="jenkins output file prefix")
|
||||
parser.add_argument('--verbose', '-v', action='store_true', default=False,
|
||||
help='Verbose reporting')
|
||||
parser.add_argument('--jobs', '-j', action="store", default=default_num_jobs, type=int,
|
||||
help="Number of jobs to use for running the tests")
|
||||
args = parser.parse_args()
|
||||
|
||||
print_progress = print_status_verbose if args.verbose else print_progress_succint
|
||||
@@ -235,7 +242,7 @@ if __name__ == "__main__":
|
||||
mode = 'debug'
|
||||
xmlout = (args.jenkins + "." + mode + "." +
|
||||
os.path.basename(path.split()[0]) + ".boost.xml")
|
||||
boost_args += ['--report_level=no', '--logger=XML,test_suite,' + xmlout]
|
||||
boost_args += ['--report_level=no', '--logger=HRF,test_suite:XML,test_suite,' + xmlout]
|
||||
if type == 'boost':
|
||||
boost_args += ['--']
|
||||
def report_error(out, report_subcause):
|
||||
@@ -265,9 +272,7 @@ if __name__ == "__main__":
|
||||
print(' with error {e}\n'.format(e=e), file=file)
|
||||
report_error(e, report_subcause=report_subcause)
|
||||
return (path, boost_args + exec_args, success, file.getvalue())
|
||||
sysmem = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
|
||||
testmem = 2e9
|
||||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=((sysmem - 4e9) // testmem))
|
||||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=args.jobs)
|
||||
futures = []
|
||||
for n, test in enumerate(test_to_run):
|
||||
path = test[0]
|
||||
|
||||
@@ -103,7 +103,7 @@ mutation make_incomplete_mutation() {
|
||||
return mutation(SCHEMA, DK, mutation_partition::make_incomplete(*SCHEMA));
|
||||
}
|
||||
|
||||
static void assert_single_version(lw_shared_ptr<partition_snapshot> snp) {
|
||||
static void assert_single_version(partition_snapshot_ptr snp) {
|
||||
BOOST_REQUIRE(snp->at_latest_version());
|
||||
BOOST_REQUIRE_EQUAL(snp->version_count(), 1);
|
||||
}
|
||||
@@ -140,7 +140,7 @@ struct expected_row {
|
||||
}
|
||||
};
|
||||
|
||||
static void assert_cached_rows(lw_shared_ptr<partition_snapshot> snp, std::deque<expected_row> expected) {
|
||||
static void assert_cached_rows(partition_snapshot_ptr snp, std::deque<expected_row> expected) {
|
||||
auto&& rows = snp->version()->partition().clustered_rows();
|
||||
for (auto&& r : rows) {
|
||||
BOOST_REQUIRE(!expected.empty());
|
||||
@@ -173,7 +173,7 @@ struct expected_tombstone {
|
||||
}
|
||||
};
|
||||
|
||||
static void assert_cached_tombstones(lw_shared_ptr<partition_snapshot> snp, std::deque<range_tombstone> expected) {
|
||||
static void assert_cached_tombstones(partition_snapshot_ptr snp, std::deque<range_tombstone> expected) {
|
||||
const range_tombstone_list& rts = snp->version()->partition().row_tombstones();
|
||||
for (auto&& rt : rts) {
|
||||
BOOST_REQUIRE(!expected.empty());
|
||||
@@ -187,7 +187,7 @@ static void assert_cached_tombstones(lw_shared_ptr<partition_snapshot> snp, std:
|
||||
|
||||
class cache_tester {
|
||||
public:
|
||||
static lw_shared_ptr<partition_snapshot> snapshot_for_key(row_cache& rc, const dht::decorated_key& dk) {
|
||||
static partition_snapshot_ptr snapshot_for_key(row_cache& rc, const dht::decorated_key& dk) {
|
||||
return rc._read_section(rc._tracker.region(), [&] {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
cache_entry& e = rc.find_or_create(dk, {}, rc.phase_of(dk));
|
||||
|
||||
@@ -2993,3 +2993,274 @@ SEASTAR_TEST_CASE(test_time_conversions) {
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
// Corner-case test that checks for the paging code's preparedness for an empty
|
||||
// range list.
|
||||
SEASTAR_TEST_CASE(test_empty_partition_range_scan) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
e.execute_cql("create keyspace empty_partition_range_scan with replication = {'class': 'SimpleStrategy', 'replication_factor': 1};").get();
|
||||
e.execute_cql("create table empty_partition_range_scan.tb (a int, b int, c int, val int, PRIMARY KEY ((a,b),c) );").get();
|
||||
|
||||
|
||||
auto qo = std::make_unique<cql3::query_options>(db::consistency_level::LOCAL_ONE, infinite_timeout_config, std::vector<cql3::raw_value>{},
|
||||
cql3::query_options::specific_options{1, nullptr, {}, api::new_timestamp()});
|
||||
auto res = e.execute_cql("select * from empty_partition_range_scan.tb where token (a,b) > 1 and token(a,b) <= 1;", std::move(qo)).get0();
|
||||
assert_that(res).is_rows().is_empty();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_allow_filtering_check) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
e.execute_cql("CREATE TABLE t (p int, c int, v int, PRIMARY KEY(p, c));").get();
|
||||
e.require_table_exists("ks", "t").get();
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
for (int j = 0; j <3; ++j) {
|
||||
e.execute_cql(sprint("INSERT INTO t(p, c, v) VALUES (%s, %s, %s)", i, j, j)).get();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<sstring> queries = {
|
||||
"SELECT * FROM t WHERE p = 1",
|
||||
"SELECT * FROM t WHERE p = 1 and c > 2",
|
||||
"SELECT * FROM t WHERE p = 1 and c = 2"
|
||||
};
|
||||
|
||||
for (const sstring& q : queries) {
|
||||
e.execute_cql(q).get();
|
||||
e.execute_cql(q + " ALLOW FILTERING").get();
|
||||
}
|
||||
|
||||
queries = {
|
||||
"SELECT * FROM t WHERE c = 2",
|
||||
"SELECT * FROM t WHERE c <= 4"
|
||||
};
|
||||
|
||||
for (const sstring& q : queries) {
|
||||
BOOST_CHECK_THROW(e.execute_cql(q).get(), exceptions::invalid_request_exception);
|
||||
e.execute_cql(q + " ALLOW FILTERING").get();
|
||||
}
|
||||
|
||||
e.execute_cql("CREATE TABLE t2 (p int PRIMARY KEY, a int, b int);").get();
|
||||
e.require_table_exists("ks", "t2").get();
|
||||
e.execute_cql("CREATE INDEX ON t2(a)").get();
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
e.execute_cql(sprint("INSERT INTO t2 (p, a, b) VALUES (%s, %s, %s)", i, i * 10, i * 100)).get();
|
||||
}
|
||||
|
||||
queries = {
|
||||
"SELECT * FROM t2 WHERE p = 1",
|
||||
"SELECT * FROM t2 WHERE a = 20"
|
||||
};
|
||||
|
||||
for (const sstring& q : queries) {
|
||||
e.execute_cql(q).get();
|
||||
e.execute_cql(q + " ALLOW FILTERING").get();
|
||||
}
|
||||
|
||||
queries = {
|
||||
"SELECT * FROM t2 WHERE a = 20 AND b = 200"
|
||||
};
|
||||
|
||||
for (const sstring& q : queries) {
|
||||
BOOST_CHECK_THROW(e.execute_cql(q).get(), exceptions::invalid_request_exception);
|
||||
e.execute_cql(q + " ALLOW FILTERING").get();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_allow_filtering_pk_ck) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
e.execute_cql("CREATE TABLE t (a int, b int, c int, d int, e int, PRIMARY KEY ((a, b), c, d));").get();
|
||||
e.require_table_exists("ks", "t").get();
|
||||
e.execute_cql("INSERT INTO t (a,b,c,d,e) VALUES (11, 12, 13, 14, 15)").get();
|
||||
e.execute_cql("INSERT INTO t (a,b,c,d,e) VALUES (11, 15, 16, 17, 18)").get();
|
||||
e.execute_cql("INSERT INTO t (a,b,c,d,e) VALUES (21, 22, 23, 24, 25)").get();
|
||||
e.execute_cql("INSERT INTO t (a,b,c,d,e) VALUES (31, 32, 33, 34, 35)").get();
|
||||
|
||||
auto msg = e.execute_cql("SELECT * FROM t WHERE a = 11 AND b = 15 AND c = 16").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(11),
|
||||
int32_type->decompose(15),
|
||||
int32_type->decompose(16),
|
||||
int32_type->decompose(17),
|
||||
int32_type->decompose(18),
|
||||
}});
|
||||
|
||||
BOOST_CHECK_THROW(e.execute_cql("SELECT * FROM t WHERE a = 11 AND b = 12 AND c > 13 AND d = 14").get(), exceptions::invalid_request_exception);
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE a = 11 AND b = 15 AND c = 16").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(11),
|
||||
int32_type->decompose(15),
|
||||
int32_type->decompose(16),
|
||||
int32_type->decompose(17),
|
||||
int32_type->decompose(18),
|
||||
}});
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE a = 11 AND b = 15 AND c > 13 AND d >= 17 ALLOW FILTERING").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(11),
|
||||
int32_type->decompose(15),
|
||||
int32_type->decompose(16),
|
||||
int32_type->decompose(17),
|
||||
int32_type->decompose(18),
|
||||
}});
|
||||
|
||||
BOOST_CHECK_THROW(e.execute_cql("SELECT * FROM t WHERE a = 11 AND b = 12 AND c > 13 AND d > 17").get(), exceptions::invalid_request_exception);
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE a = 11 AND b = 15 AND c > 13 AND d >= 17 ALLOW FILTERING").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(11),
|
||||
int32_type->decompose(15),
|
||||
int32_type->decompose(16),
|
||||
int32_type->decompose(17),
|
||||
int32_type->decompose(18),
|
||||
}});
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE a <= 11 AND c > 15 AND d >= 16 ALLOW FILTERING").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(11),
|
||||
int32_type->decompose(15),
|
||||
int32_type->decompose(16),
|
||||
int32_type->decompose(17),
|
||||
int32_type->decompose(18),
|
||||
}});
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE a <= 11 AND b >= 15 AND c > 15 AND d >= 16 ALLOW FILTERING").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(11),
|
||||
int32_type->decompose(15),
|
||||
int32_type->decompose(16),
|
||||
int32_type->decompose(17),
|
||||
int32_type->decompose(18),
|
||||
}});
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE a <= 100 AND b >= 15 AND c > 0 AND d <= 100 ALLOW FILTERING").get0();
|
||||
assert_that(msg).is_rows().with_rows({
|
||||
{
|
||||
int32_type->decompose(11),
|
||||
int32_type->decompose(15),
|
||||
int32_type->decompose(16),
|
||||
int32_type->decompose(17),
|
||||
int32_type->decompose(18),
|
||||
},
|
||||
{
|
||||
int32_type->decompose(31),
|
||||
int32_type->decompose(32),
|
||||
int32_type->decompose(33),
|
||||
int32_type->decompose(34),
|
||||
int32_type->decompose(35),
|
||||
},
|
||||
{
|
||||
int32_type->decompose(21),
|
||||
int32_type->decompose(22),
|
||||
int32_type->decompose(23),
|
||||
int32_type->decompose(24),
|
||||
int32_type->decompose(25),
|
||||
}
|
||||
});
|
||||
|
||||
BOOST_CHECK_THROW(e.execute_cql("SELECT * FROM t WHERE a <= 11 AND c > 15 AND d >= 16").get(), exceptions::invalid_request_exception);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_allow_filtering_clustering_column) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
e.execute_cql("CREATE TABLE t (k int, c int, v int, PRIMARY KEY (k, c));").get();
|
||||
e.require_table_exists("ks", "t").get();
|
||||
|
||||
e.execute_cql("INSERT INTO t (k, c, v) VALUES (1, 2, 1)").get();
|
||||
e.execute_cql("INSERT INTO t (k, c, v) VALUES (1, 3, 2)").get();
|
||||
e.execute_cql("INSERT INTO t (k, c, v) VALUES (2, 2, 3)").get();
|
||||
|
||||
auto msg = e.execute_cql("SELECT * FROM t WHERE k = 1").get0();
|
||||
assert_that(msg).is_rows().with_rows({
|
||||
{
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(2),
|
||||
int32_type->decompose(1)
|
||||
},
|
||||
{
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(3),
|
||||
int32_type->decompose(2)
|
||||
}
|
||||
});
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE k = 1 AND c > 2").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(3),
|
||||
int32_type->decompose(2)
|
||||
}});
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE k = 1 AND c = 2").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(2),
|
||||
int32_type->decompose(1)
|
||||
}});
|
||||
|
||||
BOOST_CHECK_THROW(e.execute_cql("SELECT * FROM t WHERE c = 2").get(), exceptions::invalid_request_exception);
|
||||
BOOST_CHECK_THROW(e.execute_cql("SELECT * FROM t WHERE c > 2 AND c <= 4").get(), exceptions::invalid_request_exception);
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE c = 2 ALLOW FILTERING").get0();
|
||||
assert_that(msg).is_rows().with_rows({
|
||||
{
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(2),
|
||||
int32_type->decompose(1)
|
||||
},
|
||||
{
|
||||
int32_type->decompose(2),
|
||||
int32_type->decompose(2),
|
||||
int32_type->decompose(3)
|
||||
}
|
||||
});
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE c > 2 AND c <= 4 ALLOW FILTERING").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(3),
|
||||
int32_type->decompose(2)
|
||||
}});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_allow_filtering_static_column) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
e.execute_cql("CREATE TABLE t (a int, b int, c int, s int static, PRIMARY KEY(a, b));").get();
|
||||
e.require_table_exists("ks", "t").get();
|
||||
e.execute_cql("CREATE INDEX ON t(c)").get();
|
||||
|
||||
e.execute_cql("INSERT INTO t (a, b, c, s) VALUES (1, 1, 1, 1)").get();
|
||||
e.execute_cql("INSERT INTO t (a, b, c) VALUES (1, 2, 1)").get();
|
||||
e.execute_cql("INSERT INTO t (a, s) VALUES (3, 3)").get();
|
||||
e.execute_cql("INSERT INTO t (a, b, c, s) VALUES (2, 1, 1, 2)").get();
|
||||
|
||||
auto msg = e.execute_cql("SELECT * FROM t WHERE c = 1 AND s = 2 ALLOW FILTERING").get0();
|
||||
assert_that(msg).is_rows().with_rows({{
|
||||
int32_type->decompose(2),
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(2),
|
||||
int32_type->decompose(1)
|
||||
}});
|
||||
|
||||
msg = e.execute_cql("SELECT * FROM t WHERE c = 1 AND s = 1 ALLOW FILTERING").get0();
|
||||
assert_that(msg).is_rows().with_rows({
|
||||
{
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(1)
|
||||
},
|
||||
{
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(2),
|
||||
int32_type->decompose(1),
|
||||
int32_type->decompose(1)
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -29,6 +29,9 @@
|
||||
#include "database.hh"
|
||||
#include "partition_slice_builder.hh"
|
||||
#include "frozen_mutation.hh"
|
||||
#include "mutation_source_test.hh"
|
||||
#include "schema_registry.hh"
|
||||
#include "service/migration_manager.hh"
|
||||
|
||||
SEASTAR_TEST_CASE(test_querying_with_limits) {
|
||||
return do_with_cql_env([](cql_test_env& e) {
|
||||
@@ -74,3 +77,33 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_database_with_data_in_sstables_is_a_mutation_source) {
|
||||
do_with_cql_env([] (cql_test_env& e) {
|
||||
run_mutation_source_tests([&] (schema_ptr s, const std::vector<mutation>& partitions) -> mutation_source {
|
||||
try {
|
||||
e.local_db().find_column_family(s->ks_name(), s->cf_name());
|
||||
service::get_local_migration_manager().announce_column_family_drop(s->ks_name(), s->cf_name(), true).get();
|
||||
} catch (const no_such_column_family&) {
|
||||
// expected
|
||||
}
|
||||
service::get_local_migration_manager().announce_new_column_family(s, true).get();
|
||||
column_family& cf = e.local_db().find_column_family(s);
|
||||
for (auto&& m : partitions) {
|
||||
e.local_db().apply(cf.schema(), freeze(m)).get();
|
||||
}
|
||||
cf.flush().get();
|
||||
cf.get_row_cache().invalidate([] {}).get();
|
||||
return mutation_source([&] (schema_ptr s,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return cf.make_reader(s, range, slice, pc, std::move(trace_state), fwd, fwd_mr);
|
||||
});
|
||||
});
|
||||
return make_ready_future<>();
|
||||
}).get();
|
||||
}
|
||||
|
||||
110
tests/multishard_writer_test.cc
Normal file
110
tests/multishard_writer_test.cc
Normal file
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
* Copyright (C) 2018 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#include <seastar/core/thread.hh>
|
||||
#include <seastar/tests/test-utils.hh>
|
||||
#include <seastar/util/bool_class.hh>
|
||||
|
||||
#include "mutation_fragment.hh"
|
||||
#include "mutation_source_test.hh"
|
||||
#include "flat_mutation_reader.hh"
|
||||
#include "multishard_writer.hh"
|
||||
#include "tests/cql_test_env.hh"
|
||||
|
||||
struct generate_error_tag { };
|
||||
using generate_error = bool_class<generate_error_tag>;
|
||||
|
||||
|
||||
constexpr unsigned many_partitions() {
|
||||
return
|
||||
#ifndef SEASTAR_DEBUG
|
||||
1000
|
||||
#else
|
||||
10
|
||||
#endif
|
||||
;
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_multishard_writer) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
auto test_random_streams = [] (random_mutation_generator&& gen, size_t partition_nr, generate_error error = generate_error::no) {
|
||||
for (auto i = 0; i < 3; i++) {
|
||||
auto muts = gen(partition_nr);
|
||||
std::vector<size_t> shards_before(smp::count, 0);
|
||||
std::vector<size_t> shards_after(smp::count, 0);
|
||||
|
||||
for (auto& m : muts) {
|
||||
auto shard = dht::global_partitioner().shard_of(m.token());
|
||||
shards_before[shard]++;
|
||||
}
|
||||
schema_ptr s = gen.schema();
|
||||
auto source_reader = partition_nr > 0 ? flat_mutation_reader_from_mutations(muts) : make_empty_flat_reader(s);
|
||||
size_t partitions_received = distribute_reader_and_consume_on_shards(s,
|
||||
dht::global_partitioner(),
|
||||
std::move(source_reader),
|
||||
[&shards_after, error] (flat_mutation_reader reader) mutable {
|
||||
if (error) {
|
||||
return make_exception_future<>(std::runtime_error("Failed to write"));
|
||||
}
|
||||
return repeat([&shards_after, reader = std::move(reader), error] () mutable {
|
||||
return reader().then([&shards_after, error] (mutation_fragment_opt mf_opt) mutable {
|
||||
if (mf_opt) {
|
||||
if (mf_opt->is_partition_start()) {
|
||||
auto shard = dht::global_partitioner().shard_of(mf_opt->as_partition_start().key().token());
|
||||
BOOST_REQUIRE_EQUAL(shard, engine().cpu_id());
|
||||
shards_after[shard]++;
|
||||
}
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
} else {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
).get0();
|
||||
BOOST_REQUIRE_EQUAL(partitions_received, partition_nr);
|
||||
BOOST_REQUIRE_EQUAL(shards_after, shards_before);
|
||||
}
|
||||
};
|
||||
|
||||
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::no, local_shard_only::no), 0);
|
||||
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::yes, local_shard_only::no), 0);
|
||||
|
||||
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::no, local_shard_only::no), 1);
|
||||
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::yes, local_shard_only::no), 1);
|
||||
|
||||
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::no, local_shard_only::no), many_partitions());
|
||||
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::yes, local_shard_only::no), many_partitions());
|
||||
|
||||
try {
|
||||
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::no, local_shard_only::no), many_partitions(), generate_error::yes);
|
||||
BOOST_ASSERT(false);
|
||||
} catch (...) {
|
||||
}
|
||||
|
||||
try {
|
||||
test_random_streams(random_mutation_generator(random_mutation_generator::generate_counters::yes, local_shard_only::no), many_partitions(), generate_error::yes);
|
||||
BOOST_ASSERT(false);
|
||||
} catch (...) {
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -384,7 +384,7 @@ struct sst_factory {
|
||||
schema_ptr s;
|
||||
sstring path;
|
||||
unsigned gen;
|
||||
int level;
|
||||
uint32_t level;
|
||||
|
||||
sst_factory(schema_ptr s, const sstring& path, unsigned gen, int level)
|
||||
: s(s)
|
||||
@@ -396,140 +396,115 @@ struct sst_factory {
|
||||
sstables::shared_sstable operator()() {
|
||||
auto sst = sstables::make_sstable(s, path, gen, sstables::sstable::version_types::la, sstables::sstable::format_types::big);
|
||||
sst->set_unshared();
|
||||
|
||||
//TODO set sstable level, to make the test more interesting
|
||||
//sst->set_sstable_level(level);
|
||||
sst->get_metadata_collector().sstable_level(level);
|
||||
|
||||
return sst;
|
||||
}
|
||||
};
|
||||
|
||||
SEASTAR_TEST_CASE(combined_mutation_reader_test) {
|
||||
return seastar::async([] {
|
||||
storage_service_for_tests ssft;
|
||||
//logging::logger_registry().set_logger_level("database", logging::log_level::trace);
|
||||
SEASTAR_THREAD_TEST_CASE(combined_mutation_reader_test) {
|
||||
storage_service_for_tests ssft;
|
||||
|
||||
simple_schema s;
|
||||
simple_schema s;
|
||||
|
||||
const auto pkeys = s.make_pkeys(4);
|
||||
const auto ckeys = s.make_ckeys(4);
|
||||
auto pkeys = s.make_pkeys(6);
|
||||
const auto ckeys = s.make_ckeys(4);
|
||||
|
||||
std::vector<mutation> base_mutations = boost::copy_range<std::vector<mutation>>(
|
||||
pkeys | boost::adaptors::transformed([&s](const auto& k) { return mutation(s.schema(), k); }));
|
||||
boost::sort(pkeys, [&s] (const dht::decorated_key& a, const dht::decorated_key& b) {
|
||||
return a.less_compare(*s.schema(), b);
|
||||
});
|
||||
|
||||
// Data layout:
|
||||
// d[xx]
|
||||
// b[xx][xx]c
|
||||
// a[x x]
|
||||
auto make_sstable_mutations = [&] (sstring value_prefix, unsigned ckey_index, bool static_row, std::vector<unsigned> pkey_indexes) {
|
||||
std::vector<mutation> muts;
|
||||
|
||||
int i{0};
|
||||
for (auto pkey_index : pkey_indexes) {
|
||||
muts.emplace_back(s.schema(), pkeys[pkey_index]);
|
||||
auto& mut = muts.back();
|
||||
s.add_row(mut, ckeys[ckey_index], sprint("%s_%i_val", value_prefix, ckey_index));
|
||||
|
||||
// sstable d
|
||||
std::vector<mutation> table_d_mutations;
|
||||
|
||||
i = 1;
|
||||
table_d_mutations.emplace_back(base_mutations[i]);
|
||||
s.add_row(table_d_mutations.back(), ckeys[i], sprint("val_d_%i", i));
|
||||
|
||||
i = 2;
|
||||
table_d_mutations.emplace_back(base_mutations[i]);
|
||||
s.add_row(table_d_mutations.back(), ckeys[i], sprint("val_d_%i", i));
|
||||
const auto t_static_row = s.add_static_row(table_d_mutations.back(), sprint("%i_static_val", i));
|
||||
|
||||
// sstable b
|
||||
std::vector<mutation> table_b_mutations;
|
||||
|
||||
i = 0;
|
||||
table_b_mutations.emplace_back(base_mutations[i]);
|
||||
s.add_row(table_b_mutations.back(), ckeys[i], sprint("val_b_%i", i));
|
||||
|
||||
i = 1;
|
||||
table_b_mutations.emplace_back(base_mutations[i]);
|
||||
s.add_row(table_b_mutations.back(), ckeys[i], sprint("val_b_%i", i));
|
||||
|
||||
// sstable c
|
||||
std::vector<mutation> table_c_mutations;
|
||||
|
||||
i = 2;
|
||||
table_c_mutations.emplace_back(base_mutations[i]);
|
||||
const auto t_row = s.add_row(table_c_mutations.back(), ckeys[i], sprint("val_c_%i", i));
|
||||
|
||||
i = 3;
|
||||
table_c_mutations.emplace_back(base_mutations[i]);
|
||||
s.add_row(table_c_mutations.back(), ckeys[i], sprint("val_c_%i", i));
|
||||
|
||||
// sstable a
|
||||
std::vector<mutation> table_a_mutations;
|
||||
|
||||
i = 0;
|
||||
table_a_mutations.emplace_back(base_mutations[i]);
|
||||
s.add_row(table_a_mutations.back(), ckeys[i], sprint("val_a_%i", i));
|
||||
|
||||
i = 3;
|
||||
table_a_mutations.emplace_back(base_mutations[i]);
|
||||
s.add_row(table_a_mutations.back(), ckeys[i], sprint("val_a_%i", i));
|
||||
|
||||
auto tmp = make_lw_shared<tmpdir>();
|
||||
|
||||
unsigned gen{0};
|
||||
|
||||
std::vector<sstables::shared_sstable> tables = {
|
||||
make_sstable_containing(sst_factory(s.schema(), tmp->path, gen++, 0), table_a_mutations),
|
||||
make_sstable_containing(sst_factory(s.schema(), tmp->path, gen++, 1), table_b_mutations),
|
||||
make_sstable_containing(sst_factory(s.schema(), tmp->path, gen++, 1), table_c_mutations),
|
||||
make_sstable_containing(sst_factory(s.schema(), tmp->path, gen++, 2), table_d_mutations)
|
||||
};
|
||||
|
||||
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::leveled, {});
|
||||
auto sstables = make_lw_shared<sstables::sstable_set>(cs.make_sstable_set(s.schema()));
|
||||
|
||||
std::vector<flat_mutation_reader> sstable_mutation_readers;
|
||||
|
||||
for (auto table : tables) {
|
||||
sstables->insert(table);
|
||||
|
||||
sstable_mutation_readers.emplace_back(
|
||||
table->read_range_rows_flat(
|
||||
s.schema(),
|
||||
query::full_partition_range,
|
||||
s.schema()->full_slice(),
|
||||
seastar::default_priority_class(),
|
||||
no_resource_tracking(),
|
||||
streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding::yes));
|
||||
if (static_row) {
|
||||
s.add_static_row(mut, sprint("%s_static_val", value_prefix));
|
||||
}
|
||||
}
|
||||
|
||||
auto list_reader = make_combined_reader(s.schema(),
|
||||
std::move(sstable_mutation_readers));
|
||||
return muts;
|
||||
};
|
||||
|
||||
auto incremental_reader = make_local_shard_sstable_reader(
|
||||
std::vector<mutation> sstable_level_0_0_mutations = make_sstable_mutations("level_0_0", 0, true, {0, 1, 4 });
|
||||
std::vector<mutation> sstable_level_1_0_mutations = make_sstable_mutations("level_1_0", 1, false, {0, 1 });
|
||||
std::vector<mutation> sstable_level_1_1_mutations = make_sstable_mutations("level_1_1", 1, false, { 2, 3 });
|
||||
std::vector<mutation> sstable_level_2_0_mutations = make_sstable_mutations("level_2_0", 2, false, { 1, 4 });
|
||||
std::vector<mutation> sstable_level_2_1_mutations = make_sstable_mutations("level_2_1", 2, false, { 5});
|
||||
|
||||
const mutation expexted_mutation_0 = sstable_level_0_0_mutations[0] + sstable_level_1_0_mutations[0];
|
||||
const mutation expexted_mutation_1 = sstable_level_0_0_mutations[1] + sstable_level_1_0_mutations[1] + sstable_level_2_0_mutations[0];
|
||||
const mutation expexted_mutation_2 = sstable_level_1_1_mutations[0];
|
||||
const mutation expexted_mutation_3 = sstable_level_1_1_mutations[1];
|
||||
const mutation expexted_mutation_4 = sstable_level_0_0_mutations[2] + sstable_level_2_0_mutations[1];
|
||||
const mutation expexted_mutation_5 = sstable_level_2_1_mutations[0];
|
||||
|
||||
auto tmp = make_lw_shared<tmpdir>();
|
||||
|
||||
unsigned gen{0};
|
||||
std::vector<sstables::shared_sstable> sstable_list = {
|
||||
make_sstable_containing(sst_factory(s.schema(), tmp->path, ++gen, 0), std::move(sstable_level_0_0_mutations)),
|
||||
make_sstable_containing(sst_factory(s.schema(), tmp->path, ++gen, 1), std::move(sstable_level_1_0_mutations)),
|
||||
make_sstable_containing(sst_factory(s.schema(), tmp->path, ++gen, 1), std::move(sstable_level_1_1_mutations)),
|
||||
make_sstable_containing(sst_factory(s.schema(), tmp->path, ++gen, 2), std::move(sstable_level_2_0_mutations)),
|
||||
make_sstable_containing(sst_factory(s.schema(), tmp->path, ++gen, 2), std::move(sstable_level_2_1_mutations)),
|
||||
};
|
||||
|
||||
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::leveled, {});
|
||||
auto sstable_set = make_lw_shared<sstables::sstable_set>(cs.make_sstable_set(s.schema()));
|
||||
|
||||
std::vector<flat_mutation_reader> sstable_mutation_readers;
|
||||
|
||||
for (auto sst : sstable_list) {
|
||||
sstable_set->insert(sst);
|
||||
|
||||
sstable_mutation_readers.emplace_back(
|
||||
sst->as_mutation_source().make_reader(
|
||||
s.schema(),
|
||||
sstables,
|
||||
query::full_partition_range,
|
||||
s.schema()->full_slice(),
|
||||
seastar::default_priority_class(),
|
||||
no_resource_tracking(),
|
||||
nullptr,
|
||||
streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding::yes);
|
||||
mutation_reader::forwarding::no));
|
||||
}
|
||||
|
||||
// merge c[0] with d[1]
|
||||
i = 2;
|
||||
auto c_d_merged = mutation(s.schema(), pkeys[i]);
|
||||
s.add_row(c_d_merged, ckeys[i], sprint("val_c_%i", i), t_row);
|
||||
s.add_static_row(c_d_merged, sprint("%i_static_val", i), t_static_row);
|
||||
auto list_reader = make_combined_reader(s.schema(),
|
||||
std::move(sstable_mutation_readers));
|
||||
|
||||
assert_that(std::move(list_reader))
|
||||
.produces(table_a_mutations.front())
|
||||
.produces(table_b_mutations[1])
|
||||
.produces(c_d_merged)
|
||||
.produces(table_a_mutations.back());
|
||||
auto incremental_reader = make_local_shard_sstable_reader(
|
||||
s.schema(),
|
||||
sstable_set,
|
||||
query::full_partition_range,
|
||||
s.schema()->full_slice(),
|
||||
seastar::default_priority_class(),
|
||||
no_resource_tracking(),
|
||||
nullptr,
|
||||
streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding::no);
|
||||
|
||||
assert_that(std::move(incremental_reader))
|
||||
.produces(table_a_mutations.front())
|
||||
.produces(table_b_mutations[1])
|
||||
.produces(c_d_merged)
|
||||
.produces(table_a_mutations.back());
|
||||
});
|
||||
assert_that(std::move(list_reader))
|
||||
.produces(expexted_mutation_0)
|
||||
.produces(expexted_mutation_1)
|
||||
.produces(expexted_mutation_2)
|
||||
.produces(expexted_mutation_3)
|
||||
.produces(expexted_mutation_4)
|
||||
.produces(expexted_mutation_5)
|
||||
.produces_end_of_stream();
|
||||
|
||||
assert_that(std::move(incremental_reader))
|
||||
.produces(expexted_mutation_0)
|
||||
.produces(expexted_mutation_1)
|
||||
.produces(expexted_mutation_2)
|
||||
.produces(expexted_mutation_3)
|
||||
.produces(expexted_mutation_4)
|
||||
.produces(expexted_mutation_5)
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
static mutation make_mutation_with_key(simple_schema& s, dht::decorated_key dk) {
|
||||
@@ -541,17 +516,17 @@ static mutation make_mutation_with_key(simple_schema& s, dht::decorated_key dk)
|
||||
}
|
||||
|
||||
class dummy_incremental_selector : public reader_selector {
|
||||
// To back _selector_position.
|
||||
dht::ring_position _position;
|
||||
std::vector<std::vector<mutation>> _readers_mutations;
|
||||
streamed_mutation::forwarding _fwd;
|
||||
dht::partition_range _pr;
|
||||
|
||||
const dht::token& position() const {
|
||||
return _readers_mutations.back().front().token();
|
||||
}
|
||||
flat_mutation_reader pop_reader() {
|
||||
auto muts = std::move(_readers_mutations.back());
|
||||
_readers_mutations.pop_back();
|
||||
_selector_position = _readers_mutations.empty() ? dht::ring_position::max() : dht::ring_position::starting_at(position());
|
||||
_position = _readers_mutations.empty() ? dht::ring_position::max() : _readers_mutations.back().front().decorated_key();
|
||||
_selector_position = _position;
|
||||
return flat_mutation_reader_from_mutations(std::move(muts), _pr, _fwd);
|
||||
}
|
||||
public:
|
||||
@@ -563,33 +538,34 @@ public:
|
||||
std::vector<std::vector<mutation>> reader_mutations,
|
||||
dht::partition_range pr = query::full_partition_range,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no)
|
||||
: reader_selector(s, dht::ring_position::min())
|
||||
: reader_selector(s, dht::ring_position_view::min())
|
||||
, _position(dht::ring_position::min())
|
||||
, _readers_mutations(std::move(reader_mutations))
|
||||
, _fwd(fwd)
|
||||
, _pr(std::move(pr)) {
|
||||
// So we can pop the next reader off the back
|
||||
boost::reverse(_readers_mutations);
|
||||
_selector_position = dht::ring_position::starting_at(position());
|
||||
}
|
||||
virtual std::vector<flat_mutation_reader> create_new_readers(const dht::token* const t) override {
|
||||
virtual std::vector<flat_mutation_reader> create_new_readers(const std::optional<dht::ring_position_view>& pos) override {
|
||||
if (_readers_mutations.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<flat_mutation_reader> readers;
|
||||
|
||||
if (!t) {
|
||||
if (!pos) {
|
||||
readers.emplace_back(pop_reader());
|
||||
return readers;
|
||||
}
|
||||
|
||||
while (!_readers_mutations.empty() && *t >= _selector_position.token()) {
|
||||
while (!_readers_mutations.empty() && dht::ring_position_tri_compare(*_s, _selector_position, *pos) <= 0) {
|
||||
readers.emplace_back(pop_reader());
|
||||
}
|
||||
return readers;
|
||||
}
|
||||
virtual std::vector<flat_mutation_reader> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
|
||||
return create_new_readers(&pr.start()->value().token());
|
||||
_pr = pr;
|
||||
return create_new_readers(dht::ring_position_view::for_range_start(_pr));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -600,6 +576,10 @@ SEASTAR_TEST_CASE(reader_selector_gap_between_readers_test) {
|
||||
simple_schema s;
|
||||
auto pkeys = s.make_pkeys(3);
|
||||
|
||||
boost::sort(pkeys, [&s] (const dht::decorated_key& a, const dht::decorated_key& b) {
|
||||
return a.less_compare(*s.schema(), b);
|
||||
});
|
||||
|
||||
auto mut1 = make_mutation_with_key(s, pkeys[0]);
|
||||
auto mut2a = make_mutation_with_key(s, pkeys[1]);
|
||||
auto mut2b = make_mutation_with_key(s, pkeys[1]);
|
||||
@@ -629,7 +609,11 @@ SEASTAR_TEST_CASE(reader_selector_overlapping_readers_test) {
|
||||
storage_service_for_tests ssft;
|
||||
|
||||
simple_schema s;
|
||||
auto pkeys = s.make_pkeys(3);
|
||||
auto pkeys = s.make_pkeys(4);
|
||||
|
||||
boost::sort(pkeys, [&s] (const dht::decorated_key& a, const dht::decorated_key& b) {
|
||||
return a.less_compare(*s.schema(), b);
|
||||
});
|
||||
|
||||
auto mut1 = make_mutation_with_key(s, pkeys[0]);
|
||||
auto mut2a = make_mutation_with_key(s, pkeys[1]);
|
||||
@@ -637,14 +621,27 @@ SEASTAR_TEST_CASE(reader_selector_overlapping_readers_test) {
|
||||
auto mut3a = make_mutation_with_key(s, pkeys[2]);
|
||||
auto mut3b = make_mutation_with_key(s, pkeys[2]);
|
||||
auto mut3c = make_mutation_with_key(s, pkeys[2]);
|
||||
auto mut4a = make_mutation_with_key(s, pkeys[3]);
|
||||
auto mut4b = make_mutation_with_key(s, pkeys[3]);
|
||||
|
||||
tombstone tomb(100, {});
|
||||
mut2b.partition().apply(tomb);
|
||||
|
||||
s.add_row(mut2a, s.make_ckey(1), "a");
|
||||
s.add_row(mut2b, s.make_ckey(2), "b");
|
||||
|
||||
s.add_row(mut3a, s.make_ckey(1), "a");
|
||||
s.add_row(mut3b, s.make_ckey(2), "b");
|
||||
s.add_row(mut3c, s.make_ckey(3), "c");
|
||||
|
||||
s.add_row(mut4a, s.make_ckey(1), "a");
|
||||
s.add_row(mut4b, s.make_ckey(2), "b");
|
||||
|
||||
std::vector<std::vector<mutation>> readers_mutations{
|
||||
{mut1, mut2a, mut3a},
|
||||
{mut2b, mut3b},
|
||||
{mut3c}
|
||||
{mut3c, mut4a},
|
||||
{mut4b},
|
||||
};
|
||||
|
||||
auto reader = make_combined_reader(s.schema(),
|
||||
@@ -656,6 +653,7 @@ SEASTAR_TEST_CASE(reader_selector_overlapping_readers_test) {
|
||||
.produces_partition(mut1)
|
||||
.produces_partition(mut2a + mut2b)
|
||||
.produces_partition(mut3a + mut3b + mut3c)
|
||||
.produces_partition(mut4a + mut4b)
|
||||
.produces_end_of_stream();
|
||||
});
|
||||
}
|
||||
@@ -667,6 +665,10 @@ SEASTAR_TEST_CASE(reader_selector_fast_forwarding_test) {
|
||||
simple_schema s;
|
||||
auto pkeys = s.make_pkeys(5);
|
||||
|
||||
boost::sort(pkeys, [&s] (const dht::decorated_key& a, const dht::decorated_key& b) {
|
||||
return a.less_compare(*s.schema(), b);
|
||||
});
|
||||
|
||||
auto mut1a = make_mutation_with_key(s, pkeys[0]);
|
||||
auto mut1b = make_mutation_with_key(s, pkeys[0]);
|
||||
auto mut2a = make_mutation_with_key(s, pkeys[1]);
|
||||
@@ -1387,6 +1389,11 @@ SEASTAR_TEST_CASE(test_combined_mutation_source_is_a_mutation_source) {
|
||||
|
||||
// Best run with SMP >= 2
|
||||
SEASTAR_THREAD_TEST_CASE(test_foreign_reader_as_mutation_source) {
|
||||
if (smp::count < 2) {
|
||||
std::cerr << "Cannot run test " << get_name() << " with smp::count < 2" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
do_with_cql_env([] (cql_test_env& env) -> future<> {
|
||||
auto populate = [] (schema_ptr s, const std::vector<mutation>& mutations) {
|
||||
const auto remote_shard = (engine().cpu_id() + 1) % smp::count;
|
||||
@@ -1507,6 +1514,11 @@ dht::token dummy_partitioner::token_for_next_shard(const dht::token& t, shard_id
|
||||
|
||||
// Best run with SMP >= 2
|
||||
SEASTAR_THREAD_TEST_CASE(test_multishard_combining_reader_as_mutation_source) {
|
||||
if (smp::count < 2) {
|
||||
std::cerr << "Cannot run test " << get_name() << " with smp::count < 2" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
do_with_cql_env([] (cql_test_env& env) -> future<> {
|
||||
auto populate = [] (schema_ptr s, const std::vector<mutation>& mutations) {
|
||||
// We need to group mutations that have the same token so they land on the same shard.
|
||||
@@ -1570,6 +1582,11 @@ SEASTAR_THREAD_TEST_CASE(test_multishard_combining_reader_as_mutation_source) {
|
||||
|
||||
// Best run with SMP >= 3
|
||||
SEASTAR_THREAD_TEST_CASE(test_multishard_combining_reader_reading_empty_table) {
|
||||
if (smp::count < 3) {
|
||||
std::cerr << "Cannot run test " << get_name() << " with smp::count < 2" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
do_with_cql_env([] (cql_test_env& env) -> future<> {
|
||||
std::vector<bool> shards_touched(smp::count, false);
|
||||
simple_schema s;
|
||||
@@ -1909,6 +1926,11 @@ SEASTAR_THREAD_TEST_CASE(test_foreign_reader_destroyed_with_pending_read_ahead)
|
||||
//
|
||||
// Best run with smp >= 2
|
||||
SEASTAR_THREAD_TEST_CASE(test_multishard_combining_reader_destroyed_with_pending_read_ahead) {
|
||||
if (smp::count < 2) {
|
||||
std::cerr << "Cannot run test " << get_name() << " with smp::count < 2" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
do_with_cql_env([] (cql_test_env& env) -> future<> {
|
||||
auto remote_controls = std::vector<foreign_ptr<std::unique_ptr<puppet_reader::control>>>();
|
||||
remote_controls.reserve(smp::count);
|
||||
|
||||
@@ -659,6 +659,46 @@ void test_mutation_reader_fragments_have_monotonic_positions(populate_fn populat
|
||||
});
|
||||
}
|
||||
|
||||
static void test_date_tiered_clustering_slicing(populate_fn populate) {
|
||||
BOOST_TEST_MESSAGE(__PRETTY_FUNCTION__);
|
||||
|
||||
simple_schema ss;
|
||||
|
||||
auto s = schema_builder(ss.schema())
|
||||
.set_compaction_strategy(sstables::compaction_strategy_type::date_tiered)
|
||||
.build();
|
||||
|
||||
auto pkey = ss.make_pkey();
|
||||
|
||||
mutation m1(s, pkey);
|
||||
ss.add_static_row(m1, "s");
|
||||
m1.partition().apply(ss.new_tombstone());
|
||||
ss.add_row(m1, ss.make_ckey(0), "v1");
|
||||
|
||||
mutation_source ms = populate(s, {m1});
|
||||
|
||||
// query row outside the range of existing rows to exercise sstable clustering key filter
|
||||
{
|
||||
auto slice = partition_slice_builder(*s)
|
||||
.with_range(ss.make_ckey_range(1, 2))
|
||||
.build();
|
||||
auto prange = dht::partition_range::make_singular(pkey);
|
||||
assert_that(ms.make_reader(s, prange, slice))
|
||||
.produces(m1, slice.row_ranges(*s, pkey.key()))
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
{
|
||||
auto slice = partition_slice_builder(*s)
|
||||
.with_range(query::clustering_range::make_singular(ss.make_ckey(0)))
|
||||
.build();
|
||||
auto prange = dht::partition_range::make_singular(pkey);
|
||||
assert_that(ms.make_reader(s, prange, slice))
|
||||
.produces(m1)
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
}
|
||||
|
||||
static void test_clustering_slices(populate_fn populate) {
|
||||
BOOST_TEST_MESSAGE(__PRETTY_FUNCTION__);
|
||||
auto s = schema_builder("ks", "cf")
|
||||
@@ -1012,6 +1052,7 @@ void test_slicing_with_overlapping_range_tombstones(populate_fn populate) {
|
||||
}
|
||||
|
||||
void run_mutation_reader_tests(populate_fn populate) {
|
||||
test_date_tiered_clustering_slicing(populate);
|
||||
test_fast_forwarding_across_partitions_to_empty_range(populate);
|
||||
test_clustering_slices(populate);
|
||||
test_mutation_reader_fragments_have_monotonic_positions(populate);
|
||||
@@ -1284,6 +1325,7 @@ bytes make_blob(size_t blob_size) {
|
||||
class random_mutation_generator::impl {
|
||||
friend class random_mutation_generator;
|
||||
generate_counters _generate_counters;
|
||||
local_shard_only _local_shard_only;
|
||||
const size_t _external_blob_size = 128; // Should be enough to force use of external bytes storage
|
||||
const size_t n_blobs = 1024;
|
||||
const column_id column_count = row::max_vector_size * 2;
|
||||
@@ -1327,7 +1369,7 @@ class random_mutation_generator::impl {
|
||||
: do_make_schema(bytes_type);
|
||||
}
|
||||
public:
|
||||
explicit impl(generate_counters counters) : _generate_counters(counters) {
|
||||
explicit impl(generate_counters counters, local_shard_only lso = local_shard_only::yes) : _generate_counters(counters), _local_shard_only(lso) {
|
||||
std::random_device rd;
|
||||
// In case of errors, replace the seed with a fixed value to get a deterministic run.
|
||||
auto seed = rd();
|
||||
@@ -1336,7 +1378,7 @@ public:
|
||||
|
||||
_schema = make_schema();
|
||||
|
||||
auto keys = make_local_keys(n_blobs, _schema, _external_blob_size);
|
||||
auto keys = _local_shard_only ? make_local_keys(n_blobs, _schema, _external_blob_size) : make_keys(n_blobs, _schema, _external_blob_size);
|
||||
_blobs = boost::copy_range<std::vector<bytes>>(keys | boost::adaptors::transformed([this] (sstring& k) { return to_bytes(k); }));
|
||||
}
|
||||
|
||||
@@ -1555,7 +1597,7 @@ public:
|
||||
}
|
||||
|
||||
std::vector<dht::decorated_key> make_partition_keys(size_t n) {
|
||||
auto local_keys = make_local_keys(n, _schema);
|
||||
auto local_keys = _local_shard_only ? make_local_keys(n, _schema) : make_keys(n, _schema);
|
||||
return boost::copy_range<std::vector<dht::decorated_key>>(local_keys | boost::adaptors::transformed([this] (sstring& key) {
|
||||
auto pkey = partition_key::from_single_value(*_schema, to_bytes(key));
|
||||
return dht::global_partitioner().decorate_key(*_schema, std::move(pkey));
|
||||
@@ -1575,8 +1617,8 @@ public:
|
||||
|
||||
random_mutation_generator::~random_mutation_generator() {}
|
||||
|
||||
random_mutation_generator::random_mutation_generator(generate_counters counters)
|
||||
: _impl(std::make_unique<random_mutation_generator::impl>(counters))
|
||||
random_mutation_generator::random_mutation_generator(generate_counters counters, local_shard_only lso)
|
||||
: _impl(std::make_unique<random_mutation_generator::impl>(counters, lso))
|
||||
{ }
|
||||
|
||||
mutation random_mutation_generator::operator()() {
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "mutation_reader.hh"
|
||||
#include "tests/sstable_utils.hh"
|
||||
|
||||
using populate_fn = std::function<mutation_source(schema_ptr s, const std::vector<mutation>&)>;
|
||||
|
||||
@@ -49,7 +50,7 @@ public:
|
||||
struct generate_counters_tag { };
|
||||
using generate_counters = bool_class<generate_counters_tag>;
|
||||
|
||||
explicit random_mutation_generator(generate_counters);
|
||||
explicit random_mutation_generator(generate_counters, local_shard_only lso = local_shard_only::yes);
|
||||
~random_mutation_generator();
|
||||
mutation operator()();
|
||||
// Generates n mutations sharing the same schema nad sorted by their decorated keys.
|
||||
|
||||
@@ -53,7 +53,7 @@
|
||||
#include "cell_locking.hh"
|
||||
#include "flat_mutation_reader_assertions.hh"
|
||||
#include "service/storage_proxy.hh"
|
||||
|
||||
#include "random-utils.hh"
|
||||
#include "simple_schema.hh"
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
@@ -78,7 +78,7 @@ static atomic_cell make_atomic_cell(data_type dt, T value) {
|
||||
|
||||
template<typename T>
|
||||
static atomic_cell make_collection_member(data_type dt, T value) {
|
||||
return atomic_cell::make_live(*dt, 0, dt->decompose(std::move(value)));
|
||||
return atomic_cell::make_live(*dt, 0, dt->decompose(std::move(value)), atomic_cell::collection_member::yes);
|
||||
};
|
||||
|
||||
static mutation_partition get_partition(memtable& mt, const partition_key& key) {
|
||||
@@ -1603,3 +1603,116 @@ SEASTAR_TEST_CASE(test_continuity_merging) {
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
class measuring_allocator final : public allocation_strategy {
|
||||
size_t _allocated_bytes;
|
||||
public:
|
||||
virtual void* alloc(migrate_fn mf, size_t size, size_t alignment) override {
|
||||
_allocated_bytes += size;
|
||||
return standard_allocator().alloc(mf, size, alignment);
|
||||
}
|
||||
virtual void free(void* ptr, size_t size) override {
|
||||
standard_allocator().free(ptr, size);
|
||||
}
|
||||
virtual void free(void* ptr) override {
|
||||
standard_allocator().free(ptr);
|
||||
}
|
||||
virtual size_t object_memory_size_in_allocator(const void* obj) const noexcept override {
|
||||
return standard_allocator().object_memory_size_in_allocator(obj);
|
||||
}
|
||||
size_t allocated_bytes() const { return _allocated_bytes; }
|
||||
};
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_external_memory_usage) {
|
||||
measuring_allocator alloc;
|
||||
auto s = simple_schema();
|
||||
|
||||
auto generate = [&s] {
|
||||
size_t data_size = 0;
|
||||
|
||||
auto m = mutation(s.schema(), s.make_pkey("pk"));
|
||||
|
||||
auto row_count = tests::random::get_int(1, 16);
|
||||
for (auto i = 0; i < row_count; i++) {
|
||||
auto ck_value = to_hex(tests::random::get_bytes(tests::random::get_int(1023) + 1));
|
||||
data_size += ck_value.size();
|
||||
auto ck = s.make_ckey(ck_value);
|
||||
|
||||
auto value = to_hex(tests::random::get_bytes(tests::random::get_int(128 * 1024)));
|
||||
data_size += value.size();
|
||||
s.add_row(m, ck, value);
|
||||
}
|
||||
|
||||
return std::pair(std::move(m), data_size);
|
||||
};
|
||||
|
||||
for (auto i = 0; i < 16; i++) {
|
||||
auto [ m, size ] = generate();
|
||||
|
||||
with_allocator(alloc, [&] {
|
||||
auto before = alloc.allocated_bytes();
|
||||
auto m2 = m;
|
||||
auto after = alloc.allocated_bytes();
|
||||
|
||||
BOOST_CHECK_EQUAL(m.partition().external_memory_usage(*s.schema()),
|
||||
m2.partition().external_memory_usage(*s.schema()));
|
||||
|
||||
BOOST_CHECK_GE(m.partition().external_memory_usage(*s.schema()), size);
|
||||
BOOST_CHECK_EQUAL(m.partition().external_memory_usage(*s.schema()), after - before);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_cell_external_memory_usage) {
|
||||
measuring_allocator alloc;
|
||||
|
||||
|
||||
auto test_live_atomic_cell = [&] (data_type dt, bytes_view bv) {
|
||||
with_allocator(alloc, [&] {
|
||||
auto before = alloc.allocated_bytes();
|
||||
auto ac = atomic_cell_or_collection(atomic_cell::make_live(*dt, 1, bv));
|
||||
auto after = alloc.allocated_bytes();
|
||||
BOOST_CHECK_GE(ac.external_memory_usage(*dt), bv.size());
|
||||
BOOST_CHECK_EQUAL(ac.external_memory_usage(*dt), after - before);
|
||||
});
|
||||
};
|
||||
|
||||
test_live_atomic_cell(int32_type, { });
|
||||
test_live_atomic_cell(int32_type, int32_type->decompose(int32_t(1)));
|
||||
|
||||
test_live_atomic_cell(bytes_type, { });
|
||||
test_live_atomic_cell(bytes_type, bytes(1, 'a'));
|
||||
test_live_atomic_cell(bytes_type, bytes(16, 'a'));
|
||||
test_live_atomic_cell(bytes_type, bytes(32, 'a'));
|
||||
test_live_atomic_cell(bytes_type, bytes(1024, 'a'));
|
||||
test_live_atomic_cell(bytes_type, bytes(64 * 1024 - 1, 'a'));
|
||||
test_live_atomic_cell(bytes_type, bytes(64 * 1024, 'a'));
|
||||
test_live_atomic_cell(bytes_type, bytes(64 * 1024 + 1, 'a'));
|
||||
test_live_atomic_cell(bytes_type, bytes(1024 * 1024, 'a'));
|
||||
|
||||
auto test_collection = [&] (bytes_view bv) {
|
||||
auto collection_type = map_type_impl::get_instance(int32_type, bytes_type, true);
|
||||
|
||||
auto m = make_collection_mutation({ }, int32_type->decompose(0), make_collection_member(bytes_type, data_value(bytes(bv))));
|
||||
auto cell = atomic_cell_or_collection(collection_type->serialize_mutation_form(m));
|
||||
|
||||
with_allocator(alloc, [&] {
|
||||
auto before = alloc.allocated_bytes();
|
||||
auto cell2 = cell.copy(*collection_type);
|
||||
auto after = alloc.allocated_bytes();
|
||||
BOOST_CHECK_GE(cell2.external_memory_usage(*collection_type), bv.size());
|
||||
BOOST_CHECK_EQUAL(cell2.external_memory_usage(*collection_type), cell.external_memory_usage(*collection_type));
|
||||
BOOST_CHECK_EQUAL(cell2.external_memory_usage(*collection_type), after - before);
|
||||
});
|
||||
};
|
||||
|
||||
test_collection({ });
|
||||
test_collection(bytes(1, 'a'));
|
||||
test_collection(bytes(16, 'a'));
|
||||
test_collection(bytes(32, 'a'));
|
||||
test_collection(bytes(1024, 'a'));
|
||||
test_collection(bytes(64 * 1024 - 1, 'a'));
|
||||
test_collection(bytes(64 * 1024, 'a'));
|
||||
test_collection(bytes(64 * 1024 + 1, 'a'));
|
||||
test_collection(bytes(1024 * 1024, 'a'));
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
|
||||
#include "partition_version.hh"
|
||||
#include "partition_snapshot_row_cursor.hh"
|
||||
#include "partition_snapshot_reader.hh"
|
||||
|
||||
#include "tests/test-utils.hh"
|
||||
#include "tests/mutation_assertions.hh"
|
||||
@@ -169,7 +170,7 @@ public:
|
||||
partition_snapshot::phase_type phase() const { return _phase; }
|
||||
real_dirty_memory_accounter& accounter() { return _acc; }
|
||||
|
||||
mutation_partition squashed(lw_shared_ptr<partition_snapshot>& snp) {
|
||||
mutation_partition squashed(partition_snapshot_ptr& snp) {
|
||||
logalloc::allocating_section as;
|
||||
return as(_tracker.region(), [&] {
|
||||
return snp->squashed();
|
||||
@@ -220,7 +221,7 @@ public:
|
||||
});
|
||||
}
|
||||
|
||||
lw_shared_ptr<partition_snapshot> read() {
|
||||
partition_snapshot_ptr read() {
|
||||
logalloc::allocating_section as;
|
||||
return as(region(), [&] {
|
||||
return _e.read(region(), _container.cleaner(), schema(), &_container.tracker(), _container.phase());
|
||||
@@ -466,7 +467,7 @@ SEASTAR_TEST_CASE(test_apply_to_incomplete_respects_continuity) {
|
||||
m1.partition().make_fully_continuous();
|
||||
e += m1;
|
||||
|
||||
lw_shared_ptr<partition_snapshot> snap;
|
||||
partition_snapshot_ptr snap;
|
||||
if (with_active_reader) {
|
||||
snap = e.read();
|
||||
}
|
||||
@@ -869,12 +870,11 @@ SEASTAR_TEST_CASE(test_versions_are_merged_when_snapshots_go_away) {
|
||||
|
||||
auto snap2 = e.read(r, cleaner, s, nullptr);
|
||||
|
||||
snap1->merge_partition_versions();
|
||||
snap1 = {};
|
||||
|
||||
snap2->merge_partition_versions();
|
||||
snap2 = {};
|
||||
|
||||
cleaner.drain().get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(1, boost::size(e.versions()));
|
||||
assert_that(s, e.squashed(*s)).is_equal_to((m1 + m2).partition());
|
||||
}
|
||||
@@ -890,12 +890,11 @@ SEASTAR_TEST_CASE(test_versions_are_merged_when_snapshots_go_away) {
|
||||
|
||||
auto snap2 = e.read(r, cleaner, s, nullptr);
|
||||
|
||||
snap2->merge_partition_versions();
|
||||
snap2 = {};
|
||||
|
||||
snap1->merge_partition_versions();
|
||||
snap1 = {};
|
||||
|
||||
cleaner.drain().get();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(1, boost::size(e.versions()));
|
||||
assert_that(s, e.squashed(*s)).is_equal_to((m1 + m2).partition());
|
||||
}
|
||||
|
||||
@@ -148,11 +148,27 @@ void run_test(const sstring& name, schema_ptr s, MutationGenerator&& gen) {
|
||||
float((prev_compacted - prefill_compacted)) / (prev_allocated - prefill_allocated)
|
||||
);
|
||||
|
||||
// Create a reader which tests the case of memtable snapshots
|
||||
// going away after memtable was merged to cache.
|
||||
auto rd = std::make_unique<flat_mutation_reader>(
|
||||
make_combined_reader(s, cache.make_reader(s), mt->make_flat_reader(s)));
|
||||
rd->set_max_buffer_size(1);
|
||||
rd->fill_buffer().get();
|
||||
|
||||
scheduling_latency_measurer slm;
|
||||
slm.start();
|
||||
auto d = duration_in_seconds([&] {
|
||||
cache.update([] {}, *mt).get();
|
||||
});
|
||||
|
||||
rd->set_max_buffer_size(1024*1024);
|
||||
rd->consume_pausable([] (mutation_fragment) {
|
||||
return stop_iteration::no;
|
||||
}).get();
|
||||
|
||||
mt = {};
|
||||
rd = {};
|
||||
|
||||
slm.stop();
|
||||
|
||||
auto compacted = logalloc::memory_compacted() - prev_compacted;
|
||||
@@ -282,7 +298,9 @@ int main(int argc, char** argv) {
|
||||
test_small_partitions();
|
||||
test_partition_with_few_small_rows();
|
||||
test_partition_with_lots_of_small_rows();
|
||||
test_partition_with_lots_of_range_tombstones();
|
||||
// Takes a huge amount of time due to https://github.com/scylladb/scylla/issues/2581#issuecomment-398030186,
|
||||
// disable until fixed.
|
||||
// test_partition_with_lots_of_range_tombstones();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user