Compare commits
156 Commits
debug_form
...
scylla-1.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6c4d7223fe | ||
|
|
c1a5488993 | ||
|
|
9c9f62e30b | ||
|
|
c147676ccb | ||
|
|
07adedf28a | ||
|
|
8ca530b6d3 | ||
|
|
e5a123ea80 | ||
|
|
9bfce3255a | ||
|
|
d2251199b2 | ||
|
|
bed6437b38 | ||
|
|
70508734a5 | ||
|
|
60307f62fe | ||
|
|
8006a15e3b | ||
|
|
1cfbc29f01 | ||
|
|
c665455b71 | ||
|
|
b09c91d1c8 | ||
|
|
776ae831e6 | ||
|
|
2ad3c7532f | ||
|
|
91c35c3e19 | ||
|
|
4f0cc195dc | ||
|
|
c9f7986be4 | ||
|
|
4feaf1372b | ||
|
|
3ebfecc88e | ||
|
|
c841d87fe3 | ||
|
|
7a887ea2ea | ||
|
|
bc4d63c802 | ||
|
|
616196b543 | ||
|
|
a04f462904 | ||
|
|
ebf8fb802e | ||
|
|
8d1374e911 | ||
|
|
bacc769328 | ||
|
|
241eb9e199 | ||
|
|
58fdfe5bc9 | ||
|
|
f45cc1b229 | ||
|
|
14f9eeaafd | ||
|
|
05df90ad4b | ||
|
|
5646faba18 | ||
|
|
814df06245 | ||
|
|
5ac9e2501c | ||
|
|
34ddfb4498 | ||
|
|
e4d4d0b31c | ||
|
|
4125f279c0 | ||
|
|
e276e7b1e3 | ||
|
|
a516b24111 | ||
|
|
4642c706c1 | ||
|
|
4666c095bc | ||
|
|
507e6ec75a | ||
|
|
29d6952ddd | ||
|
|
9fae641099 | ||
|
|
ccd1fe4348 | ||
|
|
7b88ba8882 | ||
|
|
46825a5e07 | ||
|
|
740d98901f | ||
|
|
ceff8b9b41 | ||
|
|
1b2dbcc26e | ||
|
|
75b2db7862 | ||
|
|
789c1297dd | ||
|
|
afeaaab034 | ||
|
|
80242ff443 | ||
|
|
0b456578c0 | ||
|
|
3b5a55c6fc | ||
|
|
4f1d37c3c9 | ||
|
|
8422a42381 | ||
|
|
c0f31fac48 | ||
|
|
6fe88a663f | ||
|
|
5f76f3d445 | ||
|
|
6676d126aa | ||
|
|
38343ccbfe | ||
|
|
f1272933fd | ||
|
|
ccd623aa87 | ||
|
|
8176fa8379 | ||
|
|
d03910f46d | ||
|
|
0c75700d8c | ||
|
|
478975b3fa | ||
|
|
5ce76258c8 | ||
|
|
4cf8791d56 | ||
|
|
ccd51010f1 | ||
|
|
8e78cbfc2d | ||
|
|
c6c176b1be | ||
|
|
9795edbe04 | ||
|
|
1539c8b136 | ||
|
|
0396a94eaf | ||
|
|
3c40c1be71 | ||
|
|
de969a5d6f | ||
|
|
0ade2894f7 | ||
|
|
6b36315040 | ||
|
|
edc5f8f2f7 | ||
|
|
066149ad46 | ||
|
|
1f07468195 | ||
|
|
0577ae5a61 | ||
|
|
054cf13cd0 | ||
|
|
71446edc97 | ||
|
|
c1d8a62b5b | ||
|
|
a3baef6b45 | ||
|
|
feaba177e2 | ||
|
|
83a289bdcd | ||
|
|
382e7e63b3 | ||
|
|
deeed904f4 | ||
|
|
d927053b3b | ||
|
|
8b8923b5af | ||
|
|
48ec129595 | ||
|
|
a4757a6737 | ||
|
|
223b73849d | ||
|
|
ba4b1eac45 | ||
|
|
9cf5fabfdf | ||
|
|
5723c664ad | ||
|
|
9635a83edd | ||
|
|
24c68e48a5 | ||
|
|
80cb0a28e1 | ||
|
|
95a9f66b75 | ||
|
|
58448d4b05 | ||
|
|
0a4d0e95f2 | ||
|
|
2c73e1c2e8 | ||
|
|
0ebd1ae62a | ||
|
|
14f616de3f | ||
|
|
827c0f68c3 | ||
|
|
e3607a4c16 | ||
|
|
59270c6d00 | ||
|
|
3be5d3a7c9 | ||
|
|
cd6697b506 | ||
|
|
acc9849e2b | ||
|
|
a445f6a7be | ||
|
|
88ed9c53a6 | ||
|
|
50f98ff90a | ||
|
|
30ffb2917f | ||
|
|
6ef8b45bf4 | ||
|
|
144829606a | ||
|
|
2eb54bb068 | ||
|
|
a133e48515 | ||
|
|
5db0049d99 | ||
|
|
ac80445bd9 | ||
|
|
0c3ffba5c8 | ||
|
|
7ca3d22c7d | ||
|
|
9b1d2dad89 | ||
|
|
7e6a7a6cb5 | ||
|
|
ec7f637384 | ||
|
|
eecfb2e4ef | ||
|
|
1f6476351a | ||
|
|
0d95dd310a | ||
|
|
80d2b72068 | ||
|
|
ac95f04ff9 | ||
|
|
08a8a4a1b4 | ||
|
|
b7e9924299 | ||
|
|
19ed269cc7 | ||
|
|
a223450a56 | ||
|
|
8f4800b30e | ||
|
|
7d13d115c6 | ||
|
|
c9c52235a1 | ||
|
|
52eeab089c | ||
|
|
49af399a2e | ||
|
|
d915370e3f | ||
|
|
a6d5e67923 | ||
|
|
f885750f90 | ||
|
|
36f55e409d | ||
|
|
c436fb5892 | ||
|
|
950bcd3e38 |
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
VERSION=666.development
|
||||
VERSION=1.0.3
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -54,9 +54,9 @@ class atomic_cell_or_collection;
|
||||
*/
|
||||
class atomic_cell_type final {
|
||||
private:
|
||||
static constexpr int8_t DEAD_FLAGS = 0;
|
||||
static constexpr int8_t LIVE_FLAG = 0x01;
|
||||
static constexpr int8_t EXPIRY_FLAG = 0x02; // When present, expiry field is present. Set only for live cells
|
||||
static constexpr int8_t REVERT_FLAG = 0x04; // transient flag used to efficiently implement ReversiblyMergeable for atomic cells.
|
||||
static constexpr unsigned flags_size = 1;
|
||||
static constexpr unsigned timestamp_offset = flags_size;
|
||||
static constexpr unsigned timestamp_size = 8;
|
||||
@@ -67,14 +67,21 @@ private:
|
||||
static constexpr unsigned ttl_offset = expiry_offset + expiry_size;
|
||||
static constexpr unsigned ttl_size = 4;
|
||||
private:
|
||||
static bool is_revert_set(bytes_view cell) {
|
||||
return cell[0] & REVERT_FLAG;
|
||||
}
|
||||
template<typename BytesContainer>
|
||||
static void set_revert(BytesContainer& cell, bool revert) {
|
||||
cell[0] = (cell[0] & ~REVERT_FLAG) | (revert * REVERT_FLAG);
|
||||
}
|
||||
static bool is_live(const bytes_view& cell) {
|
||||
return cell[0] != DEAD_FLAGS;
|
||||
return cell[0] & LIVE_FLAG;
|
||||
}
|
||||
static bool is_live_and_has_ttl(const bytes_view& cell) {
|
||||
return cell[0] & EXPIRY_FLAG;
|
||||
}
|
||||
static bool is_dead(const bytes_view& cell) {
|
||||
return cell[0] == DEAD_FLAGS;
|
||||
return !is_live(cell);
|
||||
}
|
||||
// Can be called on live and dead cells
|
||||
static api::timestamp_type timestamp(const bytes_view& cell) {
|
||||
@@ -106,7 +113,7 @@ private:
|
||||
}
|
||||
static managed_bytes make_dead(api::timestamp_type timestamp, gc_clock::time_point deletion_time) {
|
||||
managed_bytes b(managed_bytes::initialized_later(), flags_size + timestamp_size + deletion_time_size);
|
||||
b[0] = DEAD_FLAGS;
|
||||
b[0] = 0;
|
||||
set_field(b, timestamp_offset, timestamp);
|
||||
set_field(b, deletion_time_offset, deletion_time.time_since_epoch().count());
|
||||
return b;
|
||||
@@ -140,8 +147,11 @@ protected:
|
||||
ByteContainer _data;
|
||||
protected:
|
||||
atomic_cell_base(ByteContainer&& data) : _data(std::forward<ByteContainer>(data)) { }
|
||||
atomic_cell_base(const ByteContainer& data) : _data(data) { }
|
||||
friend class atomic_cell_or_collection;
|
||||
public:
|
||||
bool is_revert_set() const {
|
||||
return atomic_cell_type::is_revert_set(_data);
|
||||
}
|
||||
bool is_live() const {
|
||||
return atomic_cell_type::is_live(_data);
|
||||
}
|
||||
@@ -187,10 +197,13 @@ public:
|
||||
bytes_view serialize() const {
|
||||
return _data;
|
||||
}
|
||||
void set_revert(bool revert) {
|
||||
atomic_cell_type::set_revert(_data, revert);
|
||||
}
|
||||
};
|
||||
|
||||
class atomic_cell_view final : public atomic_cell_base<bytes_view> {
|
||||
atomic_cell_view(bytes_view data) : atomic_cell_base(data) {}
|
||||
atomic_cell_view(bytes_view data) : atomic_cell_base(std::move(data)) {}
|
||||
public:
|
||||
static atomic_cell_view from_bytes(bytes_view data) { return atomic_cell_view(data); }
|
||||
|
||||
@@ -198,6 +211,11 @@ public:
|
||||
friend std::ostream& operator<<(std::ostream& os, const atomic_cell_view& acv);
|
||||
};
|
||||
|
||||
class atomic_cell_ref final : public atomic_cell_base<managed_bytes&> {
|
||||
public:
|
||||
atomic_cell_ref(managed_bytes& buf) : atomic_cell_base(buf) {}
|
||||
};
|
||||
|
||||
class atomic_cell final : public atomic_cell_base<managed_bytes> {
|
||||
atomic_cell(managed_bytes b) : atomic_cell_base(std::move(b)) {}
|
||||
public:
|
||||
|
||||
@@ -57,3 +57,19 @@ struct appending_hash<atomic_cell_view> {
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct appending_hash<atomic_cell> {
|
||||
template<typename Hasher>
|
||||
void operator()(Hasher& h, const atomic_cell& cell) const {
|
||||
feed_hash(h, static_cast<atomic_cell_view>(cell));
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct appending_hash<collection_mutation> {
|
||||
template<typename Hasher>
|
||||
void operator()(Hasher& h, const collection_mutation& cm) const {
|
||||
feed_hash(h, static_cast<collection_mutation_view>(cm));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -27,6 +27,8 @@
|
||||
|
||||
// A variant type that can hold either an atomic_cell, or a serialized collection.
|
||||
// Which type is stored is determined by the schema.
|
||||
// Has an "empty" state.
|
||||
// Objects moved-from are left in an empty state.
|
||||
class atomic_cell_or_collection final {
|
||||
managed_bytes _data;
|
||||
private:
|
||||
@@ -36,6 +38,7 @@ public:
|
||||
atomic_cell_or_collection(atomic_cell ac) : _data(std::move(ac._data)) {}
|
||||
static atomic_cell_or_collection from_atomic_cell(atomic_cell data) { return { std::move(data._data) }; }
|
||||
atomic_cell_view as_atomic_cell() const { return atomic_cell_view::from_bytes(_data); }
|
||||
atomic_cell_ref as_atomic_cell_ref() { return { _data }; }
|
||||
atomic_cell_or_collection(collection_mutation cm) : _data(std::move(cm.data)) {}
|
||||
explicit operator bool() const {
|
||||
return !_data.empty();
|
||||
|
||||
16
configure.py
16
configure.py
@@ -845,8 +845,8 @@ with open(buildfile, 'w') as f:
|
||||
for obj in compiles:
|
||||
src = compiles[obj]
|
||||
gen_headers = list(ragels.keys())
|
||||
gen_headers += ['seastar/build/{}/http/request_parser.hh'.format(mode)]
|
||||
gen_headers += ['seastar/build/{}/http/http_response_parser.hh'.format(mode)]
|
||||
gen_headers += ['seastar/build/{}/gen/http/request_parser.hh'.format(mode)]
|
||||
gen_headers += ['seastar/build/{}/gen/http/http_response_parser.hh'.format(mode)]
|
||||
for th in thrifts:
|
||||
gen_headers += th.headers('$builddir/{}/gen'.format(mode))
|
||||
for g in antlr3_grammars:
|
||||
@@ -878,10 +878,10 @@ with open(buildfile, 'w') as f:
|
||||
for cc in grammar.sources('$builddir/{}/gen'.format(mode)):
|
||||
obj = cc.replace('.cpp', '.o')
|
||||
f.write('build {}: cxx.{} {} || {}\n'.format(obj, mode, cc, ' '.join(serializers)))
|
||||
f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune: ninja {seastar_deps}\n'
|
||||
f.write('build seastar/build/{mode}/libseastar.a seastar/build/{mode}/apps/iotune/iotune seastar/build/{mode}/gen/http/request_parser.hh seastar/build/{mode}/gen/http/http_response_parser.hh: ninja {seastar_deps}\n'
|
||||
.format(**locals()))
|
||||
f.write(' subdir = seastar\n')
|
||||
f.write(' target = build/{mode}/libseastar.a build/{mode}/apps/iotune/iotune\n'.format(**locals()))
|
||||
f.write(' target = build/{mode}/libseastar.a build/{mode}/apps/iotune/iotune build/{mode}/gen/http/request_parser.hh build/{mode}/gen/http/http_response_parser.hh\n'.format(**locals()))
|
||||
f.write(textwrap.dedent('''\
|
||||
build build/{mode}/iotune: copy seastar/build/{mode}/apps/iotune/iotune
|
||||
''').format(**locals()))
|
||||
@@ -895,14 +895,6 @@ with open(buildfile, 'w') as f:
|
||||
command = find -name '*.[chS]' -o -name "*.cc" -o -name "*.hh" | cscope -bq -i-
|
||||
description = CSCOPE
|
||||
build cscope: cscope
|
||||
rule request_parser_hh
|
||||
command = {ninja} -C seastar build/release/gen/http/request_parser.hh build/debug/gen/http/request_parser.hh
|
||||
description = GEN seastar/http/request_parser.hh
|
||||
build seastar/build/release/http/request_parser.hh seastar/build/debug/http/request_parser.hh: request_parser_hh
|
||||
rule http_response_parser_hh
|
||||
command = {ninja} -C seastar build/release/gen/http/http_response_parser.hh build/debug/gen/http/http_response_parser.hh
|
||||
description = GEN seastar/http/http_response_parser.hh
|
||||
build seastar/build/release/http/http_response_parser.hh seastar/build/debug/http/http_response_parser.hh: http_response_parser_hh
|
||||
rule clean
|
||||
command = rm -rf build
|
||||
description = CLEAN
|
||||
|
||||
@@ -52,6 +52,11 @@ selectable::writetime_or_ttl::new_selector_factory(database& db, schema_ptr s, s
|
||||
return writetime_or_ttl_selector::new_factory(def->name_as_text(), add_and_get_index(*def, defs), _is_writetime);
|
||||
}
|
||||
|
||||
sstring
|
||||
selectable::writetime_or_ttl::to_string() const {
|
||||
return sprint("%s(%s)", _is_writetime ? "writetime" : "ttl", _id->to_string());
|
||||
}
|
||||
|
||||
shared_ptr<selectable>
|
||||
selectable::writetime_or_ttl::raw::prepare(schema_ptr s) {
|
||||
return make_shared<writetime_or_ttl>(_id->prepare_column_identifier(s), _is_writetime);
|
||||
@@ -78,6 +83,11 @@ selectable::with_function::new_selector_factory(database& db, schema_ptr s, std:
|
||||
return abstract_function_selector::new_factory(std::move(fun), std::move(factories));
|
||||
}
|
||||
|
||||
sstring
|
||||
selectable::with_function::to_string() const {
|
||||
return sprint("%s(%s)", _function_name.name, join(", ", _args));
|
||||
}
|
||||
|
||||
shared_ptr<selectable>
|
||||
selectable::with_function::raw::prepare(schema_ptr s) {
|
||||
std::vector<shared_ptr<selectable>> prepared_args;
|
||||
@@ -101,7 +111,7 @@ selectable::with_field_selection::new_selector_factory(database& db, schema_ptr
|
||||
if (!ut) {
|
||||
throw exceptions::invalid_request_exception(
|
||||
sprint("Invalid field selection: %s of type %s is not a user type",
|
||||
"FIXME: selectable" /* FIMXME: _selected */, ut->as_cql3_type()));
|
||||
_selected->to_string(), factory->new_instance()->get_type()->as_cql3_type()));
|
||||
}
|
||||
for (size_t i = 0; i < ut->size(); ++i) {
|
||||
if (ut->field_name(i) != _field->bytes_) {
|
||||
@@ -110,7 +120,12 @@ selectable::with_field_selection::new_selector_factory(database& db, schema_ptr
|
||||
return field_selector::new_factory(std::move(ut), i, std::move(factory));
|
||||
}
|
||||
throw exceptions::invalid_request_exception(sprint("%s of type %s has no field %s",
|
||||
"FIXME: selectable" /* FIXME: _selected */, ut->as_cql3_type(), _field));
|
||||
_selected->to_string(), ut->as_cql3_type(), _field));
|
||||
}
|
||||
|
||||
sstring
|
||||
selectable::with_field_selection::to_string() const {
|
||||
return sprint("%s.%s", _selected->to_string(), _field->to_string());
|
||||
}
|
||||
|
||||
shared_ptr<selectable>
|
||||
@@ -126,6 +141,10 @@ selectable::with_field_selection::raw::processes_selection() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
std::ostream & operator<<(std::ostream &os, const selectable& s) {
|
||||
return os << s.to_string();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -55,6 +55,7 @@ class selectable {
|
||||
public:
|
||||
virtual ~selectable() {}
|
||||
virtual ::shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr schema, std::vector<const column_definition*>& defs) = 0;
|
||||
virtual sstring to_string() const = 0;
|
||||
protected:
|
||||
static size_t add_and_get_index(const column_definition& def, std::vector<const column_definition*>& defs) {
|
||||
auto i = std::find(defs.begin(), defs.end(), &def);
|
||||
@@ -84,6 +85,8 @@ public:
|
||||
class with_field_selection;
|
||||
};
|
||||
|
||||
std::ostream & operator<<(std::ostream &os, const selectable& s);
|
||||
|
||||
class selectable::with_function : public selectable {
|
||||
functions::function_name _function_name;
|
||||
std::vector<shared_ptr<selectable>> _args;
|
||||
@@ -92,17 +95,7 @@ public:
|
||||
: _function_name(std::move(fname)), _args(std::move(args)) {
|
||||
}
|
||||
|
||||
#if 0
|
||||
@Override
|
||||
public String toString()
|
||||
{
|
||||
return new StrBuilder().append(functionName)
|
||||
.append("(")
|
||||
.appendWithSeparators(args, ", ")
|
||||
.append(")")
|
||||
.toString();
|
||||
}
|
||||
#endif
|
||||
virtual sstring to_string() const override;
|
||||
|
||||
virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;
|
||||
class raw : public selectable::raw {
|
||||
|
||||
@@ -59,13 +59,7 @@ public:
|
||||
: _selected(std::move(selected)), _field(std::move(field)) {
|
||||
}
|
||||
|
||||
#if 0
|
||||
@Override
|
||||
public String toString()
|
||||
{
|
||||
return String.format("%s.%s", selected, field);
|
||||
}
|
||||
#endif
|
||||
virtual sstring to_string() const override;
|
||||
|
||||
virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;
|
||||
|
||||
|
||||
@@ -58,13 +58,7 @@ public:
|
||||
: _id(std::move(id)), _is_writetime(is_writetime) {
|
||||
}
|
||||
|
||||
#if 0
|
||||
@Override
|
||||
public String toString()
|
||||
{
|
||||
return (isWritetime ? "writetime" : "ttl") + "(" + id + ")";
|
||||
}
|
||||
#endif
|
||||
virtual sstring to_string() const override;
|
||||
|
||||
virtual shared_ptr<selector::factory> new_selector_factory(database& db, schema_ptr s, std::vector<const column_definition*>& defs) override;
|
||||
|
||||
|
||||
@@ -169,26 +169,21 @@ public:
|
||||
}
|
||||
private:
|
||||
future<std::vector<mutation>> get_mutations(distributed<service::storage_proxy>& storage, const query_options& options, bool local, api::timestamp_type now) {
|
||||
struct collector {
|
||||
std::vector<mutation> _result;
|
||||
std::vector<mutation> get() && { return std::move(_result); }
|
||||
void operator()(std::vector<mutation> more) {
|
||||
std::move(more.begin(), more.end(), std::back_inserter(_result));
|
||||
}
|
||||
};
|
||||
auto get_mutations_for_statement = [this, &storage, &options, now, local] (size_t i) {
|
||||
auto&& statement = _statements[i];
|
||||
auto&& statement_options = options.for_statement(i);
|
||||
auto timestamp = _attrs->get_timestamp(now, statement_options);
|
||||
return statement->get_mutations(storage, statement_options, local, timestamp);
|
||||
};
|
||||
// FIXME: origin tries hard to merge mutations to same keyspace, for
|
||||
// some reason.
|
||||
return map_reduce(
|
||||
boost::make_counting_iterator<size_t>(0),
|
||||
boost::make_counting_iterator<size_t>(_statements.size()),
|
||||
get_mutations_for_statement,
|
||||
collector());
|
||||
// Do not process in parallel because operations like list append/prepend depend on execution order.
|
||||
return do_with(std::vector<mutation>(), [this, &storage, &options, now, local] (auto&& result) {
|
||||
return do_for_each(boost::make_counting_iterator<size_t>(0),
|
||||
boost::make_counting_iterator<size_t>(_statements.size()),
|
||||
[this, &storage, &options, now, local, &result] (size_t i) {
|
||||
auto&& statement = _statements[i];
|
||||
auto&& statement_options = options.for_statement(i);
|
||||
auto timestamp = _attrs->get_timestamp(now, statement_options);
|
||||
return statement->get_mutations(storage, statement_options, local, timestamp).then([&result] (auto&& more) {
|
||||
std::move(more.begin(), more.end(), std::back_inserter(result));
|
||||
});
|
||||
}).then([&result] {
|
||||
return std::move(result);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
400
database.cc
400
database.cc
@@ -45,7 +45,9 @@
|
||||
#include <boost/algorithm/cxx11/all_of.hpp>
|
||||
#include <boost/function_output_iterator.hpp>
|
||||
#include <boost/range/algorithm/heap_algorithm.hpp>
|
||||
#include <boost/range/algorithm/remove_if.hpp>
|
||||
#include <boost/range/algorithm/find.hpp>
|
||||
#include <boost/range/adaptor/map.hpp>
|
||||
#include "frozen_mutation.hh"
|
||||
#include "mutation_partition_applier.hh"
|
||||
#include "core/do_with.hh"
|
||||
@@ -85,14 +87,16 @@ public:
|
||||
column_family::column_family(schema_ptr schema, config config, db::commitlog& cl, compaction_manager& compaction_manager)
|
||||
: _schema(std::move(schema))
|
||||
, _config(std::move(config))
|
||||
, _memtables(make_lw_shared(memtable_list{}))
|
||||
, _memtables(make_lw_shared<memtable_list>([this] { return seal_active_memtable(); }, [this] { return new_memtable(); }, _config.max_memtable_size))
|
||||
, _streaming_memtables(_config.enable_disk_writes ?
|
||||
make_lw_shared<memtable_list>([this] { return seal_active_streaming_memtable_delayed(); }, [this] { return new_streaming_memtable(); }, _config.max_memtable_size) :
|
||||
make_lw_shared<memtable_list>([this] { return seal_active_memtable(); }, [this] { return new_memtable(); }, _config.max_memtable_size))
|
||||
, _sstables(make_lw_shared<sstable_list>())
|
||||
, _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker())
|
||||
, _commitlog(&cl)
|
||||
, _compaction_manager(compaction_manager)
|
||||
, _flush_queue(std::make_unique<memtable_flush_queue>())
|
||||
{
|
||||
add_memtable();
|
||||
if (!_config.enable_disk_writes) {
|
||||
dblog.warn("Writes disabled, column family no durable.");
|
||||
}
|
||||
@@ -101,14 +105,16 @@ column_family::column_family(schema_ptr schema, config config, db::commitlog& cl
|
||||
column_family::column_family(schema_ptr schema, config config, no_commitlog cl, compaction_manager& compaction_manager)
|
||||
: _schema(std::move(schema))
|
||||
, _config(std::move(config))
|
||||
, _memtables(make_lw_shared(memtable_list{}))
|
||||
, _memtables(make_lw_shared<memtable_list>([this] { return seal_active_memtable(); }, [this] { return new_memtable(); }, _config.max_memtable_size))
|
||||
, _streaming_memtables(_config.enable_disk_writes ?
|
||||
make_lw_shared<memtable_list>([this] { return seal_active_streaming_memtable_delayed(); }, [this] { return new_streaming_memtable(); }, _config.max_memtable_size) :
|
||||
make_lw_shared<memtable_list>([this] { return seal_active_memtable(); }, [this] { return new_memtable(); }, _config.max_memtable_size))
|
||||
, _sstables(make_lw_shared<sstable_list>())
|
||||
, _cache(_schema, sstables_as_mutation_source(), sstables_as_key_source(), global_cache_tracker())
|
||||
, _commitlog(nullptr)
|
||||
, _compaction_manager(compaction_manager)
|
||||
, _flush_queue(std::make_unique<memtable_flush_queue>())
|
||||
{
|
||||
add_memtable();
|
||||
if (!_config.enable_disk_writes) {
|
||||
dblog.warn("Writes disabled, column family no durable.");
|
||||
}
|
||||
@@ -140,7 +146,10 @@ column_family::~column_family() {
|
||||
|
||||
logalloc::occupancy_stats column_family::occupancy() const {
|
||||
logalloc::occupancy_stats res;
|
||||
for (auto m : *_memtables.get()) {
|
||||
for (auto m : *_memtables) {
|
||||
res += m->region().occupancy();
|
||||
}
|
||||
for (auto m : *_streaming_memtables) {
|
||||
res += m->region().occupancy();
|
||||
}
|
||||
return res;
|
||||
@@ -483,8 +492,9 @@ future<sstables::entry_descriptor> column_family::probe_file(sstring sstdir, sst
|
||||
}
|
||||
}
|
||||
|
||||
auto fut = sstable::get_sstable_key_range(*_schema, _schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
|
||||
return std::move(fut).then([this, sstdir = std::move(sstdir), comps] (range<partition_key> r) {
|
||||
auto sst = std::make_unique<sstables::sstable>(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
|
||||
auto fut = sst->get_sstable_key_range(*_schema);
|
||||
return std::move(fut).then([this, sst = std::move(sst), sstdir = std::move(sstdir), comps] (range<partition_key> r) mutable {
|
||||
// Checks whether or not sstable belongs to current shard.
|
||||
if (!belongs_to_current_shard(*_schema, std::move(r))) {
|
||||
dblog.debug("sstable {} not relevant for this shard, ignoring",
|
||||
@@ -494,7 +504,6 @@ future<sstables::entry_descriptor> column_family::probe_file(sstring sstdir, sst
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto sst = std::make_unique<sstables::sstable>(_schema->ks_name(), _schema->cf_name(), sstdir, comps.generation, comps.version, comps.format);
|
||||
auto fut = sst->load();
|
||||
return std::move(fut).then([this, sst = std::move(sst)] () mutable {
|
||||
add_sstable(std::move(*sst));
|
||||
@@ -533,12 +542,15 @@ void column_family::add_sstable(lw_shared_ptr<sstables::sstable> sstable) {
|
||||
_sstables->emplace(generation, std::move(sstable));
|
||||
}
|
||||
|
||||
void column_family::add_memtable() {
|
||||
// allow in-progress reads to continue using old list
|
||||
_memtables = make_lw_shared(memtable_list(*_memtables));
|
||||
_memtables->emplace_back(make_lw_shared<memtable>(_schema, _config.dirty_memory_region_group));
|
||||
lw_shared_ptr<memtable> column_family::new_memtable() {
|
||||
return make_lw_shared<memtable>(_schema, _config.dirty_memory_region_group);
|
||||
}
|
||||
|
||||
lw_shared_ptr<memtable> column_family::new_streaming_memtable() {
|
||||
return make_lw_shared<memtable>(_schema, _config.streaming_dirty_memory_region_group);
|
||||
}
|
||||
|
||||
|
||||
future<>
|
||||
column_family::update_cache(memtable& m, lw_shared_ptr<sstable_list> old_sstables) {
|
||||
if (_config.enable_cache) {
|
||||
@@ -550,6 +562,97 @@ column_family::update_cache(memtable& m, lw_shared_ptr<sstable_list> old_sstable
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: because we are coalescing, it could be that mutations belonging to the same
|
||||
// range end up in two different tables. Technically, we should wait for both. However,
|
||||
// the only way we have to make this happen now is to wait on all previous writes. This
|
||||
// certainly is an overkill, so we won't do it. We can fix this longer term by looking
|
||||
// at the PREPARE messages, and then noting what is the minimum future we should be
|
||||
// waiting for.
|
||||
future<>
|
||||
column_family::seal_active_streaming_memtable_delayed() {
|
||||
auto old = _streaming_memtables->back();
|
||||
if (old->empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
if (_streaming_memtables->should_flush()) {
|
||||
return seal_active_streaming_memtable();
|
||||
}
|
||||
|
||||
if (!_delayed_streaming_flush.armed()) {
|
||||
// We don't want to wait for too long, because the incoming mutations will not be available
|
||||
// until we flush them to SSTables. On top of that, if the sender ran out of messages, it won't
|
||||
// send more until we respond to some - which depends on these futures resolving. Sure enough,
|
||||
// the real fix for that second one is to have better communication between sender and receiver,
|
||||
// but that's not realistic ATM. If we did have better negotiation here, we would not need a timer
|
||||
// at all.
|
||||
_delayed_streaming_flush.arm(2s);
|
||||
}
|
||||
|
||||
return with_gate(_streaming_flush_gate, [this, old] {
|
||||
return _waiting_streaming_flushes.get_shared_future();
|
||||
});
|
||||
}
|
||||
|
||||
future<>
|
||||
column_family::seal_active_streaming_memtable() {
|
||||
auto old = _streaming_memtables->back();
|
||||
if (old->empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
_streaming_memtables->add_memtable();
|
||||
_streaming_memtables->erase(old);
|
||||
return with_gate(_streaming_flush_gate, [this, old] {
|
||||
_delayed_streaming_flush.cancel();
|
||||
|
||||
auto current_waiters = std::exchange(_waiting_streaming_flushes, shared_promise<>());
|
||||
auto f = current_waiters.get_shared_future(); // for this seal
|
||||
|
||||
with_lock(_sstables_lock.for_read(), [this, old] {
|
||||
auto newtab = make_lw_shared<sstables::sstable>(_schema->ks_name(), _schema->cf_name(),
|
||||
_config.datadir, calculate_generation_for_new_table(),
|
||||
sstables::sstable::version_types::ka,
|
||||
sstables::sstable::format_types::big);
|
||||
|
||||
newtab->set_unshared();
|
||||
|
||||
auto&& priority = service::get_local_streaming_write_priority();
|
||||
// This is somewhat similar to the main memtable flush, but with important differences.
|
||||
//
|
||||
// The first difference, is that we don't keep aggregate collectd statistics about this one.
|
||||
// If we ever need to, we'll keep them separate statistics, but we don't want to polute the
|
||||
// main stats about memtables with streaming memtables.
|
||||
//
|
||||
// Second, we will not bother touching the cache after this flush. The current streaming code
|
||||
// will invalidate the ranges it touches, so we won't do it twice. Even when that changes, the
|
||||
// cache management code in here will have to differ from the main memtable's one. Please see
|
||||
// the comment at flush_streaming_mutations() for details.
|
||||
//
|
||||
// Lastly, we don't have any commitlog RP to update, and we don't need to deal manipulate the
|
||||
// memtable list, since this memtable was not available for reading up until this point.
|
||||
return newtab->write_components(*old, incremental_backups_enabled(), priority).then([this, newtab, old] {
|
||||
return newtab->open_data();
|
||||
}).then([this, old, newtab] () {
|
||||
add_sstable(newtab);
|
||||
trigger_compaction();
|
||||
}).handle_exception([] (auto ep) {
|
||||
dblog.error("failed to write streamed sstable: {}", ep);
|
||||
return make_exception_future<>(ep);
|
||||
});
|
||||
// We will also not have any retry logic. If we fail here, we'll fail the streaming and let
|
||||
// the upper layers know. They can then apply any logic they want here.
|
||||
}).then_wrapped([this, current_waiters = std::move(current_waiters)] (future <> f) mutable {
|
||||
if (f.failed()) {
|
||||
current_waiters.set_exception(f.get_exception());
|
||||
} else {
|
||||
current_waiters.set_value();
|
||||
}
|
||||
});
|
||||
|
||||
return f;
|
||||
});
|
||||
}
|
||||
|
||||
future<>
|
||||
column_family::seal_active_memtable() {
|
||||
auto old = _memtables->back();
|
||||
@@ -563,7 +666,7 @@ column_family::seal_active_memtable() {
|
||||
dblog.debug("Memtable is empty");
|
||||
return make_ready_future<>();
|
||||
}
|
||||
add_memtable();
|
||||
_memtables->add_memtable();
|
||||
|
||||
assert(_highest_flushed_rp < old->replay_position()
|
||||
|| (_highest_flushed_rp == db::replay_position() && old->replay_position() == db::replay_position())
|
||||
@@ -637,7 +740,7 @@ column_family::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old) {
|
||||
dblog.error("failed to move memtable to cache: {}", std::current_exception());
|
||||
}
|
||||
|
||||
_memtables->erase(boost::range::find(*_memtables, old));
|
||||
_memtables->erase(old);
|
||||
dblog.debug("Memtable replaced");
|
||||
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
@@ -660,28 +763,39 @@ column_family::start() {
|
||||
future<>
|
||||
column_family::stop() {
|
||||
seal_active_memtable();
|
||||
seal_active_streaming_memtable();
|
||||
return _compaction_manager.remove(this).then([this] {
|
||||
return _flush_queue->close();
|
||||
// Nest, instead of using when_all, so we don't lose any exceptions.
|
||||
return _flush_queue->close().then([this] {
|
||||
return _streaming_flush_gate.close();
|
||||
});
|
||||
}).then([this] {
|
||||
return _sstable_deletion_gate.close();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
future<std::vector<sstables::entry_descriptor>>
|
||||
column_family::reshuffle_sstables(int64_t start) {
|
||||
column_family::reshuffle_sstables(std::set<int64_t> all_generations, int64_t start) {
|
||||
struct work {
|
||||
int64_t current_gen;
|
||||
std::set<int64_t> all_generations; // Stores generation of all live sstables in the system.
|
||||
sstable_list sstables;
|
||||
std::unordered_map<int64_t, sstables::entry_descriptor> descriptors;
|
||||
std::vector<sstables::entry_descriptor> reshuffled;
|
||||
work(int64_t start) : current_gen(start ? start : 1) {}
|
||||
work(int64_t start, std::set<int64_t> gens)
|
||||
: current_gen(start ? start : 1)
|
||||
, all_generations(gens) {}
|
||||
};
|
||||
|
||||
return do_with(work(start), [this] (work& work) {
|
||||
return do_with(work(start, std::move(all_generations)), [this] (work& work) {
|
||||
return lister::scan_dir(_config.datadir, { directory_entry_type::regular }, [this, &work] (directory_entry de) {
|
||||
auto comps = sstables::entry_descriptor::make_descriptor(de.name);
|
||||
if (comps.component != sstables::sstable::component_type::TOC) {
|
||||
return make_ready_future<>();
|
||||
} else if (comps.generation < work.current_gen) {
|
||||
}
|
||||
// Skip generations that were already loaded by Scylla at a previous stage.
|
||||
if (work.all_generations.count(comps.generation) != 0) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
auto sst = make_lw_shared<sstables::sstable>(_schema->ks_name(), _schema->cf_name(),
|
||||
@@ -719,6 +833,21 @@ column_family::reshuffle_sstables(int64_t start) {
|
||||
});
|
||||
}
|
||||
|
||||
void column_family::rebuild_statistics() {
|
||||
// zeroing live_disk_space_used and live_sstable_count because the
|
||||
// sstable list was re-created
|
||||
_stats.live_disk_space_used = 0;
|
||||
_stats.live_sstable_count = 0;
|
||||
|
||||
for (auto&& tab : boost::range::join(_sstables_compacted_but_not_deleted,
|
||||
// this might seem dangerous, but "move" here just avoids constness,
|
||||
// making the two ranges compatible when compiling with boost 1.55.
|
||||
// Noone is actually moving anything...
|
||||
std::move(*_sstables) | boost::adaptors::map_values)) {
|
||||
update_stats_for_new_sstable(tab->data_size());
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>& new_sstables,
|
||||
const std::vector<sstables::shared_sstable>& sstables_to_remove) {
|
||||
@@ -727,37 +856,53 @@ column_family::rebuild_sstable_list(const std::vector<sstables::shared_sstable>&
|
||||
// later), and we add the new tables generated by the compaction.
|
||||
// We create a new list rather than modifying it in-place, so that
|
||||
// on-going reads can continue to use the old list.
|
||||
//
|
||||
// We only remove old sstables after they are successfully deleted,
|
||||
// to avoid a new compaction from ignoring data in the old sstables
|
||||
// if the deletion fails (note deletion of shared sstables can take
|
||||
// unbounded time, because all shards must agree on the deletion).
|
||||
auto current_sstables = _sstables;
|
||||
auto new_sstable_list = make_lw_shared<sstable_list>();
|
||||
auto new_compacted_but_not_deleted = _sstables_compacted_but_not_deleted;
|
||||
|
||||
// zeroing live_disk_space_used and live_sstable_count because the
|
||||
// sstable list is re-created below.
|
||||
_stats.live_disk_space_used = 0;
|
||||
_stats.live_sstable_count = 0;
|
||||
|
||||
std::unordered_set<sstables::shared_sstable> s(
|
||||
sstables_to_remove.begin(), sstables_to_remove.end());
|
||||
|
||||
for (const auto& oldtab : *current_sstables) {
|
||||
// First, add the new sstables.
|
||||
|
||||
// this might seem dangerous, but "move" here just avoids constness,
|
||||
// making the two ranges compatible when compiling with boost 1.55.
|
||||
// Noone is actually moving anything...
|
||||
for (auto&& tab : boost::range::join(new_sstables, std::move(*current_sstables) | boost::adaptors::map_values)) {
|
||||
// Checks if oldtab is a sstable not being compacted.
|
||||
if (!s.count(oldtab.second)) {
|
||||
update_stats_for_new_sstable(oldtab.second->data_size());
|
||||
new_sstable_list->emplace(oldtab.first, oldtab.second);
|
||||
if (!s.count(tab)) {
|
||||
new_sstable_list->emplace(tab->generation(), tab);
|
||||
} else {
|
||||
new_compacted_but_not_deleted.push_back(tab);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& newtab : new_sstables) {
|
||||
// FIXME: rename the new sstable(s). Verify a rename doesn't cause
|
||||
// problems for the sstable object.
|
||||
update_stats_for_new_sstable(newtab->data_size());
|
||||
new_sstable_list->emplace(newtab->generation(), newtab);
|
||||
}
|
||||
|
||||
for (const auto& oldtab : sstables_to_remove) {
|
||||
oldtab->mark_for_deletion();
|
||||
}
|
||||
|
||||
_sstables = std::move(new_sstable_list);
|
||||
_sstables_compacted_but_not_deleted = std::move(new_compacted_but_not_deleted);
|
||||
|
||||
rebuild_statistics();
|
||||
|
||||
// Second, delete the old sstables. This is done in the background, so we can
|
||||
// consider this compaction completed.
|
||||
seastar::with_gate(_sstable_deletion_gate, [this, sstables_to_remove] {
|
||||
return sstables::delete_atomically(sstables_to_remove).then([this, sstables_to_remove] {
|
||||
auto current_sstables = _sstables;
|
||||
auto new_sstable_list = make_lw_shared<sstable_list>();
|
||||
|
||||
std::unordered_set<sstables::shared_sstable> s(
|
||||
sstables_to_remove.begin(), sstables_to_remove.end());
|
||||
auto e = boost::range::remove_if(_sstables_compacted_but_not_deleted, [&] (sstables::shared_sstable sst) -> bool {
|
||||
return s.count(sst);
|
||||
});
|
||||
_sstables_compacted_but_not_deleted.erase(e, _sstables_compacted_but_not_deleted.end());
|
||||
rebuild_statistics();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -781,7 +926,7 @@ column_family::compact_sstables(sstables::compaction_descriptor descriptor, bool
|
||||
};
|
||||
return sstables::compact_sstables(*sstables_to_compact, *this, create_sstable, descriptor.max_sstable_bytes, descriptor.level,
|
||||
cleanup).then([this, sstables_to_compact] (auto new_sstables) {
|
||||
this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
|
||||
return this->rebuild_sstable_list(new_sstables, *sstables_to_compact);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -912,6 +1057,24 @@ lw_shared_ptr<sstable_list> column_family::get_sstables() {
|
||||
return _sstables;
|
||||
}
|
||||
|
||||
// Gets the list of all sstables in the column family, including ones that are
|
||||
// not used for active queries because they have already been compacted, but are
|
||||
// waiting for delete_atomically() to return.
|
||||
//
|
||||
// As long as we haven't deleted them, compaction needs to ensure it doesn't
|
||||
// garbage-collect a tombstone that covers data in an sstable that may not be
|
||||
// successfully deleted.
|
||||
lw_shared_ptr<sstable_list> column_family::get_sstables_including_compacted_undeleted() {
|
||||
if (_sstables_compacted_but_not_deleted.empty()) {
|
||||
return _sstables;
|
||||
}
|
||||
auto ret = make_lw_shared(*_sstables);
|
||||
for (auto&& s : _sstables_compacted_but_not_deleted) {
|
||||
ret->insert(std::make_pair(s->generation(), s));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline bool column_family::manifest_json_filter(const sstring& fname) {
|
||||
using namespace boost::filesystem;
|
||||
|
||||
@@ -1027,14 +1190,24 @@ database::database() : database(db::config())
|
||||
{}
|
||||
|
||||
database::database(const db::config& cfg)
|
||||
: _cfg(std::make_unique<db::config>(cfg))
|
||||
: _streaming_dirty_memory_region_group(&_dirty_memory_region_group)
|
||||
, _cfg(std::make_unique<db::config>(cfg))
|
||||
, _memtable_total_space([this] {
|
||||
auto memtable_total_space = size_t(_cfg->memtable_total_space_in_mb()) << 20;
|
||||
if (!memtable_total_space) {
|
||||
return memory::stats().total_memory() / 2;
|
||||
}
|
||||
return memtable_total_space;
|
||||
}())
|
||||
, _version(empty_version)
|
||||
, _enable_incremental_backups(cfg.incremental_backups())
|
||||
, _memtables_throttler(_memtable_total_space, _dirty_memory_region_group)
|
||||
// We have to be careful here not to set the streaming limit for less than
|
||||
// a memtable maximum size. Allow up to 25 % to be used up by streaming memtables
|
||||
// in the common case
|
||||
, _streaming_throttler(_memtable_total_space * std::min(0.25, cfg.memtable_cleanup_threshold()),
|
||||
_streaming_dirty_memory_region_group, _memtables_throttler)
|
||||
{
|
||||
_memtable_total_space = size_t(_cfg->memtable_total_space_in_mb()) << 20;
|
||||
if (!_memtable_total_space) {
|
||||
_memtable_total_space = memory::stats().total_memory() / 2;
|
||||
}
|
||||
// Start compaction manager with two tasks for handling compaction jobs.
|
||||
_compaction_manager.start(2);
|
||||
setup_collectd();
|
||||
@@ -1424,6 +1597,7 @@ keyspace::make_column_family_config(const schema& s) const {
|
||||
cfg.enable_cache = _config.enable_cache;
|
||||
cfg.max_memtable_size = _config.max_memtable_size;
|
||||
cfg.dirty_memory_region_group = _config.dirty_memory_region_group;
|
||||
cfg.streaming_dirty_memory_region_group = _config.streaming_dirty_memory_region_group;
|
||||
cfg.cf_stats = _config.cf_stats;
|
||||
cfg.enable_incremental_backups = _config.enable_incremental_backups;
|
||||
|
||||
@@ -1599,21 +1773,12 @@ column_family::query(schema_ptr s, const query::read_command& cmd, query::result
|
||||
{
|
||||
return do_until(std::bind(&query_state::done, &qs), [this, &qs] {
|
||||
auto&& range = *qs.current_partition_range++;
|
||||
qs.reader = make_reader(qs.schema, range, service::get_local_sstable_query_read_priority());
|
||||
qs.range_empty = false;
|
||||
return do_until([&qs] { return !qs.limit || qs.range_empty; }, [&qs] {
|
||||
return qs.reader().then([&qs](mutation_opt mo) {
|
||||
if (mo) {
|
||||
auto p_builder = qs.builder.add_partition(*mo->schema(), mo->key());
|
||||
auto is_distinct = qs.cmd.slice.options.contains(query::partition_slice::option::distinct);
|
||||
auto limit = !is_distinct ? qs.limit : 1;
|
||||
auto rows_added = mo->partition().query(p_builder, *qs.schema, qs.cmd.timestamp, limit);
|
||||
qs.limit -= rows_added;
|
||||
} else {
|
||||
qs.range_empty = true;
|
||||
}
|
||||
});
|
||||
});
|
||||
auto add_partition = [&qs] (uint32_t live_rows, mutation&& m) {
|
||||
auto pb = qs.builder.add_partition(*qs.schema, m.key());
|
||||
m.partition().query_compacted(pb, *qs.schema, live_rows);
|
||||
};
|
||||
return do_with(querying_reader(qs.schema, as_mutation_source(), range, qs.cmd.slice, qs.limit, qs.cmd.timestamp, add_partition),
|
||||
[] (auto&& rd) { return rd.read(); });
|
||||
}).then([qs_ptr = std::move(qs_ptr), &qs] {
|
||||
return make_ready_future<lw_shared_ptr<query::result>>(
|
||||
make_lw_shared<query::result>(qs.builder.build()));
|
||||
@@ -1711,8 +1876,8 @@ void
|
||||
column_family::apply(const mutation& m, const db::replay_position& rp) {
|
||||
utils::latency_counter lc;
|
||||
_stats.writes.set_latency(lc);
|
||||
active_memtable().apply(m, rp);
|
||||
seal_on_overflow();
|
||||
_memtables->active_memtable().apply(m, rp);
|
||||
_memtables->seal_on_overflow();
|
||||
_stats.writes.mark(lc);
|
||||
if (lc.is_start()) {
|
||||
_stats.estimated_write.add(lc.latency(), _stats.writes.count);
|
||||
@@ -1724,21 +1889,17 @@ column_family::apply(const frozen_mutation& m, const schema_ptr& m_schema, const
|
||||
utils::latency_counter lc;
|
||||
_stats.writes.set_latency(lc);
|
||||
check_valid_rp(rp);
|
||||
active_memtable().apply(m, m_schema, rp);
|
||||
seal_on_overflow();
|
||||
_memtables->active_memtable().apply(m, m_schema, rp);
|
||||
_memtables->seal_on_overflow();
|
||||
_stats.writes.mark(lc);
|
||||
if (lc.is_start()) {
|
||||
_stats.estimated_write.add(lc.latency(), _stats.writes.count);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
column_family::seal_on_overflow() {
|
||||
if (active_memtable().occupancy().total_space() >= _config.max_memtable_size) {
|
||||
// FIXME: if sparse, do some in-memory compaction first
|
||||
// FIXME: maybe merge with other in-memory memtables
|
||||
seal_active_memtable();
|
||||
}
|
||||
void column_family::apply_streaming_mutation(schema_ptr m_schema, const frozen_mutation& m) {
|
||||
_streaming_memtables->active_memtable().apply(m, m_schema);
|
||||
_streaming_memtables->seal_on_overflow();
|
||||
}
|
||||
|
||||
void
|
||||
@@ -1787,9 +1948,20 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m) {
|
||||
return apply_in_memory(m, s, db::replay_position());
|
||||
}
|
||||
|
||||
future<> database::throttle() {
|
||||
if (_dirty_memory_region_group.memory_used() < _memtable_total_space
|
||||
&& _throttled_requests.empty()) {
|
||||
database::throttle_state::throttle_state(size_t max_space, logalloc::region_group& rg)
|
||||
: _max_space(max_space)
|
||||
, _region_group(rg)
|
||||
, _parent(nullptr)
|
||||
{}
|
||||
|
||||
database::throttle_state::throttle_state(size_t max_space, logalloc::region_group& rg, throttle_state& parent)
|
||||
: _max_space(max_space)
|
||||
, _region_group(rg)
|
||||
, _parent(&parent)
|
||||
{}
|
||||
|
||||
future<> database::throttle_state::throttle() {
|
||||
if (!should_throttle() && _throttled_requests.empty()) {
|
||||
// All is well, go ahead
|
||||
return make_ready_future<>();
|
||||
}
|
||||
@@ -1801,13 +1973,13 @@ future<> database::throttle() {
|
||||
return _throttled_requests.back().get_future();
|
||||
}
|
||||
|
||||
void database::unthrottle() {
|
||||
void database::throttle_state::unthrottle() {
|
||||
// Release one request per free 1MB we have
|
||||
// FIXME: improve this
|
||||
if (_dirty_memory_region_group.memory_used() >= _memtable_total_space) {
|
||||
if (should_throttle()) {
|
||||
return;
|
||||
}
|
||||
size_t avail = (_memtable_total_space - _dirty_memory_region_group.memory_used()) >> 20;
|
||||
size_t avail = (_max_space - _region_group.memory_used()) >> 20;
|
||||
avail = std::min(_throttled_requests.size(), avail);
|
||||
for (size_t i = 0; i < avail; ++i) {
|
||||
_throttled_requests.front().set_value();
|
||||
@@ -1822,11 +1994,39 @@ future<> database::apply(schema_ptr s, const frozen_mutation& m) {
|
||||
if (dblog.is_enabled(logging::log_level::trace)) {
|
||||
dblog.trace("apply {}", m.pretty_printer(s));
|
||||
}
|
||||
return throttle().then([this, &m, s = std::move(s)] {
|
||||
return _memtables_throttler.throttle().then([this, &m, s = std::move(s)] {
|
||||
return do_apply(std::move(s), m);
|
||||
});
|
||||
}
|
||||
|
||||
future<> database::apply_streaming_mutation(schema_ptr s, const frozen_mutation& m) {
|
||||
if (!s->is_synced()) {
|
||||
throw std::runtime_error(sprint("attempted to mutate using not synced schema of %s.%s, version=%s",
|
||||
s->ks_name(), s->cf_name(), s->version()));
|
||||
}
|
||||
|
||||
// TODO (maybe): This will use the same memory region group as memtables, so when
|
||||
// one of them throttles, both will.
|
||||
//
|
||||
// It would be possible to provide further QoS for CQL originated memtables
|
||||
// by keeping the streaming memtables into a different region group, with its own
|
||||
// separate limit.
|
||||
//
|
||||
// Because, however, there are many other limits in play that may kick in,
|
||||
// I am not convinced that this will ever be a problem.
|
||||
//
|
||||
// If we do find ourselves in the situation that we are throttling incoming
|
||||
// writes due to high level of streaming writes, and we are sure that this
|
||||
// is the best solution, we can just change the memtable creation method so
|
||||
// that each kind of memtable creates from a different region group - and then
|
||||
// update the throttle conditions accordingly.
|
||||
return _streaming_throttler.throttle().then([this, &m, s = std::move(s)] {
|
||||
auto uuid = m.column_family_id();
|
||||
auto& cf = find_column_family(uuid);
|
||||
cf.apply_streaming_mutation(s, std::move(m));
|
||||
});
|
||||
}
|
||||
|
||||
keyspace::config
|
||||
database::make_keyspace_config(const keyspace_metadata& ksm) {
|
||||
// FIXME support multiple directories
|
||||
@@ -1847,6 +2047,7 @@ database::make_keyspace_config(const keyspace_metadata& ksm) {
|
||||
cfg.max_memtable_size = std::numeric_limits<size_t>::max();
|
||||
}
|
||||
cfg.dirty_memory_region_group = &_dirty_memory_region_group;
|
||||
cfg.streaming_dirty_memory_region_group = &_streaming_dirty_memory_region_group;
|
||||
cfg.cf_stats = &_cf_stats;
|
||||
cfg.enable_incremental_backups = _enable_incremental_backups;
|
||||
return cfg;
|
||||
@@ -2299,10 +2500,36 @@ future<> column_family::flush(const db::replay_position& pos) {
|
||||
return seal_active_memtable();
|
||||
}
|
||||
|
||||
// FIXME: We can do much better than this in terms of cache management. Right
|
||||
// now, we only have to flush the touched ranges because of the possibility of
|
||||
// streaming containing token ownership changes.
|
||||
//
|
||||
// Right now we can't differentiate between that and a normal repair process,
|
||||
// so we always flush. When we can differentiate those streams, we should not
|
||||
// be indiscriminately touching the cache during repair. We will just have to
|
||||
// invalidate the entries that are relevant to things we already have in the cache.
|
||||
future<> column_family::flush_streaming_mutations(std::vector<query::partition_range> ranges) {
|
||||
// This will effectively take the gate twice for this call. The proper way to fix that would
|
||||
// be to change seal_active_streaming_memtable_delayed to take a range parameter. However, we
|
||||
// need this code to go away as soon as we can (see FIXME above). So the double gate is a better
|
||||
// temporary counter measure.
|
||||
return with_gate(_streaming_flush_gate, [this, ranges = std::move(ranges)] {
|
||||
return seal_active_streaming_memtable_delayed().finally([this, ranges = std::move(ranges)] {
|
||||
if (_config.enable_cache) {
|
||||
for (auto& range : ranges) {
|
||||
_cache.invalidate(range);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void column_family::clear() {
|
||||
_cache.clear();
|
||||
_memtables->clear();
|
||||
add_memtable();
|
||||
_memtables->add_memtable();
|
||||
_streaming_memtables->clear();
|
||||
_streaming_memtables->add_memtable();
|
||||
}
|
||||
|
||||
// NOTE: does not need to be futurized, but might eventually, depending on
|
||||
@@ -2316,21 +2543,26 @@ future<db::replay_position> column_family::discard_sstables(db_clock::time_point
|
||||
auto gc_trunc = to_gc_clock(truncated_at);
|
||||
|
||||
auto pruned = make_lw_shared<sstable_list>();
|
||||
std::vector<sstables::shared_sstable> remove;
|
||||
|
||||
for (auto&p : *_sstables) {
|
||||
if (p.second->max_data_age() <= gc_trunc) {
|
||||
rp = std::max(p.second->get_stats_metadata().position, rp);
|
||||
p.second->mark_for_deletion();
|
||||
remove.emplace_back(p.second);
|
||||
continue;
|
||||
}
|
||||
pruned->emplace(p.first, p.second);
|
||||
}
|
||||
|
||||
_sstables = std::move(pruned);
|
||||
|
||||
dblog.debug("cleaning out row cache");
|
||||
_cache.clear();
|
||||
return make_ready_future<db::replay_position>(rp);
|
||||
|
||||
return parallel_for_each(remove, [](sstables::shared_sstable s) {
|
||||
return sstables::delete_atomically({s});
|
||||
}).then([rp] {
|
||||
return make_ready_future<db::replay_position>(rp);
|
||||
}).finally([remove] {}); // keep the objects alive until here.
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2376,6 +2608,10 @@ void column_family::set_schema(schema_ptr s) {
|
||||
m->set_schema(s);
|
||||
}
|
||||
|
||||
for (auto& m : *_streaming_memtables) {
|
||||
m->set_schema(s);
|
||||
}
|
||||
|
||||
_cache.set_schema(s);
|
||||
_schema = std::move(s);
|
||||
}
|
||||
|
||||
206
database.hh
206
database.hh
@@ -41,6 +41,7 @@
|
||||
#include <set>
|
||||
#include <iostream>
|
||||
#include <boost/functional/hash.hpp>
|
||||
#include <boost/range/algorithm/find.hpp>
|
||||
#include <experimental/optional>
|
||||
#include <string.h>
|
||||
#include "types.hh"
|
||||
@@ -70,6 +71,7 @@
|
||||
#include "sstables/compaction.hh"
|
||||
#include "key_reader.hh"
|
||||
#include <seastar/core/rwlock.hh>
|
||||
#include <seastar/core/shared_future.hh>
|
||||
|
||||
class frozen_mutation;
|
||||
class reconcilable_result;
|
||||
@@ -98,7 +100,96 @@ void make(database& db, bool durable, bool volatile_testing_only);
|
||||
|
||||
class replay_position_reordered_exception : public std::exception {};
|
||||
|
||||
using memtable_list = std::vector<lw_shared_ptr<memtable>>;
|
||||
// We could just add all memtables, regardless of types, to a single list, and
|
||||
// then filter them out when we read them. Here's why I have chosen not to do
|
||||
// it:
|
||||
//
|
||||
// First, some of the methods in which a memtable is involved (like seal) are
|
||||
// assume a commitlog, and go through great care of updating the replay
|
||||
// position, flushing the log, etc. We want to bypass those, and that has to
|
||||
// be done either by sprikling the seal code with conditionals, or having a
|
||||
// separate method for each seal.
|
||||
//
|
||||
// Also, if we ever want to put some of the memtables in as separate allocator
|
||||
// region group to provide for extra QoS, having the classes properly wrapped
|
||||
// will make that trivial: just pass a version of new_memtable() that puts it
|
||||
// in a different region, while the list approach would require a lot of
|
||||
// conditionals as well.
|
||||
//
|
||||
// If we are going to have different methods, better have different instances
|
||||
// of a common class.
|
||||
class memtable_list {
|
||||
using shared_memtable = lw_shared_ptr<memtable>;
|
||||
std::vector<shared_memtable> _memtables;
|
||||
std::function<future<> ()> _seal_fn;
|
||||
std::function<shared_memtable ()> _new_memtable;
|
||||
size_t _max_memtable_size;
|
||||
public:
|
||||
memtable_list(std::function<future<> ()> seal_fn, std::function<shared_memtable()> new_mt, size_t max_memtable_size)
|
||||
: _memtables({})
|
||||
, _seal_fn(seal_fn)
|
||||
, _new_memtable(new_mt)
|
||||
, _max_memtable_size(max_memtable_size) {
|
||||
add_memtable();
|
||||
}
|
||||
|
||||
shared_memtable back() {
|
||||
return _memtables.back();
|
||||
}
|
||||
|
||||
// The caller has to make sure the element exist before calling this.
|
||||
void erase(const shared_memtable& element) {
|
||||
_memtables.erase(boost::range::find(_memtables, element));
|
||||
}
|
||||
void clear() {
|
||||
_memtables.clear();
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return _memtables.size();
|
||||
}
|
||||
|
||||
future<> seal_active_memtable() {
|
||||
return _seal_fn();
|
||||
}
|
||||
|
||||
auto begin() noexcept {
|
||||
return _memtables.begin();
|
||||
}
|
||||
|
||||
auto begin() const noexcept {
|
||||
return _memtables.begin();
|
||||
}
|
||||
|
||||
auto end() noexcept {
|
||||
return _memtables.end();
|
||||
}
|
||||
|
||||
auto end() const noexcept {
|
||||
return _memtables.end();
|
||||
}
|
||||
|
||||
memtable& active_memtable() {
|
||||
return *_memtables.back();
|
||||
}
|
||||
|
||||
void add_memtable() {
|
||||
_memtables.emplace_back(_new_memtable());
|
||||
}
|
||||
|
||||
bool should_flush() {
|
||||
return active_memtable().occupancy().total_space() >= _max_memtable_size;
|
||||
}
|
||||
|
||||
void seal_on_overflow() {
|
||||
if (should_flush()) {
|
||||
// FIXME: if sparse, do some in-memory compaction first
|
||||
// FIXME: maybe merge with other in-memory memtables
|
||||
_seal_fn();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using sstable_list = sstables::sstable_list;
|
||||
|
||||
// The CF has a "stats" structure. But we don't want all fields here,
|
||||
@@ -122,6 +213,7 @@ public:
|
||||
bool enable_incremental_backups = false;
|
||||
size_t max_memtable_size = 5'000'000;
|
||||
logalloc::region_group* dirty_memory_region_group = nullptr;
|
||||
logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
|
||||
::cf_stats* cf_stats = nullptr;
|
||||
};
|
||||
struct no_commitlog {};
|
||||
@@ -153,8 +245,34 @@ private:
|
||||
config _config;
|
||||
stats _stats;
|
||||
lw_shared_ptr<memtable_list> _memtables;
|
||||
|
||||
// In older incarnations, we simply commited the mutations to memtables.
|
||||
// However, doing that makes it harder for us to provide QoS within the
|
||||
// disk subsystem. Keeping them in separate memtables allow us to properly
|
||||
// classify those streams into its own I/O class
|
||||
//
|
||||
// We could write those directly to disk, but we still want the mutations
|
||||
// coming through the wire to go to a memtable staging area. This has two
|
||||
// major advantages:
|
||||
//
|
||||
// first, it will allow us to properly order the partitions. They are
|
||||
// hopefuly sent in order but we can't really guarantee that without
|
||||
// sacrificing sender-side parallelism.
|
||||
//
|
||||
// second, we will be able to coalesce writes from multiple plan_id's and
|
||||
// even multiple senders, as well as automatically tapping into the dirty
|
||||
// memory throttling mechanism, guaranteeing we will not overload the
|
||||
// server.
|
||||
lw_shared_ptr<memtable_list> _streaming_memtables;
|
||||
|
||||
// generation -> sstable. Ordered by key so we can easily get the most recent.
|
||||
lw_shared_ptr<sstable_list> _sstables;
|
||||
// sstables that have been compacted (so don't look up in query) but
|
||||
// have not been deleted yet, so must not GC any tombstones in other sstables
|
||||
// that may delete data in these sstables:
|
||||
std::vector<sstables::shared_sstable> _sstables_compacted_but_not_deleted;
|
||||
// Control background fibers waiting for sstables to be deleted
|
||||
seastar::gate _sstable_deletion_gate;
|
||||
// There are situations in which we need to stop writing sstables. Flushers will take
|
||||
// the read lock, and the ones that wish to stop that process will take the write lock.
|
||||
rwlock _sstables_lock;
|
||||
@@ -171,11 +289,20 @@ private:
|
||||
int _compaction_disabled = 0;
|
||||
class memtable_flush_queue;
|
||||
std::unique_ptr<memtable_flush_queue> _flush_queue;
|
||||
// Because streaming mutations bypass the commitlog, there is
|
||||
// no need for the complications of the flush queue. Besides, it
|
||||
// is easier to just use a common gate than it is to modify the flush_queue
|
||||
// to work both with and without a replay position.
|
||||
//
|
||||
// Last but not least, we seldom need to guarantee any ordering here: as long
|
||||
// as all data is waited for, we're good.
|
||||
seastar::gate _streaming_flush_gate;
|
||||
private:
|
||||
void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable);
|
||||
void add_sstable(sstables::sstable&& sstable);
|
||||
void add_sstable(lw_shared_ptr<sstables::sstable> sstable);
|
||||
void add_memtable();
|
||||
lw_shared_ptr<memtable> new_memtable();
|
||||
lw_shared_ptr<memtable> new_streaming_memtable();
|
||||
future<stop_iteration> try_flush_memtable_to_sstable(lw_shared_ptr<memtable> memt);
|
||||
future<> update_cache(memtable&, lw_shared_ptr<sstable_list> old_sstables);
|
||||
struct merge_comparator;
|
||||
@@ -198,6 +325,7 @@ private:
|
||||
// Rebuild existing _sstables with new_sstables added to it and sstables_to_remove removed from it.
|
||||
void rebuild_sstable_list(const std::vector<sstables::shared_sstable>& new_sstables,
|
||||
const std::vector<sstables::shared_sstable>& sstables_to_remove);
|
||||
void rebuild_statistics();
|
||||
private:
|
||||
// Creates a mutation reader which covers sstables.
|
||||
// Caller needs to ensure that column_family remains live (FIXME: relax this).
|
||||
@@ -251,7 +379,7 @@ public:
|
||||
// FIXME: in case a query is satisfied from a single memtable, avoid a copy
|
||||
using const_mutation_partition_ptr = std::unique_ptr<const mutation_partition>;
|
||||
using const_row_ptr = std::unique_ptr<const row>;
|
||||
memtable& active_memtable() { return *_memtables->back(); }
|
||||
memtable& active_memtable() { return _memtables->active_memtable(); }
|
||||
const row_cache& get_row_cache() const {
|
||||
return _cache;
|
||||
}
|
||||
@@ -276,6 +404,7 @@ public:
|
||||
// The mutation is always upgraded to current schema.
|
||||
void apply(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position& = db::replay_position());
|
||||
void apply(const mutation& m, const db::replay_position& = db::replay_position());
|
||||
void apply_streaming_mutation(schema_ptr, const frozen_mutation&);
|
||||
|
||||
// Returns at most "cmd.limit" rows
|
||||
future<lw_shared_ptr<query::result>> query(schema_ptr,
|
||||
@@ -288,6 +417,7 @@ public:
|
||||
future<> stop();
|
||||
future<> flush();
|
||||
future<> flush(const db::replay_position&);
|
||||
future<> flush_streaming_mutations(std::vector<query::partition_range> ranges = std::vector<query::partition_range>{});
|
||||
void clear(); // discards memtable(s) without flushing them to disk.
|
||||
future<db::replay_position> discard_sstables(db_clock::time_point);
|
||||
|
||||
@@ -298,7 +428,10 @@ public:
|
||||
future<int64_t> disable_sstable_write() {
|
||||
_sstable_writes_disabled_at = std::chrono::steady_clock::now();
|
||||
return _sstables_lock.write_lock().then([this] {
|
||||
return make_ready_future<int64_t>((*_sstables->end()).first);
|
||||
if (_sstables->empty()) {
|
||||
return make_ready_future<int64_t>(0);
|
||||
}
|
||||
return make_ready_future<int64_t>((*_sstables->rbegin()).first);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -321,9 +454,11 @@ public:
|
||||
// very dangerous to do that with live SSTables. This is meant to be used with SSTables
|
||||
// that are not yet managed by the system.
|
||||
//
|
||||
// Parameter all_generations stores the generation of all SSTables in the system, so it
|
||||
// will be easy to determine which SSTable is new.
|
||||
// An example usage would query all shards asking what is the highest SSTable number known
|
||||
// to them, and then pass that + 1 as "start".
|
||||
future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(int64_t start);
|
||||
future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(std::set<int64_t> all_generations, int64_t start);
|
||||
|
||||
// FIXME: this is just an example, should be changed to something more
|
||||
// general. compact_all_sstables() starts a compaction of all sstables.
|
||||
@@ -357,6 +492,7 @@ public:
|
||||
}
|
||||
|
||||
lw_shared_ptr<sstable_list> get_sstables();
|
||||
lw_shared_ptr<sstable_list> get_sstables_including_compacted_undeleted();
|
||||
size_t sstables_count();
|
||||
int64_t get_unleveled_sstables() const;
|
||||
|
||||
@@ -408,6 +544,31 @@ private:
|
||||
// synchronously flush data to disk.
|
||||
future<> seal_active_memtable();
|
||||
|
||||
// I am assuming here that the repair process will potentially send ranges containing
|
||||
// few mutations, definitely not enough to fill a memtable. It wants to know whether or
|
||||
// not each of those ranges individually succeeded or failed, so we need a future for
|
||||
// each.
|
||||
//
|
||||
// One of the ways to fix that, is changing the repair itself to send more mutations at
|
||||
// a single batch. But relying on that is a bad idea for two reasons:
|
||||
//
|
||||
// First, the goals of the SSTable writer and the repair sender are at odds. The SSTable
|
||||
// writer wants to write as few SSTables as possible, while the repair sender wants to
|
||||
// break down the range in pieces as small as it can and checksum them individually, so
|
||||
// it doesn't have to send a lot of mutations for no reason.
|
||||
//
|
||||
// Second, even if the repair process wants to process larger ranges at once, some ranges
|
||||
// themselves may be small. So while most ranges would be large, we would still have
|
||||
// potentially some fairly small SSTables lying around.
|
||||
//
|
||||
// The best course of action in this case is to coalesce the incoming streams write-side.
|
||||
// repair can now choose whatever strategy - small or big ranges - it wants, resting assure
|
||||
// that the incoming memtables will be coalesced together.
|
||||
shared_promise<> _waiting_streaming_flushes;
|
||||
timer<> _delayed_streaming_flush{[this] { seal_active_streaming_memtable(); }};
|
||||
future<> seal_active_streaming_memtable();
|
||||
future<> seal_active_streaming_memtable_delayed();
|
||||
|
||||
// filter manifest.json files out
|
||||
static bool manifest_json_filter(const sstring& fname);
|
||||
|
||||
@@ -417,7 +578,6 @@ private:
|
||||
template <typename Func>
|
||||
future<bool> for_all_partitions(schema_ptr, Func&& func) const;
|
||||
future<sstables::entry_descriptor> probe_file(sstring sstdir, sstring fname);
|
||||
void seal_on_overflow();
|
||||
void check_valid_rp(const db::replay_position&) const;
|
||||
public:
|
||||
// Iterate over all partitions. Protocol is the same as std::all_of(),
|
||||
@@ -521,6 +681,7 @@ public:
|
||||
bool enable_incremental_backups = false;
|
||||
size_t max_memtable_size = 5'000'000;
|
||||
logalloc::region_group* dirty_memory_region_group = nullptr;
|
||||
logalloc::region_group* streaming_dirty_memory_region_group = nullptr;
|
||||
::cf_stats* cf_stats = nullptr;
|
||||
};
|
||||
private:
|
||||
@@ -582,6 +743,7 @@ public:
|
||||
class database {
|
||||
::cf_stats _cf_stats;
|
||||
logalloc::region_group _dirty_memory_region_group;
|
||||
logalloc::region_group _streaming_dirty_memory_region_group;
|
||||
std::unordered_map<sstring, keyspace> _keyspaces;
|
||||
std::unordered_map<utils::UUID, lw_shared_ptr<column_family>> _column_families;
|
||||
std::unordered_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash> _ks_cf_to_uuid;
|
||||
@@ -592,8 +754,6 @@ class database {
|
||||
// compaction_manager object is referenced by all column families of a database.
|
||||
compaction_manager _compaction_manager;
|
||||
std::vector<scollectd::registration> _collectd;
|
||||
timer<> _throttling_timer{[this] { unthrottle(); }};
|
||||
circular_buffer<promise<>> _throttled_requests;
|
||||
bool _enable_incremental_backups = false;
|
||||
|
||||
future<> init_commitlog();
|
||||
@@ -608,9 +768,34 @@ private:
|
||||
void create_in_memory_keyspace(const lw_shared_ptr<keyspace_metadata>& ksm);
|
||||
friend void db::system_keyspace::make(database& db, bool durable, bool volatile_testing_only);
|
||||
void setup_collectd();
|
||||
future<> throttle();
|
||||
|
||||
class throttle_state {
|
||||
size_t _max_space;
|
||||
logalloc::region_group& _region_group;
|
||||
throttle_state* _parent;
|
||||
|
||||
circular_buffer<promise<>> _throttled_requests;
|
||||
timer<> _throttling_timer{[this] { unthrottle(); }};
|
||||
void unthrottle();
|
||||
bool should_throttle() const {
|
||||
if (_region_group.memory_used() > _max_space) {
|
||||
return true;
|
||||
}
|
||||
if (_parent) {
|
||||
return _parent->should_throttle();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
public:
|
||||
throttle_state(size_t max_space, logalloc::region_group& region);
|
||||
throttle_state(size_t max_space, logalloc::region_group& region, throttle_state& parent);
|
||||
future<> throttle();
|
||||
};
|
||||
|
||||
throttle_state _memtables_throttler;
|
||||
throttle_state _streaming_throttler;
|
||||
|
||||
future<> do_apply(schema_ptr, const frozen_mutation&);
|
||||
void unthrottle();
|
||||
public:
|
||||
static utils::UUID empty_version;
|
||||
|
||||
@@ -678,6 +863,7 @@ public:
|
||||
future<lw_shared_ptr<query::result>> query(schema_ptr, const query::read_command& cmd, query::result_request request, const std::vector<query::partition_range>& ranges);
|
||||
future<reconcilable_result> query_mutations(schema_ptr, const query::read_command& cmd, const query::partition_range& range);
|
||||
future<> apply(schema_ptr, const frozen_mutation&);
|
||||
future<> apply_streaming_mutation(schema_ptr, const frozen_mutation&);
|
||||
keyspace::config make_keyspace_config(const keyspace_metadata& ksm);
|
||||
const sstring& get_snitch_name() const;
|
||||
future<> clear_snapshot(sstring tag, std::vector<sstring> keyspace_names);
|
||||
|
||||
@@ -1043,7 +1043,9 @@ void db::commitlog::segment_manager::flush_segments(bool force) {
|
||||
|
||||
future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::allocate_segment(bool active) {
|
||||
descriptor d(next_id());
|
||||
return open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create).then([this, d, active](file f) {
|
||||
file_open_options opt;
|
||||
opt.extent_allocation_size_hint = max_size;
|
||||
return open_file_dma(cfg.commit_log_location + "/" + d.filename(), open_flags::wo | open_flags::create, opt).then([this, d, active](file f) {
|
||||
// xfs doesn't like files extended betond eof, so enlarge the file
|
||||
return f.truncate(max_size).then([this, d, active, f] () mutable {
|
||||
auto s = make_lw_shared<segment>(this->shared_from_this(), d, std::move(f), active);
|
||||
|
||||
@@ -487,7 +487,7 @@ public:
|
||||
val(cas_contention_timeout_in_ms, uint32_t, 5000, Unused, \
|
||||
"The time that the coordinator continues to retry a CAS (compare and set) operation that contends with other proposals for the same row." \
|
||||
) \
|
||||
val(truncate_request_timeout_in_ms, uint32_t, 10000, Unused, \
|
||||
val(truncate_request_timeout_in_ms, uint32_t, 10000, Used, \
|
||||
"The time that the coordinator waits for truncates (remove all data from a table) to complete. The long default value allows for a snapshot to be taken before removing the data. If auto_snapshot is disabled (not recommended), you can reduce this time." \
|
||||
) \
|
||||
val(write_request_timeout_in_ms, uint32_t, 2000, Used, \
|
||||
|
||||
@@ -663,7 +663,7 @@ future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& p
|
||||
});
|
||||
}
|
||||
|
||||
static void update_column_family(database& db, schema_ptr new_schema) {
|
||||
static future<> update_column_family(database& db, schema_ptr new_schema) {
|
||||
column_family& cfm = db.find_column_family(new_schema->id());
|
||||
|
||||
bool columns_changed = !cfm.schema()->equal_columns(*new_schema);
|
||||
@@ -672,7 +672,7 @@ static void update_column_family(database& db, schema_ptr new_schema) {
|
||||
s->registry_entry()->mark_synced();
|
||||
cfm.set_schema(std::move(s));
|
||||
|
||||
service::get_local_migration_manager().notify_update_column_family(cfm.schema(), columns_changed);
|
||||
return service::get_local_migration_manager().notify_update_column_family(cfm.schema(), columns_changed);
|
||||
}
|
||||
|
||||
// see the comments for merge_keyspaces()
|
||||
@@ -713,15 +713,15 @@ static void merge_tables(distributed<service::storage_proxy>& proxy,
|
||||
auto& cf = db.find_column_family(s);
|
||||
cf.mark_ready_for_writes();
|
||||
ks.make_directory_for_column_family(s->cf_name(), s->id()).get();
|
||||
service::get_local_migration_manager().notify_create_column_family(s);
|
||||
service::get_local_migration_manager().notify_create_column_family(s).get();
|
||||
}
|
||||
for (auto&& gs : altered) {
|
||||
update_column_family(db, gs.get());
|
||||
update_column_family(db, gs.get()).get();
|
||||
}
|
||||
parallel_for_each(dropped.begin(), dropped.end(), [&db, &tsf](auto&& gs) {
|
||||
schema_ptr s = gs.get();
|
||||
return db.drop_column_family(s->ks_name(), s->cf_name(), [&tsf] { return tsf.value(); }).then([s] {
|
||||
service::get_local_migration_manager().notify_drop_column_family(s);
|
||||
return service::get_local_migration_manager().notify_drop_column_family(s);
|
||||
});
|
||||
}).get();
|
||||
});
|
||||
|
||||
90
dist/ami/build_ami.sh
vendored
90
dist/ami/build_ami.sh
vendored
@@ -29,28 +29,74 @@ while [ $# -gt 0 ]; do
|
||||
esac
|
||||
done
|
||||
|
||||
. /etc/os-release
|
||||
case "$ID" in
|
||||
"centos")
|
||||
AMI=ami-f3102499
|
||||
REGION=us-east-1
|
||||
SSH_USERNAME=centos
|
||||
;;
|
||||
"ubuntu")
|
||||
AMI=ami-ff427095
|
||||
REGION=us-east-1
|
||||
SSH_USERNAME=ubuntu
|
||||
;;
|
||||
*)
|
||||
echo "build_ami.sh does not supported this distribution."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
|
||||
if [ $LOCALRPM -eq 1 ]; then
|
||||
rm -rf build/*
|
||||
sudo yum -y install git
|
||||
if [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
|
||||
dist/redhat/build_rpm.sh
|
||||
cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
|
||||
cd scylla-jmx
|
||||
sh -x -e dist/redhat/build_rpm.sh $*
|
||||
cd ../..
|
||||
cp build/scylla-jmx/build/rpmbuild/RPMS/noarch/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
|
||||
cd scylla-tools-java
|
||||
sh -x -e dist/redhat/build_rpm.sh
|
||||
cd ../..
|
||||
cp build/scylla-tools-java/build/rpmbuild/RPMS/noarch/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
|
||||
if [ "$ID" = "centos" ]; then
|
||||
rm -rf build/*
|
||||
sudo yum -y install git
|
||||
if [ ! -f dist/ami/files/scylla-server.x86_64.rpm ]; then
|
||||
dist/redhat/build_rpm.sh
|
||||
cp build/rpmbuild/RPMS/x86_64/scylla-server-`cat build/SCYLLA-VERSION-FILE`-`cat build/SCYLLA-RELEASE-FILE`.*.x86_64.rpm dist/ami/files/scylla-server.x86_64.rpm
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-jmx.noarch.rpm ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
|
||||
cd scylla-jmx
|
||||
sh -x -e dist/redhat/build_rpm.sh $*
|
||||
cd ../..
|
||||
cp build/scylla-jmx/build/rpmbuild/RPMS/noarch/scylla-jmx-`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-jmx.noarch.rpm
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-tools.noarch.rpm ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
|
||||
cd scylla-tools-java
|
||||
sh -x -e dist/redhat/build_rpm.sh
|
||||
cd ../..
|
||||
cp build/scylla-tools-java/build/rpmbuild/RPMS/noarch/scylla-tools-`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`.*.noarch.rpm dist/ami/files/scylla-tools.noarch.rpm
|
||||
fi
|
||||
else
|
||||
sudo apt-get install -y git
|
||||
if [ ! -f dist/ami/files/scylla-server_amd64.deb ]; then
|
||||
if [ ! -f ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb ]; then
|
||||
echo "Build .deb before running build_ami.sh"
|
||||
exit 1
|
||||
fi
|
||||
cp ../scylla-server_`cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/SCYLLA-RELEASE-FILE`-ubuntu1_amd64.deb dist/ami/files/scylla-server_amd64.deb
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-jmx_all.deb ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-jmx.git
|
||||
cd scylla-jmx
|
||||
sh -x -e dist/ubuntu/build_deb.sh $*
|
||||
cd ../..
|
||||
cp build/scylla-jmx_`cat build/scylla-jmx/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-jmx/build/SCYLLA-RELEASE-FILE`-ubuntu1_all.deb dist/ami/files/scylla-jmx_all.deb
|
||||
fi
|
||||
if [ ! -f dist/ami/files/scylla-tools_all.deb ]; then
|
||||
cd build
|
||||
git clone --depth 1 https://github.com/scylladb/scylla-tools-java.git
|
||||
cd scylla-tools-java
|
||||
sh -x -e dist/ubuntu/build_deb.sh $*
|
||||
cd ../..
|
||||
cp build/scylla-tools_`cat build/scylla-tools-java/build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/'`-`cat build/scylla-tools-java/build/SCYLLA-RELEASE-FILE`-ubuntu1_all.deb dist/ami/files/scylla-tools_all.deb
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -69,4 +115,4 @@ if [ ! -d packer ]; then
|
||||
cd -
|
||||
fi
|
||||
|
||||
packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" scylla.json
|
||||
packer/packer build -var-file=variables.json -var install_args="$INSTALL_ARGS" -var region="$REGION" -var source_ami="$AMI" -var ssh_username="$SSH_USERNAME" scylla.json
|
||||
|
||||
27
dist/ami/files/.bash_profile
vendored
27
dist/ami/files/.bash_profile
vendored
@@ -30,7 +30,21 @@ echo 'More documentation available at: '
|
||||
echo ' http://www.scylladb.com/doc/'
|
||||
echo
|
||||
|
||||
if [ "`systemctl is-active scylla-server`" = "active" ]; then
|
||||
. /etc/os-release
|
||||
if [ "$ID" = "ubuntu" ]; then
|
||||
if [ "`initctl status ssh|grep "running, process"`" != "" ]; then
|
||||
STARTED=1
|
||||
else
|
||||
STARTED=0
|
||||
fi
|
||||
else
|
||||
if [ "`systemctl is-active scylla-server`" = "active" ]; then
|
||||
STARTED=1
|
||||
else
|
||||
STARTED=0
|
||||
fi
|
||||
fi
|
||||
if [ $STARTED -eq 1 ]; then
|
||||
tput setaf 4
|
||||
tput bold
|
||||
echo " ScyllaDB is active."
|
||||
@@ -42,6 +56,13 @@ else
|
||||
echo " ScyllaDB is not started!"
|
||||
tput sgr0
|
||||
echo "Please wait for startup. To see status of ScyllaDB, run "
|
||||
echo " 'systemctl status scylla-server'"
|
||||
echo
|
||||
if [ "$ID" = "ubuntu" ]; then
|
||||
echo " 'initctl status scylla-server'"
|
||||
echo "and"
|
||||
echo " 'cat /var/log/upstart/scylla-server.log'"
|
||||
echo
|
||||
else
|
||||
echo " 'systemctl status scylla-server'"
|
||||
echo
|
||||
fi
|
||||
fi
|
||||
|
||||
2
dist/ami/files/scylla-ami
vendored
2
dist/ami/files/scylla-ami
vendored
Submodule dist/ami/files/scylla-ami updated: 84bcd0df6d...7019088d7b
14
dist/ami/scylla.json
vendored
14
dist/ami/scylla.json
vendored
@@ -8,10 +8,10 @@
|
||||
"security_group_id": "{{user `security_group_id`}}",
|
||||
"region": "{{user `region`}}",
|
||||
"associate_public_ip_address": "{{user `associate_public_ip_address`}}",
|
||||
"source_ami": "ami-f3102499",
|
||||
"source_ami": "{{user `source_ami`}}",
|
||||
"user_data_file": "user_data.txt",
|
||||
"instance_type": "{{user `instance_type`}}",
|
||||
"ssh_username": "centos",
|
||||
"ssh_username": "{{user `ssh_username`}}",
|
||||
"ssh_timeout": "5m",
|
||||
"ami_name": "{{user `ami_prefix`}}scylla_{{isotime | clean_ami_name}}",
|
||||
"enhanced_networking": true,
|
||||
@@ -62,17 +62,17 @@
|
||||
{
|
||||
"type": "file",
|
||||
"source": "files/",
|
||||
"destination": "/home/centos/"
|
||||
"destination": "/home/{{user `ssh_username`}}/"
|
||||
},
|
||||
{
|
||||
"type": "file",
|
||||
"source": "../../scripts/scylla_install_pkg",
|
||||
"destination": "/home/centos/scylla_install_pkg"
|
||||
"destination": "/home/{{user `ssh_username`}}/scylla_install_pkg"
|
||||
},
|
||||
{
|
||||
"type": "shell",
|
||||
"inline": [
|
||||
"sudo /home/centos/scylla-ami/scylla_install_ami {{ user `install_args` }}"
|
||||
"sudo /home/{{user `ssh_username`}}/scylla-ami/scylla_install_ami {{ user `install_args` }}"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -85,6 +85,8 @@
|
||||
"associate_public_ip_address": "",
|
||||
"instance_type": "",
|
||||
"install_args": "",
|
||||
"ami_prefix": ""
|
||||
"ami_prefix": "",
|
||||
"source_ami": "",
|
||||
"ssh_username": ""
|
||||
}
|
||||
}
|
||||
|
||||
7
dist/common/collectd.d/scylla.conf
vendored
7
dist/common/collectd.d/scylla.conf
vendored
@@ -1,5 +1,12 @@
|
||||
LoadPlugin network
|
||||
LoadPlugin unixsock
|
||||
|
||||
# dummy write_graphite to silent noisy warning
|
||||
LoadPlugin network
|
||||
<Plugin "network">
|
||||
Server "127.0.0.1 65534"
|
||||
</Plugin>
|
||||
|
||||
<Plugin network>
|
||||
Listen "127.0.0.1" "25826"
|
||||
</Plugin>
|
||||
|
||||
25
dist/common/scripts/scylla_bootparam_setup
vendored
25
dist/common/scripts/scylla_bootparam_setup
vendored
@@ -2,6 +2,25 @@
|
||||
#
|
||||
# Copyright (C) 2015 ScyllaDB
|
||||
|
||||
print_usage() {
|
||||
echo "scylla_bootparam_setup --ami"
|
||||
echo " --ami setup AMI instance"
|
||||
exit 1
|
||||
}
|
||||
|
||||
AMI_OPT=0
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--ami")
|
||||
AMI_OPT=1
|
||||
shift 1
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
. /etc/os-release
|
||||
|
||||
if [ ! -f /etc/default/grub ]; then
|
||||
@@ -14,7 +33,11 @@ if [ "`grep hugepagesz /etc/default/grub`" != "" ] || [ "`grep hugepages /etc/de
|
||||
sed -e "s#hugepages=[0-9]* ##" /etc/default/grub > /tmp/grub
|
||||
mv /tmp/grub /etc/default/grub
|
||||
fi
|
||||
sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
|
||||
if [ $AMI_OPT -eq 1 ]; then
|
||||
sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"clocksource=tsc tsc=reliable hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
|
||||
else
|
||||
sed -e "s#^GRUB_CMDLINE_LINUX=\"#GRUB_CMDLINE_LINUX=\"hugepagesz=2M hugepages=$NR_HUGEPAGES #" /etc/default/grub > /tmp/grub
|
||||
fi
|
||||
mv /tmp/grub /etc/default/grub
|
||||
if [ "$ID" = "ubuntu" ]; then
|
||||
grub-mkconfig -o /boot/grub/grub.cfg
|
||||
|
||||
31
dist/common/scripts/scylla_dev_mode_setup
vendored
Executable file
31
dist/common/scripts/scylla_dev_mode_setup
vendored
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/bin/sh -e
|
||||
#
|
||||
# Copyright (C) 2015 ScyllaDB
|
||||
|
||||
print_usage() {
|
||||
echo "scylla_developer_mode_setup --developer-mode=[0|1]"
|
||||
echo " --developer-mode enable/disable developer mode"
|
||||
exit 1
|
||||
}
|
||||
|
||||
DEV_MODE=
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--developer-mode")
|
||||
DEV_MODE=$2
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ "$DEV_MODE" = "" ]; then
|
||||
print_usage
|
||||
fi
|
||||
if [ "$DEV_MODE" != "0" ] && [ "$DEV_MODE" != "1" ]; then
|
||||
print_usage
|
||||
fi
|
||||
|
||||
echo "DEV_MODE=--developer-mode=$DEV_MODE" > /etc/scylla.d/dev-mode.conf
|
||||
73
dist/common/scripts/scylla_io_setup
vendored
73
dist/common/scripts/scylla_io_setup
vendored
@@ -1,31 +1,53 @@
|
||||
#!/bin/sh
|
||||
|
||||
is_ami() {
|
||||
if [ "`dmidecode --string system-version | grep \.amazon`" != "" ] && \
|
||||
[ "`curl http://169.254.169.254/latest/meta-data/ami-id | grep ami-`" != "" ]; then
|
||||
echo 1
|
||||
else
|
||||
echo 0
|
||||
fi
|
||||
print_usage() {
|
||||
echo "scylla_io_setup --ami"
|
||||
echo " --ami setup AMI instance"
|
||||
exit 1
|
||||
}
|
||||
|
||||
is_supported_instance_type() {
|
||||
TYPE=`curl http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
|
||||
case $TYPE in
|
||||
"m3"|"c3"|"i2") echo 1;;
|
||||
*) echo 0;;
|
||||
AMI_OPT=0
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
"--ami")
|
||||
AMI_OPT=1
|
||||
shift 1
|
||||
;;
|
||||
*)
|
||||
print_usage
|
||||
;;
|
||||
esac
|
||||
}
|
||||
done
|
||||
|
||||
|
||||
is_developer_mode() {
|
||||
echo $SCYLLA_ARGS|egrep -c "\-\-developer-mode(\s+|=)1"
|
||||
cat /etc/scylla.d/dev-mode.conf|egrep -c "\-\-developer-mode(\s+|=)(1|true)"
|
||||
}
|
||||
|
||||
if [ ! -f /etc/scylla/io_configured ] && [ `is_developer_mode` -eq 0 ]; then
|
||||
if [ `is_ami` -eq 1 ] && [ `is_supported_instance_type` -eq 1 ]; then
|
||||
NR_CPU=`cat /proc/cpuinfo |grep processor|wc -l`
|
||||
NR_DISKS=`curl http://169.254.169.254/latest/meta-data/block-device-mapping/|grep ephemeral|wc -l`
|
||||
output_to_user()
|
||||
{
|
||||
echo "$1"
|
||||
logger -p user.err "$1"
|
||||
}
|
||||
|
||||
. /etc/os-release
|
||||
if [ "$NAME" = "Ubuntu" ]; then
|
||||
. /etc/default/scylla-server
|
||||
else
|
||||
. /etc/sysconfig/scylla-server
|
||||
fi
|
||||
|
||||
if [ `is_developer_mode` -eq 0 ]; then
|
||||
SMP=`echo $SCYLLA_ARGS|grep smp|sed -e "s/^.*smp\(\s\+\|=\)\([0-9]*\).*$/\2/"`
|
||||
CPUSET=`echo $SCYLLA_ARGS|grep cpuset|sed -e "s/^.*\(--cpuset\(\s\+\|=\)[0-9\-]*\).*$/\1/"`
|
||||
if [ $AMI_OPT -eq 1 ]; then
|
||||
NR_CPU=`cat /proc/cpuinfo |grep processor|wc -l`
|
||||
NR_DISKS=`lsblk --list --nodeps --noheadings | grep -v xvda | grep xvd | wc -l`
|
||||
TYPE=`curl http://169.254.169.254/latest/meta-data/instance-type|cut -d . -f 1`
|
||||
|
||||
if [ "$SMP" != "" ]; then
|
||||
NR_CPU=$SMP
|
||||
fi
|
||||
NR_SHARDS=$NR_CPU
|
||||
if [ $NR_CPU -ge 8 ] && [ "$SET_NIC" = "no" ]; then
|
||||
NR_SHARDS=$((NR_CPU - 1))
|
||||
@@ -39,17 +61,20 @@ if [ ! -f /etc/scylla/io_configured ] && [ `is_developer_mode` -eq 0 ]; then
|
||||
NR_IO_QUEUES=$(($NR_REQS / 4))
|
||||
fi
|
||||
|
||||
NR_IO_QUEUES=$((NR_IO_QUEUES>NR_SHARDS?NR_SHARDS:NR_IO_QUEUES))
|
||||
NR_REQS=$(($(($NR_REQS / $NR_IO_QUEUES)) * $NR_IO_QUEUES))
|
||||
if [ "$TYPE" = "i2" ]; then
|
||||
NR_REQS=$(($NR_REQS * 2))
|
||||
fi
|
||||
|
||||
echo "SEASTAR_IO=\"--num-io-queues $NR_IO_QUEUES --max-io-requests $NR_REQS\"" > /etc/scylla.d/io.conf
|
||||
else
|
||||
iotune --evaluation-directory /var/lib/scylla --format envfile --options-file /etc/scylla.d/io.conf
|
||||
iotune --evaluation-directory /var/lib/scylla --format envfile --options-file /etc/scylla.d/io.conf $CPUSET
|
||||
if [ $? -ne 0 ]; then
|
||||
logger -p user.err "/var/lib/scylla did not pass validation tests, it may not be on XFS and/or has limited disk space."
|
||||
logger -p user.err "This is a non-supported setup, and performance is expected to be very bad."
|
||||
logger -p user.err "For better performance, placing your data on XFS-formatted directories is required."
|
||||
logger -p user.err " To override this error, see the developer_mode configuration option."
|
||||
output_to_user "/var/lib/scylla did not pass validation tests, it may not be on XFS and/or has limited disk space."
|
||||
output_to_user "This is a non-supported setup, and performance is expected to be very bad."
|
||||
output_to_user "For better performance, placing your data on XFS-formatted directories is required."
|
||||
output_to_user " To override this error, see the developer_mode configuration option."
|
||||
fi
|
||||
fi
|
||||
touch /etc/scylla/io_configured
|
||||
fi
|
||||
|
||||
2
dist/common/scripts/scylla_raid_setup
vendored
2
dist/common/scripts/scylla_raid_setup
vendored
@@ -49,7 +49,7 @@ fi
|
||||
|
||||
. /etc/os-release
|
||||
if [ "$NAME" = "Ubuntu" ]; then
|
||||
apt-get -y install mdadm xfsprogs
|
||||
env DEBIAN_FRONTEND=noninteractive apt-get -y install mdadm xfsprogs
|
||||
else
|
||||
yum -y install mdadm xfsprogs
|
||||
fi
|
||||
|
||||
38
dist/common/scripts/scylla_setup
vendored
38
dist/common/scripts/scylla_setup
vendored
@@ -8,11 +8,12 @@ if [ "`id -u`" -ne 0 ]; then
|
||||
fi
|
||||
|
||||
print_usage() {
|
||||
echo "scylla_setup --disks /dev/hda,/dev/hdb... --nic eth0 --ntp-domain centos --ami --no-enable-service --no-selinux-setup --no-bootparam-setup --no-ntp-setup --no-raid-setup --no-coredump-setup --no-sysconfig-setup"
|
||||
echo "scylla_setup --disks /dev/hda,/dev/hdb... --nic eth0 --ntp-domain centos --ami --developer-mode --no-enable-service --no-selinux-setup --no-bootparam-setup --no-ntp-setup --no-raid-setup --no-coredump-setup --no-sysconfig-setup"
|
||||
echo " --disks specify disks for RAID"
|
||||
echo " --nic specify NIC"
|
||||
echo " --ntp-domain specify NTP domain"
|
||||
echo " --ami setup AMI instance"
|
||||
echo " --developer-mode enable developer mode"
|
||||
echo " --no-enable-service skip enabling service"
|
||||
echo " --no-selinux-setup skip selinux setup"
|
||||
echo " --no-bootparam-setup skip bootparam setup"
|
||||
@@ -20,6 +21,7 @@ print_usage() {
|
||||
echo " --no-raid-setup skip raid setup"
|
||||
echo " --no-coredump-setup skip coredump setup"
|
||||
echo " --no-sysconfig-setup skip sysconfig setup"
|
||||
echo " --no-io-setup skip IO configuration setup"
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -40,6 +42,7 @@ interactive_ask_service() {
|
||||
}
|
||||
|
||||
AMI=0
|
||||
DEV_MODE=0
|
||||
ENABLE_SERVICE=1
|
||||
SELINUX_SETUP=1
|
||||
BOOTPARAM_SETUP=1
|
||||
@@ -47,6 +50,7 @@ NTP_SETUP=1
|
||||
RAID_SETUP=1
|
||||
COREDUMP_SETUP=1
|
||||
SYSCONFIG_SETUP=1
|
||||
IO_SETUP=1
|
||||
|
||||
if [ $# -ne 0 ]; then
|
||||
INTERACTIVE=0
|
||||
@@ -72,6 +76,10 @@ while [ $# -gt 0 ]; do
|
||||
AMI=1
|
||||
shift 1
|
||||
;;
|
||||
"--developer-mode")
|
||||
DEV_MODE=1
|
||||
shift 1
|
||||
;;
|
||||
"--no-enable-service")
|
||||
ENABLE_SERVICE=0
|
||||
shift 1
|
||||
@@ -100,6 +108,10 @@ while [ $# -gt 0 ]; do
|
||||
SYSCONFIG_SETUP=0
|
||||
shift 1
|
||||
;;
|
||||
"--no-io-setup")
|
||||
IO_SETUP=0
|
||||
shift 1
|
||||
;;
|
||||
"-h" | "--help")
|
||||
print_usage
|
||||
shift 1
|
||||
@@ -122,9 +134,9 @@ if [ $INTERACTIVE -eq 1 ]; then
|
||||
fi
|
||||
if [ $ENABLE_SERVICE -eq 1 ]; then
|
||||
if [ "$ID" = "fedora" ] || [ "$ID" = "centos" ]; then
|
||||
systemctl enable scylla-io-setup.service
|
||||
systemctl enable scylla-server.service
|
||||
systemctl enable scylla-jmx.service
|
||||
systemctl enable collectd.service
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -162,21 +174,21 @@ if [ $INTERACTIVE -eq 1 ]; then
|
||||
if [ $RAID_SETUP -eq 1 ]; then
|
||||
echo "Please select disks from following list: "
|
||||
while true; do
|
||||
lsblk -d -i -n -p -r|awk '{print $1}'|sed -e ':loop;N;$!b loop;s/\n/ /g'
|
||||
lsblk -d -i -n -r|awk '{print $1}'|sed -e ':loop;N;$!b loop;s/\n/ /g'
|
||||
echo "type 'done' to finish selection. selected: $DISKS"
|
||||
echo -n "> "
|
||||
read dsk
|
||||
if [ "$dsk" = "done" ]; then
|
||||
break
|
||||
fi
|
||||
if [ -e $dsk ]; then
|
||||
if [ -e /dev/$dsk ]; then
|
||||
if [ "$DISKS" = "" ]; then
|
||||
DISKS=$dsk
|
||||
DISKS=/dev/$dsk
|
||||
else
|
||||
DISKS="$DISKS,$dsk"
|
||||
DISKS="$DISKS,/dev/$dsk"
|
||||
fi
|
||||
else
|
||||
echo "$dsk not found"
|
||||
echo "/dev/$dsk not found"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
@@ -212,6 +224,18 @@ if [ $INTERACTIVE -eq 1 ]; then
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $INTERACTIVE -eq 1 ]; then
|
||||
interactive_ask_service "Do you want to setup IO configuration?" &&:
|
||||
IO_SETUP=$?
|
||||
fi
|
||||
if [ $IO_SETUP -eq 1 ]; then
|
||||
/usr/lib/scylla/scylla_io_setup
|
||||
fi
|
||||
|
||||
if [ $SYSCONFIG_SETUP -eq 1 ]; then
|
||||
/usr/lib/scylla/scylla_sysconfig_setup --nic $NIC
|
||||
fi
|
||||
if [ $DEV_MODE -eq 1 ]; then
|
||||
/usr/lib/scylla/scylla_dev_mode_setup --developer-mode 1
|
||||
fi
|
||||
|
||||
2
dist/common/scripts/scylla_sysconfig_setup
vendored
2
dist/common/scripts/scylla_sysconfig_setup
vendored
@@ -76,7 +76,7 @@ echo Setting parameters on $SYSCONFIG/scylla-server
|
||||
ETHDRV=`/usr/lib/scylla/dpdk_nic_bind.py --status | grep if=$NIC | sed -e "s/^.*drv=//" -e "s/ .*$//"`
|
||||
ETHPCIID=`/usr/lib/scylla/dpdk_nic_bind.py --status | grep if=$NIC | awk '{print $1}'`
|
||||
NR_CPU=`cat /proc/cpuinfo |grep processor|wc -l`
|
||||
if [ $NR_CPU -ge 8 ] && [ "$SET_NIC" = "no" ]; then
|
||||
if [ "$AMI" = "yes" ] && [ $NR_CPU -ge 8 ] && [ "$SET_NIC" = "no" ]; then
|
||||
NR=$((NR_CPU - 1))
|
||||
SET_NIC="yes"
|
||||
SCYLLA_ARGS="$SCYLLA_ARGS --cpuset 1-$NR --smp $NR"
|
||||
|
||||
4
dist/common/scylla.d/dev-mode.conf
vendored
Normal file
4
dist/common/scylla.d/dev-mode.conf
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
# DO NO EDIT
|
||||
# This file should be automatically configure by scylla_dev_mode_setup
|
||||
#
|
||||
# DEV_MODE=--developer-mode=0
|
||||
2
dist/common/scylla.d/io.conf
vendored
2
dist/common/scylla.d/io.conf
vendored
@@ -1,4 +1,4 @@
|
||||
# DO NO EDIT
|
||||
# This file should be automatically configure by scylla-io-setup.service
|
||||
# This file should be automatically configure by scylla_io_setup
|
||||
#
|
||||
# SEASTAR_IO="--max-io-requests=1 --num-io-queues=1"
|
||||
|
||||
2
dist/common/sudoers.d/scylla
vendored
2
dist/common/sudoers.d/scylla
vendored
@@ -1 +1 @@
|
||||
scylla ALL=(ALL) NOPASSWD:SETENV: /usr/lib/scylla/scylla_prepare,/usr/lib/scylla/scylla_stop,/usr/lib/scylla/scylla_io_setup
|
||||
scylla ALL=(ALL) NOPASSWD:SETENV: /usr/lib/scylla/scylla_prepare,/usr/lib/scylla/scylla_stop,/usr/lib/scylla/scylla_io_setup,/usr/lib/scylla/scylla-ami/scylla_ami_setup
|
||||
|
||||
1
dist/docker/Dockerfile
vendored
1
dist/docker/Dockerfile
vendored
@@ -4,6 +4,7 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
|
||||
|
||||
RUN yum -y install epel-release
|
||||
ADD scylla.repo /etc/yum.repos.d/
|
||||
RUN yum -y clean expire-cache
|
||||
RUN yum -y update
|
||||
RUN yum -y remove boost-thread boost-system
|
||||
RUN yum -y install scylla-server hostname
|
||||
|
||||
4
dist/redhat/scylla-server.spec.in
vendored
4
dist/redhat/scylla-server.spec.in
vendored
@@ -113,11 +113,9 @@ if [ -f /etc/systemd/coredump.conf ];then
|
||||
/usr/lib/scylla/scylla_coredump_setup
|
||||
fi
|
||||
%systemd_post scylla-server.service
|
||||
%systemd_post scylla-io-setup.service
|
||||
|
||||
%preun
|
||||
%systemd_preun scylla-server.service
|
||||
%systemd_preun scylla-io-setup.service
|
||||
|
||||
%postun
|
||||
%systemd_postun
|
||||
@@ -151,7 +149,6 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%{_docdir}/scylla/ORIGIN
|
||||
%{_docdir}/scylla/licenses/
|
||||
%{_unitdir}/scylla-server.service
|
||||
%{_unitdir}/scylla-io-setup.service
|
||||
%{_bindir}/scylla
|
||||
%{_bindir}/iotune
|
||||
%{_bindir}/scyllatop
|
||||
@@ -165,6 +162,7 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%{_prefix}/lib/scylla/scylla_ntp_setup
|
||||
%{_prefix}/lib/scylla/scylla_selinux_setup
|
||||
%{_prefix}/lib/scylla/scylla_io_setup
|
||||
%{_prefix}/lib/scylla/scylla_dev_mode_setup
|
||||
%{_prefix}/lib/scylla/posix_net_conf.sh
|
||||
%{_prefix}/lib/scylla/dpdk_nic_bind.py
|
||||
%{_prefix}/lib/scylla/dpdk_nic_bind.pyc
|
||||
|
||||
10
dist/redhat/systemd/scylla-io-setup.service
vendored
10
dist/redhat/systemd/scylla-io-setup.service
vendored
@@ -1,10 +0,0 @@
|
||||
[Unit]
|
||||
Description=Scylla IO Setup
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
EnvironmentFile=/etc/sysconfig/scylla-server
|
||||
ExecStart=/usr/lib/scylla/scylla_io_setup
|
||||
RemainAfterExit=yes
|
||||
TimeoutStartSec=1800
|
||||
4
dist/redhat/systemd/scylla-server.service
vendored
4
dist/redhat/systemd/scylla-server.service
vendored
@@ -1,7 +1,5 @@
|
||||
[Unit]
|
||||
Description=Scylla Server
|
||||
After=scylla-io-setup.service
|
||||
Requires=scylla-io-setup.service
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
@@ -14,7 +12,7 @@ Environment="HOME=/var/lib/scylla"
|
||||
EnvironmentFile=/etc/sysconfig/scylla-server
|
||||
EnvironmentFile=/etc/scylla.d/*.conf
|
||||
ExecStartPre=/usr/bin/sudo -E /usr/lib/scylla/scylla_prepare
|
||||
ExecStart=/usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO
|
||||
ExecStart=/usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE
|
||||
ExecStopPost=/usr/bin/sudo -E /usr/lib/scylla/scylla_stop
|
||||
TimeoutStartSec=900
|
||||
KillMode=process
|
||||
|
||||
2
dist/ubuntu/build_deb.sh
vendored
2
dist/ubuntu/build_deb.sh
vendored
@@ -32,7 +32,7 @@ if [ `grep -c $RELEASE dist/ubuntu/supported_release` -lt 1 ]; then
|
||||
fi
|
||||
|
||||
VERSION=$(./SCYLLA-VERSION-GEN)
|
||||
SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE)
|
||||
SCYLLA_VERSION=$(cat build/SCYLLA-VERSION-FILE | sed 's/\.rc/~rc/')
|
||||
SCYLLA_RELEASE=$(cat build/SCYLLA-RELEASE-FILE)
|
||||
echo $VERSION > version
|
||||
./scripts/git-archive-all --extra version --force-submodules --prefix scylla-server ../scylla-server_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz
|
||||
|
||||
4
dist/ubuntu/debian/scylla-server.init
vendored
4
dist/ubuntu/debian/scylla-server.init
vendored
@@ -37,8 +37,10 @@ eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
|
||||
|
||||
do_start()
|
||||
{
|
||||
if [ "$AMI" = "yes" ]; then
|
||||
/usr/lib/scylla/scylla-ami/scylla_ami_setup
|
||||
fi
|
||||
/usr/lib/scylla/scylla_prepare
|
||||
/usr/lib/scylla/scylla_io_setup
|
||||
# Return
|
||||
# 0 if daemon has been started
|
||||
# 1 if daemon was already running
|
||||
|
||||
21
dist/ubuntu/debian/scylla-server.upstart
vendored
21
dist/ubuntu/debian/scylla-server.upstart
vendored
@@ -26,19 +26,30 @@ env HOME=/var/lib/scylla
|
||||
|
||||
pre-start script
|
||||
eval "`grep -v -e "^\s*#" -e "^$" /etc/default/scylla-server|sed -e 's/^/export /'`"
|
||||
eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
|
||||
. /etc/scylla.d/dev-mode.conf
|
||||
. /etc/scylla.d/io.conf
|
||||
export DEV_MODE
|
||||
export SEASTAR_IO
|
||||
if [ "$AMI" = "yes" ]; then
|
||||
sudo /usr/lib/scylla/scylla-ami/scylla_ami_setup
|
||||
fi
|
||||
sudo /usr/lib/scylla/scylla_prepare
|
||||
sudo /usr/lib/scylla/scylla_io_setup
|
||||
end script
|
||||
|
||||
script
|
||||
eval "`grep -v -e "^\s*#" -e "^$" /etc/default/scylla-server|sed -e 's/^/export /'`"
|
||||
eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
|
||||
exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO
|
||||
. /etc/scylla.d/dev-mode.conf
|
||||
. /etc/scylla.d/io.conf
|
||||
export DEV_MODE
|
||||
export SEASTAR_IO
|
||||
exec /usr/bin/scylla $SCYLLA_ARGS $SEASTAR_IO $DEV_MODE
|
||||
end script
|
||||
|
||||
post-stop script
|
||||
eval "`grep -v -e "^\s*#" -e "^$" /etc/default/scylla-server|sed -e 's/^/export /'`"
|
||||
eval "`grep -v -e "^\s*#" -e "^$" /etc/scylla.d/*.conf|sed -e 's/^/export /'`"
|
||||
. /etc/scylla.d/dev-mode.conf
|
||||
. /etc/scylla.d/io.conf
|
||||
export DEV_MODE
|
||||
export SEASTAR_IO
|
||||
sudo /usr/lib/scylla/scylla_stop
|
||||
end script
|
||||
|
||||
5
dist/ubuntu/rules.in
vendored
5
dist/ubuntu/rules.in
vendored
@@ -35,7 +35,7 @@ override_dh_auto_install:
|
||||
cp $(CURDIR)/dist/common/collectd.d/scylla.conf $(COLLECTD)
|
||||
|
||||
mkdir -p $(SCYLLAD) && \
|
||||
cp $(CURDIR)/dist/common/scylla.d/io.conf $(SCYLLAD)
|
||||
cp $(CURDIR)/dist/common/scylla.d/*.conf $(SCYLLAD)
|
||||
|
||||
mkdir -p $(CONF) && \
|
||||
cp $(CURDIR)/conf/scylla.yaml $(CONF)
|
||||
@@ -72,6 +72,9 @@ override_dh_auto_install:
|
||||
mkdir -p $(CURDIR)/debian/scylla-server/var/lib/scylla/commitlog
|
||||
mkdir -p $(CURDIR)/debian/scylla-server/var/lib/scylla/coredump
|
||||
|
||||
override_dh_installinit:
|
||||
dh_installinit --no-start
|
||||
|
||||
override_dh_strip:
|
||||
dh_strip --dbg-package=scylla-server-dbg
|
||||
%:
|
||||
|
||||
@@ -62,7 +62,12 @@ static const std::map<application_state, sstring> application_state_names = {
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const application_state& m) {
|
||||
os << application_state_names.at(m);
|
||||
auto it = application_state_names.find(m);
|
||||
if (it != application_state_names.end()) {
|
||||
os << application_state_names.at(m);
|
||||
} else {
|
||||
os << "UNKNOWN";
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
@@ -20,7 +20,8 @@
|
||||
*/
|
||||
|
||||
namespace gms {
|
||||
enum class application_state:int {STATUS = 0,
|
||||
enum class application_state:int {
|
||||
STATUS = 0,
|
||||
LOAD,
|
||||
SCHEMA,
|
||||
DC,
|
||||
@@ -29,6 +30,7 @@ enum class application_state:int {STATUS = 0,
|
||||
REMOVAL_COORDINATOR,
|
||||
INTERNAL_IP,
|
||||
RPC_ADDRESS,
|
||||
X_11_PADDING,
|
||||
SEVERITY,
|
||||
NET_VERSION,
|
||||
HOST_ID,
|
||||
|
||||
28
main.cc
28
main.cc
@@ -293,9 +293,19 @@ int main(int ac, char** av) {
|
||||
sstring broadcast_rpc_address = cfg->broadcast_rpc_address();
|
||||
|
||||
if (!broadcast_address.empty()) {
|
||||
utils::fb_utilities::set_broadcast_address(broadcast_address);
|
||||
try {
|
||||
utils::fb_utilities::set_broadcast_address(broadcast_address);
|
||||
} catch (...) {
|
||||
startlog.error("Bad configuration: invalid 'broadcast_address': {}: {}", broadcast_address, std::current_exception());
|
||||
throw bad_configuration_error();
|
||||
}
|
||||
} else if (!listen_address.empty()) {
|
||||
utils::fb_utilities::set_broadcast_address(listen_address);
|
||||
try {
|
||||
utils::fb_utilities::set_broadcast_address(listen_address);
|
||||
} catch (...) {
|
||||
startlog.error("Bad configuration: invalid 'listen_address': {}: {}", listen_address, std::current_exception());
|
||||
throw bad_configuration_error();
|
||||
}
|
||||
} else {
|
||||
startlog.error("Bad configuration: neither listen_address nor broadcast_address are defined\n");
|
||||
throw bad_configuration_error();
|
||||
@@ -352,11 +362,14 @@ int main(int ac, char** av) {
|
||||
print("Scylla API server listening on %s:%s ...\n", api_address, api_port);
|
||||
supervisor_notify("initializing storage service");
|
||||
init_storage_service(db).get();
|
||||
api::set_server_storage_service(ctx).get();
|
||||
supervisor_notify("starting per-shard database core");
|
||||
// Note: changed from using a move here, because we want the config object intact.
|
||||
db.start(std::ref(*cfg)).get();
|
||||
engine().at_exit([&db] {
|
||||
// A shared sstable must be compacted by all shards before it can be deleted.
|
||||
// Since we're stoping, that's not going to happen. Cancel those pending
|
||||
// deletions to let anyone waiting on them to continue.
|
||||
sstables::cancel_atomic_deletions();
|
||||
// #293 - do not stop anything - not even db (for real)
|
||||
//return db.stop();
|
||||
// call stop on each db instance, but leave the shareded<database> pointers alive.
|
||||
@@ -422,14 +435,11 @@ int main(int ac, char** av) {
|
||||
, seed_provider
|
||||
, cluster_name
|
||||
, phi).get();
|
||||
api::set_server_gossip(ctx).get();
|
||||
supervisor_notify("starting messaging service");
|
||||
api::set_server_messaging_service(ctx).get();
|
||||
supervisor_notify("starting storage proxy");
|
||||
proxy.start(std::ref(db)).get();
|
||||
// #293 - do not stop anything
|
||||
// engine().at_exit([&proxy] { return proxy.stop(); });
|
||||
api::set_server_storage_proxy(ctx).get();
|
||||
supervisor_notify("starting migration manager");
|
||||
mm.start().get();
|
||||
// #293 - do not stop anything
|
||||
@@ -458,7 +468,6 @@ int main(int ac, char** av) {
|
||||
}
|
||||
return db.load_sstables(proxy);
|
||||
}).get();
|
||||
api::set_server_load_sstable(ctx).get();
|
||||
supervisor_notify("setting up system keyspace");
|
||||
db::system_keyspace::setup(db, qp).get();
|
||||
supervisor_notify("starting commit log");
|
||||
@@ -479,6 +488,11 @@ int main(int ac, char** av) {
|
||||
}
|
||||
}
|
||||
}
|
||||
api::set_server_storage_service(ctx).get();
|
||||
api::set_server_gossip(ctx).get();
|
||||
api::set_server_messaging_service(ctx).get();
|
||||
api::set_server_storage_proxy(ctx).get();
|
||||
api::set_server_load_sstable(ctx).get();
|
||||
supervisor_notify("initializing migration manager RPC verbs");
|
||||
service::get_migration_manager().invoke_on_all([] (auto& mm) {
|
||||
mm.init_messaging_service();
|
||||
|
||||
@@ -360,6 +360,7 @@ void messaging_service::cache_preferred_ip(gms::inet_address ep, gms::inet_addre
|
||||
}
|
||||
|
||||
shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::get_rpc_client(messaging_verb verb, msg_addr id) {
|
||||
assert(!_stopping);
|
||||
auto idx = get_rpc_client_idx(verb);
|
||||
auto it = _clients[idx].find(id);
|
||||
|
||||
@@ -409,6 +410,13 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
}
|
||||
|
||||
void messaging_service::remove_rpc_client_one(clients_map& clients, msg_addr id, bool dead_only) {
|
||||
if (_stopping) {
|
||||
// if messaging service is in a processed of been stopped no need to
|
||||
// stop and remove connection here since they are being stopped already
|
||||
// and we'll just interfere
|
||||
return;
|
||||
}
|
||||
|
||||
auto it = clients.find(id);
|
||||
if (it != clients.end() && (!dead_only || it->second.rpc_client->error())) {
|
||||
auto client = std::move(it->second.rpc_client);
|
||||
@@ -442,8 +450,12 @@ std::unique_ptr<messaging_service::rpc_protocol_wrapper>& messaging_service::rpc
|
||||
// Send a message for verb
|
||||
template <typename MsgIn, typename... MsgOut>
|
||||
auto send_message(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOut&&... msg) {
|
||||
auto rpc_client_ptr = ms->get_rpc_client(verb, id);
|
||||
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
|
||||
if (ms->is_stopping()) {
|
||||
using futurator = futurize<std::result_of_t<decltype(rpc_handler)(rpc_protocol::client&, MsgOut...)>>;
|
||||
return futurator::make_exception_future(rpc::closed_error());
|
||||
}
|
||||
auto rpc_client_ptr = ms->get_rpc_client(verb, id);
|
||||
auto& rpc_client = *rpc_client_ptr;
|
||||
return rpc_handler(rpc_client, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
|
||||
try {
|
||||
@@ -467,8 +479,12 @@ auto send_message(messaging_service* ms, messaging_verb verb, msg_addr id, MsgOu
|
||||
// TODO: Remove duplicated code in send_message
|
||||
template <typename MsgIn, typename Timeout, typename... MsgOut>
|
||||
auto send_message_timeout(messaging_service* ms, messaging_verb verb, msg_addr id, Timeout timeout, MsgOut&&... msg) {
|
||||
auto rpc_client_ptr = ms->get_rpc_client(verb, id);
|
||||
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
|
||||
if (ms->is_stopping()) {
|
||||
using futurator = futurize<std::result_of_t<decltype(rpc_handler)(rpc_protocol::client&, MsgOut...)>>;
|
||||
return futurator::make_exception_future(rpc::closed_error());
|
||||
}
|
||||
auto rpc_client_ptr = ms->get_rpc_client(verb, id);
|
||||
auto& rpc_client = *rpc_client_ptr;
|
||||
return rpc_handler(rpc_client, timeout, std::forward<MsgOut>(msg)...).then_wrapped([ms = ms->shared_from_this(), id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (auto&& f) {
|
||||
try {
|
||||
@@ -534,7 +550,7 @@ auto send_message_timeout_and_retry(messaging_service* ms, messaging_verb verb,
|
||||
throw;
|
||||
}
|
||||
});
|
||||
}).then([] (MsgInTuple result) {
|
||||
}).then([ms = ms->shared_from_this()] (MsgInTuple result) {
|
||||
return futurize<MsgIn>::from_tuple(std::move(result));
|
||||
});
|
||||
});
|
||||
|
||||
33
mutation.cc
33
mutation.cc
@@ -126,16 +126,37 @@ bool mutation::operator!=(const mutation& m) const {
|
||||
return !(*this == m);
|
||||
}
|
||||
|
||||
void
|
||||
mutation::query(query::result::builder& builder,
|
||||
const query::partition_slice& slice,
|
||||
gc_clock::time_point now,
|
||||
uint32_t row_limit) &&
|
||||
{
|
||||
auto pb = builder.add_partition(*schema(), key());
|
||||
auto is_reversed = slice.options.contains<query::partition_slice::option::reversed>();
|
||||
mutation_partition& p = partition();
|
||||
p.compact_for_query(*schema(), now, slice.row_ranges(*schema(), key()), is_reversed, row_limit);
|
||||
p.query_compacted(pb, *schema(), row_limit);
|
||||
}
|
||||
|
||||
query::result
|
||||
mutation::query(const query::partition_slice& slice, query::result_request request,
|
||||
gc_clock::time_point now, uint32_t row_limit) const
|
||||
mutation::query(const query::partition_slice& slice,
|
||||
query::result_request request,
|
||||
gc_clock::time_point now, uint32_t row_limit) &&
|
||||
{
|
||||
query::result::builder builder(slice, request);
|
||||
auto pb = builder.add_partition(*schema(), key());
|
||||
partition().query(pb, *schema(), now, row_limit);
|
||||
std::move(*this).query(builder, slice, now, row_limit);
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
query::result
|
||||
mutation::query(const query::partition_slice& slice,
|
||||
query::result_request request,
|
||||
gc_clock::time_point now, uint32_t row_limit) const&
|
||||
{
|
||||
return mutation(*this).query(slice, request, now, row_limit);
|
||||
}
|
||||
|
||||
size_t
|
||||
mutation::live_row_count(gc_clock::time_point query_time) const {
|
||||
return partition().live_row_count(*schema(), query_time);
|
||||
@@ -186,3 +207,7 @@ void mutation::apply(mutation&& m) {
|
||||
void mutation::apply(const mutation& m) {
|
||||
partition().apply(*schema(), m.partition(), *m.schema());
|
||||
}
|
||||
|
||||
mutation& mutation::operator=(const mutation& m) {
|
||||
return *this = mutation(m);
|
||||
}
|
||||
|
||||
21
mutation.hh
21
mutation.hh
@@ -60,9 +60,9 @@ public:
|
||||
mutation(const mutation& m)
|
||||
: _ptr(std::make_unique<data>(schema_ptr(m.schema()), dht::decorated_key(m.decorated_key()), m.partition()))
|
||||
{ }
|
||||
|
||||
mutation(mutation&&) = default;
|
||||
mutation& operator=(mutation&& x) = default;
|
||||
mutation& operator=(const mutation& m);
|
||||
|
||||
void set_static_cell(const column_definition& def, atomic_cell_or_collection&& value);
|
||||
void set_static_cell(const bytes& name, const data_value& value, api::timestamp_type timestamp, ttl_opt ttl = {});
|
||||
@@ -104,8 +104,23 @@ public:
|
||||
bool operator!=(const mutation&) const;
|
||||
public:
|
||||
// The supplied partition_slice must be governed by this mutation's schema
|
||||
query::result query(const query::partition_slice&, query::result_request request = query::result_request::only_result,
|
||||
gc_clock::time_point now = gc_clock::now(), uint32_t row_limit = query::max_rows) const;
|
||||
query::result query(const query::partition_slice&,
|
||||
query::result_request request = query::result_request::only_result,
|
||||
gc_clock::time_point now = gc_clock::now(),
|
||||
uint32_t row_limit = query::max_rows) &&;
|
||||
|
||||
// The supplied partition_slice must be governed by this mutation's schema
|
||||
// FIXME: Slower than the r-value version
|
||||
query::result query(const query::partition_slice&,
|
||||
query::result_request request = query::result_request::only_result,
|
||||
gc_clock::time_point now = gc_clock::now(),
|
||||
uint32_t row_limit = query::max_rows) const&;
|
||||
|
||||
// The supplied partition_slice must be governed by this mutation's schema
|
||||
void query(query::result::builder& builder,
|
||||
const query::partition_slice& slice,
|
||||
gc_clock::time_point now = gc_clock::now(),
|
||||
uint32_t row_limit = query::max_rows) &&;
|
||||
|
||||
// See mutation_partition::live_row_count()
|
||||
size_t live_row_count(gc_clock::time_point query_time = gc_clock::time_point::min()) const;
|
||||
|
||||
@@ -20,12 +20,14 @@
|
||||
*/
|
||||
|
||||
#include <boost/range/adaptor/reversed.hpp>
|
||||
#include <seastar/util/defer.hh>
|
||||
#include "mutation_partition.hh"
|
||||
#include "mutation_partition_applier.hh"
|
||||
#include "converting_mutation_partition_applier.hh"
|
||||
#include "partition_builder.hh"
|
||||
#include "query-result-writer.hh"
|
||||
#include "atomic_cell_hash.hh"
|
||||
#include "reversibly_mergeable.hh"
|
||||
|
||||
template<bool reversed>
|
||||
struct reversal_traits;
|
||||
@@ -57,6 +59,11 @@ struct reversal_traits<false> {
|
||||
{
|
||||
return r;
|
||||
}
|
||||
|
||||
template <typename Container>
|
||||
static typename Container::iterator maybe_reverse(Container&, typename Container::iterator r) {
|
||||
return r;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
@@ -89,8 +96,116 @@ struct reversal_traits<true> {
|
||||
using reverse_iterator = typename Container::reverse_iterator;
|
||||
return boost::make_iterator_range(reverse_iterator(r.end()), reverse_iterator(r.begin()));
|
||||
}
|
||||
|
||||
template <typename Container>
|
||||
static typename Container::reverse_iterator maybe_reverse(Container&, typename Container::iterator r) {
|
||||
return typename Container::reverse_iterator(r);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// apply_reversibly_intrusive_set() and revert_intrusive_set() implement ReversiblyMergeable
|
||||
// for a boost::intrusive_set<> container of ReversiblyMergeable entries.
|
||||
//
|
||||
// See reversibly_mergeable.hh
|
||||
//
|
||||
// Requirements:
|
||||
// - entry has distinct key and value states
|
||||
// - entries are ordered only by key in the container
|
||||
// - entry can have an empty value
|
||||
// - presence of an entry with an empty value doesn't affect equality of the containers
|
||||
// - E::empty() returns true iff the value is empty
|
||||
// - E(e.key()) creates an entry with empty value but the same key as that of e.
|
||||
//
|
||||
// Implementation of ReversiblyMergeable for the entry's value is provided via Apply and Revert functors.
|
||||
//
|
||||
// ReversiblyMergeable is constructed assuming the following properties of the 'apply' operation
|
||||
// on containers:
|
||||
//
|
||||
// apply([{k1, v1}], [{k1, v2}]) = [{k1, apply(v1, v2)}]
|
||||
// apply([{k1, v1}], [{k2, v2}]) = [{k1, v1}, {k2, v2}]
|
||||
//
|
||||
|
||||
// revert for apply_reversibly_intrusive_set()
|
||||
template<typename Container, typename Revert = default_reverter<typename Container::value_type>>
|
||||
void revert_intrusive_set_range(Container& dst, Container& src,
|
||||
typename Container::iterator start,
|
||||
typename Container::iterator end,
|
||||
Revert&& revert = Revert()) noexcept
|
||||
{
|
||||
using value_type = typename Container::value_type;
|
||||
auto deleter = current_deleter<value_type>();
|
||||
while (start != end) {
|
||||
auto& e = *start;
|
||||
// lower_bound() can allocate if linearization is required but it should have
|
||||
// been already performed by the lower_bound() invocation in apply_reversibly_intrusive_set() and
|
||||
// stored in the linearization context.
|
||||
auto i = dst.find(e);
|
||||
assert(i != dst.end());
|
||||
value_type& dst_e = *i;
|
||||
|
||||
if (e.empty()) {
|
||||
dst.erase(i);
|
||||
start = src.erase_and_dispose(start, deleter);
|
||||
start = src.insert_before(start, dst_e);
|
||||
} else {
|
||||
revert(dst_e, e);
|
||||
}
|
||||
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Container, typename Revert = default_reverter<typename Container::value_type>>
|
||||
void revert_intrusive_set(Container& dst, Container& src, Revert&& revert = Revert()) noexcept {
|
||||
revert_intrusive_set_range(dst, src, src.begin(), src.end(), std::forward<Revert>(revert));
|
||||
}
|
||||
|
||||
// Applies src onto dst. See comment above revert_intrusive_set_range() for more details.
|
||||
//
|
||||
// Returns an object which upon going out of scope, unless cancel() is called on it,
|
||||
// reverts the applicaiton by calling revert_intrusive_set(). The references to containers
|
||||
// must be stable as long as the returned object is live.
|
||||
template<typename Container,
|
||||
typename Apply = default_reversible_applier<typename Container::value_type>,
|
||||
typename Revert = default_reverter<typename Container::value_type>>
|
||||
auto apply_reversibly_intrusive_set(Container& dst, Container& src, Apply&& apply = Apply(), Revert&& revert = Revert()) {
|
||||
using value_type = typename Container::value_type;
|
||||
auto src_i = src.begin();
|
||||
try {
|
||||
while (src_i != src.end()) {
|
||||
value_type& src_e = *src_i;
|
||||
|
||||
// neutral entries will be given special meaning for the purpose of revert, so
|
||||
// get rid of empty rows from the input as if they were not there. This doesn't change
|
||||
// the value of src.
|
||||
if (src_e.empty()) {
|
||||
src_i = src.erase_and_dispose(src_i, current_deleter<value_type>());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto i = dst.lower_bound(src_e);
|
||||
if (i == dst.end() || dst.key_comp()(src_e, *i)) {
|
||||
// Construct neutral entry which will represent missing dst entry for revert.
|
||||
value_type* empty_e = current_allocator().construct<value_type>(src_e.key());
|
||||
[&] () noexcept {
|
||||
src_i = src.erase(src_i);
|
||||
src_i = src.insert_before(src_i, *empty_e);
|
||||
dst.insert_before(i, src_e);
|
||||
}();
|
||||
} else {
|
||||
apply(*i, src_e);
|
||||
}
|
||||
++src_i;
|
||||
}
|
||||
return defer([&dst, &src, revert] { revert_intrusive_set(dst, src, revert); });
|
||||
} catch (...) {
|
||||
revert_intrusive_set_range(dst, src, src.begin(), src_i, revert);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
mutation_partition::mutation_partition(const mutation_partition& x)
|
||||
: _tombstone(x._tombstone)
|
||||
, _static_row(x._static_row)
|
||||
@@ -134,29 +249,12 @@ mutation_partition::apply(const schema& s, const mutation_partition& p, const sc
|
||||
if (s.version() != p_schema.version()) {
|
||||
auto p2 = p;
|
||||
p2.upgrade(p_schema, s);
|
||||
apply(s, std::move(p2), s);
|
||||
apply(s, std::move(p2));
|
||||
return;
|
||||
}
|
||||
|
||||
_tombstone.apply(p._tombstone);
|
||||
|
||||
for (auto&& e : p._row_tombstones) {
|
||||
apply_row_tombstone(s, e.prefix(), e.t());
|
||||
}
|
||||
|
||||
_static_row.merge(s, column_kind::static_column, p._static_row);
|
||||
|
||||
for (auto&& entry : p._rows) {
|
||||
auto i = _rows.find(entry);
|
||||
if (i == _rows.end()) {
|
||||
auto e = current_allocator().construct<rows_entry>(entry);
|
||||
_rows.insert(i, *e);
|
||||
} else {
|
||||
i->row().apply(entry.row().deleted_at());
|
||||
i->row().apply(entry.row().marker());
|
||||
i->row().cells().merge(s, column_kind::regular_column, entry.row().cells());
|
||||
}
|
||||
}
|
||||
mutation_partition tmp(p);
|
||||
apply(s, std::move(tmp));
|
||||
}
|
||||
|
||||
void
|
||||
@@ -167,42 +265,42 @@ mutation_partition::apply(const schema& s, mutation_partition&& p, const schema&
|
||||
return;
|
||||
}
|
||||
|
||||
_tombstone.apply(p._tombstone);
|
||||
apply(s, std::move(p));
|
||||
}
|
||||
|
||||
p._row_tombstones.clear_and_dispose([this, &s] (row_tombstones_entry* e) {
|
||||
apply_row_tombstone(s, e);
|
||||
void
|
||||
mutation_partition::apply(const schema& s, mutation_partition&& p) {
|
||||
auto revert_row_tombstones = apply_reversibly_intrusive_set(_row_tombstones, p._row_tombstones);
|
||||
|
||||
_static_row.apply_reversibly(s, column_kind::static_column, p._static_row);
|
||||
auto revert_static_row = defer([&] {
|
||||
_static_row.revert(s, column_kind::static_column, p._static_row);
|
||||
});
|
||||
|
||||
_static_row.merge(s, column_kind::static_column, std::move(p._static_row));
|
||||
auto revert_rows = apply_reversibly_intrusive_set(_rows, p._rows,
|
||||
[&s] (rows_entry& dst, rows_entry& src) { dst.apply_reversibly(s, src); },
|
||||
[&s] (rows_entry& dst, rows_entry& src) noexcept { dst.revert(s, src); });
|
||||
|
||||
auto p_i = p._rows.begin();
|
||||
auto p_end = p._rows.end();
|
||||
while (p_i != p_end) {
|
||||
rows_entry& entry = *p_i;
|
||||
auto i = _rows.find(entry);
|
||||
if (i == _rows.end()) {
|
||||
p_i = p._rows.erase(p_i);
|
||||
_rows.insert(i, entry);
|
||||
} else {
|
||||
i->row().apply(entry.row().deleted_at());
|
||||
i->row().apply(entry.row().marker());
|
||||
i->row().cells().merge(s, column_kind::regular_column, std::move(entry.row().cells()));
|
||||
p_i = p._rows.erase_and_dispose(p_i, current_deleter<rows_entry>());
|
||||
}
|
||||
}
|
||||
_tombstone.apply(p._tombstone); // noexcept
|
||||
|
||||
revert_rows.cancel();
|
||||
revert_row_tombstones.cancel();
|
||||
revert_static_row.cancel();
|
||||
}
|
||||
|
||||
void
|
||||
mutation_partition::apply(const schema& s, mutation_partition_view p, const schema& p_schema) {
|
||||
if (p_schema.version() == s.version()) {
|
||||
mutation_partition_applier applier(s, *this);
|
||||
p.accept(s, applier);
|
||||
mutation_partition p2(*this, copy_comparators_only{});
|
||||
partition_builder b(s, p2);
|
||||
p.accept(s, b);
|
||||
apply(s, std::move(p2));
|
||||
} else {
|
||||
mutation_partition p2(*this, copy_comparators_only{});
|
||||
partition_builder b(p_schema, p2);
|
||||
p.accept(p_schema, b);
|
||||
p2.upgrade(p_schema, s);
|
||||
apply(s, std::move(p2), s);
|
||||
apply(s, std::move(p2));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -350,16 +448,25 @@ mutation_partition::clustered_row(const schema& s, const clustering_key_view& ke
|
||||
return i->row();
|
||||
}
|
||||
|
||||
boost::iterator_range<mutation_partition::rows_type::const_iterator>
|
||||
mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) const {
|
||||
mutation_partition::rows_type::const_iterator
|
||||
mutation_partition::lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const {
|
||||
auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
|
||||
auto i1 = r.start() ? (r.start()->is_inclusive()
|
||||
return r.start() ? (r.start()->is_inclusive()
|
||||
? _rows.lower_bound(r.start()->value(), cmp)
|
||||
: _rows.upper_bound(r.start()->value(), cmp)) : _rows.cbegin();
|
||||
auto i2 = r.end() ? (r.end()->is_inclusive()
|
||||
? _rows.upper_bound(r.end()->value(), cmp)
|
||||
: _rows.lower_bound(r.end()->value(), cmp)) : _rows.cend();
|
||||
return boost::make_iterator_range(i1, i2);
|
||||
}
|
||||
|
||||
mutation_partition::rows_type::const_iterator
|
||||
mutation_partition::upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const {
|
||||
auto cmp = rows_entry::key_comparator(clustering_key_prefix::prefix_equality_less_compare(schema));
|
||||
return r.end() ? (r.end()->is_inclusive()
|
||||
? _rows.upper_bound(r.end()->value(), cmp)
|
||||
: _rows.lower_bound(r.end()->value(), cmp)) : _rows.cend();
|
||||
}
|
||||
|
||||
boost::iterator_range<mutation_partition::rows_type::const_iterator>
|
||||
mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) const {
|
||||
return boost::make_iterator_range(lower_bound(schema, r), upper_bound(schema, r));
|
||||
}
|
||||
|
||||
template <typename Container>
|
||||
@@ -371,11 +478,27 @@ unconst(Container& c, boost::iterator_range<typename Container::const_iterator>
|
||||
);
|
||||
}
|
||||
|
||||
template <typename Container>
|
||||
typename Container::iterator
|
||||
unconst(Container& c, typename Container::const_iterator i) {
|
||||
return c.erase(i, i);
|
||||
}
|
||||
|
||||
boost::iterator_range<mutation_partition::rows_type::iterator>
|
||||
mutation_partition::range(const schema& schema, const query::range<clustering_key_prefix>& r) {
|
||||
return unconst(_rows, static_cast<const mutation_partition*>(this)->range(schema, r));
|
||||
}
|
||||
|
||||
mutation_partition::rows_type::iterator
|
||||
mutation_partition::lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) {
|
||||
return unconst(_rows, static_cast<const mutation_partition*>(this)->lower_bound(schema, r));
|
||||
}
|
||||
|
||||
mutation_partition::rows_type::iterator
|
||||
mutation_partition::upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) {
|
||||
return unconst(_rows, static_cast<const mutation_partition*>(this)->upper_bound(schema, r));
|
||||
}
|
||||
|
||||
template<typename Func>
|
||||
void mutation_partition::for_each_row(const schema& schema, const query::range<clustering_key_prefix>& row_range, bool reversed, Func&& func) const
|
||||
{
|
||||
@@ -450,13 +573,11 @@ static void hash_row_slice(md5_hasher& hasher,
|
||||
}
|
||||
|
||||
template<typename RowWriter>
|
||||
static void get_row_slice(const schema& s,
|
||||
static void get_compacted_row_slice(const schema& s,
|
||||
const query::partition_slice& slice,
|
||||
column_kind kind,
|
||||
const row& cells,
|
||||
const std::vector<column_id>& columns,
|
||||
tombstone tomb,
|
||||
gc_clock::time_point now,
|
||||
RowWriter& writer)
|
||||
{
|
||||
for (auto id : columns) {
|
||||
@@ -467,7 +588,7 @@ static void get_row_slice(const schema& s,
|
||||
auto&& def = s.column_at(kind, id);
|
||||
if (def.is_atomic()) {
|
||||
auto c = cell->as_atomic_cell();
|
||||
if (!c.is_live(tomb, now)) {
|
||||
if (!c.is_live()) {
|
||||
writer.add().skip();
|
||||
} else {
|
||||
write_cell(writer, slice, cell->as_atomic_cell());
|
||||
@@ -475,21 +596,18 @@ static void get_row_slice(const schema& s,
|
||||
} else {
|
||||
auto&& mut = cell->as_collection_mutation();
|
||||
auto&& ctype = static_pointer_cast<const collection_type_impl>(def.type);
|
||||
auto m_view = ctype->deserialize_mutation_form(mut);
|
||||
m_view.tomb.apply(tomb);
|
||||
// FIXME: Instead of this, write optimistically and retract if empty
|
||||
auto m_ser = ctype->serialize_mutation_form_only_live(m_view, now);
|
||||
if (ctype->is_empty(m_ser)) {
|
||||
if (!ctype->is_any_live(mut)) {
|
||||
writer.add().skip();
|
||||
} else {
|
||||
write_cell(writer, slice, def.type, m_ser);
|
||||
write_cell(writer, slice, def.type, mut);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tombstone tomb, gc_clock::time_point now) {
|
||||
bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tombstone tomb = tombstone(),
|
||||
gc_clock::time_point now = gc_clock::time_point::min()) {
|
||||
bool any_live = false;
|
||||
cells.for_each_cell_until([&] (column_id id, const atomic_cell_or_collection& cell_or_collection) {
|
||||
const column_definition& def = s.column_at(kind, id);
|
||||
@@ -512,25 +630,27 @@ bool has_any_live_data(const schema& s, column_kind kind, const row& cells, tomb
|
||||
return any_live;
|
||||
}
|
||||
|
||||
uint32_t
|
||||
mutation_partition::query(query::result::partition_writer& pw,
|
||||
const schema& s,
|
||||
gc_clock::time_point now,
|
||||
uint32_t limit) const
|
||||
{
|
||||
static bool has_ck_selector(const query::clustering_row_ranges& ranges) {
|
||||
// Like PK range, an empty row range, should be considered an "exclude all" restriction
|
||||
return ranges.empty() || std::any_of(ranges.begin(), ranges.end(), [](auto& r) {
|
||||
return !r.is_full();
|
||||
});
|
||||
}
|
||||
|
||||
void
|
||||
mutation_partition::query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t limit) const {
|
||||
const query::partition_slice& slice = pw.slice();
|
||||
|
||||
if (limit == 0) {
|
||||
pw.retract();
|
||||
return 0;
|
||||
return;
|
||||
}
|
||||
|
||||
auto static_cells_wr = pw.start().start_static_row().start_cells();
|
||||
|
||||
if (!slice.static_columns.empty()) {
|
||||
if (pw.requested_result()) {
|
||||
get_row_slice(s, slice, column_kind::static_column, static_row(), slice.static_columns, partition_tombstone(),
|
||||
now, static_cells_wr);
|
||||
get_compacted_row_slice(s, slice, column_kind::static_column, static_row(), slice.static_columns, static_cells_wr);
|
||||
}
|
||||
if (pw.requested_digest()) {
|
||||
::feed_hash(pw.digest(), partition_tombstone());
|
||||
@@ -544,52 +664,37 @@ mutation_partition::query(query::result::partition_writer& pw,
|
||||
|
||||
uint32_t row_count = 0;
|
||||
|
||||
// Like PK range, an empty row range, should be considered an "exclude all" restriction
|
||||
bool has_ck_selector = pw.ranges().empty();
|
||||
|
||||
auto is_reversed = slice.options.contains(query::partition_slice::option::reversed);
|
||||
auto send_ck = slice.options.contains(query::partition_slice::option::send_clustering_key);
|
||||
for (auto&& row_range : pw.ranges()) {
|
||||
if (limit == 0) {
|
||||
break;
|
||||
for_each_row(s, query::clustering_range::make_open_ended_both_sides(), is_reversed, [&] (const rows_entry& e) {
|
||||
auto& row = e.row();
|
||||
auto row_tombstone = tombstone_for_row(s, e);
|
||||
|
||||
if (pw.requested_digest()) {
|
||||
e.key().feed_hash(pw.digest(), s);
|
||||
::feed_hash(pw.digest(), row_tombstone);
|
||||
hash_row_slice(pw.digest(), s, column_kind::regular_column, row.cells(), slice.regular_columns);
|
||||
}
|
||||
|
||||
has_ck_selector |= !row_range.is_full();
|
||||
|
||||
// FIXME: Optimize for a full-tuple singular range. mutation_partition::range()
|
||||
// does two lookups to form a range, even for singular range. We need
|
||||
// only one lookup for a full-tuple singular range though.
|
||||
for_each_row(s, row_range, is_reversed, [&] (const rows_entry& e) {
|
||||
auto& row = e.row();
|
||||
auto row_tombstone = tombstone_for_row(s, e);
|
||||
|
||||
if (pw.requested_digest()) {
|
||||
e.key().feed_hash(pw.digest(), s);
|
||||
::feed_hash(pw.digest(), row_tombstone);
|
||||
hash_row_slice(pw.digest(), s, column_kind::regular_column, row.cells(), slice.regular_columns);
|
||||
if (row.is_live(s)) {
|
||||
if (pw.requested_result()) {
|
||||
auto cells_wr = [&] {
|
||||
if (send_ck) {
|
||||
return rows_wr.add().write_key(e.key()).start_cells().start_cells();
|
||||
} else {
|
||||
return rows_wr.add().skip_key().start_cells().start_cells();
|
||||
}
|
||||
}();
|
||||
get_compacted_row_slice(s, slice, column_kind::regular_column, row.cells(), slice.regular_columns, cells_wr);
|
||||
std::move(cells_wr).end_cells().end_cells().end_qr_clustered_row();
|
||||
}
|
||||
|
||||
if (row.is_live(s, row_tombstone, now)) {
|
||||
if (pw.requested_result()) {
|
||||
auto cells_wr = [&] {
|
||||
if (send_ck) {
|
||||
return rows_wr.add().write_key(e.key()).start_cells().start_cells();
|
||||
} else {
|
||||
return rows_wr.add().skip_key().start_cells().start_cells();
|
||||
}
|
||||
}();
|
||||
get_row_slice(s, slice, column_kind::regular_column, row.cells(), slice.regular_columns, row_tombstone,
|
||||
now, cells_wr);
|
||||
std::move(cells_wr).end_cells().end_cells().end_qr_clustered_row();
|
||||
}
|
||||
++row_count;
|
||||
if (--limit == 0) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
++row_count;
|
||||
if (--limit == 0) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
return stop_iteration::no;
|
||||
});
|
||||
}
|
||||
}
|
||||
return stop_iteration::no;
|
||||
});
|
||||
|
||||
// If we got no rows, but have live static columns, we should only
|
||||
// give them back IFF we did not have any CK restrictions.
|
||||
@@ -597,17 +702,11 @@ mutation_partition::query(query::result::partition_writer& pw,
|
||||
// If ck:s exist, and we do a restriction on them, we either have maching
|
||||
// rows, or return nothing, since cql does not allow "is null".
|
||||
if (row_count == 0
|
||||
&& (has_ck_selector
|
||||
|| !has_any_live_data(s, column_kind::static_column,
|
||||
static_row(), _tombstone, now))) {
|
||||
&& (has_ck_selector(pw.ranges())
|
||||
|| !has_any_live_data(s, column_kind::static_column, static_row()))) {
|
||||
pw.retract();
|
||||
return 0;
|
||||
} else {
|
||||
std::move(rows_wr).end_rows().end_qr_partition();
|
||||
|
||||
// The partition is live. If there are no clustered rows, there
|
||||
// must be something live in the static row, which counts as one row.
|
||||
return std::max<uint32_t>(row_count, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -667,7 +766,7 @@ operator<<(std::ostream& os, const mutation_partition& mp) {
|
||||
constexpr gc_clock::duration row_marker::no_ttl;
|
||||
constexpr gc_clock::duration row_marker::dead;
|
||||
|
||||
int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) {
|
||||
int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept {
|
||||
if (left.timestamp() != right.timestamp()) {
|
||||
return left.timestamp() > right.timestamp() ? 1 : -1;
|
||||
}
|
||||
@@ -703,6 +802,18 @@ deletable_row::equal(column_kind kind, const schema& s, const deletable_row& oth
|
||||
return _cells.equal(kind, s, other._cells, other_schema);
|
||||
}
|
||||
|
||||
void deletable_row::apply_reversibly(const schema& s, deletable_row& src) {
|
||||
_cells.apply_reversibly(s, column_kind::regular_column, src._cells);
|
||||
_deleted_at.apply_reversibly(src._deleted_at); // noexcept
|
||||
_marker.apply_reversibly(src._marker); // noexcept
|
||||
}
|
||||
|
||||
void deletable_row::revert(const schema& s, deletable_row& src) {
|
||||
_cells.revert(s, column_kind::regular_column, src._cells);
|
||||
_deleted_at.revert(src._deleted_at);
|
||||
_marker.revert(src._marker);
|
||||
}
|
||||
|
||||
bool
|
||||
rows_entry::equal(const schema& s, const rows_entry& other) const {
|
||||
return equal(s, other, s);
|
||||
@@ -747,42 +858,123 @@ bool mutation_partition::equal(const schema& this_schema, const mutation_partiti
|
||||
}
|
||||
|
||||
void
|
||||
merge_column(const column_definition& def,
|
||||
atomic_cell_or_collection& old,
|
||||
atomic_cell_or_collection&& neww) {
|
||||
apply_reversibly(const column_definition& def, atomic_cell_or_collection& dst, atomic_cell_or_collection& src) {
|
||||
// Must be run via with_linearized_managed_bytes() context, but assume it is
|
||||
// provided via an upper layer
|
||||
if (def.is_atomic()) {
|
||||
if (compare_atomic_cell_for_merge(old.as_atomic_cell(), neww.as_atomic_cell()) < 0) {
|
||||
old = std::move(neww);
|
||||
auto&& src_ac = src.as_atomic_cell_ref();
|
||||
if (compare_atomic_cell_for_merge(dst.as_atomic_cell(), src.as_atomic_cell()) < 0) {
|
||||
std::swap(dst, src);
|
||||
src_ac.set_revert(true);
|
||||
} else {
|
||||
src_ac.set_revert(false);
|
||||
}
|
||||
} else {
|
||||
auto ct = static_pointer_cast<const collection_type_impl>(def.type);
|
||||
old = ct->merge(old.as_collection_mutation(), neww.as_collection_mutation());
|
||||
src = ct->merge(dst.as_collection_mutation(), src.as_collection_mutation());
|
||||
std::swap(dst, src);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
revert(const column_definition& def, atomic_cell_or_collection& dst, atomic_cell_or_collection& src) noexcept {
|
||||
static_assert(std::is_nothrow_move_constructible<atomic_cell_or_collection>::value
|
||||
&& std::is_nothrow_move_assignable<atomic_cell_or_collection>::value,
|
||||
"for std::swap() to be noexcept");
|
||||
if (def.is_atomic()) {
|
||||
auto&& ac = src.as_atomic_cell_ref();
|
||||
if (ac.is_revert_set()) {
|
||||
ac.set_revert(false);
|
||||
std::swap(dst, src);
|
||||
}
|
||||
} else {
|
||||
std::swap(dst, src);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
row::apply(const column_definition& column, const atomic_cell_or_collection& value) {
|
||||
// FIXME: Optimize
|
||||
atomic_cell_or_collection tmp(value);
|
||||
apply(column, std::move(tmp));
|
||||
}
|
||||
|
||||
void
|
||||
row::apply(const column_definition& column, atomic_cell_or_collection&& value) {
|
||||
apply_reversibly(column, value);
|
||||
}
|
||||
|
||||
template<typename Func, typename Rollback>
|
||||
void row::for_each_cell(Func&& func, Rollback&& rollback) {
|
||||
static_assert(noexcept(rollback(std::declval<column_id>(), std::declval<atomic_cell_or_collection&>())),
|
||||
"rollback must be noexcept");
|
||||
|
||||
if (_type == storage_type::vector) {
|
||||
unsigned i = 0;
|
||||
try {
|
||||
for (; i < _storage.vector.v.size(); i++) {
|
||||
if (_storage.vector.present.test(i)) {
|
||||
func(i, _storage.vector.v[i]);
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
while (i) {
|
||||
--i;
|
||||
if (_storage.vector.present.test(i)) {
|
||||
rollback(i, _storage.vector.v[i]);
|
||||
}
|
||||
}
|
||||
throw;
|
||||
}
|
||||
} else {
|
||||
auto i = _storage.set.begin();
|
||||
try {
|
||||
while (i != _storage.set.end()) {
|
||||
func(i->id(), i->cell());
|
||||
++i;
|
||||
}
|
||||
} catch (...) {
|
||||
while (i != _storage.set.begin()) {
|
||||
--i;
|
||||
rollback(i->id(), i->cell());
|
||||
}
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Func>
|
||||
void row::for_each_cell(Func&& func) {
|
||||
if (_type == storage_type::vector) {
|
||||
for (auto i : bitsets::for_each_set(_storage.vector.present)) {
|
||||
func(i, _storage.vector.v[i]);
|
||||
}
|
||||
} else {
|
||||
for (auto& cell : _storage.set) {
|
||||
func(cell.id(), cell.cell());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
row::apply_reversibly(const column_definition& column, atomic_cell_or_collection& value) {
|
||||
static_assert(std::is_nothrow_move_constructible<atomic_cell_or_collection>::value
|
||||
&& std::is_nothrow_move_assignable<atomic_cell_or_collection>::value,
|
||||
"noexcept required for atomicity");
|
||||
|
||||
// our mutations are not yet immutable
|
||||
auto id = column.id;
|
||||
if (_type == storage_type::vector && id < max_vector_size) {
|
||||
if (id >= _storage.vector.size()) {
|
||||
_storage.vector.resize(id);
|
||||
_storage.vector.emplace_back(std::move(value));
|
||||
if (id >= _storage.vector.v.size()) {
|
||||
_storage.vector.v.resize(id);
|
||||
_storage.vector.v.emplace_back(std::move(value));
|
||||
_storage.vector.present.set(id);
|
||||
_size++;
|
||||
} else if (!bool(_storage.vector[id])) {
|
||||
_storage.vector[id] = std::move(value);
|
||||
} else if (!bool(_storage.vector.v[id])) {
|
||||
_storage.vector.v[id] = std::move(value);
|
||||
_storage.vector.present.set(id);
|
||||
_size++;
|
||||
} else {
|
||||
merge_column(column, _storage.vector[id], std::move(value));
|
||||
::apply_reversibly(column, _storage.vector.v[id], value);
|
||||
}
|
||||
} else {
|
||||
if (_type == storage_type::vector) {
|
||||
@@ -790,11 +982,37 @@ row::apply(const column_definition& column, atomic_cell_or_collection&& value) {
|
||||
}
|
||||
auto i = _storage.set.lower_bound(id, cell_entry::compare());
|
||||
if (i == _storage.set.end() || i->id() != id) {
|
||||
auto e = current_allocator().construct<cell_entry>(id, std::move(value));
|
||||
cell_entry* e = current_allocator().construct<cell_entry>(id);
|
||||
std::swap(e->_cell, value);
|
||||
_storage.set.insert(i, *e);
|
||||
_size++;
|
||||
} else {
|
||||
merge_column(column, i->cell(), std::move(value));
|
||||
::apply_reversibly(column, i->cell(), value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
row::revert(const column_definition& column, atomic_cell_or_collection& src) noexcept {
|
||||
auto id = column.id;
|
||||
if (_type == storage_type::vector) {
|
||||
auto& dst = _storage.vector.v[id];
|
||||
if (!src) {
|
||||
std::swap(dst, src);
|
||||
_storage.vector.present.reset(id);
|
||||
--_size;
|
||||
} else {
|
||||
::revert(column, dst, src);
|
||||
}
|
||||
} else {
|
||||
auto i = _storage.set.find(id, cell_entry::compare());
|
||||
auto& dst = i->cell();
|
||||
if (!src) {
|
||||
std::swap(dst, src);
|
||||
_storage.set.erase_and_dispose(i, current_deleter<cell_entry>());
|
||||
--_size;
|
||||
} else {
|
||||
::revert(column, dst, src);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -802,8 +1020,9 @@ row::apply(const column_definition& column, atomic_cell_or_collection&& value) {
|
||||
void
|
||||
row::append_cell(column_id id, atomic_cell_or_collection value) {
|
||||
if (_type == storage_type::vector && id < max_vector_size) {
|
||||
_storage.vector.resize(id);
|
||||
_storage.vector.emplace_back(std::move(value));
|
||||
_storage.vector.v.resize(id);
|
||||
_storage.vector.v.emplace_back(std::move(value));
|
||||
_storage.vector.present.set(id);
|
||||
} else {
|
||||
if (_type == storage_type::vector) {
|
||||
vector_to_set();
|
||||
@@ -817,10 +1036,10 @@ row::append_cell(column_id id, atomic_cell_or_collection value) {
|
||||
const atomic_cell_or_collection*
|
||||
row::find_cell(column_id id) const {
|
||||
if (_type == storage_type::vector) {
|
||||
if (id >= _storage.vector.size() || !bool(_storage.vector[id])) {
|
||||
if (id >= _storage.vector.v.size() || !_storage.vector.present.test(id)) {
|
||||
return nullptr;
|
||||
}
|
||||
return &_storage.vector[id];
|
||||
return &_storage.vector.v[id];
|
||||
} else {
|
||||
auto i = _storage.set.find(id, cell_entry::compare());
|
||||
if (i == _storage.set.end()) {
|
||||
@@ -841,15 +1060,24 @@ void mutation_partition::trim_rows(const schema& s,
|
||||
auto last = reversal_traits<reversed>::begin(_rows);
|
||||
auto deleter = current_deleter<rows_entry>();
|
||||
|
||||
auto range_begin = [this, &s] (const query::clustering_range& range) {
|
||||
return reversed ? upper_bound(s, range) : lower_bound(s, range);
|
||||
};
|
||||
|
||||
auto range_end = [this, &s] (const query::clustering_range& range) {
|
||||
return reversed ? lower_bound(s, range) : upper_bound(s, range);
|
||||
};
|
||||
|
||||
for (auto&& row_range : row_ranges) {
|
||||
if (stop) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto it_range = reversal_traits<reversed>::maybe_reverse(_rows, range(s, row_range));
|
||||
last = reversal_traits<reversed>::erase_and_dispose(_rows, last, it_range.begin(), deleter);
|
||||
last = reversal_traits<reversed>::erase_and_dispose(_rows, last,
|
||||
reversal_traits<reversed>::maybe_reverse(_rows, range_begin(row_range)), deleter);
|
||||
|
||||
while (last != it_range.end()) {
|
||||
auto end = reversal_traits<reversed>::maybe_reverse(_rows, range_end(row_range));
|
||||
while (last != end) {
|
||||
rows_entry& e = *last;
|
||||
if (func(e) == stop_iteration::yes) {
|
||||
stop = true;
|
||||
@@ -921,10 +1149,7 @@ uint32_t mutation_partition::do_compact(const schema& s,
|
||||
|
||||
// #589 - Do not add extra row for statics unless we did a CK range-less query.
|
||||
// See comment in query
|
||||
if (row_count == 0 && static_row_live
|
||||
&& std::any_of(row_ranges.begin(), row_ranges.end(), [](auto& r) {
|
||||
return r.is_full();
|
||||
})) {
|
||||
if (row_count == 0 && static_row_live && !has_ck_selector(row_ranges)) {
|
||||
++row_count;
|
||||
}
|
||||
|
||||
@@ -977,7 +1202,7 @@ bool mutation_partition::empty() const
|
||||
}
|
||||
|
||||
bool
|
||||
deletable_row::is_live(const schema& s, tombstone base_tombstone, gc_clock::time_point query_time = gc_clock::time_point::min()) const {
|
||||
deletable_row::is_live(const schema& s, tombstone base_tombstone, gc_clock::time_point query_time) const {
|
||||
// _created_at corresponds to the row marker cell, present for rows
|
||||
// created with the 'insert' statement. If row marker is live, we know the
|
||||
// row is live. Otherwise, a row is considered live if it has any cell
|
||||
@@ -1034,7 +1259,7 @@ row::row(const row& o)
|
||||
, _size(o._size)
|
||||
{
|
||||
if (_type == storage_type::vector) {
|
||||
new (&_storage.vector) vector_type(o._storage.vector);
|
||||
new (&_storage.vector) vector_storage(o._storage.vector);
|
||||
} else {
|
||||
auto cloner = [] (const auto& x) {
|
||||
return current_allocator().construct<std::remove_const_t<std::remove_reference_t<decltype(x)>>>(x);
|
||||
@@ -1051,14 +1276,14 @@ row::row(const row& o)
|
||||
|
||||
row::~row() {
|
||||
if (_type == storage_type::vector) {
|
||||
_storage.vector.~vector_type();
|
||||
_storage.vector.~vector_storage();
|
||||
} else {
|
||||
_storage.set.clear_and_dispose(current_deleter<cell_entry>());
|
||||
_storage.set.~map_type();
|
||||
}
|
||||
}
|
||||
|
||||
row::cell_entry::cell_entry(const cell_entry& o) noexcept
|
||||
row::cell_entry::cell_entry(const cell_entry& o)
|
||||
: _id(o._id)
|
||||
, _cell(o._cell)
|
||||
{ }
|
||||
@@ -1085,15 +1310,20 @@ void row::vector_to_set()
|
||||
{
|
||||
assert(_type == storage_type::vector);
|
||||
map_type set;
|
||||
for (unsigned i = 0; i < _storage.vector.size(); i++) {
|
||||
auto& c = _storage.vector[i];
|
||||
if (!bool(c)) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
for (auto i : bitsets::for_each_set(_storage.vector.present)) {
|
||||
auto& c = _storage.vector.v[i];
|
||||
auto e = current_allocator().construct<cell_entry>(i, std::move(c));
|
||||
set.insert(set.end(), *e);
|
||||
}
|
||||
_storage.vector.~vector_type();
|
||||
} catch (...) {
|
||||
set.clear_and_dispose([this, del = current_deleter<cell_entry>()] (cell_entry* ce) noexcept {
|
||||
_storage.vector.v[ce->id()] = std::move(ce->cell());
|
||||
del(ce);
|
||||
});
|
||||
throw;
|
||||
}
|
||||
_storage.vector.~vector_storage();
|
||||
new (&_storage.set) map_type(std::move(set));
|
||||
_type = storage_type::set;
|
||||
}
|
||||
@@ -1104,7 +1334,7 @@ void row::reserve(column_id last_column)
|
||||
if (last_column >= max_vector_size) {
|
||||
vector_to_set();
|
||||
} else {
|
||||
_storage.vector.reserve(last_column);
|
||||
_storage.vector.v.reserve(last_column);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1157,13 +1387,13 @@ bool row::equal(column_kind kind, const schema& this_schema, const row& other, c
|
||||
}
|
||||
|
||||
row::row() {
|
||||
new (&_storage.vector) vector_type;
|
||||
new (&_storage.vector) vector_storage;
|
||||
}
|
||||
|
||||
row::row(row&& other)
|
||||
: _type(other._type), _size(other._size) {
|
||||
if (_type == storage_type::vector) {
|
||||
new (&_storage.vector) vector_type(std::move(other._storage.vector));
|
||||
new (&_storage.vector) vector_storage(std::move(other._storage.vector));
|
||||
} else {
|
||||
new (&_storage.set) map_type(std::move(other._storage.set));
|
||||
}
|
||||
@@ -1177,27 +1407,25 @@ row& row::operator=(row&& other) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
void row::merge(const schema& s, column_kind kind, const row& other) {
|
||||
void row::apply_reversibly(const schema& s, column_kind kind, row& other) {
|
||||
if (other.empty()) {
|
||||
return;
|
||||
}
|
||||
if (other._type == storage_type::vector) {
|
||||
reserve(other._storage.vector.size() - 1);
|
||||
reserve(other._storage.vector.v.size() - 1);
|
||||
} else {
|
||||
reserve(other._storage.set.rbegin()->id());
|
||||
}
|
||||
other.for_each_cell([&] (column_id id, const atomic_cell_or_collection& cell) {
|
||||
apply(s.column_at(kind, id), cell);
|
||||
other.for_each_cell([&] (column_id id, atomic_cell_or_collection& cell) {
|
||||
apply_reversibly(s.column_at(kind, id), cell);
|
||||
}, [&] (column_id id, atomic_cell_or_collection& cell) noexcept {
|
||||
revert(s.column_at(kind, id), cell);
|
||||
});
|
||||
}
|
||||
|
||||
void row::merge(const schema& s, column_kind kind, row&& other) {
|
||||
if (other._type == storage_type::vector) {
|
||||
reserve(other._storage.vector.size() - 1);
|
||||
} else {
|
||||
reserve(other._storage.set.rbegin()->id());
|
||||
}
|
||||
// FIXME: Optimize when 'other' is a set. We could move whole entries, not only cells.
|
||||
other.for_each_cell_until([&] (column_id id, atomic_cell_or_collection& cell) {
|
||||
apply(s.column_at(kind, id), std::move(cell));
|
||||
return stop_iteration::no;
|
||||
void row::revert(const schema& s, column_kind kind, row& other) noexcept {
|
||||
other.for_each_cell([&] (column_id id, atomic_cell_or_collection& cell) noexcept {
|
||||
revert(s.column_at(kind, id), cell);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1348,3 +1576,15 @@ mutation_partition::upgrade(const schema& old_schema, const schema& new_schema)
|
||||
accept(old_schema, v);
|
||||
*this = std::move(tmp);
|
||||
}
|
||||
|
||||
void row_marker::apply_reversibly(row_marker& rm) noexcept {
|
||||
if (compare_row_marker_for_merge(*this, rm) < 0) {
|
||||
std::swap(*this, rm);
|
||||
} else {
|
||||
rm = *this;
|
||||
}
|
||||
}
|
||||
|
||||
void row_marker::revert(row_marker& rm) noexcept {
|
||||
std::swap(*this, rm);
|
||||
}
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
#include <boost/range/adaptor/indexed.hpp>
|
||||
#include <boost/range/adaptor/filtered.hpp>
|
||||
|
||||
#include <seastar/core/bitset-iter.hh>
|
||||
|
||||
#include "schema.hh"
|
||||
#include "tombstone.hh"
|
||||
#include "keys.hh"
|
||||
@@ -58,8 +60,11 @@ class row {
|
||||
: _id(id)
|
||||
, _cell(std::move(cell))
|
||||
{ }
|
||||
cell_entry(column_id id)
|
||||
: _id(id)
|
||||
{ }
|
||||
cell_entry(cell_entry&&) noexcept;
|
||||
cell_entry(const cell_entry&) noexcept;
|
||||
cell_entry(const cell_entry&);
|
||||
|
||||
column_id id() const { return _id; }
|
||||
const atomic_cell_or_collection& cell() const { return _cell; }
|
||||
@@ -96,11 +101,16 @@ public:
|
||||
private:
|
||||
using vector_type = managed_vector<atomic_cell_or_collection, internal_count, size_type>;
|
||||
|
||||
struct vector_storage {
|
||||
std::bitset<max_vector_size> present;
|
||||
vector_type v;
|
||||
};
|
||||
|
||||
union storage {
|
||||
storage() { }
|
||||
~storage() { }
|
||||
map_type set;
|
||||
vector_type vector;
|
||||
vector_storage vector;
|
||||
} _storage;
|
||||
public:
|
||||
row();
|
||||
@@ -109,6 +119,7 @@ public:
|
||||
row(row&& other);
|
||||
row& operator=(row&& other);
|
||||
size_t size() const { return _size; }
|
||||
bool empty() const { return _size == 0; }
|
||||
|
||||
void reserve(column_id);
|
||||
|
||||
@@ -120,13 +131,14 @@ private:
|
||||
template<typename Func>
|
||||
void remove_if(Func&& func) {
|
||||
if (_type == storage_type::vector) {
|
||||
for (unsigned i = 0; i < _storage.vector.size(); i++) {
|
||||
auto& c = _storage.vector[i];
|
||||
if (!bool(c)) {
|
||||
for (unsigned i = 0; i < _storage.vector.v.size(); i++) {
|
||||
if (!_storage.vector.present.test(i)) {
|
||||
continue;
|
||||
}
|
||||
auto& c = _storage.vector.v[i];
|
||||
if (func(i, c)) {
|
||||
c = atomic_cell_or_collection();
|
||||
_storage.vector.present.reset(i);
|
||||
_size--;
|
||||
}
|
||||
}
|
||||
@@ -146,11 +158,12 @@ private:
|
||||
|
||||
private:
|
||||
auto get_range_vector() const {
|
||||
auto range = boost::make_iterator_range(_storage.vector.begin(), _storage.vector.end());
|
||||
return range | boost::adaptors::filtered([] (const atomic_cell_or_collection& c) { return bool(c); })
|
||||
| boost::adaptors::transformed([this] (const atomic_cell_or_collection& c) {
|
||||
auto id = &c - _storage.vector.data();
|
||||
return std::pair<column_id, const atomic_cell_or_collection&>(id, std::cref(c));
|
||||
auto id_range = boost::irange<column_id>(0, _storage.vector.v.size());
|
||||
return boost::combine(id_range, _storage.vector.v)
|
||||
| boost::adaptors::filtered([this] (const boost::tuple<const column_id&, const atomic_cell_or_collection&>& t) {
|
||||
return _storage.vector.present.test(t.get<0>());
|
||||
}) | boost::adaptors::transformed([] (const boost::tuple<const column_id&, const atomic_cell_or_collection&>& t) {
|
||||
return std::pair<column_id, const atomic_cell_or_collection&>(t.get<0>(), t.get<1>());
|
||||
});
|
||||
}
|
||||
auto get_range_set() const {
|
||||
@@ -163,7 +176,23 @@ private:
|
||||
auto with_both_ranges(const row& other, Func&& func) const;
|
||||
|
||||
void vector_to_set();
|
||||
|
||||
// Calls Func(column_id, atomic_cell_or_collection&) for each cell in this row.
|
||||
//
|
||||
// Func() is allowed to modify the cell. Emptying a cell makes it still
|
||||
// visible to for_each().
|
||||
//
|
||||
// In case of exception, calls Rollback(column_id, atomic_cell_or_collection&) on
|
||||
// all cells on which Func() was successfully invoked in reverse order.
|
||||
//
|
||||
template<typename Func, typename Rollback>
|
||||
void for_each_cell(Func&&, Rollback&&);
|
||||
public:
|
||||
// Calls Func(column_id, atomic_cell_or_collection&) for each cell in this row.
|
||||
// noexcept if Func doesn't throw.
|
||||
template<typename Func>
|
||||
void for_each_cell(Func&&);
|
||||
|
||||
template<typename Func>
|
||||
void for_each_cell(Func&& func) const {
|
||||
for_each_cell_until([func = std::forward<Func>(func)] (column_id id, const atomic_cell_or_collection& c) {
|
||||
@@ -175,11 +204,8 @@ public:
|
||||
template<typename Func>
|
||||
void for_each_cell_until(Func&& func) const {
|
||||
if (_type == storage_type::vector) {
|
||||
for (unsigned i = 0; i < _storage.vector.size(); i++) {
|
||||
auto& cell = _storage.vector[i];
|
||||
if (!bool(cell)) {
|
||||
continue;
|
||||
}
|
||||
for (auto i : bitsets::for_each_set(_storage.vector.present)) {
|
||||
auto& cell = _storage.vector.v[i];
|
||||
if (func(i, cell) == stop_iteration::yes) {
|
||||
break;
|
||||
}
|
||||
@@ -187,29 +213,7 @@ public:
|
||||
} else {
|
||||
for (auto& cell : _storage.set) {
|
||||
const auto& c = cell.cell();
|
||||
if (c && func(cell.id(), c) == stop_iteration::yes) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Func>
|
||||
void for_each_cell_until(Func&& func) {
|
||||
if (_type == storage_type::vector) {
|
||||
for (unsigned i = 0; i < _storage.vector.size(); i++) {
|
||||
auto& cell = _storage.vector[i];
|
||||
if (!bool(cell)) {
|
||||
continue;
|
||||
}
|
||||
if (func(i, cell) == stop_iteration::yes) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto& cell : _storage.set) {
|
||||
auto& c = cell.cell();
|
||||
if (c && func(cell.id(), c) == stop_iteration::yes) {
|
||||
if (func(cell.id(), c) == stop_iteration::yes) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -222,21 +226,26 @@ public:
|
||||
//
|
||||
// Merges cell's value into the row.
|
||||
//
|
||||
// In case of exception the current object and external object (moved-from)
|
||||
// are both left in some valid states, such that they still will commute to
|
||||
// a state the current object would have should the exception had not occurred.
|
||||
// In case of exception the current object is left with a value equivalent to the original state.
|
||||
//
|
||||
// The external cell is left in a valid state, such that it will commute with
|
||||
// current object to the same value should the exception had not occurred.
|
||||
//
|
||||
void apply(const column_definition& column, atomic_cell_or_collection&& cell);
|
||||
|
||||
// Equivalent to calling apply_reversibly() with a row containing only given cell.
|
||||
// See reversibly_mergeable.hh
|
||||
void apply_reversibly(const column_definition& column, atomic_cell_or_collection& cell);
|
||||
// See reversibly_mergeable.hh
|
||||
void revert(const column_definition& column, atomic_cell_or_collection& cell) noexcept;
|
||||
|
||||
// Adds cell to the row. The column must not be already set.
|
||||
void append_cell(column_id id, atomic_cell_or_collection cell);
|
||||
|
||||
void merge(const schema& s, column_kind kind, const row& other);
|
||||
|
||||
// In case of exception the current object and external object (moved-from)
|
||||
// are both left in some valid states, such that they still will commute to
|
||||
// a state the current object would have should the exception had not occurred.
|
||||
void merge(const schema& s, column_kind kind, row&& other);
|
||||
// See reversibly_mergeable.hh
|
||||
void apply_reversibly(const schema&, column_kind, row& src);
|
||||
// See reversibly_mergeable.hh
|
||||
void revert(const schema&, column_kind, row& src) noexcept;
|
||||
|
||||
// Expires cells based on query_time. Expires tombstones based on gc_before
|
||||
// and max_purgeable. Removes cells covered by tomb.
|
||||
@@ -258,7 +267,7 @@ public:
|
||||
std::ostream& operator<<(std::ostream& os, const std::pair<column_id, const atomic_cell_or_collection&>& c);
|
||||
|
||||
class row_marker;
|
||||
int compare_row_marker_for_merge(const row_marker& left, const row_marker& right);
|
||||
int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept;
|
||||
|
||||
class row_marker {
|
||||
static constexpr gc_clock::duration no_ttl { 0 };
|
||||
@@ -321,6 +330,10 @@ public:
|
||||
*this = rm;
|
||||
}
|
||||
}
|
||||
// See reversibly_mergeable.hh
|
||||
void apply_reversibly(row_marker& rm) noexcept;
|
||||
// See reversibly_mergeable.hh
|
||||
void revert(row_marker& rm) noexcept;
|
||||
// Expires cells and tombstones. Removes items covered by higher level
|
||||
// tombstones.
|
||||
// Returns true if row marker is live.
|
||||
@@ -398,6 +411,11 @@ public:
|
||||
void remove_tombstone() {
|
||||
_deleted_at = tombstone();
|
||||
}
|
||||
|
||||
// See reversibly_mergeable.hh
|
||||
void apply_reversibly(const schema& s, deletable_row& src);
|
||||
// See reversibly_mergeable.hh
|
||||
void revert(const schema& s, deletable_row& src);
|
||||
public:
|
||||
tombstone deleted_at() const { return _deleted_at; }
|
||||
api::timestamp_type created_at() const { return _marker.timestamp(); }
|
||||
@@ -407,7 +425,7 @@ public:
|
||||
row& cells() { return _cells; }
|
||||
friend std::ostream& operator<<(std::ostream& os, const deletable_row& dr);
|
||||
bool equal(column_kind, const schema& s, const deletable_row& other, const schema& other_schema) const;
|
||||
bool is_live(const schema& s, tombstone base_tombstone, gc_clock::time_point query_time) const;
|
||||
bool is_live(const schema& s, tombstone base_tombstone = tombstone(), gc_clock::time_point query_time = gc_clock::time_point::min()) const;
|
||||
bool empty() const { return !_deleted_at && _marker.is_missing() && !_cells.size(); }
|
||||
deletable_row difference(const schema&, column_kind, const deletable_row& other) const;
|
||||
};
|
||||
@@ -422,6 +440,9 @@ public:
|
||||
: _prefix(std::move(prefix))
|
||||
, _t(std::move(t))
|
||||
{ }
|
||||
row_tombstones_entry(const clustering_key_prefix& prefix)
|
||||
: _prefix(prefix)
|
||||
{ }
|
||||
row_tombstones_entry(row_tombstones_entry&& o) noexcept;
|
||||
row_tombstones_entry(const row_tombstones_entry&) = default;
|
||||
clustering_key_prefix& prefix() {
|
||||
@@ -430,6 +451,9 @@ public:
|
||||
const clustering_key_prefix& prefix() const {
|
||||
return _prefix;
|
||||
}
|
||||
const clustering_key_prefix& key() const {
|
||||
return _prefix;
|
||||
}
|
||||
tombstone& t() {
|
||||
return _t;
|
||||
}
|
||||
@@ -439,6 +463,14 @@ public:
|
||||
void apply(tombstone t) {
|
||||
_t.apply(t);
|
||||
}
|
||||
// See reversibly_mergeable.hh
|
||||
void apply_reversibly(row_tombstones_entry& e) {
|
||||
_t.apply_reversibly(e._t);
|
||||
}
|
||||
// See reversibly_mergeable.hh
|
||||
void revert(row_tombstones_entry& e) noexcept {
|
||||
_t.revert(e._t);
|
||||
}
|
||||
struct compare {
|
||||
clustering_key_prefix::less_compare _c;
|
||||
compare(const schema& s) : _c(s) {}
|
||||
@@ -472,6 +504,9 @@ public:
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const row_tombstones_entry& rte);
|
||||
bool equal(const schema& s, const row_tombstones_entry& other) const;
|
||||
bool empty() const {
|
||||
return !_t;
|
||||
}
|
||||
};
|
||||
|
||||
class rows_entry {
|
||||
@@ -512,6 +547,14 @@ public:
|
||||
void apply(tombstone t) {
|
||||
_row.apply(t);
|
||||
}
|
||||
// See reversibly_mergeable.hh
|
||||
void apply_reversibly(const schema& s, rows_entry& e) {
|
||||
_row.apply_reversibly(s, e._row);
|
||||
}
|
||||
// See reversibly_mergeable.hh
|
||||
void revert(const schema& s, rows_entry& e) noexcept {
|
||||
_row.revert(s, e._row);
|
||||
}
|
||||
bool empty() const {
|
||||
return _row.empty();
|
||||
}
|
||||
@@ -570,8 +613,8 @@ class mutation_partition final {
|
||||
using row_tombstones_type = boost::intrusive::set<row_tombstones_entry,
|
||||
boost::intrusive::member_hook<row_tombstones_entry, boost::intrusive::set_member_hook<>, &row_tombstones_entry::_link>,
|
||||
boost::intrusive::compare<row_tombstones_entry::compare>>;
|
||||
friend rows_entry;
|
||||
friend row_tombstones_entry;
|
||||
friend class rows_entry;
|
||||
friend class row_tombstones_entry;
|
||||
friend class size_calculator;
|
||||
private:
|
||||
tombstone _tombstone;
|
||||
@@ -626,19 +669,21 @@ public:
|
||||
// Commutative when this_schema == p_schema. If schemas differ, data in p which
|
||||
// is not representable in this_schema is dropped, thus apply() loses commutativity.
|
||||
//
|
||||
// Basic exception guarantees. If apply() throws after being called in
|
||||
// some entry state p0, the object is left in some consistent state p1 and
|
||||
// it's possible that p1 != p0 + p. It holds though that p1 + p = p0 + p.
|
||||
//
|
||||
// FIXME: make stronger exception guarantees (p1 = p0).
|
||||
// Strong exception guarantees.
|
||||
void apply(const schema& this_schema, const mutation_partition& p, const schema& p_schema);
|
||||
//
|
||||
// Same guarantees as for apply(const schema&, const mutation_partition&).
|
||||
// Applies p to current object.
|
||||
//
|
||||
// In case of exception the current object and external object (moved-from)
|
||||
// are both left in some valid states, such that they still will commute to
|
||||
// a state the current object would have should the exception had not occurred.
|
||||
// Commutative when this_schema == p_schema. If schemas differ, data in p which
|
||||
// is not representable in this_schema is dropped, thus apply() loses commutativity.
|
||||
//
|
||||
// If exception is thrown, this object will be left in a state equivalent to the entry state
|
||||
// and p will be left in a state which will commute with current object to the same value
|
||||
// should the exception had not occurred.
|
||||
void apply(const schema& this_schema, mutation_partition&& p, const schema& p_schema);
|
||||
// Use in case this instance and p share the same schema.
|
||||
// Same guarantees as apply(const schema&, mutation_partition&&, const schema&);
|
||||
void apply(const schema& s, mutation_partition&& p);
|
||||
// Same guarantees and constraints as for apply(const schema&, const mutation_partition&, const schema&).
|
||||
void apply(const schema& this_schema, mutation_partition_view p, const schema& p_schema);
|
||||
|
||||
@@ -717,9 +762,16 @@ public:
|
||||
tombstone tombstone_for_row(const schema& schema, const clustering_key& key) const;
|
||||
tombstone tombstone_for_row(const schema& schema, const rows_entry& e) const;
|
||||
boost::iterator_range<rows_type::const_iterator> range(const schema& schema, const query::range<clustering_key_prefix>& r) const;
|
||||
rows_type::const_iterator lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const;
|
||||
rows_type::const_iterator upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r) const;
|
||||
rows_type::iterator lower_bound(const schema& schema, const query::range<clustering_key_prefix>& r);
|
||||
rows_type::iterator upper_bound(const schema& schema, const query::range<clustering_key_prefix>& r);
|
||||
boost::iterator_range<rows_type::iterator> range(const schema& schema, const query::range<clustering_key_prefix>& r);
|
||||
// Returns the number of live CQL rows written. No more than limit.
|
||||
uint32_t query(query::result::partition_writer& pw, const schema& s, gc_clock::time_point now, uint32_t limit = query::max_rows) const;
|
||||
// Writes this partition using supplied query result writer.
|
||||
// The partition should be first compacted with compact_for_query(), otherwise
|
||||
// results may include data which is deleted/expired.
|
||||
// At most row_limit CQL rows will be written and digested.
|
||||
void query_compacted(query::result::partition_writer& pw, const schema& s, uint32_t row_limit) const;
|
||||
void accept(const schema&, mutation_partition_visitor&) const;
|
||||
|
||||
// Returns the number of live CQL rows in this partition.
|
||||
|
||||
@@ -57,75 +57,96 @@ query::result
|
||||
to_data_query_result(const reconcilable_result& r, schema_ptr s, const query::partition_slice& slice) {
|
||||
query::result::builder builder(slice, query::result_request::only_result);
|
||||
for (const partition& p : r.partitions()) {
|
||||
auto pb = builder.add_partition(*s, p._m.key(*s));
|
||||
p.mut().unfreeze(s).partition().query(pb, *s, gc_clock::time_point::min(), query::max_rows);
|
||||
p.mut().unfreeze(s).query(builder, slice, gc_clock::time_point::min(), query::max_rows);
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
|
||||
querying_reader::querying_reader(schema_ptr s,
|
||||
const mutation_source& source,
|
||||
const query::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
uint32_t row_limit,
|
||||
gc_clock::time_point query_time,
|
||||
std::function<void(uint32_t, mutation&&)> consumer)
|
||||
: _schema(std::move(s))
|
||||
, _range(range)
|
||||
, _slice(slice)
|
||||
, _requested_limit(row_limit)
|
||||
, _query_time(query_time)
|
||||
, _limit(row_limit)
|
||||
, _source(source)
|
||||
, _consumer(std::move(consumer))
|
||||
{ }
|
||||
|
||||
future<> querying_reader::read() {
|
||||
_reader = _source(_schema, _range, service::get_local_sstable_query_read_priority());
|
||||
return consume(*_reader, [this](mutation&& m) {
|
||||
// FIXME: Make data sources respect row_ranges so that we don't have to filter them out here.
|
||||
auto is_distinct = _slice.options.contains(query::partition_slice::option::distinct);
|
||||
auto is_reversed = _slice.options.contains(query::partition_slice::option::reversed);
|
||||
auto limit = !is_distinct ? _limit : 1;
|
||||
auto rows_left = m.partition().compact_for_query(*m.schema(), _query_time,
|
||||
_slice.row_ranges(*m.schema(), m.key()),
|
||||
is_reversed, limit);
|
||||
_limit -= rows_left;
|
||||
|
||||
if (rows_left || !m.partition().empty()) {
|
||||
// NOTE: We must return all columns, regardless of what's in
|
||||
// partition_slice, for the results to be reconcilable with tombstones.
|
||||
// That's because row's presence depends on existence of any
|
||||
// column in a row (See mutation_partition::query). We could
|
||||
// optimize this case and only send cell timestamps, without data,
|
||||
// for the cells which are not queried for (TODO).
|
||||
_consumer(rows_left, std::move(m));
|
||||
}
|
||||
|
||||
return _limit ? stop_iteration::no : stop_iteration::yes;
|
||||
});
|
||||
}
|
||||
|
||||
class reconcilable_result_builder {
|
||||
querying_reader _reader;
|
||||
std::vector<partition> _result;
|
||||
uint32_t _total = 0;
|
||||
public:
|
||||
reconcilable_result_builder(schema_ptr s,
|
||||
const mutation_source& source,
|
||||
const query::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
uint32_t row_limit,
|
||||
gc_clock::time_point query_time)
|
||||
: _reader(std::move(s), source, range, slice, row_limit, query_time, [this] (uint32_t live_rows, mutation&& m) {
|
||||
_result.emplace_back(partition{live_rows, freeze(m)});
|
||||
_total += live_rows;
|
||||
})
|
||||
{ }
|
||||
|
||||
reconcilable_result_builder(reconcilable_result_builder&&) = delete; // this captured
|
||||
|
||||
future<reconcilable_result> build() {
|
||||
return _reader.read().then([this] {
|
||||
return make_ready_future<reconcilable_result>(reconcilable_result(_total, std::move(_result)));
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
future<reconcilable_result>
|
||||
mutation_query(schema_ptr s,
|
||||
const mutation_source& source,
|
||||
const query::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
uint32_t row_limit,
|
||||
gc_clock::time_point query_time)
|
||||
const mutation_source& source,
|
||||
const query::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
uint32_t row_limit,
|
||||
gc_clock::time_point query_time)
|
||||
{
|
||||
struct query_state {
|
||||
const query::partition_range& range;
|
||||
const query::partition_slice& slice;
|
||||
uint32_t requested_limit;
|
||||
gc_clock::time_point query_time;
|
||||
uint32_t limit;
|
||||
mutation_reader reader;
|
||||
std::vector<partition> result;
|
||||
|
||||
query_state(
|
||||
const query::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
uint32_t requested_limit,
|
||||
gc_clock::time_point query_time
|
||||
)
|
||||
: range(range)
|
||||
, slice(slice)
|
||||
, requested_limit(requested_limit)
|
||||
, query_time(query_time)
|
||||
, limit(requested_limit)
|
||||
{ }
|
||||
};
|
||||
|
||||
if (row_limit == 0) {
|
||||
return make_ready_future<reconcilable_result>(reconcilable_result());
|
||||
}
|
||||
|
||||
return do_with(query_state(range, slice, row_limit, query_time),
|
||||
[&source, s = std::move(s)] (query_state& state) -> future<reconcilable_result> {
|
||||
state.reader = source(std::move(s), state.range, service::get_local_sstable_query_read_priority());
|
||||
return consume(state.reader, [&state] (mutation&& m) {
|
||||
// FIXME: Make data sources respect row_ranges so that we don't have to filter them out here.
|
||||
auto is_distinct = state.slice.options.contains(query::partition_slice::option::distinct);
|
||||
auto is_reversed = state.slice.options.contains(query::partition_slice::option::reversed);
|
||||
auto limit = !is_distinct ? state.limit : 1;
|
||||
auto rows_left = m.partition().compact_for_query(*m.schema(), state.query_time, state.slice.row_ranges(*m.schema(), m.key()),
|
||||
is_reversed, limit);
|
||||
state.limit -= rows_left;
|
||||
|
||||
if (rows_left || !m.partition().empty()) {
|
||||
// NOTE: We must return all columns, regardless of what's in
|
||||
// partition_slice, for the results to be reconcilable with tombstones.
|
||||
// That's because row's presence depends on existence of any
|
||||
// column in a row (See mutation_partition::query). We could
|
||||
// optimize this case and only send cell timestamps, without data,
|
||||
// for the cells which are not queried for (TODO).
|
||||
state.result.emplace_back(partition{rows_left, freeze(m)});
|
||||
}
|
||||
|
||||
return state.limit ? stop_iteration::no : stop_iteration::yes;
|
||||
}).then([&state] {
|
||||
return make_ready_future<reconcilable_result>(
|
||||
reconcilable_result(state.requested_limit - state.limit, std::move(state.result)));
|
||||
});
|
||||
});
|
||||
auto b_ptr = std::make_unique<reconcilable_result_builder>(std::move(s), source, range, slice, row_limit, query_time);
|
||||
auto& b = *b_ptr;
|
||||
return b.build().finally([keep = std::move(b_ptr)] {});
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const reconcilable_result::printer& pr) {
|
||||
|
||||
@@ -114,3 +114,26 @@ future<reconcilable_result> mutation_query(
|
||||
const query::partition_slice& slice,
|
||||
uint32_t row_limit,
|
||||
gc_clock::time_point query_time);
|
||||
|
||||
|
||||
class querying_reader {
|
||||
schema_ptr _schema;
|
||||
const query::partition_range& _range;
|
||||
const query::partition_slice& _slice;
|
||||
uint32_t _requested_limit;
|
||||
gc_clock::time_point _query_time;
|
||||
uint32_t _limit;
|
||||
const mutation_source& _source;
|
||||
std::function<void(uint32_t, mutation&&)> _consumer;
|
||||
std::experimental::optional<mutation_reader> _reader;
|
||||
public:
|
||||
querying_reader(schema_ptr s,
|
||||
const mutation_source& source,
|
||||
const query::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
uint32_t row_limit,
|
||||
gc_clock::time_point query_time,
|
||||
std::function<void(uint32_t, mutation&&)> consumer);
|
||||
|
||||
future<> read();
|
||||
};
|
||||
|
||||
@@ -127,3 +127,15 @@ partition_slice_builder::reversed() {
|
||||
_options.set<query::partition_slice::option::reversed>();
|
||||
return *this;
|
||||
}
|
||||
|
||||
partition_slice_builder&
|
||||
partition_slice_builder::without_partition_key_columns() {
|
||||
_options.remove<query::partition_slice::option::send_partition_key>();
|
||||
return *this;
|
||||
}
|
||||
|
||||
partition_slice_builder&
|
||||
partition_slice_builder::without_clustering_key_columns() {
|
||||
_options.remove<query::partition_slice::option::send_clustering_key>();
|
||||
return *this;
|
||||
}
|
||||
|
||||
@@ -50,6 +50,8 @@ public:
|
||||
partition_slice_builder& with_regular_column(bytes name);
|
||||
partition_slice_builder& with_no_regular_columns();
|
||||
partition_slice_builder& with_range(query::clustering_range range);
|
||||
partition_slice_builder& without_partition_key_columns();
|
||||
partition_slice_builder& without_clustering_key_columns();
|
||||
partition_slice_builder& reversed();
|
||||
|
||||
query::partition_slice build();
|
||||
|
||||
@@ -201,7 +201,7 @@ result_set::from_raw_result(schema_ptr s, const partition_slice& slice, const re
|
||||
|
||||
result_set::result_set(const mutation& m) : result_set([&m] {
|
||||
auto slice = partition_slice_builder(*m.schema()).build();
|
||||
auto qr = m.query(slice, result_request::only_result);
|
||||
auto qr = mutation(m).query(slice, result_request::only_result);
|
||||
return result_set::from_raw_result(m.schema(), slice, qr);
|
||||
}())
|
||||
{ }
|
||||
|
||||
@@ -83,6 +83,7 @@ public:
|
||||
}
|
||||
throw null_column_value(column_name);
|
||||
}
|
||||
const std::unordered_map<sstring, data_value>& cells() const { return _cells; }
|
||||
friend inline bool operator==(const result_set_row& x, const result_set_row& y);
|
||||
friend inline bool operator!=(const result_set_row& x, const result_set_row& y);
|
||||
friend std::ostream& operator<<(std::ostream& out, const result_set_row& row);
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
|
||||
#include <cryptopp/sha.h>
|
||||
#include <seastar/core/gate.hh>
|
||||
|
||||
static logging::logger logger("repair");
|
||||
|
||||
@@ -326,7 +327,7 @@ static future<partition_checksum> checksum_range_shard(database &db,
|
||||
const ::range<dht::token>& range) {
|
||||
auto& cf = db.find_column_family(keyspace_name, cf_name);
|
||||
return do_with(query::to_partition_range(range), [&cf] (const auto& partition_range) {
|
||||
return do_with(cf.make_reader(cf.schema(), partition_range, service::get_local_mutation_stream_priority()), partition_checksum(),
|
||||
return do_with(cf.make_reader(cf.schema(), partition_range, service::get_local_streaming_read_priority()), partition_checksum(),
|
||||
[] (auto& reader, auto& checksum) {
|
||||
return repeat([&reader, &checksum] () {
|
||||
return reader().then([&checksum] (auto mopt) {
|
||||
@@ -415,6 +416,21 @@ static void split_and_add(std::vector<::range<dht::token>>& ranges,
|
||||
ranges.push_back(halves.first);
|
||||
ranges.push_back(halves.second);
|
||||
}
|
||||
// We don't need to wait for one checksum to finish before we start the
|
||||
// next, but doing too many of these operations in parallel also doesn't
|
||||
// make sense, so we limit the number of concurrent ongoing checksum
|
||||
// requests with a semaphore.
|
||||
//
|
||||
// FIXME: We shouldn't use a magic number here, but rather bind it to
|
||||
// some resource. Otherwise we'll be doing too little in some machines,
|
||||
// and too much in others.
|
||||
//
|
||||
// FIXME: This would be better of in a repair service, or even a per-shard
|
||||
// repair instance holding all repair state. However, since we are anyway
|
||||
// considering ditching those semaphores for a more fine grained resource-based
|
||||
// solution, let's do the simplest thing here and change it later
|
||||
constexpr int parallelism = 100;
|
||||
static thread_local semaphore parallelism_semaphore(parallelism);
|
||||
|
||||
// Repair a single cf in a single local range.
|
||||
// Comparable to RepairJob in Origin.
|
||||
@@ -461,21 +477,14 @@ static future<> repair_cf_range(seastar::sharded<database>& db,
|
||||
split_and_add(ranges, range, estimated_partitions, 100);
|
||||
}
|
||||
|
||||
// We don't need to wait for one checksum to finish before we start the
|
||||
// next, but doing too many of these operations in parallel also doesn't
|
||||
// make sense, so we limit the number of concurrent ongoing checksum
|
||||
// requests with a semaphore.
|
||||
//
|
||||
// FIXME: We shouldn't use a magic number here, but rather bind it to
|
||||
// some resource. Otherwise we'll be doing too little in some machines,
|
||||
// and too much in others.
|
||||
constexpr int parallelism = 10;
|
||||
return do_with(semaphore(parallelism), true, std::move(keyspace), std::move(cf), std::move(ranges),
|
||||
[&db, &neighbors, parallelism] (auto& sem, auto& success, const auto& keyspace, const auto& cf, const auto& ranges) {
|
||||
return do_for_each(ranges, [&sem, &success, &db, &neighbors, &keyspace, &cf]
|
||||
return do_with(seastar::gate(), true, std::move(keyspace), std::move(cf), std::move(ranges),
|
||||
[&db, &neighbors] (auto& completion, auto& success, const auto& keyspace, const auto& cf, const auto& ranges) {
|
||||
return do_for_each(ranges, [&completion, &success, &db, &neighbors, &keyspace, &cf]
|
||||
(const auto& range) {
|
||||
|
||||
check_in_shutdown();
|
||||
return sem.wait(1).then([&sem, &success, &db, &neighbors, &keyspace, &cf, &range] {
|
||||
return parallelism_semaphore.wait(1).then([&completion, &success, &db, &neighbors, &keyspace, &cf, &range] {
|
||||
|
||||
// Ask this node, and all neighbors, to calculate checksums in
|
||||
// this range. When all are done, compare the results, and if
|
||||
// there are any differences, sync the content of this range.
|
||||
@@ -487,6 +496,8 @@ static future<> repair_cf_range(seastar::sharded<database>& db,
|
||||
net::get_local_messaging_service().send_repair_checksum_range(
|
||||
net::msg_addr{neighbor},keyspace, cf, range));
|
||||
}
|
||||
|
||||
completion.enter();
|
||||
when_all(checksums.begin(), checksums.end()).then(
|
||||
[&db, &keyspace, &cf, &range, &neighbors, &success]
|
||||
(std::vector<future<partition_checksum>> checksums) {
|
||||
@@ -532,10 +543,13 @@ static future<> repair_cf_range(seastar::sharded<database>& db,
|
||||
// tell the caller.
|
||||
success = false;
|
||||
logger.warn("Failed sync of range {}: {}", range, eptr);
|
||||
}).finally([&sem] { sem.signal(1); });
|
||||
}).finally([&completion] {
|
||||
parallelism_semaphore.signal(1);
|
||||
completion.leave(); // notify do_for_each that we're done
|
||||
});
|
||||
});
|
||||
}).finally([&sem, &success, parallelism] {
|
||||
return sem.wait(parallelism).then([&success] {
|
||||
}).finally([&success, &completion] {
|
||||
return completion.close().then([&success] {
|
||||
return success ? make_ready_future<>() :
|
||||
make_exception_future<>(std::runtime_error("Checksum or sync of partial range failed"));
|
||||
});
|
||||
|
||||
69
reversibly_mergeable.hh
Normal file
69
reversibly_mergeable.hh
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (C) 2016 Cloudius Systems, Ltd.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "utils/allocation_strategy.hh"
|
||||
#include <seastar/util/defer.hh>
|
||||
|
||||
//
|
||||
// ~~ Definitions ~~
|
||||
//
|
||||
// Mergeable type is a type which has an associated "apply" binary operation (T x T -> T)
|
||||
// which forms a commutative semigroup with instances of that type.
|
||||
//
|
||||
// ReversiblyMergeable type is a Mergeable type which has two binary operations associated,
|
||||
// "apply_reversibly" and "revert", both working on objects of that type (T x T -> T x T)
|
||||
// with the following properties:
|
||||
//
|
||||
// apply_reversibly(x, y) = (x', y')
|
||||
// revert(x', y') = (x'', y'')
|
||||
//
|
||||
// x' = apply(x, y)
|
||||
// x'' = x
|
||||
// apply(x'', y'') = apply(x, y)
|
||||
//
|
||||
// Note that it is not guaranteed that y'' = y and the state of y' is unspecified.
|
||||
//
|
||||
// ~~ API ~~
|
||||
//
|
||||
// "apply_reversibly" and "revert" are usually implemented as instance methods or functions
|
||||
// mutating both arguments to store the result of the operation in them.
|
||||
//
|
||||
// "revert" is not allowed to throw. If "apply_reversibly" throws the objects on which it operates
|
||||
// are left in valid states, with guarantees the same as if a successful apply_reversibly() was
|
||||
// followed by revert().
|
||||
//
|
||||
|
||||
|
||||
template<typename T>
|
||||
struct default_reversible_applier {
|
||||
void operator()(T& dst, T& src) const {
|
||||
dst.apply_reversibly(src);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct default_reverter {
|
||||
void operator()(T& dst, T& src) const noexcept {
|
||||
dst.revert(src);
|
||||
}
|
||||
};
|
||||
@@ -36,14 +36,29 @@ done
|
||||
. /etc/os-release
|
||||
|
||||
if [ "$ID" = "ubuntu" ]; then
|
||||
if [ "$LOCAL_PKG" = "" ]; then
|
||||
echo "#!/bin/sh" >> /usr/sbin/policy-rc.d
|
||||
echo "exit 101" >> /usr/sbin/policy-rc.d
|
||||
chmod +x /usr/sbin/policy-rc.d
|
||||
cp /etc/hosts /etc/hosts.orig
|
||||
echo 127.0.0.1 `hostname` >> /etc/hosts
|
||||
if [ $UNSTABLE -eq 0 ]; then
|
||||
echo "deb http://s3.amazonaws.com/downloads.scylladb.com/deb/ubuntu trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
|
||||
apt-get update
|
||||
else
|
||||
echo "deb https://s3.amazonaws.com/downloads.scylladb.com/deb/unstable/ubuntu/master/latest trusty/scylladb multiverse" > /etc/apt/sources.list.d/scylla.list
|
||||
fi
|
||||
apt-get update
|
||||
if [ "$LOCAL_PKG" = "" ]; then
|
||||
apt-get install -y --force-yes scylla-server scylla-jmx scylla-tools
|
||||
else
|
||||
apt-get install -y --force-yes gdebi-core
|
||||
gdebi $LOCAL_PKG/scylla-server*.deb $LOCAL_PKG/scylla-jmx*.deb $LOCAL_PKG/scylla-tools*.deb
|
||||
if [ ! -f /usr/bin/gdebi ]; then
|
||||
apt-get install -y --force-yes gdebi-core
|
||||
fi
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-server*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-jmx*.deb
|
||||
echo Y | gdebi $LOCAL_PKG/scylla-tools*.deb
|
||||
fi
|
||||
mv /etc/hosts.orig /etc/hosts
|
||||
rm /usr/sbin/policy-rc.d
|
||||
else
|
||||
if [ "$ID" = "fedora" ]; then
|
||||
if [ $UNSTABLE -eq 0 ]; then
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 6a207e11b1...0225940222
@@ -26,7 +26,8 @@ namespace service {
|
||||
class priority_manager {
|
||||
::io_priority_class _commitlog_priority;
|
||||
::io_priority_class _mt_flush_priority;
|
||||
::io_priority_class _mut_stream_priority;
|
||||
::io_priority_class _stream_read_priority;
|
||||
::io_priority_class _stream_write_priority;
|
||||
::io_priority_class _sstable_query_read;
|
||||
::io_priority_class _compaction_priority;
|
||||
|
||||
@@ -42,8 +43,13 @@ public:
|
||||
}
|
||||
|
||||
const ::io_priority_class&
|
||||
mutation_stream_priority() {
|
||||
return _mut_stream_priority;
|
||||
streaming_read_priority() {
|
||||
return _stream_read_priority;
|
||||
}
|
||||
|
||||
const ::io_priority_class&
|
||||
streaming_write_priority() {
|
||||
return _stream_write_priority;
|
||||
}
|
||||
|
||||
const ::io_priority_class&
|
||||
@@ -59,7 +65,8 @@ public:
|
||||
priority_manager()
|
||||
: _commitlog_priority(engine().register_one_priority_class("commitlog", 100))
|
||||
, _mt_flush_priority(engine().register_one_priority_class("memtable_flush", 100))
|
||||
, _mut_stream_priority(engine().register_one_priority_class("streaming", 100))
|
||||
, _stream_read_priority(engine().register_one_priority_class("streaming_read", 20))
|
||||
, _stream_write_priority(engine().register_one_priority_class("streaming_write", 20))
|
||||
, _sstable_query_read(engine().register_one_priority_class("query", 100))
|
||||
, _compaction_priority(engine().register_one_priority_class("compaction", 100))
|
||||
|
||||
@@ -78,8 +85,13 @@ get_local_memtable_flush_priority() {
|
||||
}
|
||||
|
||||
const inline ::io_priority_class&
|
||||
get_local_mutation_stream_priority() {
|
||||
return get_local_priority_manager().mutation_stream_priority();
|
||||
get_local_streaming_read_priority() {
|
||||
return get_local_priority_manager().streaming_read_priority();
|
||||
}
|
||||
|
||||
const inline ::io_priority_class&
|
||||
get_local_streaming_write_priority() {
|
||||
return get_local_priority_manager().streaming_write_priority();
|
||||
}
|
||||
|
||||
const inline ::io_priority_class&
|
||||
|
||||
@@ -835,6 +835,15 @@ storage_proxy::mutate_locally(std::vector<mutation> mutations) {
|
||||
});
|
||||
}
|
||||
|
||||
future<>
|
||||
storage_proxy::mutate_streaming_mutation(const schema_ptr& s, const frozen_mutation& m) {
|
||||
auto shard = _db.local().shard_of(m);
|
||||
return _db.invoke_on(shard, [&m, gs = global_schema_ptr(s)] (database& db) mutable -> future<> {
|
||||
return db.apply_streaming_mutation(gs, m);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Helper for create_write_response_handler, shared across mutate/mutate_atomically.
|
||||
* Both methods do roughly the same thing, with the latter intermixing batch log ops
|
||||
|
||||
@@ -181,6 +181,8 @@ public:
|
||||
future<> mutate_locally(const schema_ptr&, const frozen_mutation& m);
|
||||
future<> mutate_locally(std::vector<mutation> mutations);
|
||||
|
||||
future<> mutate_streaming_mutation(const schema_ptr&, const frozen_mutation& m);
|
||||
|
||||
/**
|
||||
* Use this method to have these Mutations applied
|
||||
* across all replicas. This method will take care
|
||||
|
||||
@@ -749,14 +749,14 @@ void storage_service::on_join(gms::inet_address endpoint, gms::endpoint_state ep
|
||||
on_change(endpoint, e.first, e.second);
|
||||
}
|
||||
get_local_migration_manager().schedule_schema_pull(endpoint, ep_state).handle_exception([endpoint] (auto ep) {
|
||||
logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
|
||||
logger.warn("Fail to pull schema from {}: {}", endpoint, ep);
|
||||
});
|
||||
}
|
||||
|
||||
void storage_service::on_alive(gms::inet_address endpoint, gms::endpoint_state state) {
|
||||
logger.debug("endpoint={} on_alive", endpoint);
|
||||
get_local_migration_manager().schedule_schema_pull(endpoint, state).handle_exception([endpoint] (auto ep) {
|
||||
logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
|
||||
logger.warn("Fail to pull schema from {}: {}", endpoint, ep);
|
||||
});
|
||||
if (_token_metadata.is_member(endpoint)) {
|
||||
#if 0
|
||||
@@ -813,7 +813,7 @@ void storage_service::on_change(inet_address endpoint, application_state state,
|
||||
do_update_system_peers_table(endpoint, state, value);
|
||||
if (state == application_state::SCHEMA) {
|
||||
get_local_migration_manager().schedule_schema_pull(endpoint, *ep_state).handle_exception([endpoint] (auto ep) {
|
||||
logger.warn("Fail to pull schmea from {}: {}", endpoint, ep);
|
||||
logger.warn("Failed to pull schema from {}: {}", endpoint, ep);
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -2481,7 +2481,7 @@ void storage_service::add_expire_time_if_found(inet_address endpoint, int64_t ex
|
||||
// in there.
|
||||
future<> storage_service::load_new_sstables(sstring ks_name, sstring cf_name) {
|
||||
class max_element {
|
||||
int64_t _result = 1;
|
||||
int64_t _result = 0;
|
||||
public:
|
||||
future<> operator()(int64_t value) {
|
||||
_result = std::max(value, _result);
|
||||
@@ -2514,18 +2514,37 @@ future<> storage_service::load_new_sstables(sstring ks_name, sstring cf_name) {
|
||||
auto& cf = db.find_column_family(ks_name, cf_name);
|
||||
return cf.disable_sstable_write();
|
||||
}).then([this, cf_name, ks_name] (int64_t max_seen_sstable) {
|
||||
logger.debug("Loading new sstables with generation numbers larger or equal than {}", max_seen_sstable);
|
||||
// Then, we will reshuffle the tables to make sure that the generation numbers don't go too high.
|
||||
// We will do all of it the same CPU, to make sure that we won't have two parallel shufflers stepping
|
||||
// onto each other.
|
||||
//
|
||||
// Note that this will reshuffle all tables, including existing ones. Figuring out which of the tables
|
||||
// are new would require coordination between all shards, so it is simpler this way. Renaming an existing
|
||||
// SSTable shouldn't be that bad, and we are assuming empty directory for normal operation anyway.
|
||||
auto shard = std::hash<sstring>()(cf_name) % smp::count;
|
||||
return _db.invoke_on(shard, [ks_name, cf_name, max_seen_sstable] (database& db) {
|
||||
|
||||
class all_generations {
|
||||
std::set<int64_t> _result;
|
||||
public:
|
||||
future<> operator()(std::set<int64_t> value) {
|
||||
_result.insert(value.begin(), value.end());
|
||||
return make_ready_future<>();
|
||||
}
|
||||
std::set<int64_t> get() && {
|
||||
return _result;
|
||||
}
|
||||
};
|
||||
|
||||
// We provide to reshuffle_sstables() the generation of all existing sstables, such that it will
|
||||
// easily know which sstables are new.
|
||||
return _db.map_reduce(all_generations(), [ks_name, cf_name] (database& db) {
|
||||
auto& cf = db.find_column_family(ks_name, cf_name);
|
||||
return cf.reshuffle_sstables(max_seen_sstable);
|
||||
std::set<int64_t> generations;
|
||||
for (auto& p : *(cf.get_sstables())) {
|
||||
generations.insert(p.second->generation());
|
||||
}
|
||||
return make_ready_future<std::set<int64_t>>(std::move(generations));
|
||||
}).then([this, max_seen_sstable, ks_name, cf_name] (std::set<int64_t> all_generations) {
|
||||
auto shard = std::hash<sstring>()(cf_name) % smp::count;
|
||||
return _db.invoke_on(shard, [ks_name, cf_name, max_seen_sstable, all_generations = std::move(all_generations)] (database& db) {
|
||||
auto& cf = db.find_column_family(ks_name, cf_name);
|
||||
return cf.reshuffle_sstables(std::move(all_generations), max_seen_sstable + 1);
|
||||
});
|
||||
});
|
||||
}).then_wrapped([this, ks_name, cf_name] (future<std::vector<sstables::entry_descriptor>> f) {
|
||||
std::vector<sstables::entry_descriptor> new_tables;
|
||||
|
||||
@@ -139,7 +139,7 @@ compact_sstables(std::vector<shared_sstable> sstables, column_family& cf, std::f
|
||||
|
||||
db::replay_position rp;
|
||||
|
||||
auto all_sstables = cf.get_sstables();
|
||||
auto all_sstables = cf.get_sstables_including_compacted_undeleted();
|
||||
std::sort(sstables.begin(), sstables.end(), [] (const shared_sstable& x, const shared_sstable& y) {
|
||||
return x->generation() < y->generation();
|
||||
});
|
||||
|
||||
@@ -229,9 +229,14 @@ public:
|
||||
: _compression_metadata(cm)
|
||||
{
|
||||
_beg_pos = pos;
|
||||
if (pos >= _compression_metadata->data_len) {
|
||||
if (pos > _compression_metadata->data_len) {
|
||||
throw std::runtime_error("attempt to uncompress beyond end");
|
||||
}
|
||||
if (len == 0 || pos == _compression_metadata->data_len) {
|
||||
// Nothing to read
|
||||
_end_pos = _pos = _beg_pos;
|
||||
return;
|
||||
}
|
||||
if (len <= _compression_metadata->data_len - pos) {
|
||||
_end_pos = pos + len;
|
||||
} else {
|
||||
|
||||
@@ -42,10 +42,16 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context> {
|
||||
// IndexConsumer is a concept that implements:
|
||||
//
|
||||
// bool should_continue();
|
||||
// void consume_entry(index_entry&& ie);
|
||||
template <class IndexConsumer>
|
||||
class index_consume_entry_context: public data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>> {
|
||||
using proceed = data_consumer::proceed;
|
||||
using continuous_data_consumer = data_consumer::continuous_data_consumer<index_consume_entry_context<IndexConsumer>>;
|
||||
private:
|
||||
index_consumer& _consumer;
|
||||
IndexConsumer& _consumer;
|
||||
|
||||
enum class state {
|
||||
START,
|
||||
@@ -66,7 +72,7 @@ public:
|
||||
|
||||
bool non_consuming() const {
|
||||
return ((_state == state::CONSUME_ENTRY) || (_state == state::START) ||
|
||||
((_state == state::PROMOTED_BYTES) && (_prestate == prestate::NONE)));
|
||||
((_state == state::PROMOTED_BYTES) && (continuous_data_consumer::_prestate == continuous_data_consumer::prestate::NONE)));
|
||||
}
|
||||
|
||||
proceed process_state(temporary_buffer<char>& data) {
|
||||
@@ -79,32 +85,32 @@ public:
|
||||
_state = state::KEY_SIZE;
|
||||
break;
|
||||
case state::KEY_SIZE:
|
||||
if (read_16(data) != read_status::ready) {
|
||||
if (this->read_16(data) != continuous_data_consumer::read_status::ready) {
|
||||
_state = state::KEY_BYTES;
|
||||
break;
|
||||
}
|
||||
case state::KEY_BYTES:
|
||||
if (read_bytes(data, _u16, _key) != read_status::ready) {
|
||||
if (this->read_bytes(data, this->_u16, _key) != continuous_data_consumer::read_status::ready) {
|
||||
_state = state::POSITION;
|
||||
break;
|
||||
}
|
||||
case state::POSITION:
|
||||
if (read_64(data) != read_status::ready) {
|
||||
if (this->read_64(data) != continuous_data_consumer::read_status::ready) {
|
||||
_state = state::PROMOTED_SIZE;
|
||||
break;
|
||||
}
|
||||
case state::PROMOTED_SIZE:
|
||||
if (read_32(data) != read_status::ready) {
|
||||
if (this->read_32(data) != continuous_data_consumer::read_status::ready) {
|
||||
_state = state::PROMOTED_BYTES;
|
||||
break;
|
||||
}
|
||||
case state::PROMOTED_BYTES:
|
||||
if (read_bytes(data, _u32, _promoted) != read_status::ready) {
|
||||
if (this->read_bytes(data, this->_u32, _promoted) != continuous_data_consumer::read_status::ready) {
|
||||
_state = state::CONSUME_ENTRY;
|
||||
break;
|
||||
}
|
||||
case state::CONSUME_ENTRY:
|
||||
_consumer.consume_entry(index_entry(std::move(_key), _u64, std::move(_promoted)));
|
||||
_consumer.consume_entry(index_entry(std::move(_key), this->_u64, std::move(_promoted)));
|
||||
_state = state::START;
|
||||
break;
|
||||
default:
|
||||
@@ -113,7 +119,7 @@ public:
|
||||
return proceed::yes;
|
||||
}
|
||||
|
||||
index_consume_entry_context(index_consumer& consumer,
|
||||
index_consume_entry_context(IndexConsumer& consumer,
|
||||
input_stream<char>&& input, uint64_t maxlen)
|
||||
: continuous_data_consumer(std::move(input), maxlen)
|
||||
, _consumer(consumer)
|
||||
|
||||
@@ -57,21 +57,18 @@ enum class composite_marker : bytes::value_type {
|
||||
end_range = 1,
|
||||
};
|
||||
|
||||
inline void check_marker(bytes_view component, composite_marker expected) {
|
||||
inline void check_marker(bytes_view component) {
|
||||
auto found = composite_marker(component.back());
|
||||
if (found != expected) {
|
||||
throw runtime_exception(sprint("Unexpected marker. Found %d, expected %d\n", uint8_t(found), uint8_t(expected)));
|
||||
switch (found) {
|
||||
case composite_marker::none:
|
||||
case composite_marker::start_range:
|
||||
case composite_marker::end_range:
|
||||
break;
|
||||
default:
|
||||
throw runtime_exception(sprint("Unexpected marker. Found %d, expected %d\n", uint16_t(uint8_t(found))));
|
||||
}
|
||||
}
|
||||
|
||||
inline void check_marker(bytes_view component, composite_marker expected, composite_marker alternative) {
|
||||
auto found = composite_marker(component.back());
|
||||
if ((found == expected) || (found == alternative)) {
|
||||
return;
|
||||
}
|
||||
throw runtime_exception(sprint("Unexpected marker. Found %d, expected %d or %d\n", uint8_t(found), uint8_t(expected)));
|
||||
}
|
||||
|
||||
// Our internal representation differs slightly (in the way it serializes) from Origin.
|
||||
// In order to be able to achieve read and write compatibility for sstables - so they can
|
||||
// be imported and exported - we need to always convert a key to this representation.
|
||||
|
||||
@@ -249,6 +249,139 @@ class mp_row_consumer : public row_consumer {
|
||||
_pending_collection = {};
|
||||
}
|
||||
}
|
||||
|
||||
class range_merger {
|
||||
bytes _data;
|
||||
bytes _end;
|
||||
sstables::deletion_time _deletion_time;
|
||||
public:
|
||||
bytes&& data() {
|
||||
return std::move(_data);
|
||||
}
|
||||
explicit operator bool() const noexcept {
|
||||
return !_data.empty();
|
||||
}
|
||||
explicit operator sstring() const {
|
||||
if (*this) {
|
||||
return to_hex(_data) + sprint(" deletion (%x,%lx)", _deletion_time.local_deletion_time, _deletion_time.marked_for_delete_at);
|
||||
} else {
|
||||
return sstring("(null)");
|
||||
}
|
||||
}
|
||||
explicit operator bytes_view() const {
|
||||
return _data;
|
||||
}
|
||||
|
||||
bool operator==(const range_merger& candidate) {
|
||||
if (!candidate) {
|
||||
return false;
|
||||
}
|
||||
bytes_view a(_data);
|
||||
bytes_view b(candidate._data);
|
||||
a.remove_suffix(1);
|
||||
b.remove_suffix(1);
|
||||
return ((a == b) && (_deletion_time == candidate._deletion_time));
|
||||
}
|
||||
|
||||
bool operator!=(const range_merger& candidate) {
|
||||
return !(*this == candidate);
|
||||
}
|
||||
|
||||
bool is_prefix_of(const range_merger& candidate) {
|
||||
bytes_view a(_data);
|
||||
bytes_view b(candidate._data);
|
||||
a.remove_suffix(1);
|
||||
b.remove_suffix(1);
|
||||
return b.compare(0, a.size(), a) == 0;
|
||||
}
|
||||
|
||||
bool end_matches(bytes_view candidate, sstables::deletion_time deltime) {
|
||||
if (_deletion_time != deltime) {
|
||||
return false;
|
||||
}
|
||||
bytes_view my_end(_end);
|
||||
my_end.remove_suffix(1);
|
||||
candidate.remove_suffix(1);
|
||||
return my_end == candidate;
|
||||
}
|
||||
|
||||
void set_end(bytes_view end) {
|
||||
_end = to_bytes(end);
|
||||
}
|
||||
|
||||
range_merger(bytes_view start, bytes_view end, sstables::deletion_time d)
|
||||
: _data(to_bytes(start))
|
||||
, _end(to_bytes(end))
|
||||
, _deletion_time(d)
|
||||
{}
|
||||
range_merger() : _data(), _end(), _deletion_time() {}
|
||||
};
|
||||
|
||||
// Variables for tracking tombstone merging in consume_range_tombstone().
|
||||
// All of these hold serialized composites.
|
||||
std::stack<range_merger> _starts;
|
||||
|
||||
void reset_range_tombstone_merger() {
|
||||
// Will throw if there is a current merger that hasn't finished.
|
||||
// This will be called at the start and end of any row.
|
||||
// This check is crucial to our goal of not falsely reporting a real range tombstone as a
|
||||
// merger.
|
||||
if (!_starts.empty()) {
|
||||
auto msg = sstring("RANGE DELETE not implemented. Tried to merge, but row finished before we could finish the merge. Starts found: (");
|
||||
while (!_starts.empty()) {
|
||||
msg += sstring(_starts.top());
|
||||
_starts.pop();
|
||||
if (!_starts.empty()) {
|
||||
msg += sstring(" , ");
|
||||
}
|
||||
}
|
||||
msg += sstring(")");
|
||||
throw malformed_sstable_exception(msg);
|
||||
}
|
||||
}
|
||||
|
||||
bytes close_merger_range() {
|
||||
// We closed a larger enclosing row.
|
||||
auto ret = _starts.top().data();
|
||||
_starts.pop();
|
||||
return ret;
|
||||
}
|
||||
|
||||
bytes update_range_tombstone_merger(bytes_view _start, bytes_view end,
|
||||
sstables::deletion_time deltime) {
|
||||
range_merger start(_start, end, deltime);
|
||||
range_merger empty;
|
||||
|
||||
// If we're processing a range (_starts is not empty, it's fine to start
|
||||
// processing another, but only so long as we're nesting. We then check
|
||||
// to make sure that the current range being processed is a prefix of the new one.
|
||||
if (!_starts.empty() && !_starts.top().is_prefix_of(start)) {
|
||||
auto msg = sstring("RANGE DELETE not implemented. Tried to merge, but existing range not a prefix of new one. Current range: ");
|
||||
msg += sstring(_starts.top());
|
||||
msg += ". new range: " + sstring(start);
|
||||
throw malformed_sstable_exception(msg);
|
||||
}
|
||||
|
||||
range_merger& prev = empty;
|
||||
if (!_starts.empty()) {
|
||||
prev = _starts.top();
|
||||
}
|
||||
_starts.push(start);
|
||||
|
||||
if (prev.end_matches(bytes_view(start), deltime)) {
|
||||
// If _contig_deletion_end, we're in the middle of trying to merge
|
||||
// several contiguous range tombstones. If there's a gap, we cannot
|
||||
// represent this range in Scylla.
|
||||
prev.set_end(end);
|
||||
// We pop what we have just inserted, because that's not starting the
|
||||
// processing of any new range.
|
||||
_starts.pop();
|
||||
}
|
||||
if (_starts.top().end_matches(end, deltime)) {
|
||||
return close_merger_range();
|
||||
}
|
||||
return {};
|
||||
}
|
||||
public:
|
||||
mutation_opt mut;
|
||||
|
||||
@@ -366,39 +499,77 @@ public:
|
||||
}
|
||||
}
|
||||
virtual proceed consume_row_end() override {
|
||||
reset_range_tombstone_merger();
|
||||
if (mut) {
|
||||
flush_pending_collection(*_schema, *mut);
|
||||
}
|
||||
return proceed::no;
|
||||
}
|
||||
|
||||
// Partial support for range tombstones read from sstables:
|
||||
//
|
||||
// Currently, Scylla does not support generic range tombstones: Only
|
||||
// ranges which are a complete clustering-key prefix are supported because
|
||||
// our in-memory data structure only allows deleted rows (prefixes).
|
||||
// In principle, this is good enough because in Cassandra 2 (whose
|
||||
// sstables we support) and using only CQL, there is no way to delete a
|
||||
// generic range, because the DELETE and UPDATE statement's "WHERE" only
|
||||
// takes the "=" operator, leading to a deletion of entire rows.
|
||||
//
|
||||
// However, in one important case the sstable written by Cassandra does
|
||||
// have a generic range tombstone, which we can and must handle:
|
||||
// Consider two tombstones, one deleting a bigger prefix than the other:
|
||||
//
|
||||
// create table tab (pk text, ck1 text, ck2 text, data text, primary key(pk, ck1, ck2));
|
||||
// delete from tab where pk = 'pk' and ck1 = 'aaa';
|
||||
// delete from tab where pk = 'pk' and ck1 = 'aaa' and ck2 = 'bbb';
|
||||
//
|
||||
// The first deletion covers the second, but nevertheless we cannot drop the
|
||||
// smaller one because the two deletions have different timestamps.
|
||||
// Currently in Scylla, we simply keep both tombstones separately.
|
||||
// But Cassandra does something different: Cassandra does not want to have
|
||||
// overlapping range tombstones, so it converts them into non-overlapping
|
||||
// range tombstones (see RangeTombstoneList.java). In the above example,
|
||||
// the resulting sstable is (sstable2json format)
|
||||
//
|
||||
// {"key": "pk",
|
||||
// "cells": [["aaa:_","aaa:bbb:_",1459334681228103,"t",1459334681],
|
||||
// ["aaa:bbb:_","aaa:bbb:!",1459334681244989,"t",1459334681],
|
||||
// ["aaa:bbb:!","aaa:!",1459334681228103,"t",1459334681]]}
|
||||
// ]
|
||||
//
|
||||
// In this sstable, the first and third tombstones look like "generic" ranges,
|
||||
// not covering an entire prefix, so we cannot represent these three
|
||||
// tombstones in our in-memory data structure. Instead, we need to convert the
|
||||
// three non-overlapping tombstones to two overlapping whole-prefix tombstones,
|
||||
// the two we started with in the "delete" commands above.
|
||||
// This is what the code below does. If after trying to recombine split
|
||||
// tombstones we are still left with a generic range we cannot represent,
|
||||
// we fail the read.
|
||||
|
||||
virtual void consume_range_tombstone(
|
||||
bytes_view start_col, bytes_view end_col,
|
||||
sstables::deletion_time deltime) override {
|
||||
check_marker(end_col, composite_marker::end_range);
|
||||
// Some versions of Cassandra will write a 0 to mark the start of the range.
|
||||
// CASSANDRA-7593 discusses that.
|
||||
check_marker(start_col, composite_marker::start_range, composite_marker::none);
|
||||
// We used to check that start_col has composite_marker:start_range
|
||||
// and end_col has composite_marker::end_range. But this check is
|
||||
// incorrect. start_col may have composite_marker::none in sstables
|
||||
// from older versions of Cassandra (see CASSANDRA-7593) and we also
|
||||
// saw composite_marker::none in end_col. Also, when a larger range
|
||||
// tombstone was split (see explanation above), we can have a
|
||||
// start_range in end_col or end_range in start_col.
|
||||
// So we don't check the markers' content at all here, only if they
|
||||
// are sane.
|
||||
check_marker(start_col);
|
||||
check_marker(end_col);
|
||||
|
||||
// FIXME: CASSANDRA-6237 says support will be added to things like this.
|
||||
//
|
||||
// The check below represents a range with a different start and end
|
||||
// clustering key. Cassandra-generated files (to the moment) will
|
||||
// generate multi-row deletes, but they always have the same clustering
|
||||
// key. This is basically because one can't (yet) write delete
|
||||
// statements in which the WHERE clause looks like WHERE clustering_key >= x.
|
||||
//
|
||||
// We don't really have it in our model ATM, so let's just mark this unimplemented.
|
||||
//
|
||||
// The only expected difference between them, is the final marker. We
|
||||
// will remove it from end_col to ease the comparison, but will leave
|
||||
// start_col untouched to make sure explode() still works.
|
||||
end_col.remove_suffix(1);
|
||||
if (start_col.compare(0, end_col.size(), end_col)) {
|
||||
fail(unimplemented::cause::RANGE_DELETES);
|
||||
bytes new_start = {};
|
||||
new_start = update_range_tombstone_merger(start_col, end_col, deltime);
|
||||
if (new_start.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
start_col = bytes_view(new_start);
|
||||
auto start = composite_view(column::fix_static_name(start_col)).explode();
|
||||
|
||||
// Note how this is slightly different from the check in is_collection. Collection tombstones
|
||||
// do not have extra data.
|
||||
//
|
||||
@@ -466,6 +637,13 @@ sstables::sstable::read_row(schema_ptr schema, const sstables::key& key, const i
|
||||
auto token = partitioner.get_token(key_view(key));
|
||||
|
||||
auto& summary = _summary;
|
||||
|
||||
if (token < partitioner.get_token(key_view(summary.first_key.value))
|
||||
|| token > partitioner.get_token(key_view(summary.last_key.value))) {
|
||||
_filter_tracker.add_false_positive();
|
||||
return make_ready_future<mutation_opt>();
|
||||
}
|
||||
|
||||
auto summary_idx = adjust_binary_search_index(binary_search(summary.entries, key, token));
|
||||
if (summary_idx < 0) {
|
||||
_filter_tracker.add_false_positive();
|
||||
@@ -495,52 +673,59 @@ class mutation_reader::impl {
|
||||
private:
|
||||
mp_row_consumer _consumer;
|
||||
std::experimental::optional<data_consume_context> _context;
|
||||
std::experimental::optional<future<data_consume_context>> _context_future;
|
||||
std::function<future<data_consume_context> ()> _get_context;
|
||||
public:
|
||||
impl(sstable& sst, schema_ptr schema, uint64_t start, uint64_t end,
|
||||
const io_priority_class &pc)
|
||||
: _consumer(schema, pc)
|
||||
, _context(sst.data_consume_rows(_consumer, start, end)) { }
|
||||
, _get_context([&sst, this, start, end] {
|
||||
return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer, start, end));
|
||||
}) { }
|
||||
impl(sstable& sst, schema_ptr schema,
|
||||
const io_priority_class &pc)
|
||||
: _consumer(schema, pc)
|
||||
, _context(sst.data_consume_rows(_consumer)) { }
|
||||
impl(sstable& sst, schema_ptr schema, future<uint64_t> start, future<uint64_t> end, const io_priority_class& pc)
|
||||
, _get_context([this, &sst] {
|
||||
return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer));
|
||||
}) { }
|
||||
impl(sstable& sst, schema_ptr schema, std::function<future<uint64_t>()> start, std::function<future<uint64_t>()> end, const io_priority_class& pc)
|
||||
: _consumer(schema, pc)
|
||||
, _context_future(start.then([this, &sst, end = std::move(end)] (uint64_t start) mutable {
|
||||
return end.then([this, &sst, start] (uint64_t end) mutable {
|
||||
return sst.data_consume_rows(_consumer, start, end);
|
||||
});
|
||||
})) { }
|
||||
impl() : _consumer() { }
|
||||
, _get_context([this, &sst, start = std::move(start), end = std::move(end)] () {
|
||||
return start().then([this, &sst, end = std::move(end)] (uint64_t start) {
|
||||
return end().then([this, &sst, start] (uint64_t end) {
|
||||
return make_ready_future<data_consume_context>(sst.data_consume_rows(_consumer, start, end));
|
||||
});
|
||||
});
|
||||
}) { }
|
||||
impl() : _consumer(), _get_context() { }
|
||||
|
||||
// Reference to _consumer is passed to data_consume_rows() in the constructor so we must not allow move/copy
|
||||
impl(impl&&) = delete;
|
||||
impl(const impl&) = delete;
|
||||
|
||||
future<mutation_opt> read() {
|
||||
if (_context) {
|
||||
return _context->read().then([this] {
|
||||
// We want after returning a mutation that _consumer.mut()
|
||||
// will be left in unengaged state (so on EOF we return an
|
||||
// unengaged optional). Moving _consumer.mut is *not* enough.
|
||||
auto ret = std::move(_consumer.mut);
|
||||
_consumer.mut = {};
|
||||
return std::move(ret);
|
||||
});
|
||||
} else if (_context_future) {
|
||||
return _context_future->then([this] (auto context) {
|
||||
_context = std::move(context);
|
||||
return _context->read().then([this] {
|
||||
auto ret = std::move(_consumer.mut);
|
||||
_consumer.mut = {};
|
||||
return std::move(ret);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
if (!_get_context) {
|
||||
// empty mutation reader returns EOF immediately
|
||||
return make_ready_future<mutation_opt>();
|
||||
}
|
||||
|
||||
if (_context) {
|
||||
return do_read();
|
||||
}
|
||||
return (_get_context)().then([this] (data_consume_context context) {
|
||||
_context = std::move(context);
|
||||
return do_read();
|
||||
});
|
||||
}
|
||||
private:
|
||||
future<mutation_opt> do_read() {
|
||||
return _context->read().then([this] {
|
||||
// We want after returning a mutation that _consumer.mut()
|
||||
// will be left in unengaged state (so on EOF we return an
|
||||
// unengaged optional). Moving _consumer.mut is *not* enough.
|
||||
auto ret = std::move(_consumer.mut);
|
||||
_consumer.mut = {};
|
||||
return std::move(ret);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -649,17 +834,19 @@ sstable::read_range_rows(schema_ptr schema, const query::partition_range& range,
|
||||
fail(unimplemented::cause::WRAP_AROUND);
|
||||
}
|
||||
|
||||
future<uint64_t> start = range.start()
|
||||
? (range.start()->is_inclusive()
|
||||
auto start = [this, range, schema, &pc] {
|
||||
return range.start() ? (range.start()->is_inclusive()
|
||||
? lower_bound(schema, range.start()->value(), pc)
|
||||
: upper_bound(schema, range.start()->value(), pc))
|
||||
: make_ready_future<uint64_t>(0);
|
||||
};
|
||||
|
||||
future<uint64_t> end = range.end()
|
||||
? (range.end()->is_inclusive()
|
||||
auto end = [this, range, schema, &pc] {
|
||||
return range.end() ? (range.end()->is_inclusive()
|
||||
? upper_bound(schema, range.end()->value(), pc)
|
||||
: lower_bound(schema, range.end()->value(), pc))
|
||||
: make_ready_future<uint64_t>(data_size());
|
||||
};
|
||||
|
||||
return std::make_unique<mutation_reader::impl>(
|
||||
*this, std::move(schema), std::move(start), std::move(end), pc);
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "core/shared_ptr.hh"
|
||||
#include "core/do_with.hh"
|
||||
#include "core/thread.hh"
|
||||
#include <seastar/core/shared_future.hh>
|
||||
#include <iterator>
|
||||
|
||||
#include "types.hh"
|
||||
@@ -44,6 +45,9 @@
|
||||
#include <boost/filesystem/operations.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/range/adaptor/map.hpp>
|
||||
#include <boost/range/algorithm_ext/insert.hpp>
|
||||
#include <boost/range/algorithm_ext/push_back.hpp>
|
||||
#include <boost/range/algorithm/set_algorithm.hpp>
|
||||
#include <regex>
|
||||
#include <core/align.hh>
|
||||
#include "utils/phased_barrier.hh"
|
||||
@@ -59,7 +63,12 @@ future<file> new_sstable_component_file(sstring name, open_flags flags) {
|
||||
});
|
||||
}
|
||||
|
||||
thread_local std::unordered_map<sstring, std::unordered_set<unsigned>> sstable::_shards_agreeing_to_remove_sstable;
|
||||
future<file> new_sstable_component_file(sstring name, open_flags flags, file_open_options options) {
|
||||
return open_file_dma(name, flags, options).handle_exception([name] (auto ep) {
|
||||
sstlog.error("Could not create SSTable component {}. Found exception: {}", name, ep);
|
||||
return make_exception_future<file>(ep);
|
||||
});
|
||||
}
|
||||
|
||||
static utils::phased_barrier& background_jobs() {
|
||||
static thread_local utils::phased_barrier gate;
|
||||
@@ -682,6 +691,10 @@ inline void write(file_writer& out, estimated_histogram& eh) {
|
||||
// This is small enough, and well-defined. Easier to just read it all
|
||||
// at once
|
||||
future<> sstable::read_toc() {
|
||||
if (_components.size()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto file_path = filename(sstable::component_type::TOC);
|
||||
|
||||
sstlog.debug("Reading TOC file {} ", file_path);
|
||||
@@ -712,6 +725,7 @@ future<> sstable::read_toc() {
|
||||
try {
|
||||
_components.insert(reverse_map(c, _component_map));
|
||||
} catch (std::out_of_range& oor) {
|
||||
_components.clear(); // so subsequent read_toc will be forced to fail again
|
||||
throw malformed_sstable_exception("Unrecognized TOC component: " + c);
|
||||
}
|
||||
}
|
||||
@@ -862,7 +876,7 @@ future<index_list> sstable::read_indexes(uint64_t summary_idx, const io_priority
|
||||
auto stream = make_file_input_stream(this->_index_file, position, end - position, std::move(options));
|
||||
// TODO: it's redundant to constrain the consumer here to stop at
|
||||
// index_size()-position, the input stream is already constrained.
|
||||
auto ctx = make_lw_shared<index_consume_entry_context>(ic, std::move(stream), this->index_size() - position);
|
||||
auto ctx = make_lw_shared<index_consume_entry_context<index_consumer>>(ic, std::move(stream), this->index_size() - position);
|
||||
return ctx->consume_input(*ctx).then([ctx, &ic] {
|
||||
return make_ready_future<index_list>(std::move(ic.indexes));
|
||||
});
|
||||
@@ -934,6 +948,25 @@ void sstable::write_statistics(const io_priority_class& pc) {
|
||||
write_simple<component_type::Statistics>(_statistics, pc);
|
||||
}
|
||||
|
||||
future<> sstable::read_summary(const io_priority_class& pc) {
|
||||
if (_summary) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
return read_toc().then([this, &pc] {
|
||||
// We'll try to keep the main code path exception free, but if an exception does happen
|
||||
// we can try to regenerate the Summary.
|
||||
if (has_component(sstable::component_type::Summary)) {
|
||||
return read_simple<component_type::Summary>(_summary, pc).handle_exception([this, &pc] (auto ep) {
|
||||
sstlog.warn("Couldn't read summary file %s: %s. Recreating it.", this->filename(component_type::Summary), ep);
|
||||
return this->generate_summary(pc);
|
||||
});
|
||||
} else {
|
||||
return generate_summary(pc);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
future<> sstable::open_data() {
|
||||
return when_all(open_file_dma(filename(component_type::Index), open_flags::ro),
|
||||
open_file_dma(filename(component_type::Data), open_flags::ro)).then([this] (auto files) {
|
||||
@@ -964,8 +997,10 @@ future<> sstable::open_data() {
|
||||
|
||||
future<> sstable::create_data() {
|
||||
auto oflags = open_flags::wo | open_flags::create | open_flags::exclusive;
|
||||
file_open_options opt;
|
||||
opt.extent_allocation_size_hint = 32 << 20;
|
||||
return when_all(new_sstable_component_file(filename(component_type::Index), oflags),
|
||||
new_sstable_component_file(filename(component_type::Data), oflags)).then([this] (auto files) {
|
||||
new_sstable_component_file(filename(component_type::Data), oflags, opt)).then([this] (auto files) {
|
||||
// FIXME: If both files could not be created, the first get below will
|
||||
// throw an exception, and second get() will not be attempted, and
|
||||
// we'll get a warning about the second future being destructed
|
||||
@@ -1202,10 +1237,9 @@ static void write_index_entry(file_writer& out, disk_string_view<uint16_t>& key,
|
||||
write(out, key, pos, promoted_index_size);
|
||||
}
|
||||
|
||||
static void prepare_summary(summary& s, uint64_t expected_partition_count, const schema& schema) {
|
||||
static void prepare_summary(summary& s, uint64_t expected_partition_count, uint32_t min_index_interval) {
|
||||
assert(expected_partition_count >= 1);
|
||||
|
||||
auto min_index_interval = schema.min_index_interval();
|
||||
s.header.min_index_interval = min_index_interval;
|
||||
s.header.sampling_level = downsampling::BASE_SAMPLING_LEVEL;
|
||||
uint64_t max_expected_entries =
|
||||
@@ -1222,8 +1256,7 @@ static void prepare_summary(summary& s, uint64_t expected_partition_count, const
|
||||
|
||||
static void seal_summary(summary& s,
|
||||
std::experimental::optional<key>&& first_key,
|
||||
std::experimental::optional<key>&& last_key,
|
||||
const schema& schema) {
|
||||
std::experimental::optional<key>&& last_key) {
|
||||
s.header.size = s.entries.size();
|
||||
s.header.size_at_full_sampling = s.header.size;
|
||||
|
||||
@@ -1312,7 +1345,7 @@ void sstable::do_write_components(::mutation_reader mr,
|
||||
auto filter_fp_chance = schema->bloom_filter_fp_chance();
|
||||
_filter = utils::i_filter::get_filter(estimated_partitions, filter_fp_chance);
|
||||
|
||||
prepare_summary(_summary, estimated_partitions, *schema);
|
||||
prepare_summary(_summary, estimated_partitions, schema->min_index_interval());
|
||||
|
||||
// FIXME: we may need to set repaired_at stats at this point.
|
||||
|
||||
@@ -1392,7 +1425,7 @@ void sstable::do_write_components(::mutation_reader mr,
|
||||
}
|
||||
|
||||
}
|
||||
seal_summary(_summary, std::move(first_key), std::move(last_key), *schema);
|
||||
seal_summary(_summary, std::move(first_key), std::move(last_key));
|
||||
|
||||
index->close().get();
|
||||
_index_file = file(); // index->close() closed _index_file
|
||||
@@ -1465,6 +1498,60 @@ future<> sstable::write_components(::mutation_reader mr,
|
||||
});
|
||||
}
|
||||
|
||||
future<> sstable::generate_summary(const io_priority_class& pc) {
|
||||
if (_summary) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
sstlog.info("Summary file {} not found. Generating Summary...", filename(sstable::component_type::Summary));
|
||||
class summary_generator {
|
||||
summary& _summary;
|
||||
public:
|
||||
std::experimental::optional<key> first_key, last_key;
|
||||
|
||||
summary_generator(summary& s) : _summary(s) {}
|
||||
bool should_continue() {
|
||||
return true;
|
||||
}
|
||||
void consume_entry(index_entry&& ie) {
|
||||
maybe_add_summary_entry(_summary, ie.get_key_bytes(), ie.position());
|
||||
if (!first_key) {
|
||||
first_key = key(to_bytes(ie.get_key_bytes()));
|
||||
} else {
|
||||
last_key = key(to_bytes(ie.get_key_bytes()));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
return open_file_dma(filename(component_type::Index), open_flags::ro).then([this, &pc] (file index_file) {
|
||||
return do_with(std::move(index_file), [this, &pc] (file index_file) {
|
||||
return index_file.size().then([this, &pc, index_file] (auto size) {
|
||||
// an upper bound. Surely to be less than this.
|
||||
auto estimated_partitions = size / sizeof(uint64_t);
|
||||
// Since we don't have a summary, use a default min_index_interval, and if needed we'll resample
|
||||
// later.
|
||||
prepare_summary(_summary, estimated_partitions, 0x80);
|
||||
|
||||
file_input_stream_options options;
|
||||
options.buffer_size = sstable_buffer_size;
|
||||
options.io_priority_class = pc;
|
||||
auto stream = make_file_input_stream(index_file, 0, size, std::move(options));
|
||||
return do_with(summary_generator(_summary), [this, &pc, stream = std::move(stream), size] (summary_generator& s) mutable {
|
||||
auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(s, std::move(stream), size);
|
||||
return ctx->consume_input(*ctx).then([this, ctx, &s] {
|
||||
seal_summary(_summary, std::move(s.first_key), std::move(s.last_key));
|
||||
});
|
||||
});
|
||||
}).then([index_file] () mutable {
|
||||
return index_file.close().handle_exception([] (auto ep) {
|
||||
sstlog.warn("sstable close index_file failed: {}", ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
uint64_t sstable::data_size() const {
|
||||
if (has_component(sstable::component_type::CompressionInfo)) {
|
||||
return _compression.data_len;
|
||||
@@ -1730,7 +1817,7 @@ sstable::~sstable() {
|
||||
// clean up unused sstables, and because we'll never reuse the same
|
||||
// generation number anyway.
|
||||
try {
|
||||
shared_remove_by_toc_name(filename(component_type::TOC), _shared).handle_exception(
|
||||
delete_atomically({sstable_to_delete(filename(component_type::TOC), _shared)}).handle_exception(
|
||||
[op = background_jobs().start()] (std::exception_ptr eptr) {
|
||||
sstlog.warn("Exception when deleting sstable file: {}", eptr);
|
||||
});
|
||||
@@ -1746,26 +1833,6 @@ dirname(sstring fname) {
|
||||
return boost::filesystem::canonical(std::string(fname)).parent_path().string();
|
||||
}
|
||||
|
||||
future<>
|
||||
sstable::shared_remove_by_toc_name(sstring toc_name, bool shared) {
|
||||
if (!shared) {
|
||||
return remove_by_toc_name(toc_name);
|
||||
} else {
|
||||
auto shard = std::hash<sstring>()(toc_name) % smp::count;
|
||||
return smp::submit_to(shard, [toc_name, src_shard = engine().cpu_id()] {
|
||||
auto& remove_set = _shards_agreeing_to_remove_sstable[toc_name];
|
||||
remove_set.insert(src_shard);
|
||||
auto counter = remove_set.size();
|
||||
if (counter == smp::count) {
|
||||
_shards_agreeing_to_remove_sstable.erase(toc_name);
|
||||
return remove_by_toc_name(toc_name);
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
future<>
|
||||
fsync_directory(sstring fname) {
|
||||
return open_directory(dirname(fname)).then([] (file f) {
|
||||
@@ -1778,16 +1845,23 @@ fsync_directory(sstring fname) {
|
||||
future<>
|
||||
remove_by_toc_name(sstring sstable_toc_name) {
|
||||
return seastar::async([sstable_toc_name] {
|
||||
auto dir = dirname(sstable_toc_name);
|
||||
auto toc_file = open_file_dma(sstable_toc_name, open_flags::ro).get0();
|
||||
sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - TOC_SUFFIX.size());
|
||||
auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
|
||||
sstring dir;
|
||||
|
||||
if (file_exists(sstable_toc_name).get0()) {
|
||||
dir = dirname(sstable_toc_name);
|
||||
rename_file(sstable_toc_name, new_toc_name).get();
|
||||
fsync_directory(dir).get();
|
||||
} else {
|
||||
dir = dirname(new_toc_name);
|
||||
}
|
||||
|
||||
auto toc_file = open_file_dma(new_toc_name, open_flags::ro).get0();
|
||||
auto in = make_file_input_stream(toc_file);
|
||||
auto size = toc_file.size().get0();
|
||||
auto text = in.read_exactly(size).get0();
|
||||
in.close().get();
|
||||
sstring prefix = sstable_toc_name.substr(0, sstable_toc_name.size() - TOC_SUFFIX.size());
|
||||
auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
|
||||
rename_file(sstable_toc_name, new_toc_name).get();
|
||||
fsync_directory(dir).get();
|
||||
std::vector<sstring> components;
|
||||
sstring all(text.begin(), text.end());
|
||||
boost::split(components, all, boost::is_any_of("\n"));
|
||||
@@ -1800,13 +1874,58 @@ remove_by_toc_name(sstring sstable_toc_name) {
|
||||
// already deleted
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return remove_file(prefix + component);
|
||||
auto fname = prefix + component;
|
||||
return remove_file(prefix + component).then_wrapped([fname = std::move(fname)] (future<> f) {
|
||||
// forgive ENOENT, since the component may not have been written;
|
||||
try {
|
||||
f.get();
|
||||
} catch (std::system_error& e) {
|
||||
if (e.code() != std::error_code(ENOENT, std::system_category())) {
|
||||
throw;
|
||||
}
|
||||
sstlog.debug("Forgiving ENOENT when deleting file {}", fname);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}).get();
|
||||
fsync_directory(dir).get();
|
||||
remove_file(new_toc_name).get();
|
||||
});
|
||||
}
|
||||
|
||||
future<>
|
||||
sstable::mark_for_deletion_on_disk() {
|
||||
mark_for_deletion();
|
||||
|
||||
auto toc_name = filename(component_type::TOC);
|
||||
auto shard = std::hash<sstring>()(toc_name) % smp::count;
|
||||
|
||||
return smp::submit_to(shard, [toc_name] {
|
||||
static thread_local std::unordered_set<sstring> renaming;
|
||||
|
||||
if (renaming.count(toc_name) > 0) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
renaming.emplace(toc_name);
|
||||
|
||||
return seastar::async([toc_name] {
|
||||
if (!file_exists(toc_name).get0()) {
|
||||
return; // already gone
|
||||
}
|
||||
|
||||
auto dir = dirname(toc_name);
|
||||
auto toc_file = open_file_dma(toc_name, open_flags::ro).get0();
|
||||
sstring prefix = toc_name.substr(0, toc_name.size() - TOC_SUFFIX.size());
|
||||
auto new_toc_name = prefix + TEMPORARY_TOC_SUFFIX;
|
||||
rename_file(toc_name, new_toc_name).get();
|
||||
fsync_directory(dir).get();
|
||||
}).finally([toc_name] {
|
||||
renaming.erase(toc_name);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<>
|
||||
sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
|
||||
return seastar::async([ks, cf, dir, generation, v, f] {
|
||||
@@ -1849,12 +1968,11 @@ sstable::remove_sstable_with_temp_toc(sstring ks, sstring cf, sstring dir, int64
|
||||
}
|
||||
|
||||
future<range<partition_key>>
|
||||
sstable::get_sstable_key_range(const schema& s, sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f) {
|
||||
auto sst = std::make_unique<sstable>(ks, cf, dir, generation, v, f);
|
||||
auto fut = sst->read_summary(default_priority_class());
|
||||
return std::move(fut).then([sst = std::move(sst), &s] () mutable {
|
||||
auto first = sst->get_first_partition_key(s);
|
||||
auto last = sst->get_last_partition_key(s);
|
||||
sstable::get_sstable_key_range(const schema& s) {
|
||||
auto fut = read_summary(default_priority_class());
|
||||
return std::move(fut).then([this, &s] () mutable {
|
||||
auto first = get_first_partition_key(s);
|
||||
auto last = get_last_partition_key(s);
|
||||
return make_ready_future<range<partition_key>>(range<partition_key>::make(first, last));
|
||||
});
|
||||
}
|
||||
@@ -1864,4 +1982,170 @@ void sstable::mark_sstable_for_deletion(sstring ks, sstring cf, sstring dir, int
|
||||
sst.mark_for_deletion();
|
||||
}
|
||||
|
||||
std::ostream&
|
||||
operator<<(std::ostream& os, const sstable_to_delete& std) {
|
||||
return os << std.name << "(" << (std.shared ? "shared" : "unshared") << ")";
|
||||
}
|
||||
|
||||
using shards_agreeing_to_delete_sstable_type = std::unordered_set<shard_id>;
|
||||
using sstables_to_delete_atomically_type = std::set<sstring>;
|
||||
struct pending_deletion {
|
||||
sstables_to_delete_atomically_type names;
|
||||
std::vector<lw_shared_ptr<promise<>>> completions;
|
||||
};
|
||||
|
||||
static thread_local bool g_atomic_deletions_cancelled = false;
|
||||
static thread_local std::list<lw_shared_ptr<pending_deletion>> g_atomic_deletion_sets;
|
||||
static thread_local std::unordered_map<sstring, shards_agreeing_to_delete_sstable_type> g_shards_agreeing_to_delete_sstable;
|
||||
|
||||
static logging::logger deletion_logger("sstable-deletion");
|
||||
|
||||
static
|
||||
future<>
|
||||
do_delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set, unsigned deleting_shard) {
|
||||
// runs on shard 0 only
|
||||
deletion_logger.debug("shard {} atomically deleting {}", deleting_shard, atomic_deletion_set);
|
||||
|
||||
if (g_atomic_deletions_cancelled) {
|
||||
deletion_logger.debug("atomic deletions disabled, erroring out");
|
||||
throw std::runtime_error(sprint("atomic deletions disabled; not deleting %s", atomic_deletion_set));
|
||||
}
|
||||
|
||||
// Insert atomic_deletion_set into the list of sets pending deletion. If the new set
|
||||
// overlaps with an existing set, merge them (the merged set will be deleted atomically).
|
||||
std::list<lw_shared_ptr<pending_deletion>> new_atomic_deletion_sets;
|
||||
auto merged_set = make_lw_shared(pending_deletion());
|
||||
for (auto&& sst_to_delete : atomic_deletion_set) {
|
||||
merged_set->names.insert(sst_to_delete.name);
|
||||
if (!sst_to_delete.shared) {
|
||||
for (auto shard : boost::irange<shard_id>(0, smp::count)) {
|
||||
g_shards_agreeing_to_delete_sstable[sst_to_delete.name].insert(shard);
|
||||
}
|
||||
}
|
||||
}
|
||||
merged_set->completions.push_back(make_lw_shared<promise<>>());
|
||||
auto ret = merged_set->completions.back()->get_future();
|
||||
for (auto&& old_set : g_atomic_deletion_sets) {
|
||||
auto intersection = sstables_to_delete_atomically_type();
|
||||
boost::set_intersection(merged_set->names, old_set->names, std::inserter(intersection, intersection.end()));
|
||||
if (intersection.empty()) {
|
||||
// We copy old_set to avoid corrupting g_atomic_deletion_sets if we fail
|
||||
// further on.
|
||||
new_atomic_deletion_sets.push_back(old_set);
|
||||
} else {
|
||||
deletion_logger.debug("merging with {}", old_set->names);
|
||||
merged_set->names.insert(old_set->names.begin(), old_set->names.end());
|
||||
boost::push_back(merged_set->completions, old_set->completions);
|
||||
}
|
||||
}
|
||||
deletion_logger.debug("new atomic set: {}", merged_set->names);
|
||||
new_atomic_deletion_sets.push_back(merged_set);
|
||||
// can now exception-safely commit:
|
||||
g_atomic_deletion_sets = std::move(new_atomic_deletion_sets);
|
||||
|
||||
// Mark each sstable as being deleted from deleting_shard. We have to do
|
||||
// this in a separate pass, so the consideration whether we can delete or not
|
||||
// sees all the data from this pass.
|
||||
for (auto&& sst : atomic_deletion_set) {
|
||||
g_shards_agreeing_to_delete_sstable[sst.name].insert(deleting_shard);
|
||||
}
|
||||
|
||||
// Figure out if the (possibly merged) set can be deleted
|
||||
for (auto&& sst : merged_set->names) {
|
||||
if (g_shards_agreeing_to_delete_sstable[sst].size() != smp::count) {
|
||||
// Not everyone agrees, leave the set pending
|
||||
deletion_logger.debug("deferring deletion until all shards agree");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
// Cannot recover from a failed deletion
|
||||
g_atomic_deletion_sets.pop_back();
|
||||
for (auto&& name : merged_set->names) {
|
||||
g_shards_agreeing_to_delete_sstable.erase(name);
|
||||
}
|
||||
|
||||
// Everyone agrees, let's delete
|
||||
// FIXME: this needs to be done atomically (using a log file of sstables we intend to delete)
|
||||
parallel_for_each(merged_set->names, [] (sstring name) {
|
||||
deletion_logger.debug("deleting {}", name);
|
||||
return remove_by_toc_name(name);
|
||||
}).then_wrapped([merged_set] (future<> result) {
|
||||
deletion_logger.debug("atomic deletion completed: {}", merged_set->names);
|
||||
shared_future<> sf(std::move(result));
|
||||
for (auto&& comp : merged_set->completions) {
|
||||
sf.get_future().forward_to(std::move(*comp));
|
||||
}
|
||||
});
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pending_shard_deletes {
|
||||
std::unordered_map<int, promise<>> pending_deletes;
|
||||
int idgen = 0;
|
||||
future<> delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set);
|
||||
void acknowledge(int id, std::exception_ptr ex);
|
||||
};
|
||||
|
||||
static thread_local pending_shard_deletes this_shard_deletes;
|
||||
|
||||
future<>
|
||||
pending_shard_deletes::delete_atomically(std::vector<sstable_to_delete> atomic_deletion_set) {
|
||||
auto i = pending_deletes.emplace(idgen++, promise<>()).first;
|
||||
auto idx = i->first;
|
||||
auto fut = i->second.get_future();
|
||||
auto deleting_shard = engine().cpu_id();
|
||||
smp::submit_to(0, [atomic_deletion_set, deleting_shard, idx] {
|
||||
futurize<void>::apply(do_delete_atomically, atomic_deletion_set, deleting_shard).then_wrapped([deleting_shard, idx] (future<> ret) {
|
||||
std::exception_ptr ex;
|
||||
if (ret.failed()) {
|
||||
ex = ret.get_exception();
|
||||
}
|
||||
return smp::submit_to(deleting_shard, [idx, ex] () mutable {
|
||||
this_shard_deletes.acknowledge(idx, ex);
|
||||
});
|
||||
});
|
||||
});
|
||||
return fut;
|
||||
}
|
||||
|
||||
void
|
||||
pending_shard_deletes::acknowledge(int idx, std::exception_ptr ex) {
|
||||
auto i = pending_deletes.find(idx);
|
||||
auto& pr = i->second;
|
||||
if (ex) {
|
||||
pr.set_exception(ex);
|
||||
} else {
|
||||
pr.set_value();
|
||||
}
|
||||
pending_deletes.erase(i);
|
||||
}
|
||||
|
||||
future<>
|
||||
delete_atomically(std::vector<sstable_to_delete> ssts) {
|
||||
return this_shard_deletes.delete_atomically(std::move(ssts));
|
||||
}
|
||||
|
||||
future<>
|
||||
delete_atomically(std::vector<shared_sstable> ssts) {
|
||||
std::vector<sstable_to_delete> sstables_to_delete_atomically;
|
||||
for (auto&& sst : ssts) {
|
||||
sstables_to_delete_atomically.push_back({sst->toc_filename(), sst->is_shared()});
|
||||
}
|
||||
return delete_atomically(std::move(sstables_to_delete_atomically));
|
||||
}
|
||||
|
||||
void
|
||||
cancel_atomic_deletions() {
|
||||
g_atomic_deletions_cancelled = true;
|
||||
for (auto&& pd : g_atomic_deletion_sets) {
|
||||
for (auto&& c : pd->completions) {
|
||||
c->set_exception(std::runtime_error(sprint("Atomic sstable deletions cancelled; not deleting %s", pd->names)));
|
||||
}
|
||||
}
|
||||
g_atomic_deletion_sets.clear();
|
||||
g_shards_agreeing_to_delete_sstable.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -245,6 +245,8 @@ public:
|
||||
_marked_for_deletion = true;
|
||||
}
|
||||
|
||||
future<> mark_for_deletion_on_disk();
|
||||
|
||||
bool marked_for_deletion() const {
|
||||
return _marked_for_deletion;
|
||||
}
|
||||
@@ -339,11 +341,9 @@ private:
|
||||
void prepare_write_components(::mutation_reader mr,
|
||||
uint64_t estimated_partitions, schema_ptr schema, uint64_t max_sstable_size,
|
||||
const io_priority_class& pc);
|
||||
static future<> shared_remove_by_toc_name(sstring toc_name, bool shared);
|
||||
static std::unordered_map<version_types, sstring, enum_hash<version_types>> _version_string;
|
||||
static std::unordered_map<format_types, sstring, enum_hash<format_types>> _format_string;
|
||||
static std::unordered_map<component_type, sstring, enum_hash<component_type>> _component_map;
|
||||
static thread_local std::unordered_map<sstring, std::unordered_set<unsigned>> _shards_agreeing_to_remove_sstable;
|
||||
|
||||
std::unordered_set<component_type, enum_hash<component_type>> _components;
|
||||
|
||||
@@ -397,13 +397,16 @@ private:
|
||||
|
||||
void write_filter(const io_priority_class& pc);
|
||||
|
||||
future<> read_summary(const io_priority_class& pc) {
|
||||
return read_simple<component_type::Summary>(_summary, pc);
|
||||
}
|
||||
future<> read_summary(const io_priority_class& pc);
|
||||
|
||||
void write_summary(const io_priority_class& pc) {
|
||||
write_simple<component_type::Summary>(_summary, pc);
|
||||
}
|
||||
|
||||
// To be called when we try to load an SSTable that lacks a Summary. Could
|
||||
// happen if old tools are being used.
|
||||
future<> generate_summary(const io_priority_class& pc);
|
||||
|
||||
future<> read_statistics(const io_priority_class& pc);
|
||||
void write_statistics(const io_priority_class& pc);
|
||||
|
||||
@@ -533,8 +536,8 @@ public:
|
||||
}
|
||||
|
||||
// Return sstable key range as range<partition_key> reading only the summary component.
|
||||
static future<range<partition_key>>
|
||||
get_sstable_key_range(const schema& s, sstring ks, sstring cf, sstring dir, int64_t generation, version_types v, format_types f);
|
||||
future<range<partition_key>>
|
||||
get_sstable_key_range(const schema& s);
|
||||
|
||||
// Used to mark a sstable for deletion that is not relevant to the current shard.
|
||||
// It doesn't mean that the sstable will be deleted, but that the sstable is not
|
||||
@@ -581,4 +584,31 @@ future<> await_background_jobs();
|
||||
// Invokes await_background_jobs() on all shards
|
||||
future<> await_background_jobs_on_all_shards();
|
||||
|
||||
struct sstable_to_delete {
|
||||
sstable_to_delete(sstring name, bool shared) : name(std::move(name)), shared(shared) {}
|
||||
sstring name;
|
||||
bool shared = false;
|
||||
friend std::ostream& operator<<(std::ostream& os, const sstable_to_delete& std);
|
||||
};
|
||||
|
||||
|
||||
// When we compact sstables, we have to atomically instantiate the new
|
||||
// sstable and delete the old ones. Otherwise, if we compact A+B into C,
|
||||
// and if A contained some data that was tombstoned by B, and if B was
|
||||
// deleted but A survived, then data from A will be resurrected.
|
||||
//
|
||||
// There are two violators of the requirement to atomically delete
|
||||
// sstables: first sstable instantiation and deletion on disk is atomic
|
||||
// only wrt. itself, not other sstables, and second when an sstable is
|
||||
// shared among shard, so actual on-disk deletion of an sstable is deferred
|
||||
// until all shards agree it can be deleted.
|
||||
//
|
||||
// This function only solves the second problem for now.
|
||||
future<> delete_atomically(std::vector<shared_sstable> ssts);
|
||||
future<> delete_atomically(std::vector<sstable_to_delete> ssts);
|
||||
|
||||
// Cancel any deletions scheduled by delete_atomically() and make their
|
||||
// futures complete
|
||||
void cancel_atomic_deletions();
|
||||
|
||||
}
|
||||
|
||||
@@ -144,6 +144,10 @@ struct summary_ka {
|
||||
uint64_t memory_footprint() const {
|
||||
return sizeof(summary_entry) * entries.size() + sizeof(uint32_t) * positions.size() + sizeof(*this);
|
||||
}
|
||||
|
||||
explicit operator bool() const {
|
||||
return entries.size();
|
||||
}
|
||||
};
|
||||
using summary = summary_ka;
|
||||
|
||||
@@ -262,6 +266,13 @@ struct deletion_time {
|
||||
(marked_for_delete_at == std::numeric_limits<int64_t>::min());
|
||||
}
|
||||
|
||||
bool operator==(const deletion_time& d) {
|
||||
return local_deletion_time == d.local_deletion_time &&
|
||||
marked_for_delete_at == d.marked_for_delete_at;
|
||||
}
|
||||
bool operator!=(const deletion_time& d) {
|
||||
return !(*this == d);
|
||||
}
|
||||
explicit operator tombstone() {
|
||||
return tombstone(marked_for_delete_at, gc_clock::time_point(gc_clock::duration(local_deletion_time)));
|
||||
}
|
||||
|
||||
@@ -103,8 +103,6 @@ void stream_session::init_messaging_service_handler() {
|
||||
auto session = get_session(plan_id, from, "PREPARE_MESSAGE");
|
||||
session->init(sr);
|
||||
session->dst_cpu_id = src_cpu_id;
|
||||
sslog.debug("[Stream #{}] GOT PREPARE_MESSAGE from {}: get session peer={}, dst_cpu_id={}",
|
||||
session->plan_id(), from, session->peer, session->dst_cpu_id);
|
||||
return session->prepare(std::move(msg.requests), std::move(msg.summaries));
|
||||
});
|
||||
});
|
||||
@@ -123,13 +121,27 @@ void stream_session::init_messaging_service_handler() {
|
||||
get_local_stream_manager().update_progress(plan_id, from.addr, progress_info::direction::IN, fm_size);
|
||||
return service::get_schema_for_write(fm.schema_version(), from).then([plan_id, from, &fm] (schema_ptr s) {
|
||||
auto cf_id = fm.column_family_id();
|
||||
sslog.debug("[Stream #{}] GOT STREAM_MUTATION from {}: cf_id={}", plan_id, from.addr, cf_id);
|
||||
|
||||
auto& db = service::get_local_storage_proxy().get_db().local();
|
||||
if (!db.column_family_exists(cf_id)) {
|
||||
sslog.debug("[Stream #{}] STREAM_MUTATION from {}: cf_id={} is missing, assume the table is dropped",
|
||||
sslog.warn("[Stream #{}] STREAM_MUTATION from {}: cf_id={} is missing, assume the table is dropped",
|
||||
plan_id, from.addr, cf_id);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return service::get_storage_proxy().local().mutate_locally(std::move(s), fm);
|
||||
return service::get_storage_proxy().local().mutate_streaming_mutation(std::move(s), fm).then_wrapped([plan_id, cf_id, from] (auto&& f) {
|
||||
try {
|
||||
f.get();
|
||||
return make_ready_future<>();
|
||||
} catch (no_such_column_family) {
|
||||
sslog.warn("[Stream #{}] STREAM_MUTATION from {}: cf_id={} is missing, assume the table is dropped",
|
||||
plan_id, from.addr, cf_id);
|
||||
return make_ready_future<>();
|
||||
} catch (...) {
|
||||
throw;
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -137,18 +149,29 @@ void stream_session::init_messaging_service_handler() {
|
||||
const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
return smp::submit_to(dst_cpu_id, [ranges = std::move(ranges), plan_id, cf_id, from] () mutable {
|
||||
auto session = get_session(plan_id, from, "STREAM_MUTATION_DONE", cf_id);
|
||||
session->receive_task_completed(cf_id);
|
||||
return session->get_db().invoke_on_all([ranges = std::move(ranges), plan_id, from, cf_id] (database& db) {
|
||||
if (!db.column_family_exists(cf_id)) {
|
||||
sslog.debug("[Stream #{}] STREAM_MUTATION_DONE from {}: cf_id={} is missing, assume the table is dropped",
|
||||
sslog.warn("[Stream #{}] STREAM_MUTATION_DONE from {}: cf_id={} is missing, assume the table is dropped",
|
||||
plan_id, from, cf_id);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
auto& cf = db.find_column_family(cf_id);
|
||||
for (auto& range : ranges) {
|
||||
cf.get_row_cache().invalidate(query::to_partition_range(range));
|
||||
std::vector<query::partition_range> query_ranges;
|
||||
try {
|
||||
auto& cf = db.find_column_family(cf_id);
|
||||
query_ranges.reserve(ranges.size());
|
||||
for (auto& range : ranges) {
|
||||
query_ranges.push_back(query::to_partition_range(range));
|
||||
}
|
||||
return cf.flush_streaming_mutations(std::move(query_ranges));
|
||||
} catch (no_such_column_family) {
|
||||
sslog.warn("[Stream #{}] STREAM_MUTATION_DONE from {}: cf_id={} is missing, assume the table is dropped",
|
||||
plan_id, from, cf_id);
|
||||
return make_ready_future<>();
|
||||
} catch (...) {
|
||||
throw;
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).then([session, cf_id] {
|
||||
session->receive_task_completed(cf_id);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -109,7 +109,7 @@ future<stop_iteration> do_send_mutations(auto si, auto fm) {
|
||||
|
||||
future<> send_mutations(auto si) {
|
||||
auto& cf = si->db.find_column_family(si->cf_id);
|
||||
auto& priority = service::get_local_mutation_stream_priority();
|
||||
auto& priority = service::get_local_streaming_read_priority();
|
||||
return do_with(cf.make_reader(cf.schema(), si->pr, priority), [si] (auto& reader) {
|
||||
return repeat([si, &reader] () {
|
||||
return reader().then([si] (auto mopt) {
|
||||
|
||||
@@ -2098,6 +2098,24 @@ SEASTAR_TEST_CASE(test_alter_table) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_map_query) {
|
||||
return do_with_cql_env([] (auto& e) {
|
||||
return seastar::async([&e] {
|
||||
e.execute_cql("CREATE TABLE xx (k int PRIMARY KEY, m map<text, int>);").get();
|
||||
e.execute_cql("insert into xx (k, m) values (0, {'v2': 1});").get();
|
||||
auto m_type = map_type_impl::get_instance(utf8_type, int32_type, true);
|
||||
assert_that(e.execute_cql("select m from xx where k = 0;").get0())
|
||||
.is_rows().with_rows({
|
||||
{ make_map_value(m_type, map_type_impl::native_type({{sstring("v2"), 1}})).serialize() }
|
||||
});
|
||||
e.execute_cql("delete m['v2'] from xx where k = 0;").get();
|
||||
assert_that(e.execute_cql("select m from xx where k = 0;").get0())
|
||||
.is_rows().with_rows({{{}}});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_drop_table) {
|
||||
return do_with_cql_env([] (auto& e) {
|
||||
return seastar::async([&e] {
|
||||
@@ -2109,6 +2127,40 @@ SEASTAR_TEST_CASE(test_drop_table) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_reversed_slice_with_empty_range_before_all_rows) {
|
||||
return do_with_cql_env([] (auto& e) {
|
||||
return seastar::async([&e] {
|
||||
e.execute_cql("CREATE TABLE test (a int, b int, c int, s1 int static, s2 int static, PRIMARY KEY (a, b));").get();
|
||||
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 0, 0, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 1, 1, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 2, 2, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 3, 3, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 4, 4, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 5, 5, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 6, 6, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 7, 7, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 8, 8, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 9, 9, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 10, 10, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 11, 11, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 12, 12, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 13, 13, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 14, 14, 17, 42);").get();
|
||||
e.execute_cql("INSERT INTO test (a, b, c, s1, s2) VALUES (99, 15, 15, 17, 42);").get();
|
||||
|
||||
assert_that(e.execute_cql("select * from test WHERE a = 99 and b < 0 ORDER BY b DESC limit 2;").get0())
|
||||
.is_rows().is_empty();
|
||||
|
||||
assert_that(e.execute_cql("select * from test WHERE a = 99 order by b desc;").get0())
|
||||
.is_rows().with_size(16);
|
||||
|
||||
assert_that(e.execute_cql("select * from test;").get0())
|
||||
.is_rows().with_size(16);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_alter_table_validation) {
|
||||
return do_with_cql_env([] (auto& e) {
|
||||
return e.execute_cql("create table tatv (p1 int, c1 int, c2 int, r1 int, r2 set<int>, PRIMARY KEY (p1, c1, c2));").discard_result().then_wrapped([&e] (auto f) {
|
||||
|
||||
@@ -333,7 +333,6 @@ public:
|
||||
|
||||
gms::get_gossiper().stop().get();
|
||||
gms::get_failure_detector().stop().get();
|
||||
net::get_messaging_service().stop().get();
|
||||
|
||||
_db->stop().get();
|
||||
|
||||
@@ -343,6 +342,8 @@ public:
|
||||
|
||||
sstables::await_background_jobs_on_all_shards().get();
|
||||
|
||||
net::get_messaging_service().stop().get();
|
||||
|
||||
bool old_active = true;
|
||||
assert(active.compare_exchange_strong(old_active, false));
|
||||
});
|
||||
|
||||
@@ -46,9 +46,9 @@ SEASTAR_TEST_CASE(test_boot_shutdown){
|
||||
gms::get_gossiper().start().get();
|
||||
gms::get_gossiper().stop().get();
|
||||
gms::get_failure_detector().stop().get();
|
||||
net::get_messaging_service().stop().get();
|
||||
db.stop().get();
|
||||
service::get_storage_service().stop().get();
|
||||
net::get_messaging_service().stop().get();
|
||||
locator::i_endpoint_snitch::stop_snitch().get();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -108,7 +108,7 @@ SEASTAR_TEST_CASE(test_compaction_with_multiple_regions) {
|
||||
}
|
||||
});
|
||||
|
||||
size_t quarter = shard_tracker().occupancy().total_space() / 4;
|
||||
size_t quarter = shard_tracker().region_occupancy().total_space() / 4;
|
||||
|
||||
shard_tracker().reclaim_all_free_segments();
|
||||
|
||||
|
||||
@@ -298,6 +298,15 @@ static mutation_sets generate_mutation_sets() {
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
random_mutation_generator gen;
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
auto m = gen();
|
||||
result.unequal.emplace_back(mutations{m, gen()}); // collision unlikely
|
||||
result.equal.emplace_back(mutations{m, m});
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -341,3 +350,145 @@ void for_each_mutation(std::function<void(const mutation&)> callback) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bytes make_blob(size_t blob_size) {
|
||||
static thread_local std::independent_bits_engine<std::default_random_engine, 8, uint8_t> random_bytes;
|
||||
bytes big_blob(bytes::initialized_later(), blob_size);
|
||||
for (auto&& b : big_blob) {
|
||||
b = random_bytes();
|
||||
}
|
||||
return big_blob;
|
||||
};
|
||||
|
||||
class random_mutation_generator::impl {
|
||||
friend class random_mutation_generator;
|
||||
const size_t _external_blob_size = 128; // Should be enough to force use of external bytes storage
|
||||
const column_id column_count = row::max_vector_size * 2;
|
||||
std::mt19937 _gen;
|
||||
schema_ptr _schema;
|
||||
std::vector<bytes> _blobs;
|
||||
|
||||
static gc_clock::time_point expiry_dist(auto& gen) {
|
||||
static thread_local std::uniform_int_distribution<int> dist(0, 2);
|
||||
return gc_clock::time_point() + std::chrono::seconds(dist(gen));
|
||||
}
|
||||
|
||||
public:
|
||||
schema_ptr make_schema() {
|
||||
auto builder = schema_builder("ks", "cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("ck1", bytes_type, column_kind::clustering_key)
|
||||
.with_column("ck2", bytes_type, column_kind::clustering_key);
|
||||
|
||||
// Create enough columns so that row can overflow its vector storage
|
||||
for (column_id i = 0; i < column_count; ++i) {
|
||||
{
|
||||
auto column_name = sprint("v%d", i);
|
||||
builder.with_column(to_bytes(column_name), bytes_type, column_kind::regular_column);
|
||||
}
|
||||
{
|
||||
auto column_name = sprint("s%d", i);
|
||||
builder.with_column(to_bytes(column_name), bytes_type, column_kind::static_column);
|
||||
}
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
impl() {
|
||||
_schema = make_schema();
|
||||
|
||||
for (int i = 0; i < 1024; ++i) {
|
||||
_blobs.emplace_back(make_blob(_external_blob_size));
|
||||
}
|
||||
|
||||
std::random_device rd;
|
||||
// In case of errors, replace the seed with a fixed value to get a deterministic run.
|
||||
auto seed = rd();
|
||||
BOOST_TEST_MESSAGE(sprint("Random seed: %s", seed));
|
||||
_gen = std::mt19937(seed);
|
||||
}
|
||||
|
||||
mutation operator()() {
|
||||
std::uniform_int_distribution<column_id> column_count_dist(1, column_count);
|
||||
std::uniform_int_distribution<column_id> column_id_dist(0, column_count - 1);
|
||||
std::uniform_int_distribution<size_t> value_blob_index_dist(0, 2);
|
||||
std::normal_distribution<> ck_index_dist(_blobs.size() / 2, 1.5);
|
||||
std::uniform_int_distribution<int> bool_dist(0, 1);
|
||||
|
||||
std::uniform_int_distribution<api::timestamp_type> timestamp_dist(api::min_timestamp, api::min_timestamp + 2); // 3 values
|
||||
|
||||
auto pkey = partition_key::from_single_value(*_schema, _blobs[0]);
|
||||
mutation m(pkey, _schema);
|
||||
|
||||
auto set_random_cells = [&] (row& r, column_kind kind) {
|
||||
auto columns_to_set = column_count_dist(_gen);
|
||||
for (column_id i = 0; i < columns_to_set; ++i) {
|
||||
// FIXME: generate expiring cells
|
||||
auto cell = bool_dist(_gen)
|
||||
? atomic_cell::make_live(timestamp_dist(_gen), _blobs[value_blob_index_dist(_gen)])
|
||||
: atomic_cell::make_dead(timestamp_dist(_gen), expiry_dist(_gen));
|
||||
r.apply(_schema->column_at(kind, column_id_dist(_gen)), std::move(cell));
|
||||
}
|
||||
};
|
||||
|
||||
auto random_tombstone = [&] {
|
||||
return tombstone(timestamp_dist(_gen), expiry_dist(_gen));
|
||||
};
|
||||
|
||||
auto random_row_marker = [&] {
|
||||
static thread_local std::uniform_int_distribution<int> dist(0, 3);
|
||||
switch (dist(_gen)) {
|
||||
case 0: return row_marker();
|
||||
case 1: return row_marker(random_tombstone());
|
||||
case 2: return row_marker(timestamp_dist(_gen));
|
||||
case 3: return row_marker(timestamp_dist(_gen), std::chrono::seconds(1), expiry_dist(_gen));
|
||||
default: assert(0);
|
||||
}
|
||||
};
|
||||
|
||||
if (bool_dist(_gen)) {
|
||||
m.partition().apply(random_tombstone());
|
||||
}
|
||||
|
||||
set_random_cells(m.partition().static_row(), column_kind::static_column);
|
||||
|
||||
auto random_blob = [&] {
|
||||
return _blobs[std::min(_blobs.size() - 1, static_cast<size_t>(std::max(0.0, ck_index_dist(_gen))))];
|
||||
};
|
||||
|
||||
auto row_count_dist = [&] (auto& gen) {
|
||||
static thread_local std::normal_distribution<> dist(32, 1.5);
|
||||
return static_cast<size_t>(std::min(100.0, std::max(0.0, dist(gen))));
|
||||
};
|
||||
|
||||
size_t row_count = row_count_dist(_gen);
|
||||
for (size_t i = 0; i < row_count; ++i) {
|
||||
auto ckey = clustering_key::from_exploded(*_schema, {random_blob(), random_blob()});
|
||||
deletable_row& row = m.partition().clustered_row(ckey);
|
||||
set_random_cells(row.cells(), column_kind::regular_column);
|
||||
row.marker() = random_row_marker();
|
||||
}
|
||||
|
||||
size_t range_tombstone_count = row_count_dist(_gen);
|
||||
for (size_t i = 0; i < range_tombstone_count; ++i) {
|
||||
auto key = clustering_key::from_exploded(*_schema, {random_blob()});
|
||||
m.partition().apply_row_tombstone(*_schema, key, random_tombstone());
|
||||
}
|
||||
return m;
|
||||
}
|
||||
};
|
||||
|
||||
random_mutation_generator::~random_mutation_generator() {}
|
||||
|
||||
random_mutation_generator::random_mutation_generator()
|
||||
: _impl(std::make_unique<random_mutation_generator::impl>())
|
||||
{ }
|
||||
|
||||
mutation random_mutation_generator::operator()() {
|
||||
return (*_impl)();
|
||||
}
|
||||
|
||||
schema_ptr random_mutation_generator::schema() const {
|
||||
return _impl->_schema;
|
||||
}
|
||||
@@ -36,3 +36,15 @@ void for_each_mutation_pair(std::function<void(const mutation&, const mutation&,
|
||||
|
||||
// Calls the provided function on mutations. Is supposed to exercise as many differences as possible.
|
||||
void for_each_mutation(std::function<void(const mutation&)>);
|
||||
|
||||
class random_mutation_generator {
|
||||
class impl;
|
||||
std::unique_ptr<impl> _impl;
|
||||
public:
|
||||
random_mutation_generator();
|
||||
~random_mutation_generator();
|
||||
mutation operator()();
|
||||
schema_ptr schema() const;
|
||||
};
|
||||
|
||||
bytes make_blob(size_t blob_size);
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include <boost/range/adaptor/transformed.hpp>
|
||||
#include <boost/range/algorithm/copy.hpp>
|
||||
#include <boost/range/algorithm_ext/push_back.hpp>
|
||||
#include "mutation_query.hh"
|
||||
#include "md5_hasher.hh"
|
||||
|
||||
#include "core/sstring.hh"
|
||||
@@ -270,6 +271,7 @@ SEASTAR_TEST_CASE(test_list_mutations) {
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_multiple_memtables_one_partition) {
|
||||
return seastar::async([] {
|
||||
auto s = make_lw_shared(schema({}, some_keyspace, some_column_family,
|
||||
{{"p1", utf8_type}}, {{"c1", int32_type}}, {{"r1", int32_type}}, {}, utf8_type));
|
||||
|
||||
@@ -280,7 +282,7 @@ SEASTAR_TEST_CASE(test_multiple_memtables_one_partition) {
|
||||
cfg.enable_incremental_backups = false;
|
||||
cfg.cf_stats = &*cf_stats;
|
||||
|
||||
return with_column_family(s, cfg, [s] (column_family& cf) {
|
||||
with_column_family(s, cfg, [s] (column_family& cf) {
|
||||
const column_definition& r1_col = *s->get_column_definition("r1");
|
||||
auto key = partition_key::from_exploded(*s, {to_bytes("key1")});
|
||||
|
||||
@@ -291,26 +293,30 @@ SEASTAR_TEST_CASE(test_multiple_memtables_one_partition) {
|
||||
cf.apply(std::move(m));
|
||||
return cf.flush();
|
||||
};
|
||||
return when_all(
|
||||
insert_row(1001, 2001),
|
||||
insert_row(1002, 2002),
|
||||
insert_row(1003, 2003)).discard_result().then([s, &r1_col, &cf, key] {
|
||||
insert_row(1001, 2001).get();
|
||||
insert_row(1002, 2002).get();
|
||||
insert_row(1003, 2003).get();
|
||||
{
|
||||
auto verify_row = [&] (int32_t c1, int32_t r1) {
|
||||
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(c1)});
|
||||
return cf.find_row(cf.schema(), dht::global_partitioner().decorate_key(*s, key), std::move(c_key)).then([r1, r1_col] (auto r) {
|
||||
auto p_key = dht::global_partitioner().decorate_key(*s, key);
|
||||
auto r = cf.find_row(cf.schema(), p_key, c_key).get0();
|
||||
{
|
||||
BOOST_REQUIRE(r);
|
||||
auto i = r->find_cell(r1_col.id);
|
||||
BOOST_REQUIRE(i);
|
||||
auto cell = i->as_atomic_cell();
|
||||
BOOST_REQUIRE(cell.is_live());
|
||||
BOOST_REQUIRE(int32_type->equal(cell.value(), int32_type->decompose(r1)));
|
||||
});
|
||||
}
|
||||
};
|
||||
verify_row(1001, 2001);
|
||||
verify_row(1002, 2002);
|
||||
verify_row(1003, 2003);
|
||||
});
|
||||
}).then([cf_stats] {});
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_flush_in_the_middle_of_a_scan) {
|
||||
@@ -690,6 +696,165 @@ SEASTAR_TEST_CASE(test_row_counting) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_tombstone_apply) {
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("v", bytes_type, column_kind::regular_column)
|
||||
.build();
|
||||
|
||||
auto pkey = partition_key::from_single_value(*s, "key1");
|
||||
|
||||
mutation m1(pkey, s);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(m1.partition().partition_tombstone(), tombstone());
|
||||
|
||||
mutation m2(pkey, s);
|
||||
auto tomb = tombstone(api::new_timestamp(), gc_clock::now());
|
||||
m2.partition().apply(tomb);
|
||||
BOOST_REQUIRE_EQUAL(m2.partition().partition_tombstone(), tomb);
|
||||
|
||||
m1.apply(m2);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(m1.partition().partition_tombstone(), tomb);
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_marker_apply) {
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("ck", bytes_type, column_kind::clustering_key)
|
||||
.with_column("v", bytes_type, column_kind::regular_column)
|
||||
.build();
|
||||
|
||||
auto pkey = partition_key::from_single_value(*s, "pk1");
|
||||
auto ckey = clustering_key::from_single_value(*s, "ck1");
|
||||
|
||||
auto mutation_with_marker = [&] (row_marker rm) {
|
||||
mutation m(pkey, s);
|
||||
m.partition().clustered_row(ckey).marker() = rm;
|
||||
return m;
|
||||
};
|
||||
|
||||
{
|
||||
mutation m(pkey, s);
|
||||
auto marker = row_marker(api::new_timestamp());
|
||||
auto mm = mutation_with_marker(marker);
|
||||
m.apply(mm);
|
||||
BOOST_REQUIRE_EQUAL(m.partition().clustered_row(ckey).marker(), marker);
|
||||
}
|
||||
|
||||
{
|
||||
mutation m(pkey, s);
|
||||
auto marker = row_marker(api::new_timestamp(), std::chrono::seconds(1), gc_clock::now());
|
||||
m.apply(mutation_with_marker(marker));
|
||||
BOOST_REQUIRE_EQUAL(m.partition().clustered_row(ckey).marker(), marker);
|
||||
}
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
class failure_injecting_allocation_strategy : public allocation_strategy {
|
||||
allocation_strategy& _delegate;
|
||||
uint64_t _alloc_count;
|
||||
uint64_t _fail_at = std::numeric_limits<uint64_t>::max();
|
||||
public:
|
||||
failure_injecting_allocation_strategy(allocation_strategy& delegate) : _delegate(delegate) {}
|
||||
|
||||
virtual void* alloc(migrate_fn mf, size_t size, size_t alignment) override {
|
||||
if (_alloc_count >= _fail_at) {
|
||||
stop_failing();
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
++_alloc_count;
|
||||
return _delegate.alloc(mf, size, alignment);
|
||||
}
|
||||
|
||||
virtual void free(void* ptr) override {
|
||||
_delegate.free(ptr);
|
||||
}
|
||||
|
||||
// Counts allocation attempts which are not failed due to fail_at().
|
||||
uint64_t alloc_count() const {
|
||||
return _alloc_count;
|
||||
}
|
||||
|
||||
void fail_after(uint64_t count) {
|
||||
_fail_at = _alloc_count + count;
|
||||
}
|
||||
|
||||
void stop_failing() {
|
||||
_fail_at = std::numeric_limits<uint64_t>::max();
|
||||
}
|
||||
};
|
||||
|
||||
SEASTAR_TEST_CASE(test_apply_is_atomic_in_case_of_allocation_failures) {
|
||||
random_mutation_generator gen;
|
||||
|
||||
failure_injecting_allocation_strategy alloc(standard_allocator());
|
||||
with_allocator(alloc, [&] {
|
||||
auto target = gen();
|
||||
|
||||
BOOST_TEST_MESSAGE(sprint("Target: %s", target));
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
auto second = gen();
|
||||
|
||||
BOOST_TEST_MESSAGE(sprint("Second: %s", second));
|
||||
|
||||
auto expected_apply_result = target;
|
||||
expected_apply_result.apply(second);
|
||||
|
||||
BOOST_TEST_MESSAGE(sprint("Expected: %s", expected_apply_result));
|
||||
|
||||
// Test the apply(const mutation&) variant
|
||||
{
|
||||
auto m = target;
|
||||
|
||||
// Try to fail at every possible allocation point during apply()
|
||||
size_t fail_offset = 0;
|
||||
while (true) {
|
||||
BOOST_TEST_MESSAGE(sprint("Failing allocation at %d", fail_offset));
|
||||
alloc.fail_after(fail_offset++);
|
||||
try {
|
||||
m.apply(second);
|
||||
alloc.stop_failing();
|
||||
BOOST_TEST_MESSAGE("Checking that apply has expected result");
|
||||
assert_that(m).is_equal_to(expected_apply_result);
|
||||
break; // we exhausted all allocation points
|
||||
} catch (const std::bad_alloc&) {
|
||||
BOOST_TEST_MESSAGE("Checking that apply was reverted");
|
||||
assert_that(m).is_equal_to(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test the apply(mutation&&) variant
|
||||
{
|
||||
size_t fail_offset = 0;
|
||||
while (true) {
|
||||
auto copy_of_second = second;
|
||||
auto m = target;
|
||||
alloc.fail_after(fail_offset++);
|
||||
try {
|
||||
m.apply(std::move(copy_of_second));
|
||||
alloc.stop_failing();
|
||||
assert_that(m).is_equal_to(expected_apply_result);
|
||||
break; // we exhausted all allocation points
|
||||
} catch (const std::bad_alloc&) {
|
||||
assert_that(m).is_equal_to(target);
|
||||
// they should still commute
|
||||
m.apply(copy_of_second);
|
||||
assert_that(m).is_equal_to(expected_apply_result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_mutation_diff) {
|
||||
return seastar::async([] {
|
||||
auto my_set_type = set_type_impl::get_instance(int32_type, true);
|
||||
@@ -805,15 +970,6 @@ SEASTAR_TEST_CASE(test_large_blobs) {
|
||||
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
|
||||
auto make_blob = [] (size_t blob_size) -> bytes {
|
||||
bytes big_blob(bytes::initialized_later(), blob_size);
|
||||
std::independent_bits_engine<std::default_random_engine, 8, uint8_t> random_bytes;
|
||||
for (auto&& b : big_blob) {
|
||||
b = random_bytes();
|
||||
}
|
||||
return big_blob;
|
||||
};
|
||||
|
||||
auto blob1 = make_blob(1234567);
|
||||
auto blob2 = make_blob(2345678);
|
||||
|
||||
@@ -884,6 +1040,55 @@ SEASTAR_TEST_CASE(test_mutation_hash) {
|
||||
});
|
||||
}
|
||||
|
||||
static mutation compacted(const mutation& m) {
|
||||
auto result = m;
|
||||
result.partition().compact_for_compaction(*result.schema(), api::max_timestamp, gc_clock::now());
|
||||
return result;
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_query_digest) {
|
||||
return seastar::async([] {
|
||||
auto check_digests_equal = [] (const mutation& m1, const mutation& m2) {
|
||||
auto ps1 = partition_slice_builder(*m1.schema()).build();
|
||||
auto ps2 = partition_slice_builder(*m2.schema()).build();
|
||||
auto digest1 = *m1.query(ps1, query::result_request::only_digest).digest();
|
||||
auto digest2 = *m2.query(ps2, query::result_request::only_digest).digest();
|
||||
if (digest1 != digest2) {
|
||||
BOOST_FAIL(sprint("Digest should be the same for %s and %s", m1, m2));
|
||||
}
|
||||
};
|
||||
|
||||
for_each_mutation_pair([&] (const mutation& m1, const mutation& m2, are_equal eq) {
|
||||
if (m1.schema()->version() != m2.schema()->version()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (eq) {
|
||||
check_digests_equal(compacted(m1), m2);
|
||||
check_digests_equal(m1, compacted(m2));
|
||||
} else {
|
||||
BOOST_TEST_MESSAGE("If not equal, they should become so after applying diffs mutually");
|
||||
|
||||
schema_ptr s = m1.schema();
|
||||
|
||||
auto m3 = m2;
|
||||
{
|
||||
auto diff = m1.partition().difference(s, m2.partition());
|
||||
m3.partition().apply(*m3.schema(), std::move(diff));
|
||||
}
|
||||
|
||||
auto m4 = m1;
|
||||
{
|
||||
auto diff = m2.partition().difference(s, m1.partition());
|
||||
m4.partition().apply(*m4.schema(), std::move(diff));
|
||||
}
|
||||
|
||||
check_digests_equal(m3, m4);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_mutation_upgrade_of_equal_mutations) {
|
||||
return seastar::async([] {
|
||||
for_each_mutation_pair([](auto&& m1, auto&& m2, are_equal eq) {
|
||||
@@ -995,6 +1200,95 @@ SEASTAR_TEST_CASE(test_mutation_upgrade) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_querying_expired_cells) {
|
||||
return seastar::async([] {
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", bytes_type, column_kind::partition_key)
|
||||
.with_column("ck", bytes_type, column_kind::clustering_key)
|
||||
.with_column("s1", bytes_type, column_kind::static_column)
|
||||
.with_column("s2", bytes_type, column_kind::static_column)
|
||||
.with_column("s3", bytes_type, column_kind::static_column)
|
||||
.with_column("v1", bytes_type)
|
||||
.with_column("v2", bytes_type)
|
||||
.with_column("v3", bytes_type)
|
||||
.build();
|
||||
|
||||
auto pk = partition_key::from_singular(*s, data_value(bytes("key1")));
|
||||
auto ckey1 = clustering_key::from_singular(*s, data_value(bytes("A")));
|
||||
|
||||
auto ttl = std::chrono::seconds(1);
|
||||
auto t1 = gc_clock::now();
|
||||
auto t2 = t1 + std::chrono::seconds(1);
|
||||
auto t3 = t2 + std::chrono::seconds(1);
|
||||
auto t4 = t3 + std::chrono::seconds(1);
|
||||
|
||||
auto v1 = data_value(bytes("1"));
|
||||
auto v2 = data_value(bytes("2"));
|
||||
auto v3 = data_value(bytes("3"));
|
||||
|
||||
auto results_at_time = [s] (const mutation& m, gc_clock::time_point t) {
|
||||
auto slice = partition_slice_builder(*s)
|
||||
.with_regular_column("v1")
|
||||
.with_regular_column("v2")
|
||||
.with_regular_column("v3")
|
||||
.with_static_column("s1")
|
||||
.with_static_column("s2")
|
||||
.with_static_column("s3")
|
||||
.without_clustering_key_columns()
|
||||
.without_partition_key_columns()
|
||||
.build();
|
||||
return query::result_set::from_raw_result(s, slice, m.query(slice, query::result_request::result_and_digest, t));
|
||||
};
|
||||
|
||||
{
|
||||
mutation m(pk, s);
|
||||
m.set_clustered_cell(ckey1, *s->get_column_definition("v1"), atomic_cell::make_live(api::new_timestamp(), v1.serialize(), t1, ttl));
|
||||
m.set_clustered_cell(ckey1, *s->get_column_definition("v2"), atomic_cell::make_live(api::new_timestamp(), v2.serialize(), t2, ttl));
|
||||
m.set_clustered_cell(ckey1, *s->get_column_definition("v3"), atomic_cell::make_live(api::new_timestamp(), v3.serialize(), t3, ttl));
|
||||
m.set_static_cell(*s->get_column_definition("s1"), atomic_cell::make_live(api::new_timestamp(), v1.serialize(), t1, ttl));
|
||||
m.set_static_cell(*s->get_column_definition("s2"), atomic_cell::make_live(api::new_timestamp(), v2.serialize(), t2, ttl));
|
||||
m.set_static_cell(*s->get_column_definition("s3"), atomic_cell::make_live(api::new_timestamp(), v3.serialize(), t3, ttl));
|
||||
|
||||
assert_that(results_at_time(m, t1))
|
||||
.has_only(a_row()
|
||||
.with_column("s1", v1)
|
||||
.with_column("s2", v2)
|
||||
.with_column("s3", v3)
|
||||
.with_column("v1", v1)
|
||||
.with_column("v2", v2)
|
||||
.with_column("v3", v3)
|
||||
.and_only_that());
|
||||
|
||||
assert_that(results_at_time(m, t2))
|
||||
.has_only(a_row()
|
||||
.with_column("s2", v2)
|
||||
.with_column("s3", v3)
|
||||
.with_column("v2", v2)
|
||||
.with_column("v3", v3)
|
||||
.and_only_that());
|
||||
|
||||
assert_that(results_at_time(m, t3))
|
||||
.has_only(a_row()
|
||||
.with_column("s3", v3)
|
||||
.with_column("v3", v3)
|
||||
.and_only_that());
|
||||
|
||||
assert_that(results_at_time(m, t4)).is_empty();
|
||||
}
|
||||
|
||||
{
|
||||
mutation m(pk, s);
|
||||
m.set_clustered_cell(ckey1, *s->get_column_definition("v1"), atomic_cell::make_live(api::new_timestamp(), v1.serialize(), t1, ttl));
|
||||
m.set_static_cell(*s->get_column_definition("s1"), atomic_cell::make_live(api::new_timestamp(), v1.serialize(), t3, ttl));
|
||||
|
||||
assert_that(results_at_time(m, t2))
|
||||
.has_only(a_row().with_column("s1", v1).and_only_that());
|
||||
|
||||
assert_that(results_at_time(m, t4)).is_empty();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_tombstone_purge) {
|
||||
auto builder = schema_builder("tests", "tombstone_purge")
|
||||
.with_column("id", utf8_type, column_kind::partition_key)
|
||||
|
||||
@@ -50,6 +50,7 @@ struct test_config {
|
||||
unsigned partitions;
|
||||
unsigned concurrency;
|
||||
bool query_single_key;
|
||||
unsigned duration_in_seconds;
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const test_config::run_mode& m) {
|
||||
@@ -79,7 +80,7 @@ future<> test_read(cql_test_env& env, test_config& cfg) {
|
||||
return time_parallel([&env, &cfg, id] {
|
||||
bytes key = make_key(cfg.query_single_key ? 0 : std::rand() % cfg.partitions);
|
||||
return env.execute_prepared(id, {{std::move(key)}}).discard_result();
|
||||
}, cfg.concurrency);
|
||||
}, cfg.concurrency, cfg.duration_in_seconds);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -95,7 +96,7 @@ future<> test_write(cql_test_env& env, test_config& cfg) {
|
||||
return time_parallel([&env, &cfg, id] {
|
||||
bytes key = make_key(cfg.query_single_key ? 0 : std::rand() % cfg.partitions);
|
||||
return env.execute_prepared(id, {{std::move(key)}}).discard_result();
|
||||
}, cfg.concurrency);
|
||||
}, cfg.concurrency, cfg.duration_in_seconds);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -125,6 +126,7 @@ int main(int argc, char** argv) {
|
||||
app.add_options()
|
||||
("partitions", bpo::value<unsigned>()->default_value(10000), "number of partitions")
|
||||
("write", "test write path instead of read path")
|
||||
("duration", bpo::value<unsigned>()->default_value(5), "test duration in seconds")
|
||||
("query-single-key", "test write path instead of read path")
|
||||
("concurrency", bpo::value<unsigned>()->default_value(100), "workers per core");
|
||||
|
||||
@@ -132,6 +134,7 @@ int main(int argc, char** argv) {
|
||||
make_env_for_test().then([&app] (auto env) {
|
||||
auto cfg = make_lw_shared<test_config>();
|
||||
cfg->partitions = app.configuration()["partitions"].as<unsigned>();
|
||||
cfg->duration_in_seconds = app.configuration()["duration"].as<unsigned>();
|
||||
cfg->concurrency = app.configuration()["concurrency"].as<unsigned>();
|
||||
cfg->mode = app.configuration().count("write") ? test_config::run_mode::write : test_config::run_mode::read;
|
||||
cfg->query_single_key = app.configuration().count("query-single-key");
|
||||
|
||||
@@ -49,6 +49,14 @@ row_assertion::matches(const query::result_set_row& row) const {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (_only_that) {
|
||||
for (auto&& e : row.cells()) {
|
||||
auto name = to_bytes(e.first);
|
||||
if (!_expected_values.count(name)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -36,11 +36,16 @@
|
||||
|
||||
class row_assertion {
|
||||
std::map<bytes, data_value> _expected_values;
|
||||
bool _only_that = false;
|
||||
public:
|
||||
row_assertion& with_column(bytes name, data_value value) {
|
||||
_expected_values.emplace(name, value);
|
||||
return *this;
|
||||
}
|
||||
row_assertion& and_only_that() {
|
||||
_only_that = true;
|
||||
return *this;
|
||||
}
|
||||
private:
|
||||
friend class result_set_assertions;
|
||||
bool matches(const query::result_set_row& row) const;
|
||||
|
||||
@@ -106,13 +106,17 @@ int main(int argc, char** argv) {
|
||||
keys.push_back(key);
|
||||
}
|
||||
|
||||
auto reclaimable_memory = [] {
|
||||
return memory::stats().free_memory() + logalloc::shard_tracker().occupancy().free_space();
|
||||
};
|
||||
|
||||
std::cout << "memtable occupancy: " << mt->occupancy() << "\n";
|
||||
std::cout << "Cache occupancy: " << tracker.region().occupancy() << "\n";
|
||||
std::cout << "Free memory: " << memory::stats().free_memory() << "\n";
|
||||
std::cout << "Reclaimable memory: " << reclaimable_memory() << "\n";
|
||||
|
||||
// We need to have enough Free memory to copy memtable into cache
|
||||
// When this assertion fails, increase amount of memory
|
||||
assert(mt->occupancy().used_space() < memory::stats().free_memory());
|
||||
assert(mt->occupancy().used_space() < reclaimable_memory());
|
||||
|
||||
auto checker = [](const partition_key& key) {
|
||||
return partition_presence_checker_result::maybe_exists;
|
||||
@@ -146,13 +150,14 @@ int main(int argc, char** argv) {
|
||||
for (auto&& key : keys) {
|
||||
cache.touch(key);
|
||||
}
|
||||
std::cout << "Free memory: " << memory::stats().free_memory() << "\n";
|
||||
std::cout << "Reclaimable memory: " << reclaimable_memory() << "\n";
|
||||
std::cout << "Cache occupancy: " << tracker.region().occupancy() << "\n";
|
||||
};
|
||||
|
||||
std::deque<std::unique_ptr<char[]>> stuffing;
|
||||
auto fragment_free_space = [&] {
|
||||
stuffing.clear();
|
||||
std::cout << "Reclaimable memory: " << reclaimable_memory() << "\n";
|
||||
std::cout << "Free memory: " << memory::stats().free_memory() << "\n";
|
||||
std::cout << "Cache occupancy: " << tracker.region().occupancy() << "\n";
|
||||
|
||||
@@ -165,6 +170,7 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
std::cout << "After fragmenting:\n";
|
||||
std::cout << "Reclaimable memory: " << reclaimable_memory() << "\n";
|
||||
std::cout << "Free memory: " << memory::stats().free_memory() << "\n";
|
||||
std::cout << "Cache occupancy: " << tracker.region().occupancy() << "\n";
|
||||
};
|
||||
|
||||
@@ -1031,7 +1031,7 @@ SEASTAR_TEST_CASE(compaction_manager_test) {
|
||||
}).then([cf, cm] {
|
||||
// remove cf from compaction manager; this will wait for the
|
||||
// ongoing compaction to finish.
|
||||
return cm->remove(&*cf).then([cf, cm] {
|
||||
return cf->stop().then([cf, cm] {
|
||||
// expect sstables of cf to be compacted.
|
||||
BOOST_REQUIRE(cf->sstables_count() == 1);
|
||||
// stop all compaction manager tasks.
|
||||
|
||||
@@ -474,3 +474,109 @@ SEASTAR_TEST_CASE(broken_ranges_collection) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Scylla does not currently support generic range-tombstone - only ranges
|
||||
// which are a complete clustering-key prefix are supported because our
|
||||
// row_tombstone only works on whole rows. This is good enough because
|
||||
// in Cassandra 2 (whose sstables we support) there is no way using CQL to
|
||||
// create a generic range, because the DELETE and UPDATE statement's "WHERE"
|
||||
// only takes the "=" operator, leading to a deletion of entire rows.
|
||||
//
|
||||
// However, in one imporant case the sstable written by Cassandra might look
|
||||
// like it has generic range tombstone: consider two overlapping tombstones,
|
||||
// one deleting a bigger prefix than the other:
|
||||
//
|
||||
// create COLUMNFAMILY tab (pk text, ck1 text, ck2 text, data text, primary key(pk, ck1, ck2));
|
||||
// delete from tab where pk = 'pk' and ck1 = 'aaa';
|
||||
// delete from tab where pk = 'pk' and ck1 = 'aaa' and ck2 = 'bbb';
|
||||
//
|
||||
// The first deletion covers the second, but nevertheless we cannot drop the
|
||||
// smaller one because the two deletions have different timestamps. But while
|
||||
// it is not allowed to drop the smaller deletion, it is possible to split the
|
||||
// the larger range to three ranges where one of them is the the smaller range
|
||||
// and then we have two range tombstones with identical ranges - and can keep
|
||||
// only the newer one. This splitting is what Cassandra does: Cassandra does
|
||||
// not want to have overlapping range tombstones, so it converts them (see
|
||||
// RangeTombstoneList.java) into non-overlapping range-tombstones, as describe
|
||||
// above. In the above example, the resulting sstable is (sstable2json format)
|
||||
//
|
||||
// {"key": "pk",
|
||||
// "cells": [["aaa:_","aaa:bbb:_",1459334681228103,"t",1459334681],
|
||||
// ["aaa:bbb:_","aaa:bbb:!",1459334681244989,"t",1459334681],
|
||||
// ["aaa:bbb:!","aaa:!",1459334681228103,"t",1459334681]]}
|
||||
// ]
|
||||
//
|
||||
// Note that the middle tombstone has a different timestamp than the other.
|
||||
//
|
||||
// In this sstable, the first and third tombstones look like "generic" ranges,
|
||||
// not covering an entire prefix, so we cannot represent these three
|
||||
// tombstones in our in-memory data structure. Instead, we need to convert the
|
||||
// three non-overlapping tombstones to two overlapping whole-prefix tombstones,
|
||||
// the two we started with.
|
||||
// That is what this test tests - we read an sstable as above and verify that
|
||||
// our sstable reading code converted it to two overlapping tombstones.
|
||||
|
||||
static schema_ptr tombstone_overlap_schema() {
|
||||
static thread_local auto s = [] {
|
||||
schema_builder builder(make_lw_shared(schema(generate_legacy_id("try1", "tab"), "try1", "tab",
|
||||
// partition key
|
||||
{{"pk", utf8_type}},
|
||||
// clustering key
|
||||
{{"ck1", utf8_type}, {"ck2", utf8_type}},
|
||||
// regular columns
|
||||
{},
|
||||
// static columns
|
||||
{},
|
||||
// regular column name type
|
||||
utf8_type,
|
||||
// comment
|
||||
""
|
||||
)));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
static future<sstable_ptr> ka_sst(sstring ks, sstring cf, sstring dir, unsigned long generation) {
|
||||
auto sst = make_lw_shared<sstable>(ks, cf, dir, generation, sstables::sstable::version_types::ka, big);
|
||||
auto fut = sst->load();
|
||||
return std::move(fut).then([sst = std::move(sst)] {
|
||||
return make_ready_future<sstable_ptr>(std::move(sst));
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(tombstone_in_tombstone) {
|
||||
return ka_sst("try1", "tab", "tests/sstables/tombstone_overlap", 1).then([] (auto sstp) {
|
||||
auto s = tombstone_overlap_schema();
|
||||
return do_with(sstp->read_rows(s), [sstp, s] (auto& reader) {
|
||||
return repeat([sstp, s, &reader] {
|
||||
return reader.read().then([s] (mutation_opt mut) {
|
||||
if (!mut) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
BOOST_REQUIRE((bytes_view(mut->key()) == bytes{'\x00','\x02','p','k'}));
|
||||
// We expect to see two overlapping deletions, as explained
|
||||
// above. Somewhat counterintuitively, scylla represents
|
||||
// deleting a small row with all clustering keys set - not
|
||||
// as a "row tombstone" but rather as a deleted clustering row.
|
||||
// So we expect to see one row tombstone and one deleted row.
|
||||
auto& rts = mut->partition().row_tombstones();
|
||||
BOOST_REQUIRE(rts.size() == 1);
|
||||
for (auto e : rts) {
|
||||
BOOST_REQUIRE((bytes_view(e.prefix()) == bytes{'\x00','\x03','a','a','a'}));
|
||||
BOOST_REQUIRE(e.t().timestamp == 1459334681228103LL);
|
||||
}
|
||||
auto& rows = mut->partition().clustered_rows();
|
||||
BOOST_REQUIRE(rows.size() == 1);
|
||||
for (auto e : rows) {
|
||||
BOOST_REQUIRE((bytes_view(e.key()) == bytes{'\x00','\x03','a','a','a', '\x00', '\x03', 'b', 'b', 'b'}));
|
||||
BOOST_REQUIRE(e.row().deleted_at().timestamp == 1459334681244989LL);
|
||||
}
|
||||
|
||||
return stop_iteration::no;
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -170,6 +170,33 @@ SEASTAR_TEST_CASE(big_summary_query_32) {
|
||||
return summary_query<32, 0xc4000, 182>("tests/sstables/bigsummary", 76);
|
||||
}
|
||||
|
||||
// The following two files are just a copy of uncompressed's 1. But the Summary
|
||||
// is removed (and removed from the TOC as well). We should reconstruct it
|
||||
// in this case, so the queries should still go through
|
||||
SEASTAR_TEST_CASE(missing_summary_query_ok) {
|
||||
return summary_query<0, 0, 5>("tests/sstables/uncompressed", 2);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(missing_summary_query_fail) {
|
||||
return summary_query_fail<2, 0, 5>("tests/sstables/uncompressed", 2);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(missing_summary_query_negative_fail) {
|
||||
return summary_query_fail<-2, 0, 5>("tests/sstables/uncompressed", 2);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(missing_summary_first_last_sane) {
|
||||
return reusable_sst("tests/sstables/uncompressed", 2).then([] (sstable_ptr ptr) {
|
||||
auto& summary = sstables::test(ptr).get_summary();
|
||||
BOOST_REQUIRE(summary.header.size == 1);
|
||||
BOOST_REQUIRE(summary.positions.size() == 1);
|
||||
BOOST_REQUIRE(summary.entries.size() == 1);
|
||||
BOOST_REQUIRE(bytes_view(summary.first_key) == as_bytes("vinna"));
|
||||
BOOST_REQUIRE(bytes_view(summary.last_key) == as_bytes("finna"));
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
static future<sstable_ptr> do_write_sst(sstring load_dir, sstring write_dir, unsigned long generation) {
|
||||
auto sst = make_lw_shared<sstable>("ks", "cf", load_dir, generation, la, big);
|
||||
return sst->load().then([sst, write_dir, generation] {
|
||||
@@ -864,16 +891,17 @@ SEASTAR_TEST_CASE(reshuffle) {
|
||||
auto cf = make_lw_shared<column_family>(uncompressed_schema(), cfg, column_family::no_commitlog(), *cm);
|
||||
cf->start();
|
||||
cf->mark_ready_for_writes();
|
||||
return cf->reshuffle_sstables(3).then([cm, cf] (std::vector<sstables::entry_descriptor> reshuffled) {
|
||||
BOOST_REQUIRE(reshuffled.size() == 2);
|
||||
BOOST_REQUIRE(reshuffled[0].generation == 3);
|
||||
BOOST_REQUIRE(reshuffled[1].generation == 4);
|
||||
std::set<int64_t> existing_sstables = { 1, 5 };
|
||||
return cf->reshuffle_sstables(existing_sstables, 6).then([cm, cf] (std::vector<sstables::entry_descriptor> reshuffled) {
|
||||
BOOST_REQUIRE(reshuffled.size() == 1);
|
||||
BOOST_REQUIRE(reshuffled[0].generation == 6);
|
||||
return when_all(
|
||||
test_sstable_exists("tests/sstables/generation", 1, true),
|
||||
test_sstable_exists("tests/sstables/generation", 2, false),
|
||||
test_sstable_exists("tests/sstables/generation", 3, true),
|
||||
test_sstable_exists("tests/sstables/generation", 4, true),
|
||||
test_sstable_exists("tests/sstables/generation", 5, false),
|
||||
test_sstable_exists("tests/sstables/generation", 3, false),
|
||||
test_sstable_exists("tests/sstables/generation", 4, false),
|
||||
test_sstable_exists("tests/sstables/generation", 5, true),
|
||||
test_sstable_exists("tests/sstables/generation", 6, true),
|
||||
test_sstable_exists("tests/sstables/generation", 10, false)
|
||||
).discard_result().then([cm] {
|
||||
return cm->stop();
|
||||
|
||||
Binary file not shown.
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Data.db
Normal file
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Data.db
Normal file
Binary file not shown.
@@ -0,0 +1 @@
|
||||
4178122188
|
||||
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Filter.db
Normal file
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Filter.db
Normal file
Binary file not shown.
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Index.db
Normal file
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Index.db
Normal file
Binary file not shown.
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Statistics.db
Normal file
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Statistics.db
Normal file
Binary file not shown.
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Summary.db
Normal file
BIN
tests/sstables/tombstone_overlap/try1-tab-ka-1-Summary.db
Normal file
Binary file not shown.
8
tests/sstables/tombstone_overlap/try1-tab-ka-1-TOC.txt
Normal file
8
tests/sstables/tombstone_overlap/try1-tab-ka-1-TOC.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
TOC.txt
|
||||
Filter.db
|
||||
CompressionInfo.db
|
||||
Index.db
|
||||
Digest.sha1
|
||||
Summary.db
|
||||
Data.db
|
||||
Statistics.db
|
||||
BIN
tests/sstables/uncompressed/la-2-big-CRC.db
Normal file
BIN
tests/sstables/uncompressed/la-2-big-CRC.db
Normal file
Binary file not shown.
BIN
tests/sstables/uncompressed/la-2-big-Data.db
Normal file
BIN
tests/sstables/uncompressed/la-2-big-Data.db
Normal file
Binary file not shown.
1
tests/sstables/uncompressed/la-2-big-Digest.sha1
Normal file
1
tests/sstables/uncompressed/la-2-big-Digest.sha1
Normal file
@@ -0,0 +1 @@
|
||||
748507322
|
||||
BIN
tests/sstables/uncompressed/la-2-big-Filter.db
Normal file
BIN
tests/sstables/uncompressed/la-2-big-Filter.db
Normal file
Binary file not shown.
BIN
tests/sstables/uncompressed/la-2-big-Index.db
Normal file
BIN
tests/sstables/uncompressed/la-2-big-Index.db
Normal file
Binary file not shown.
BIN
tests/sstables/uncompressed/la-2-big-Statistics.db
Normal file
BIN
tests/sstables/uncompressed/la-2-big-Statistics.db
Normal file
Binary file not shown.
7
tests/sstables/uncompressed/la-2-big-TOC.txt
Normal file
7
tests/sstables/uncompressed/la-2-big-TOC.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
Data.db
|
||||
Filter.db
|
||||
CRC.db
|
||||
Statistics.db
|
||||
Digest.sha1
|
||||
Index.db
|
||||
TOC.txt
|
||||
@@ -40,8 +40,7 @@ static query::result to_data_query_result(mutation_reader& reader, const query::
|
||||
if (!mo) {
|
||||
break;
|
||||
}
|
||||
auto pb = builder.add_partition(*mo->schema(), mo->key());
|
||||
mo->partition().query(pb, *mo->schema(), now);
|
||||
std::move(*mo).query(builder, slice, now);
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user