Compare commits
104 Commits
next
...
scylla-4.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4fb8ebccff | ||
|
|
d1fa0adcbe | ||
|
|
46b56d885e | ||
|
|
94597e38e2 | ||
|
|
c74ba1bc36 | ||
|
|
06cfc63c59 | ||
|
|
56d25930ec | ||
|
|
fa1cd048d7 | ||
|
|
94b754eee5 | ||
|
|
e02defb3bf | ||
|
|
95e712e244 | ||
|
|
a9109f068c | ||
|
|
2e71779970 | ||
|
|
5ed7de81ad | ||
|
|
5e21c9bd8a | ||
|
|
8e22fddc9e | ||
|
|
6cf2d998e3 | ||
|
|
5fcc1f205c | ||
|
|
08c35c1aad | ||
|
|
54a913d452 | ||
|
|
8f9cd98c45 | ||
|
|
2893f6e43b | ||
|
|
18d6c27b05 | ||
|
|
97d7f6990c | ||
|
|
9855e18c0d | ||
|
|
5d5ddd3539 | ||
|
|
0a72893fef | ||
|
|
1e45557d2a | ||
|
|
96e1e95c1d | ||
|
|
338196eab6 | ||
|
|
71cbec966b | ||
|
|
067a065553 | ||
|
|
e00bdc4f57 | ||
|
|
ad40f9222c | ||
|
|
5d90fa17d6 | ||
|
|
bf0c493c28 | ||
|
|
26cb0935f0 | ||
|
|
4e97d562eb | ||
|
|
3f1b932c04 | ||
|
|
b9498ab947 | ||
|
|
e1b3d6d0a2 | ||
|
|
67378cda03 | ||
|
|
6ab3965465 | ||
|
|
ca22461a9b | ||
|
|
b3d83ad073 | ||
|
|
7e6f47fbce | ||
|
|
9ca49cba6b | ||
|
|
6da8ba2d3f | ||
|
|
366c1c2c59 | ||
|
|
05cdb173f3 | ||
|
|
8c929a96cf | ||
|
|
a9aa10e8de | ||
|
|
989d8fe636 | ||
|
|
48d79a1d9f | ||
|
|
4c65413413 | ||
|
|
e931d28673 | ||
|
|
ec71688ff2 | ||
|
|
9710a91100 | ||
|
|
b052f3f5ce | ||
|
|
d70cab0444 | ||
|
|
0ce3799187 | ||
|
|
ee113eca52 | ||
|
|
be11514985 | ||
|
|
ec874bdc31 | ||
|
|
43169ffa2c | ||
|
|
c5ed14bff6 | ||
|
|
8366eda943 | ||
|
|
5d0b0dd4c4 | ||
|
|
6f259be5f1 | ||
|
|
16e512e21c | ||
|
|
c61dc4e87d | ||
|
|
af76a3ba79 | ||
|
|
8fb5ebb2c6 | ||
|
|
bfb11defdd | ||
|
|
2d1ddcbb6a | ||
|
|
4c560b63f0 | ||
|
|
00155e32b1 | ||
|
|
b06dffcc19 | ||
|
|
508e58ef9e | ||
|
|
776faa809f | ||
|
|
7037f43a17 | ||
|
|
bd713959ce | ||
|
|
b7c5a918cb | ||
|
|
fb2ae9e66b | ||
|
|
7a7ed8c65d | ||
|
|
7b9be752ec | ||
|
|
903e967a16 | ||
|
|
b84946895c | ||
|
|
a27188886a | ||
|
|
51d4efc321 | ||
|
|
0847eea8d6 | ||
|
|
35ad57cb9c | ||
|
|
42b0b9ad08 | ||
|
|
68b95bf2ac | ||
|
|
fea83f6ae0 | ||
|
|
76618a7e06 | ||
|
|
189a08ac72 | ||
|
|
a3e9915a83 | ||
|
|
e4bc14ec1a | ||
|
|
972acb6d56 | ||
|
|
7fbfedf025 | ||
|
|
5f175f8103 | ||
|
|
674ad6656a | ||
|
|
58498b4b6c |
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
PRODUCT=scylla
|
||||
VERSION=666.development
|
||||
VERSION=4.2.1
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -129,7 +129,7 @@ future<std::string> get_key_from_roles(cql3::query_processor& qp, std::string us
|
||||
auth::meta::roles_table::qualified_name(), auth::meta::roles_table::role_col_name);
|
||||
|
||||
auto cl = auth::password_authenticator::consistency_for_user(username);
|
||||
auto timeout = auth::internal_distributed_timeout_config();
|
||||
auto& timeout = auth::internal_distributed_timeout_config();
|
||||
return qp.execute_internal(query, cl, timeout, {sstring(username)}, true).then_wrapped([username = std::move(username)] (future<::shared_ptr<cql3::untyped_result_set>> f) {
|
||||
auto res = f.get0();
|
||||
auto salted_hash = std::optional<sstring>();
|
||||
|
||||
@@ -98,6 +98,11 @@ struct nonempty : public size_check {
|
||||
|
||||
// Check that array has the expected number of elements
|
||||
static void verify_operand_count(const rjson::value* array, const size_check& expected, const rjson::value& op) {
|
||||
if (!array && expected(0)) {
|
||||
// If expected() allows an empty AttributeValueList, it is also fine
|
||||
// that it is missing.
|
||||
return;
|
||||
}
|
||||
if (!array || !array->IsArray()) {
|
||||
throw api_error("ValidationException", "With ComparisonOperator, AttributeValueList must be given and an array");
|
||||
}
|
||||
|
||||
@@ -626,11 +626,8 @@ void rmw_operation::set_default_write_isolation(std::string_view value) {
|
||||
default_write_isolation = parse_write_isolation(value);
|
||||
}
|
||||
|
||||
// FIXME: Updating tags currently relies on updating schema, which may be subject
|
||||
// to races during concurrent updates of the same table. Once Scylla schema updates
|
||||
// are fixed, this issue will automatically get fixed as well.
|
||||
enum class update_tags_action { add_tags, delete_tags };
|
||||
static future<> update_tags(service::migration_manager& mm, const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
|
||||
static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>& tags_map, update_tags_action action) {
|
||||
if (action == update_tags_action::add_tags) {
|
||||
for (auto it = tags.Begin(); it != tags.End(); ++it) {
|
||||
const rjson::value& key = (*it)["Key"];
|
||||
@@ -652,28 +649,20 @@ static future<> update_tags(service::migration_manager& mm, const rjson::value&
|
||||
}
|
||||
|
||||
if (tags_map.size() > 50) {
|
||||
return make_exception_future<>(api_error("ValidationException", "Number of Tags exceed the current limit for the provided ResourceArn"));
|
||||
throw api_error("ValidationException", "Number of Tags exceed the current limit for the provided ResourceArn");
|
||||
}
|
||||
validate_tags(tags_map);
|
||||
}
|
||||
|
||||
// FIXME: Updating tags currently relies on updating schema, which may be subject
|
||||
// to races during concurrent updates of the same table. Once Scylla schema updates
|
||||
// are fixed, this issue will automatically get fixed as well.
|
||||
static future<> update_tags(service::migration_manager& mm, schema_ptr schema, std::map<sstring, sstring>&& tags_map) {
|
||||
schema_builder builder(schema);
|
||||
builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>(std::move(tags_map))}});
|
||||
return mm.announce_column_family_update(builder.build(), false, std::vector<view_ptr>(), false);
|
||||
}
|
||||
|
||||
static future<> add_tags(service::migration_manager& mm, service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
|
||||
const rjson::value* tags = rjson::find(request_info, "Tags");
|
||||
if (!tags || !tags->IsArray()) {
|
||||
return make_exception_future<>(api_error("ValidationException", format("Cannot parse tags")));
|
||||
}
|
||||
if (tags->Size() < 1) {
|
||||
return make_exception_future<>(api_error("ValidationException", "The number of tags must be at least 1"));
|
||||
}
|
||||
|
||||
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
||||
return update_tags(mm, rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
|
||||
}
|
||||
|
||||
future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
|
||||
_stats.api_operations.tag_resource++;
|
||||
|
||||
@@ -683,7 +672,16 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
|
||||
return api_error("AccessDeniedException", "Incorrect resource identifier");
|
||||
}
|
||||
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
|
||||
add_tags(_mm, _proxy, schema, request).get();
|
||||
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
||||
const rjson::value* tags = rjson::find(request, "Tags");
|
||||
if (!tags || !tags->IsArray()) {
|
||||
return api_error("ValidationException", format("Cannot parse tags"));
|
||||
}
|
||||
if (tags->Size() < 1) {
|
||||
return api_error("ValidationException", "The number of tags must be at least 1") ;
|
||||
}
|
||||
update_tags_map(*tags, tags_map, update_tags_action::add_tags);
|
||||
update_tags(_mm, schema, std::move(tags_map)).get();
|
||||
return json_string("");
|
||||
});
|
||||
}
|
||||
@@ -704,7 +702,8 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
|
||||
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
|
||||
|
||||
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
||||
update_tags(_mm, *tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
|
||||
update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
|
||||
update_tags(_mm, schema, std::move(tags_map)).get();
|
||||
return json_string("");
|
||||
});
|
||||
}
|
||||
@@ -891,9 +890,22 @@ future<executor::request_return_type> executor::create_table(client_state& clien
|
||||
view_builders.emplace_back(std::move(view_builder));
|
||||
}
|
||||
}
|
||||
if (rjson::find(request, "SSESpecification")) {
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException", "SSESpecification: configuring encryption-at-rest is not yet supported."));
|
||||
|
||||
// We don't yet support configuring server-side encryption (SSE) via the
|
||||
// SSESpecifiction attribute, but an SSESpecification with Enabled=false
|
||||
// is simply the default, and should be accepted:
|
||||
rjson::value* sse_specification = rjson::find(request, "SSESpecification");
|
||||
if (sse_specification && sse_specification->IsObject()) {
|
||||
rjson::value* enabled = rjson::find(*sse_specification, "Enabled");
|
||||
if (!enabled || !enabled->IsBool()) {
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException", "SSESpecification needs boolean Enabled"));
|
||||
}
|
||||
if (enabled->GetBool()) {
|
||||
// TODO: full support for SSESpecification
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException", "SSESpecification: configuring encryption-at-rest is not yet supported."));
|
||||
}
|
||||
}
|
||||
|
||||
// We don't yet support streams (CDC), but a StreamSpecification asking
|
||||
// *not* to use streams should be accepted:
|
||||
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
||||
@@ -908,6 +920,14 @@ future<executor::request_return_type> executor::create_table(client_state& clien
|
||||
}
|
||||
}
|
||||
|
||||
// Parse the "Tags" parameter early, so we can avoid creating the table
|
||||
// at all if this parsing failed.
|
||||
const rjson::value* tags = rjson::find(request, "Tags");
|
||||
std::map<sstring, sstring> tags_map;
|
||||
if (tags && tags->IsArray()) {
|
||||
update_tags_map(*tags, tags_map, update_tags_action::add_tags);
|
||||
}
|
||||
|
||||
builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>()}});
|
||||
schema_ptr schema = builder.build();
|
||||
auto where_clause_it = where_clauses.begin();
|
||||
@@ -928,14 +948,14 @@ future<executor::request_return_type> executor::create_table(client_state& clien
|
||||
|
||||
return create_keyspace(keyspace_name).handle_exception_type([] (exceptions::already_exists_exception&) {
|
||||
// Ignore the fact that the keyspace may already exist. See discussion in #6340
|
||||
}).then([this, table_name, request = std::move(request), schema, view_builders = std::move(view_builders)] () mutable {
|
||||
return futurize_invoke([&] { return _mm.announce_new_column_family(schema, false); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders)] () mutable {
|
||||
}).then([this, table_name, request = std::move(request), schema, view_builders = std::move(view_builders), tags_map = std::move(tags_map)] () mutable {
|
||||
return futurize_invoke([&] { return _mm.announce_new_column_family(schema, false); }).then([this, table_info = std::move(request), schema, view_builders = std::move(view_builders), tags_map = std::move(tags_map)] () mutable {
|
||||
return parallel_for_each(std::move(view_builders), [this, schema] (schema_builder builder) {
|
||||
return _mm.announce_new_view(view_ptr(builder.build()));
|
||||
}).then([this, table_info = std::move(table_info), schema] () mutable {
|
||||
}).then([this, table_info = std::move(table_info), schema, tags_map = std::move(tags_map)] () mutable {
|
||||
future<> f = make_ready_future<>();
|
||||
if (rjson::find(table_info, "Tags")) {
|
||||
f = add_tags(_mm, _proxy, schema, table_info);
|
||||
if (!tags_map.empty()) {
|
||||
f = update_tags(_mm, schema, std::move(tags_map));
|
||||
}
|
||||
return f.then([this] {
|
||||
return wait_for_schema_agreement(_mm, db::timeout_clock::now() + 10s);
|
||||
@@ -963,15 +983,24 @@ class attribute_collector {
|
||||
void add(bytes&& name, atomic_cell&& cell) {
|
||||
collected.emplace(std::move(name), std::move(cell));
|
||||
}
|
||||
void add(const bytes& name, atomic_cell&& cell) {
|
||||
collected.emplace(name, std::move(cell));
|
||||
}
|
||||
public:
|
||||
attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
|
||||
void put(bytes&& name, bytes&& val, api::timestamp_type ts) {
|
||||
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, std::move(val), atomic_cell::collection_member::yes));
|
||||
void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
|
||||
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
|
||||
|
||||
}
|
||||
void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
|
||||
add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
|
||||
}
|
||||
void del(bytes&& name, api::timestamp_type ts) {
|
||||
add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
|
||||
}
|
||||
void del(const bytes& name, api::timestamp_type ts) {
|
||||
add(name, atomic_cell::make_dead(ts, gc_clock::now()));
|
||||
}
|
||||
collection_mutation_description to_mut() {
|
||||
collection_mutation_description ret;
|
||||
for (auto&& e : collected) {
|
||||
@@ -1048,7 +1077,7 @@ public:
|
||||
put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item);
|
||||
// put_or_delete_item doesn't keep a reference to schema (so it can be
|
||||
// moved between shards for LWT) so it needs to be given again to build():
|
||||
mutation build(schema_ptr schema, api::timestamp_type ts);
|
||||
mutation build(schema_ptr schema, api::timestamp_type ts) const;
|
||||
const partition_key& pk() const { return _pk; }
|
||||
const clustering_key& ck() const { return _ck; }
|
||||
};
|
||||
@@ -1077,7 +1106,7 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
|
||||
}
|
||||
}
|
||||
|
||||
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
|
||||
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
|
||||
mutation m(schema, _pk);
|
||||
// If there's no clustering key, a tombstone should be created directly
|
||||
// on a partition, not on a clustering row - otherwise it will look like
|
||||
@@ -1099,7 +1128,7 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
|
||||
for (auto& c : *_cells) {
|
||||
const column_definition* cdef = schema->get_column_definition(c.column_name);
|
||||
if (!cdef) {
|
||||
attrs_collector.put(std::move(c.column_name), std::move(c.value), ts);
|
||||
attrs_collector.put(c.column_name, c.value, ts);
|
||||
} else {
|
||||
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
|
||||
}
|
||||
@@ -1410,7 +1439,7 @@ public:
|
||||
check_needs_read_before_write(_condition_expression) ||
|
||||
_returnvalues == returnvalues::ALL_OLD;
|
||||
}
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
|
||||
if (!verify_expected(_request, previous_item.get()) ||
|
||||
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
||||
// If the update is to be cancelled because of an unfulfilled Expected
|
||||
@@ -1420,6 +1449,8 @@ public:
|
||||
}
|
||||
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
||||
_return_attributes = std::move(*previous_item);
|
||||
} else {
|
||||
_return_attributes = {};
|
||||
}
|
||||
return _mutation_builder.build(_schema, ts);
|
||||
}
|
||||
@@ -1493,7 +1524,7 @@ public:
|
||||
check_needs_read_before_write(_condition_expression) ||
|
||||
_returnvalues == returnvalues::ALL_OLD;
|
||||
}
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
|
||||
if (!verify_expected(_request, previous_item.get()) ||
|
||||
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
||||
// If the update is to be cancelled because of an unfulfilled Expected
|
||||
@@ -1503,6 +1534,8 @@ public:
|
||||
}
|
||||
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
||||
_return_attributes = std::move(*previous_item);
|
||||
} else {
|
||||
_return_attributes = {};
|
||||
}
|
||||
return _mutation_builder.build(_schema, ts);
|
||||
}
|
||||
@@ -1577,7 +1610,7 @@ public:
|
||||
virtual ~put_or_delete_item_cas_request() = default;
|
||||
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override {
|
||||
std::optional<mutation> ret;
|
||||
for (put_or_delete_item& mutation_builder : _mutation_builders) {
|
||||
for (const put_or_delete_item& mutation_builder : _mutation_builders) {
|
||||
// We assume all these builders have the same partition.
|
||||
if (ret) {
|
||||
ret->apply(mutation_builder.build(schema, ts));
|
||||
@@ -1906,7 +1939,7 @@ public:
|
||||
|
||||
update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
|
||||
virtual ~update_item_operation() = default;
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override;
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override;
|
||||
bool needs_read_before_write() const;
|
||||
};
|
||||
|
||||
@@ -1984,7 +2017,7 @@ update_item_operation::needs_read_before_write() const {
|
||||
}
|
||||
|
||||
std::optional<mutation>
|
||||
update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) {
|
||||
update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
|
||||
if (!verify_expected(_request, previous_item.get()) ||
|
||||
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
||||
// If the update is to be cancelled because of an unfulfilled
|
||||
|
||||
@@ -87,7 +87,11 @@ protected:
|
||||
// When _returnvalues != NONE, apply() should store here, in JSON form,
|
||||
// the values which are to be returned in the "Attributes" field.
|
||||
// The default null JSON means do not return an Attributes field at all.
|
||||
rjson::value _return_attributes;
|
||||
// This field is marked "mutable" so that the const apply() can modify
|
||||
// it (see explanation below), but note that because apply() may be
|
||||
// called more than once, if apply() will sometimes set this field it
|
||||
// must set it (even if just to the default empty value) every time.
|
||||
mutable rjson::value _return_attributes;
|
||||
public:
|
||||
// The constructor of a rmw_operation subclass should parse the request
|
||||
// and try to discover as many input errors as it can before really
|
||||
@@ -100,7 +104,12 @@ public:
|
||||
// conditional expression, apply() should return an empty optional.
|
||||
// apply() may throw if it encounters input errors not discovered during
|
||||
// the constructor.
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) = 0;
|
||||
// apply() may be called more than once in case of contention, so it must
|
||||
// not change the state saved in the object (issue #7218 was caused by
|
||||
// violating this). We mark apply() "const" to let the compiler validate
|
||||
// this for us. The output-only field _return_attributes is marked
|
||||
// "mutable" above so that apply() can still write to it.
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
|
||||
// Convert the above apply() into the signature needed by cas_request:
|
||||
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override;
|
||||
virtual ~rmw_operation() = default;
|
||||
|
||||
@@ -322,8 +322,8 @@ void set_storage_service(http_context& ctx, routes& r) {
|
||||
for (auto cf : column_families) {
|
||||
column_families_vec.push_back(&db.find_column_family(keyspace, cf));
|
||||
}
|
||||
return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
|
||||
return cm.perform_cleanup(cf);
|
||||
return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
|
||||
return cm.perform_cleanup(db, cf);
|
||||
});
|
||||
}).then([]{
|
||||
return make_ready_future<json::json_return_type>(0);
|
||||
|
||||
@@ -386,6 +386,7 @@ scylla_tests = set([
|
||||
'test/boost/view_schema_ckey_test',
|
||||
'test/boost/vint_serialization_test',
|
||||
'test/boost/virtual_reader_test',
|
||||
'test/boost/stall_free_test',
|
||||
'test/manual/ec2_snitch_test',
|
||||
'test/manual/gce_snitch_test',
|
||||
'test/manual/gossip',
|
||||
@@ -573,6 +574,7 @@ scylla_core = (['database.cc',
|
||||
'cql3/sets.cc',
|
||||
'cql3/tuples.cc',
|
||||
'cql3/maps.cc',
|
||||
'cql3/values.cc',
|
||||
'cql3/functions/user_function.cc',
|
||||
'cql3/functions/functions.cc',
|
||||
'cql3/functions/aggregate_fcts.cc',
|
||||
|
||||
@@ -267,10 +267,13 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
/// The same as `impl_max_function_for' but without knowledge of `Type'.
|
||||
/// The same as `impl_max_function_for' but without compile-time dependency on `Type'.
|
||||
class impl_max_dynamic_function final : public aggregate_function::aggregate {
|
||||
data_type _io_type;
|
||||
opt_bytes _max;
|
||||
public:
|
||||
impl_max_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
|
||||
|
||||
virtual void reset() override {
|
||||
_max = {};
|
||||
}
|
||||
@@ -278,12 +281,11 @@ public:
|
||||
return _max.value_or(bytes{});
|
||||
}
|
||||
virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
|
||||
if (!values[0]) {
|
||||
if (values.empty() || !values[0]) {
|
||||
return;
|
||||
}
|
||||
const auto val = *values[0];
|
||||
if (!_max || *_max < val) {
|
||||
_max = val;
|
||||
if (!_max || _io_type->less(*_max, *values[0])) {
|
||||
_max = values[0];
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -298,10 +300,13 @@ public:
|
||||
};
|
||||
|
||||
class max_dynamic_function final : public native_aggregate_function {
|
||||
data_type _io_type;
|
||||
public:
|
||||
max_dynamic_function(data_type io_type) : native_aggregate_function("max", io_type, { io_type }) {}
|
||||
max_dynamic_function(data_type io_type)
|
||||
: native_aggregate_function("max", io_type, { io_type })
|
||||
, _io_type(std::move(io_type)) {}
|
||||
virtual std::unique_ptr<aggregate> new_aggregate() override {
|
||||
return std::make_unique<impl_max_dynamic_function>();
|
||||
return std::make_unique<impl_max_dynamic_function>(_io_type);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -358,10 +363,13 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
/// The same as `impl_min_function_for' but without knowledge of `Type'.
|
||||
/// The same as `impl_min_function_for' but without compile-time dependency on `Type'.
|
||||
class impl_min_dynamic_function final : public aggregate_function::aggregate {
|
||||
data_type _io_type;
|
||||
opt_bytes _min;
|
||||
public:
|
||||
impl_min_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
|
||||
|
||||
virtual void reset() override {
|
||||
_min = {};
|
||||
}
|
||||
@@ -369,12 +377,11 @@ public:
|
||||
return _min.value_or(bytes{});
|
||||
}
|
||||
virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
|
||||
if (!values[0]) {
|
||||
if (values.empty() || !values[0]) {
|
||||
return;
|
||||
}
|
||||
const auto val = *values[0];
|
||||
if (!_min || val < *_min) {
|
||||
_min = val;
|
||||
if (!_min || _io_type->less(*values[0], *_min)) {
|
||||
_min = values[0];
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -389,10 +396,13 @@ public:
|
||||
};
|
||||
|
||||
class min_dynamic_function final : public native_aggregate_function {
|
||||
data_type _io_type;
|
||||
public:
|
||||
min_dynamic_function(data_type io_type) : native_aggregate_function("min", io_type, { io_type }) {}
|
||||
min_dynamic_function(data_type io_type)
|
||||
: native_aggregate_function("min", io_type, { io_type })
|
||||
, _io_type(std::move(io_type)) {}
|
||||
virtual std::unique_ptr<aggregate> new_aggregate() override {
|
||||
return std::make_unique<impl_min_dynamic_function>();
|
||||
return std::make_unique<impl_min_dynamic_function>(_io_type);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -88,16 +88,13 @@ static data_value castas_fctn_simple(data_value from) {
|
||||
template<typename ToType>
|
||||
static data_value castas_fctn_from_decimal_to_float(data_value from) {
|
||||
auto val_from = value_cast<big_decimal>(from);
|
||||
boost::multiprecision::cpp_int ten(10);
|
||||
boost::multiprecision::cpp_rational r = val_from.unscaled_value();
|
||||
r /= boost::multiprecision::pow(ten, val_from.scale());
|
||||
return static_cast<ToType>(r);
|
||||
return static_cast<ToType>(val_from.as_rational());
|
||||
}
|
||||
|
||||
static utils::multiprecision_int from_decimal_to_cppint(const data_value& from) {
|
||||
const auto& val_from = value_cast<big_decimal>(from);
|
||||
boost::multiprecision::cpp_int ten(10);
|
||||
return boost::multiprecision::cpp_int(val_from.unscaled_value() / boost::multiprecision::pow(ten, val_from.scale()));
|
||||
auto r = val_from.as_rational();
|
||||
return utils::multiprecision_int(numerator(r)/denominator(r));
|
||||
}
|
||||
|
||||
template<typename ToType>
|
||||
|
||||
@@ -445,7 +445,7 @@ function_call::bind_and_get(const query_options& options) {
|
||||
buffers.push_back(std::move(to_bytes_opt(val)));
|
||||
}
|
||||
auto result = execute_internal(options.get_cql_serialization_format(), *_fun, std::move(buffers));
|
||||
return options.make_temporary(cql3::raw_value::make_value(result));
|
||||
return cql3::raw_value_view::make_temporary(cql3::raw_value::make_value(result));
|
||||
}
|
||||
|
||||
bytes_opt
|
||||
|
||||
@@ -357,7 +357,12 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,
|
||||
|
||||
collection_mutation_description mut;
|
||||
mut.cells.reserve(1);
|
||||
mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
|
||||
|
||||
if (!value) {
|
||||
mut.cells.emplace_back(to_bytes(*index), params.make_dead_cell());
|
||||
} else {
|
||||
mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
|
||||
}
|
||||
|
||||
m.set_cell(prefix, column, mut.serialize(*ltype));
|
||||
}
|
||||
|
||||
@@ -161,17 +161,6 @@ query_options::query_options(std::vector<cql3::raw_value> values)
|
||||
db::consistency_level::ONE, infinite_timeout_config, std::move(values))
|
||||
{}
|
||||
|
||||
cql3::raw_value_view query_options::make_temporary(cql3::raw_value value) const
|
||||
{
|
||||
if (value) {
|
||||
auto value_view = *value;
|
||||
auto ptr = _temporaries.write_place_holder(value_view.size());
|
||||
std::copy_n(value_view.data(), value_view.size(), ptr);
|
||||
return cql3::raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{ptr, value_view.size()}));
|
||||
}
|
||||
return cql3::raw_value_view::make_null();
|
||||
}
|
||||
|
||||
bytes_view query_options::linearize(fragmented_temporary_buffer::view view) const
|
||||
{
|
||||
if (view.empty()) {
|
||||
|
||||
@@ -178,7 +178,6 @@ public:
|
||||
return _value_views.size();
|
||||
}
|
||||
|
||||
cql3::raw_value_view make_temporary(cql3::raw_value value) const;
|
||||
bytes_view linearize(fragmented_temporary_buffer::view) const;
|
||||
|
||||
bool skip_metadata() const {
|
||||
|
||||
@@ -417,7 +417,7 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
|
||||
_clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
|
||||
for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
|
||||
::shared_ptr<single_column_restriction> restr;
|
||||
if (single_pk_restrs) {
|
||||
if (single_ck_restrs) {
|
||||
auto it = single_ck_restrs->restrictions().find(cdef);
|
||||
if (it != single_ck_restrs->restrictions().end()) {
|
||||
restr = dynamic_pointer_cast<single_column_restriction>(it->second);
|
||||
@@ -688,6 +688,11 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
|
||||
extract_bound(statements::bound::END));
|
||||
}
|
||||
|
||||
static bool contains_without_wraparound(
|
||||
const query::range<bytes_view>& range, bytes_view value, const serialized_tri_compare& cmp) {
|
||||
return !range.is_wrap_around(cmp) && range.contains(value, cmp);
|
||||
}
|
||||
|
||||
bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
|
||||
const partition_key& key,
|
||||
const clustering_key_prefix& ckey,
|
||||
@@ -702,13 +707,13 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
|
||||
return false;
|
||||
}
|
||||
return cell_value->with_linearized([&] (bytes_view cell_value_bv) {
|
||||
return to_range(_slice, options, _column_def.name_as_text()).contains(
|
||||
return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
|
||||
cell_value_bv, _column_def.type->as_tri_comparator());
|
||||
});
|
||||
}
|
||||
|
||||
bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
return to_range(_slice, options, _column_def.name_as_text()).contains(
|
||||
return contains_without_wraparound(to_range(_slice, options, _column_def.name_as_text()),
|
||||
data, _column_def.type->underlying_type()->as_tri_comparator());
|
||||
}
|
||||
|
||||
|
||||
@@ -207,6 +207,9 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
|
||||
"because a collection with the same name and a different type has already been used in the past", column_name));
|
||||
}
|
||||
}
|
||||
if (type->is_counter() && !schema.is_counter()) {
|
||||
throw exceptions::configuration_exception(format("Cannot add a counter column ({}) in a non counter column family", column_name));
|
||||
}
|
||||
|
||||
cfm.with_column(column_name.name(), type, is_static ? column_kind::static_column : column_kind::regular_column);
|
||||
|
||||
@@ -222,7 +225,7 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
|
||||
schema_builder builder(view);
|
||||
if (view->view_info()->include_all_columns()) {
|
||||
builder.with_column(column_name.name(), type);
|
||||
} else if (view->view_info()->base_non_pk_columns_in_view_pk().empty()) {
|
||||
} else if (!view->view_info()->has_base_non_pk_columns_in_view_pk()) {
|
||||
db::view::create_virtual_column(builder, column_name.name(), type);
|
||||
}
|
||||
view_updates.push_back(view_ptr(builder.build()));
|
||||
|
||||
@@ -190,7 +190,7 @@ public:
|
||||
virtual cql3::raw_value get(const query_options& options) = 0;
|
||||
|
||||
virtual cql3::raw_value_view bind_and_get(const query_options& options) override {
|
||||
return options.make_temporary(get(options));
|
||||
return raw_value_view::make_temporary(get(options));
|
||||
}
|
||||
|
||||
virtual sstring to_string() const = 0;
|
||||
@@ -227,7 +227,7 @@ public:
|
||||
virtual cql3::raw_value_view bind_and_get(const query_options& options) override {
|
||||
auto t = bind(options);
|
||||
if (t) {
|
||||
return options.make_temporary(t->get(options));
|
||||
return cql3::raw_value_view::make_temporary(t->get(options));
|
||||
}
|
||||
return cql3::raw_value_view::make_null();
|
||||
};
|
||||
|
||||
@@ -184,7 +184,7 @@ public:
|
||||
|
||||
virtual cql3::raw_value_view bind_and_get(const query_options& options) override {
|
||||
// We don't "need" that override but it saves us the allocation of a Value object if used
|
||||
return options.make_temporary(cql3::raw_value::make_value(_type->build_value(bind_internal(options))));
|
||||
return cql3::raw_value_view::make_temporary(cql3::raw_value::make_value(_type->build_value(bind_internal(options))));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -227,7 +227,7 @@ shared_ptr<terminal> user_types::delayed_value::bind(const query_options& option
|
||||
}
|
||||
|
||||
cql3::raw_value_view user_types::delayed_value::bind_and_get(const query_options& options) {
|
||||
return options.make_temporary(cql3::raw_value::make_value(user_type_impl::build_value(bind_internal(options))));
|
||||
return cql3::raw_value_view::make_temporary(cql3::raw_value::make_value(user_type_impl::build_value(bind_internal(options))));
|
||||
}
|
||||
|
||||
shared_ptr<terminal> user_types::marker::bind(const query_options& options) {
|
||||
|
||||
70
cql3/values.cc
Normal file
70
cql3/values.cc
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (C) 2020 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "cql3/values.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const raw_value_view& value) {
|
||||
seastar::visit(value._data, [&] (fragmented_temporary_buffer::view v) {
|
||||
os << "{ value: ";
|
||||
using boost::range::for_each;
|
||||
for_each(v, [&os] (bytes_view bv) { os << bv; });
|
||||
os << " }";
|
||||
}, [&] (null_value) {
|
||||
os << "{ null }";
|
||||
}, [&] (unset_value) {
|
||||
os << "{ unset }";
|
||||
});
|
||||
return os;
|
||||
}
|
||||
|
||||
raw_value_view raw_value::to_view() const {
|
||||
switch (_data.index()) {
|
||||
case 0: return raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{std::get<bytes>(_data)}));
|
||||
case 1: return raw_value_view::make_null();
|
||||
default: return raw_value_view::make_unset_value();
|
||||
}
|
||||
}
|
||||
|
||||
raw_value raw_value::make_value(const raw_value_view& view) {
|
||||
if (view.is_null()) {
|
||||
return make_null();
|
||||
}
|
||||
if (view.is_unset_value()) {
|
||||
return make_unset_value();
|
||||
}
|
||||
return make_value(linearized(*view));
|
||||
}
|
||||
|
||||
raw_value_view raw_value_view::make_temporary(raw_value&& value) {
|
||||
if (!value) {
|
||||
return raw_value_view::make_null();
|
||||
}
|
||||
return raw_value_view(std::move(value).extract_value());
|
||||
}
|
||||
|
||||
raw_value_view::raw_value_view(bytes&& tmp) {
|
||||
_temporary_storage = make_lw_shared<bytes>(std::move(tmp));
|
||||
_data = fragmented_temporary_buffer::view(bytes_view(*_temporary_storage));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -39,11 +39,22 @@ struct null_value {
|
||||
struct unset_value {
|
||||
};
|
||||
|
||||
class raw_value;
|
||||
/// \brief View to a raw CQL protocol value.
|
||||
///
|
||||
/// \see raw_value
|
||||
struct raw_value_view {
|
||||
std::variant<fragmented_temporary_buffer::view, null_value, unset_value> _data;
|
||||
// Temporary storage is only useful if a raw_value_view needs to be instantiated
|
||||
// with a value which lifetime is bounded only to the view itself.
|
||||
// This hack is introduced in order to avoid storing temporary storage
|
||||
// in an external container, which may cause memory leaking problems.
|
||||
// This pointer is disengaged for regular raw_value_view instances.
|
||||
// Data is stored in a shared pointer for two reasons:
|
||||
// - pointers are cheap to copy
|
||||
// - it makes the view keep its semantics - it's safe to copy a view multiple times
|
||||
// and all copies still refer to the same underlying data.
|
||||
lw_shared_ptr<bytes> _temporary_storage = nullptr;
|
||||
|
||||
raw_value_view(null_value&& data)
|
||||
: _data{std::move(data)}
|
||||
@@ -54,6 +65,9 @@ struct raw_value_view {
|
||||
raw_value_view(fragmented_temporary_buffer::view data)
|
||||
: _data{data}
|
||||
{}
|
||||
// This constructor is only used by make_temporary() and it acquires ownership
|
||||
// of the given buffer. The view created that way refers to its own temporary storage.
|
||||
explicit raw_value_view(bytes&& temporary_storage);
|
||||
public:
|
||||
static raw_value_view make_null() {
|
||||
return raw_value_view{std::move(null_value{})};
|
||||
@@ -64,6 +78,7 @@ public:
|
||||
static raw_value_view make_value(fragmented_temporary_buffer::view view) {
|
||||
return raw_value_view{view};
|
||||
}
|
||||
static raw_value_view make_temporary(raw_value&& value);
|
||||
bool is_null() const {
|
||||
return std::holds_alternative<null_value>(_data);
|
||||
}
|
||||
@@ -102,19 +117,7 @@ public:
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const raw_value_view& value) {
|
||||
seastar::visit(value._data, [&] (fragmented_temporary_buffer::view v) {
|
||||
os << "{ value: ";
|
||||
using boost::range::for_each;
|
||||
for_each(v, [&os] (bytes_view bv) { os << bv; });
|
||||
os << " }";
|
||||
}, [&] (null_value) {
|
||||
os << "{ null }";
|
||||
}, [&] (unset_value) {
|
||||
os << "{ unset }";
|
||||
});
|
||||
return os;
|
||||
}
|
||||
friend std::ostream& operator<<(std::ostream& os, const raw_value_view& value);
|
||||
};
|
||||
|
||||
/// \brief Raw CQL protocol value.
|
||||
@@ -144,15 +147,7 @@ public:
|
||||
static raw_value make_unset_value() {
|
||||
return raw_value{std::move(unset_value{})};
|
||||
}
|
||||
static raw_value make_value(const raw_value_view& view) {
|
||||
if (view.is_null()) {
|
||||
return make_null();
|
||||
}
|
||||
if (view.is_unset_value()) {
|
||||
return make_unset_value();
|
||||
}
|
||||
return make_value(linearized(*view));
|
||||
}
|
||||
static raw_value make_value(const raw_value_view& view);
|
||||
static raw_value make_value(bytes&& bytes) {
|
||||
return raw_value{std::move(bytes)};
|
||||
}
|
||||
@@ -189,13 +184,12 @@ public:
|
||||
const bytes& operator*() const {
|
||||
return std::get<bytes>(_data);
|
||||
}
|
||||
raw_value_view to_view() const {
|
||||
switch (_data.index()) {
|
||||
case 0: return raw_value_view::make_value(fragmented_temporary_buffer::view(bytes_view{std::get<bytes>(_data)}));
|
||||
case 1: return raw_value_view::make_null();
|
||||
default: return raw_value_view::make_unset_value();
|
||||
}
|
||||
bytes&& extract_value() && {
|
||||
auto b = std::get_if<bytes>(&_data);
|
||||
assert(b);
|
||||
return std::move(*b);
|
||||
}
|
||||
raw_value_view to_view() const;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
37
database.cc
37
database.cc
@@ -552,14 +552,14 @@ database::~database() {
|
||||
}
|
||||
|
||||
void database::update_version(const utils::UUID& version) {
|
||||
if (_version != version) {
|
||||
if (_version.get() != version) {
|
||||
_schema_change_count++;
|
||||
}
|
||||
_version = version;
|
||||
_version.set(version);
|
||||
}
|
||||
|
||||
const utils::UUID& database::get_version() const {
|
||||
return _version;
|
||||
return _version.get();
|
||||
}
|
||||
|
||||
static future<>
|
||||
@@ -1851,7 +1851,11 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
|
||||
// TODO: indexes.
|
||||
// Note: since discard_sstables was changed to only count tables owned by this shard,
|
||||
// we can get zero rp back. Changed assert, and ensure we save at least low_mark.
|
||||
assert(low_mark <= rp || rp == db::replay_position());
|
||||
// #6995 - the assert below was broken in c2c6c71 and remained so for many years.
|
||||
// We nowadays do not flush tables with sstables but autosnapshot=false. This means
|
||||
// the low_mark assertion does not hold, because we maybe/probably never got around to
|
||||
// creating the sstables that would create them.
|
||||
assert(!should_flush || low_mark <= rp || rp == db::replay_position());
|
||||
rp = std::max(low_mark, rp);
|
||||
return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
|
||||
// save_truncation_record() may actually fail after we cached the truncation time
|
||||
@@ -1957,31 +1961,6 @@ future<> database::clear_snapshot(sstring tag, std::vector<sstring> keyspace_nam
|
||||
});
|
||||
}
|
||||
|
||||
future<utils::UUID> update_schema_version(distributed<service::storage_proxy>& proxy, schema_features features)
|
||||
{
|
||||
return db::schema_tables::calculate_schema_digest(proxy, features).then([&proxy] (utils::UUID uuid) {
|
||||
return proxy.local().get_db().invoke_on_all([uuid] (database& db) {
|
||||
db.update_version(uuid);
|
||||
}).then([uuid] {
|
||||
return db::system_keyspace::update_schema_version(uuid);
|
||||
}).then([uuid] {
|
||||
dblog.info("Schema version changed to {}", uuid);
|
||||
return uuid;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> announce_schema_version(utils::UUID schema_version) {
|
||||
return service::migration_manager::passive_announce(schema_version);
|
||||
}
|
||||
|
||||
future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy, schema_features features)
|
||||
{
|
||||
return update_schema_version(proxy, features).then([] (utils::UUID uuid) {
|
||||
return announce_schema_version(uuid);
|
||||
});
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const user_types_metadata& m) {
|
||||
os << "org.apache.cassandra.config.UTMetaData@" << &m;
|
||||
return os;
|
||||
|
||||
35
database.hh
35
database.hh
@@ -55,6 +55,7 @@
|
||||
#include <limits>
|
||||
#include <cstddef>
|
||||
#include "schema_fwd.hh"
|
||||
#include "db/view/view.hh"
|
||||
#include "db/schema_features.hh"
|
||||
#include "gms/feature.hh"
|
||||
#include "timestamp.hh"
|
||||
@@ -95,6 +96,7 @@
|
||||
#include "user_types_metadata.hh"
|
||||
#include "query_class_config.hh"
|
||||
#include "absl-flat_hash_map.hh"
|
||||
#include "utils/updateable_value.hh"
|
||||
|
||||
class cell_locker;
|
||||
class cell_locker_stats;
|
||||
@@ -901,7 +903,7 @@ public:
|
||||
lw_shared_ptr<const sstable_list> get_sstables_including_compacted_undeleted() const;
|
||||
const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const;
|
||||
std::vector<sstables::shared_sstable> select_sstables(const dht::partition_range& range) const;
|
||||
std::vector<sstables::shared_sstable> candidates_for_compaction() const;
|
||||
std::vector<sstables::shared_sstable> non_staging_sstables() const;
|
||||
std::vector<sstables::shared_sstable> sstables_need_rewrite() const;
|
||||
size_t sstables_count() const;
|
||||
std::vector<uint64_t> sstable_count_per_level() const;
|
||||
@@ -1008,8 +1010,9 @@ public:
|
||||
return *_config.sstables_manager;
|
||||
}
|
||||
|
||||
// Reader's schema must be the same as the base schema of each of the views.
|
||||
future<> populate_views(
|
||||
std::vector<view_ptr>,
|
||||
std::vector<db::view::view_and_base>,
|
||||
dht::token base_token,
|
||||
flat_mutation_reader&&,
|
||||
gc_clock::time_point);
|
||||
@@ -1027,7 +1030,7 @@ private:
|
||||
tracing::trace_state_ptr tr_state, reader_concurrency_semaphore& sem, const io_priority_class& io_priority, query::partition_slice::option_set custom_opts) const;
|
||||
std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update, gc_clock::time_point now) const;
|
||||
future<> generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views,
|
||||
std::vector<db::view::view_and_base>&& views,
|
||||
mutation&& m,
|
||||
flat_mutation_reader_opt existings,
|
||||
tracing::trace_state_ptr tr_state,
|
||||
@@ -1099,6 +1102,10 @@ flat_mutation_reader make_local_shard_sstable_reader(schema_ptr s,
|
||||
mutation_reader::forwarding fwd_mr,
|
||||
sstables::read_monitor_generator& monitor_generator = sstables::default_read_monitor_generator());
|
||||
|
||||
/// Read a range from the passed-in sstables.
|
||||
///
|
||||
/// The reader is unrestricted, but will account its resource usage on the
|
||||
/// semaphore belonging to the passed-in permit.
|
||||
flat_mutation_reader make_range_sstable_reader(schema_ptr s,
|
||||
reader_permit permit,
|
||||
lw_shared_ptr<sstables::sstable_set> sstables,
|
||||
@@ -1110,6 +1117,21 @@ flat_mutation_reader make_range_sstable_reader(schema_ptr s,
|
||||
mutation_reader::forwarding fwd_mr,
|
||||
sstables::read_monitor_generator& monitor_generator = sstables::default_read_monitor_generator());
|
||||
|
||||
/// Read a range from the passed-in sstables.
|
||||
///
|
||||
/// The reader is restricted, that is it will wait for admission on the semaphore
|
||||
/// belonging to the passed-in permit, before starting to read.
|
||||
flat_mutation_reader make_restricted_range_sstable_reader(schema_ptr s,
|
||||
reader_permit permit,
|
||||
lw_shared_ptr<sstables::sstable_set> sstables,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr,
|
||||
sstables::read_monitor_generator& monitor_generator = sstables::default_read_monitor_generator());
|
||||
|
||||
class user_types_metadata;
|
||||
|
||||
class keyspace_metadata final {
|
||||
@@ -1376,7 +1398,7 @@ private:
|
||||
flat_hash_map<std::pair<sstring, sstring>, utils::UUID, utils::tuple_hash, string_pair_eq>;
|
||||
ks_cf_to_uuid_t _ks_cf_to_uuid;
|
||||
std::unique_ptr<db::commitlog> _commitlog;
|
||||
utils::UUID _version;
|
||||
utils::updateable_value_source<utils::UUID> _version;
|
||||
uint32_t _schema_change_count = 0;
|
||||
// compaction_manager object is referenced by all column families of a database.
|
||||
std::unique_ptr<compaction_manager> _compaction_manager;
|
||||
@@ -1447,6 +1469,7 @@ public:
|
||||
void update_version(const utils::UUID& version);
|
||||
|
||||
const utils::UUID& get_version() const;
|
||||
utils::observable<utils::UUID>& observable_schema_version() const { return _version.as_observable(); }
|
||||
|
||||
db::commitlog* commitlog() const {
|
||||
return _commitlog.get();
|
||||
@@ -1664,10 +1687,6 @@ future<> stop_database(sharded<database>& db);
|
||||
flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db, schema_ptr schema,
|
||||
std::function<std::optional<dht::partition_range>()> range_generator);
|
||||
|
||||
future<utils::UUID> update_schema_version(distributed<service::storage_proxy>& proxy, db::schema_features);
|
||||
future<> announce_schema_version(utils::UUID schema_version);
|
||||
future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy, db::schema_features);
|
||||
|
||||
bool is_internal_keyspace(const sstring& name);
|
||||
|
||||
#endif /* DATABASE_HH_ */
|
||||
|
||||
@@ -290,7 +290,7 @@ future<> db::batchlog_manager::replay_all_failed_batches() {
|
||||
mutation m(schema, key);
|
||||
auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
|
||||
m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
|
||||
return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr());
|
||||
return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
@@ -521,7 +521,7 @@ public:
|
||||
_segment_manager->totals.total_size_on_disk -= size_on_disk();
|
||||
_segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
|
||||
_segment_manager->add_file_to_delete(_file_name, _desc);
|
||||
} else {
|
||||
} else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
|
||||
clogger.warn("Segment {} is dirty and is left on disk.", *this);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,6 +137,7 @@ public:
|
||||
|
||||
bool reuse_segments = true;
|
||||
bool use_o_dsync = false;
|
||||
bool warn_about_segments_left_on_disk_after_shutdown = true;
|
||||
|
||||
const db::extensions * extensions = nullptr;
|
||||
};
|
||||
|
||||
@@ -304,7 +304,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
|
||||
mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
|
||||
converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
|
||||
fm.partition().accept(cm, v);
|
||||
return do_with(std::move(m), [&db, &cf] (mutation m) {
|
||||
return do_with(std::move(m), [&db, &cf] (const mutation& m) {
|
||||
return db.apply_in_memory(m, cf, db::rp_handle(), db::no_timeout);
|
||||
});
|
||||
} else {
|
||||
|
||||
11
db/config.cc
11
db/config.cc
@@ -662,6 +662,15 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"\tpriority_string : GnuTLS priority string controlling TLS algorithms used/allowed.\n"
|
||||
"\trequire_client_auth : (Default: false ) Enables or disables certificate authentication.\n"
|
||||
"Related information: Client-to-node encryption")
|
||||
, alternator_encryption_options(this, "alternator_encryption_options", value_status::Used, {/*none*/},
|
||||
"When Alternator via HTTPS is enabled with alternator_https_port, where to take the key and certificate. The available options are:\n"
|
||||
"\n"
|
||||
"\tcertificate: (Default: conf/scylla.crt) The location of a PEM-encoded x509 certificate used to identify and encrypt the client/server communication.\n"
|
||||
"\tkeyfile: (Default: conf/scylla.key) PEM Key file associated with certificate.\n"
|
||||
"\n"
|
||||
"The advanced settings are:\n"
|
||||
"\n"
|
||||
"\tpriority_string : GnuTLS priority string controlling TLS algorithms used/allowed.")
|
||||
, ssl_storage_port(this, "ssl_storage_port", value_status::Used, 7001,
|
||||
"The SSL port for encrypted communication. Unused unless enabled in encryption_options.")
|
||||
, enable_in_memory_data_store(this, "enable_in_memory_data_store", value_status::Used, false, "Enable in memory mode (system tables are always persisted)")
|
||||
@@ -681,7 +690,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, replace_address(this, "replace_address", value_status::Used, "", "The listen_address or broadcast_address of the dead node to replace. Same as -Dcassandra.replace_address.")
|
||||
, replace_address_first_boot(this, "replace_address_first_boot", value_status::Used, "", "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.")
|
||||
, override_decommission(this, "override_decommission", value_status::Used, false, "Set true to force a decommissioned node to join the cluster")
|
||||
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based")
|
||||
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, false, "Set true to use enable repair based node operations instead of streaming based")
|
||||
, ring_delay_ms(this, "ring_delay_ms", value_status::Used, 30 * 1000, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.")
|
||||
, shadow_round_ms(this, "shadow_round_ms", value_status::Used, 300 * 1000, "The maximum gossip shadow round time. Can be used to reduce the gossip feature check time during node boot up.")
|
||||
, fd_max_interval_ms(this, "fd_max_interval_ms", value_status::Used, 2 * 1000, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.")
|
||||
|
||||
@@ -252,6 +252,7 @@ public:
|
||||
named_value<uint32_t> permissions_cache_max_entries;
|
||||
named_value<string_map> server_encryption_options;
|
||||
named_value<string_map> client_encryption_options;
|
||||
named_value<string_map> alternator_encryption_options;
|
||||
named_value<uint32_t> ssl_storage_port;
|
||||
named_value<bool> enable_in_memory_data_store;
|
||||
named_value<bool> enable_cache;
|
||||
|
||||
@@ -224,7 +224,9 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
|
||||
with_lock(file_update_mutex(), [this] {
|
||||
if (_hints_store_anchor) {
|
||||
hints_store_ptr tmp = std::exchange(_hints_store_anchor, nullptr);
|
||||
return tmp->shutdown().finally([tmp] {});
|
||||
return tmp->shutdown().finally([tmp] {
|
||||
return tmp->release();
|
||||
}).finally([tmp] {});
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();
|
||||
@@ -290,7 +292,7 @@ inline bool manager::have_ep_manager(ep_key_type ep) const noexcept {
|
||||
}
|
||||
|
||||
bool manager::store_hint(ep_key_type ep, schema_ptr s, lw_shared_ptr<const frozen_mutation> fm, tracing::trace_state_ptr tr_state) noexcept {
|
||||
if (stopping() || !started() || !can_hint_for(ep)) {
|
||||
if (stopping() || draining_all() || !started() || !can_hint_for(ep)) {
|
||||
manager_logger.trace("Can't store a hint to {}", ep);
|
||||
++_stats.dropped;
|
||||
return false;
|
||||
@@ -326,6 +328,10 @@ future<db::commitlog> manager::end_point_hints_manager::add_store() noexcept {
|
||||
// HH doesn't utilize the flow that benefits from reusing segments.
|
||||
// Therefore let's simply disable it to avoid any possible confusion.
|
||||
cfg.reuse_segments = false;
|
||||
// HH leaves segments on disk after commitlog shutdown, and later reads
|
||||
// them when commitlog is re-created. This is expected to happen regularly
|
||||
// during standard HH workload, so no need to print a warning about it.
|
||||
cfg.warn_about_segments_left_on_disk_after_shutdown = false;
|
||||
|
||||
return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) {
|
||||
// add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
|
||||
@@ -352,7 +358,9 @@ future<> manager::end_point_hints_manager::flush_current_hints() noexcept {
|
||||
return futurize_invoke([this] {
|
||||
return with_lock(file_update_mutex(), [this]() -> future<> {
|
||||
return get_or_load().then([] (hints_store_ptr cptr) {
|
||||
return cptr->shutdown();
|
||||
return cptr->shutdown().finally([cptr] {
|
||||
return cptr->release();
|
||||
}).finally([cptr] {});
|
||||
}).then([this] {
|
||||
// Un-hold the commitlog object. Since we are under the exclusive _file_update_mutex lock there are no
|
||||
// other hints_store_ptr copies and this would destroy the commitlog shared value.
|
||||
@@ -529,7 +537,7 @@ bool manager::check_dc_for(ep_key_type ep) const noexcept {
|
||||
}
|
||||
|
||||
void manager::drain_for(gms::inet_address endpoint) {
|
||||
if (stopping()) {
|
||||
if (stopping() || draining_all()) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -540,6 +548,7 @@ void manager::drain_for(gms::inet_address endpoint) {
|
||||
return with_semaphore(drain_lock(), 1, [this, endpoint] {
|
||||
return futurize_invoke([this, endpoint] () {
|
||||
if (utils::fb_utilities::is_me(endpoint)) {
|
||||
set_draining_all();
|
||||
return parallel_for_each(_ep_managers, [] (auto& pair) {
|
||||
return pair.second.stop(drain::yes).finally([&pair] {
|
||||
return with_file_update_mutex(pair.second, [&pair] {
|
||||
|
||||
@@ -424,12 +424,14 @@ public:
|
||||
enum class state {
|
||||
started, // hinting is currently allowed (start() call is complete)
|
||||
replay_allowed, // replaying (hints sending) is allowed
|
||||
draining_all, // hinting is not allowed - all ep managers are being stopped because this node is leaving the cluster
|
||||
stopping // hinting is not allowed - stopping is in progress (stop() method has been called)
|
||||
};
|
||||
|
||||
using state_set = enum_set<super_enum<state,
|
||||
state::started,
|
||||
state::replay_allowed,
|
||||
state::draining_all,
|
||||
state::stopping>>;
|
||||
|
||||
private:
|
||||
@@ -690,6 +692,14 @@ private:
|
||||
return _state.contains(state::replay_allowed);
|
||||
}
|
||||
|
||||
void set_draining_all() noexcept {
|
||||
_state.set(state::draining_all);
|
||||
}
|
||||
|
||||
bool draining_all() noexcept {
|
||||
return _state.contains(state::draining_all);
|
||||
}
|
||||
|
||||
public:
|
||||
ep_managers_map_type::iterator find_ep_manager(ep_key_type ep_key) noexcept {
|
||||
return _ep_managers.find(ep_key);
|
||||
|
||||
@@ -801,6 +801,19 @@ future<> merge_unlock() {
|
||||
return smp::submit_to(0, [] { the_merge_lock.signal(); });
|
||||
}
|
||||
|
||||
static
|
||||
future<> update_schema_version_and_announce(distributed<service::storage_proxy>& proxy, schema_features features) {
|
||||
return calculate_schema_digest(proxy, features).then([&proxy] (utils::UUID uuid) {
|
||||
return db::system_keyspace::update_schema_version(uuid).then([&proxy, uuid] {
|
||||
return proxy.local().get_db().invoke_on_all([uuid] (database& db) {
|
||||
db.update_version(uuid);
|
||||
});
|
||||
}).then([uuid] {
|
||||
slogger.info("Schema version changed to {}", uuid);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge remote schema in form of mutations with local and mutate ks/cf metadata objects
|
||||
* (which also involves fs operations on add/drop ks/cf)
|
||||
@@ -821,6 +834,14 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
|
||||
});
|
||||
}
|
||||
|
||||
future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat) {
|
||||
return merge_lock().then([&proxy, &feat] {
|
||||
return update_schema_version_and_announce(proxy, feat.cluster_schema_features());
|
||||
}).finally([] {
|
||||
return merge_unlock();
|
||||
});
|
||||
}
|
||||
|
||||
future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
|
||||
{
|
||||
return merge_lock().then([&proxy, mutations = std::move(mutations), do_flush] () mutable {
|
||||
@@ -2904,10 +2925,6 @@ future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manage
|
||||
// format, where "token" is not marked as computed. Once we're sure that all indexes have their
|
||||
// columns marked as computed (because they were either created on a node that supports computed
|
||||
// columns or were fixed by this utility function), it's safe to remove this function altogether.
|
||||
if (!db.features().cluster_supports_computed_columns()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
if (v->clustering_key_size() == 0) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
@@ -170,6 +170,13 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
|
||||
|
||||
future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush);
|
||||
|
||||
// Recalculates the local schema version.
|
||||
//
|
||||
// It is safe to call concurrently with recalculate_schema_version() and merge_schema() in which case it
|
||||
// is guaranteed that the schema version we end up with after all calls will reflect the most recent state
|
||||
// of feature_service and schema tables.
|
||||
future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat);
|
||||
|
||||
future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after);
|
||||
|
||||
std::vector<mutation> make_create_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions = true);
|
||||
|
||||
310
db/view/view.cc
310
db/view/view.cc
@@ -58,6 +58,7 @@
|
||||
#include "cql3/util.hh"
|
||||
#include "db/view/view.hh"
|
||||
#include "db/view/view_builder.hh"
|
||||
#include "db/view/view_updating_consumer.hh"
|
||||
#include "db/system_keyspace_view_types.hh"
|
||||
#include "db/system_keyspace.hh"
|
||||
#include "frozen_mutation.hh"
|
||||
@@ -136,17 +137,90 @@ const column_definition* view_info::view_column(const column_definition& base_de
|
||||
return _schema.get_column_definition(base_def.name());
|
||||
}
|
||||
|
||||
const std::vector<column_id>& view_info::base_non_pk_columns_in_view_pk() const {
|
||||
void view_info::set_base_info(db::view::base_info_ptr base_info) {
|
||||
_base_info = std::move(base_info);
|
||||
}
|
||||
|
||||
// A constructor for a base info that can facilitate reads and writes from the materialized view.
|
||||
db::view::base_dependent_view_info::base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk)
|
||||
: _base_schema{std::move(base_schema)}
|
||||
, _base_non_pk_columns_in_view_pk{std::move(base_non_pk_columns_in_view_pk)}
|
||||
, has_base_non_pk_columns_in_view_pk{!_base_non_pk_columns_in_view_pk.empty()}
|
||||
, use_only_for_reads{false} {
|
||||
|
||||
}
|
||||
|
||||
// A constructor for a base info that can facilitate only reads from the materialized view.
|
||||
db::view::base_dependent_view_info::base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk)
|
||||
: _base_schema{nullptr}
|
||||
, has_base_non_pk_columns_in_view_pk{has_base_non_pk_columns_in_view_pk}
|
||||
, use_only_for_reads{true} {
|
||||
}
|
||||
|
||||
const std::vector<column_id>& db::view::base_dependent_view_info::base_non_pk_columns_in_view_pk() const {
|
||||
if (use_only_for_reads) {
|
||||
on_internal_error(vlogger, "base_non_pk_columns_in_view_pk(): operation unsupported when initialized only for view reads.");
|
||||
}
|
||||
return _base_non_pk_columns_in_view_pk;
|
||||
}
|
||||
|
||||
void view_info::initialize_base_dependent_fields(const schema& base) {
|
||||
const schema_ptr& db::view::base_dependent_view_info::base_schema() const {
|
||||
if (use_only_for_reads) {
|
||||
on_internal_error(vlogger, "base_schema(): operation unsupported when initialized only for view reads.");
|
||||
}
|
||||
return _base_schema;
|
||||
}
|
||||
|
||||
db::view::base_info_ptr view_info::make_base_dependent_view_info(const schema& base) const {
|
||||
std::vector<column_id> base_non_pk_columns_in_view_pk;
|
||||
bool has_base_non_pk_columns_in_view_pk = false;
|
||||
bool can_only_read_from_view = false;
|
||||
|
||||
for (auto&& view_col : boost::range::join(_schema.partition_key_columns(), _schema.clustering_key_columns())) {
|
||||
if (view_col.is_computed()) {
|
||||
// we are not going to find it in the base table...
|
||||
continue;
|
||||
}
|
||||
auto* base_col = base.get_column_definition(view_col.name());
|
||||
if (base_col && !base_col->is_primary_key()) {
|
||||
_base_non_pk_columns_in_view_pk.push_back(base_col->id);
|
||||
base_non_pk_columns_in_view_pk.push_back(base_col->id);
|
||||
has_base_non_pk_columns_in_view_pk = true;
|
||||
} else if (!base_col) {
|
||||
// If we didn't find the column in the base column then it must have been deleted
|
||||
// or not yet added (by alter command), this means it is for sure not a pk column
|
||||
// in the base table. This can happen if the version of the base schema is not the
|
||||
// one that the view was created with. Seting this schema as the base can't harm since
|
||||
// if we got to such a situation then it means it is only going to be used for reading
|
||||
// (computation of shadowable tombstones) and in that case the existence of such a column
|
||||
// is the only thing that is of interest to us.
|
||||
has_base_non_pk_columns_in_view_pk = true;
|
||||
can_only_read_from_view = true;
|
||||
|
||||
// We can break the loop here since we have the info we wanted and the list
|
||||
// of columns is not going to be reliable anyhow.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (can_only_read_from_view) {
|
||||
return make_lw_shared<db::view::base_dependent_view_info>(has_base_non_pk_columns_in_view_pk);
|
||||
} else {
|
||||
return make_lw_shared<db::view::base_dependent_view_info>(base.shared_from_this(), std::move(base_non_pk_columns_in_view_pk));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool view_info::has_base_non_pk_columns_in_view_pk() const {
|
||||
// The base info is not always available, this is because
|
||||
// the base info initialization is separate from the view
|
||||
// info construction. If we are trying to get this info without
|
||||
// initializing the base information it means that we have a
|
||||
// schema integrity problem as the creator of owning view schema
|
||||
// didn't make sure to initialize it with base information.
|
||||
if (!_base_info) {
|
||||
on_internal_error(vlogger, "Tried to perform a view query which is base info dependant without initializing it");
|
||||
}
|
||||
return _base_info->has_base_non_pk_columns_in_view_pk;
|
||||
}
|
||||
|
||||
namespace db {
|
||||
@@ -194,12 +268,12 @@ bool may_be_affected_by(const schema& base, const view_info& view, const dht::de
|
||||
}
|
||||
|
||||
static bool update_requires_read_before_write(const schema& base,
|
||||
const std::vector<view_ptr>& views,
|
||||
const std::vector<view_and_base>& views,
|
||||
const dht::decorated_key& key,
|
||||
const rows_entry& update,
|
||||
gc_clock::time_point now) {
|
||||
for (auto&& v : views) {
|
||||
view_info& vf = *v->view_info();
|
||||
view_info& vf = *v.view->view_info();
|
||||
if (may_be_affected_by(base, vf, key, update, now)) {
|
||||
return true;
|
||||
}
|
||||
@@ -246,12 +320,14 @@ class view_updates final {
|
||||
view_ptr _view;
|
||||
const view_info& _view_info;
|
||||
schema_ptr _base;
|
||||
base_info_ptr _base_info;
|
||||
std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
|
||||
public:
|
||||
explicit view_updates(view_ptr view, schema_ptr base)
|
||||
: _view(std::move(view))
|
||||
explicit view_updates(view_and_base vab)
|
||||
: _view(std::move(vab.view))
|
||||
, _view_info(*_view->view_info())
|
||||
, _base(std::move(base))
|
||||
, _base(vab.base->base_schema())
|
||||
, _base_info(vab.base)
|
||||
, _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
|
||||
}
|
||||
|
||||
@@ -313,7 +389,7 @@ row_marker view_updates::compute_row_marker(const clustering_row& base_row) cons
|
||||
// they share liveness information. It's true especially in the only case currently allowed by CQL,
|
||||
// which assumes there's up to one non-pk column in the view key. It's also true in alternator,
|
||||
// which does not carry TTL information.
|
||||
const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
|
||||
const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
|
||||
if (!col_ids.empty()) {
|
||||
auto& def = _base->regular_column_at(col_ids[0]);
|
||||
// Note: multi-cell columns can't be part of the primary key.
|
||||
@@ -544,7 +620,7 @@ void view_updates::delete_old_entry(const partition_key& base_key, const cluster
|
||||
|
||||
void view_updates::do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now) {
|
||||
auto& r = get_view_row(base_key, existing);
|
||||
const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
|
||||
const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
|
||||
if (!col_ids.empty()) {
|
||||
// We delete the old row using a shadowable row tombstone, making sure that
|
||||
// the tombstone deletes everything in the row (or it might still show up).
|
||||
@@ -685,7 +761,7 @@ void view_updates::generate_update(
|
||||
return;
|
||||
}
|
||||
|
||||
const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
|
||||
const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk();
|
||||
if (col_ids.empty()) {
|
||||
// The view key is necessarily the same pre and post update.
|
||||
if (existing && existing->is_live(*_base)) {
|
||||
@@ -940,12 +1016,17 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
|
||||
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||
const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views_to_update,
|
||||
std::vector<view_and_base>&& views_to_update,
|
||||
flat_mutation_reader&& updates,
|
||||
flat_mutation_reader_opt&& existings,
|
||||
gc_clock::time_point now) {
|
||||
auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (auto&& v) {
|
||||
return view_updates(std::move(v), base);
|
||||
auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (view_and_base v) {
|
||||
if (base->version() != v.base->base_schema()->version()) {
|
||||
on_internal_error(vlogger, format("Schema version used for view updates ({}) does not match the current"
|
||||
" base schema version of the view ({}) for view {}.{} of {}.{}",
|
||||
base->version(), v.base->base_schema()->version(), v.view->ks_name(), v.view->cf_name(), base->ks_name(), base->cf_name()));
|
||||
}
|
||||
return view_updates(std::move(v));
|
||||
}));
|
||||
auto builder = std::make_unique<view_update_builder>(base, std::move(vs), std::move(updates), std::move(existings), now);
|
||||
auto f = builder->build();
|
||||
@@ -955,7 +1036,7 @@ future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||
query::clustering_row_ranges calculate_affected_clustering_ranges(const schema& base,
|
||||
const dht::decorated_key& key,
|
||||
const mutation_partition& mp,
|
||||
const std::vector<view_ptr>& views,
|
||||
const std::vector<view_and_base>& views,
|
||||
gc_clock::time_point now) {
|
||||
std::vector<nonwrapping_range<clustering_key_prefix_view>> row_ranges;
|
||||
std::vector<nonwrapping_range<clustering_key_prefix_view>> view_row_ranges;
|
||||
@@ -963,11 +1044,11 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(const schema&
|
||||
if (mp.partition_tombstone() || !mp.row_tombstones().empty()) {
|
||||
for (auto&& v : views) {
|
||||
// FIXME: #2371
|
||||
if (v->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
|
||||
if (v.view->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
|
||||
view_row_ranges.push_back(nonwrapping_range<clustering_key_prefix_view>::make_open_ended_both_sides());
|
||||
break;
|
||||
}
|
||||
for (auto&& r : v->view_info()->partition_slice().default_row_ranges()) {
|
||||
for (auto&& r : v.view->view_info()->partition_slice().default_row_ranges()) {
|
||||
view_row_ranges.push_back(r.transform(std::mem_fn(&clustering_key_prefix::view)));
|
||||
}
|
||||
}
|
||||
@@ -1210,25 +1291,61 @@ view_builder::view_builder(database& db, db::system_distributed_keyspace& sys_di
|
||||
}
|
||||
|
||||
future<> view_builder::start(service::migration_manager& mm) {
|
||||
_started = seastar::async([this, &mm] {
|
||||
// Guard the whole startup routine with a semaphore,
|
||||
// so that it's not intercepted by `on_drop_view`, `on_create_view`
|
||||
// or `on_update_view` events.
|
||||
auto units = get_units(_sem, 1).get0();
|
||||
// Wait for schema agreement even if we're a seed node.
|
||||
while (!mm.have_schema_agreement()) {
|
||||
if (_as.abort_requested()) {
|
||||
return;
|
||||
_started = do_with(view_builder_init_state{}, [this, &mm] (view_builder_init_state& vbi) {
|
||||
return seastar::async([this, &mm, &vbi] {
|
||||
// Guard the whole startup routine with a semaphore,
|
||||
// so that it's not intercepted by `on_drop_view`, `on_create_view`
|
||||
// or `on_update_view` events.
|
||||
auto units = get_units(_sem, 1).get0();
|
||||
// Wait for schema agreement even if we're a seed node.
|
||||
while (!mm.have_schema_agreement()) {
|
||||
seastar::sleep_abortable(500ms, _as).get();
|
||||
}
|
||||
seastar::sleep(500ms).get();
|
||||
}
|
||||
auto built = system_keyspace::load_built_views().get0();
|
||||
auto in_progress = system_keyspace::load_view_build_progress().get0();
|
||||
calculate_shard_build_step(std::move(built), std::move(in_progress)).get();
|
||||
_mnotifier.register_listener(this);
|
||||
_current_step = _base_to_build_step.begin();
|
||||
// Waited on indirectly in stop().
|
||||
(void)_build_step.trigger();
|
||||
auto built = system_keyspace::load_built_views().get0();
|
||||
auto in_progress = system_keyspace::load_view_build_progress().get0();
|
||||
setup_shard_build_step(vbi, std::move(built), std::move(in_progress));
|
||||
}).then_wrapped([this] (future<>&& f) {
|
||||
// All shards need to arrive at the same decisions on whether or not to
|
||||
// restart a view build at some common token (reshard), and which token
|
||||
// to restart at. So we need to wait until all shards have read the view
|
||||
// build statuses before they can all proceed to make the (same) decision.
|
||||
// If we don't synchronize here, a fast shard may make a decision, start
|
||||
// building and finish a build step - before the slowest shard even read
|
||||
// the view build information.
|
||||
std::exception_ptr eptr;
|
||||
if (f.failed()) {
|
||||
eptr = f.get_exception();
|
||||
}
|
||||
|
||||
return container().invoke_on(0, [eptr = std::move(eptr)] (view_builder& builder) {
|
||||
// The &builder is alive, because it can only be destroyed in
|
||||
// sharded<view_builder>::stop(), which, in turn, waits for all
|
||||
// view_builder::stop()-s to finish, and each stop() waits for
|
||||
// the shard's current future (called _started) to resolve.
|
||||
if (!eptr) {
|
||||
if (++builder._shards_finished_read == smp::count) {
|
||||
builder._shards_finished_read_promise.set_value();
|
||||
}
|
||||
} else {
|
||||
if (builder._shards_finished_read < smp::count) {
|
||||
builder._shards_finished_read = smp::count;
|
||||
builder._shards_finished_read_promise.set_exception(std::move(eptr));
|
||||
}
|
||||
}
|
||||
return builder._shards_finished_read_promise.get_shared_future();
|
||||
});
|
||||
}).then([this, &vbi] {
|
||||
return calculate_shard_build_step(vbi);
|
||||
}).then([this] {
|
||||
_mnotifier.register_listener(this);
|
||||
_current_step = _base_to_build_step.begin();
|
||||
// Waited on indirectly in stop().
|
||||
(void)_build_step.trigger();
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}).handle_exception([] (std::exception_ptr eptr) {
|
||||
vlogger.error("start failed: {}", eptr);
|
||||
return make_ready_future<>();
|
||||
});
|
||||
return make_ready_future<>();
|
||||
}
|
||||
@@ -1236,7 +1353,7 @@ future<> view_builder::start(service::migration_manager& mm) {
|
||||
future<> view_builder::stop() {
|
||||
vlogger.info("Stopping view builder");
|
||||
_as.request_abort();
|
||||
return _started.finally([this] {
|
||||
return _started.then([this] {
|
||||
return _mnotifier.unregister_listener(this).then([this] {
|
||||
return _sem.wait();
|
||||
}).then([this] {
|
||||
@@ -1370,12 +1487,12 @@ void view_builder::reshard(
|
||||
}
|
||||
}
|
||||
|
||||
future<> view_builder::calculate_shard_build_step(
|
||||
void view_builder::setup_shard_build_step(
|
||||
view_builder_init_state& vbi,
|
||||
std::vector<system_keyspace::view_name> built,
|
||||
std::vector<system_keyspace::view_build_progress> in_progress) {
|
||||
// Shard 0 makes cleanup changes to the system tables, but none that could conflict
|
||||
// with the other shards; everyone is thus able to proceed independently.
|
||||
auto bookkeeping_ops = std::make_unique<std::vector<future<>>>();
|
||||
auto base_table_exists = [this] (const view_ptr& view) {
|
||||
// This is a safety check in case this node missed a create MV statement
|
||||
// but got a drop table for the base, and another node didn't get the
|
||||
@@ -1402,9 +1519,9 @@ future<> view_builder::calculate_shard_build_step(
|
||||
// Fall-through
|
||||
}
|
||||
if (this_shard_id() == 0) {
|
||||
bookkeeping_ops->push_back(_sys_dist_ks.remove_view(name.first, name.second));
|
||||
bookkeeping_ops->push_back(system_keyspace::remove_built_view(name.first, name.second));
|
||||
bookkeeping_ops->push_back(
|
||||
vbi.bookkeeping_ops.push_back(_sys_dist_ks.remove_view(name.first, name.second));
|
||||
vbi.bookkeeping_ops.push_back(system_keyspace::remove_built_view(name.first, name.second));
|
||||
vbi.bookkeeping_ops.push_back(
|
||||
system_keyspace::remove_view_build_progress_across_all_shards(
|
||||
std::move(name.first),
|
||||
std::move(name.second)));
|
||||
@@ -1412,50 +1529,48 @@ future<> view_builder::calculate_shard_build_step(
|
||||
return view_ptr(nullptr);
|
||||
};
|
||||
|
||||
auto built_views = boost::copy_range<std::unordered_set<utils::UUID>>(built
|
||||
vbi.built_views = boost::copy_range<std::unordered_set<utils::UUID>>(built
|
||||
| boost::adaptors::transformed(maybe_fetch_view)
|
||||
| boost::adaptors::filtered([] (const view_ptr& v) { return bool(v); })
|
||||
| boost::adaptors::transformed([] (const view_ptr& v) { return v->id(); }));
|
||||
|
||||
std::vector<std::vector<view_build_status>> view_build_status_per_shard;
|
||||
for (auto& [view_name, first_token, next_token_opt, cpu_id] : in_progress) {
|
||||
if (auto view = maybe_fetch_view(view_name)) {
|
||||
if (built_views.find(view->id()) != built_views.end()) {
|
||||
if (vbi.built_views.find(view->id()) != vbi.built_views.end()) {
|
||||
if (this_shard_id() == 0) {
|
||||
auto f = _sys_dist_ks.finish_view_build(std::move(view_name.first), std::move(view_name.second)).then([view = std::move(view)] {
|
||||
return system_keyspace::remove_view_build_progress_across_all_shards(view->cf_name(), view->ks_name());
|
||||
});
|
||||
bookkeeping_ops->push_back(std::move(f));
|
||||
vbi.bookkeeping_ops.push_back(std::move(f));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
view_build_status_per_shard.resize(std::max(view_build_status_per_shard.size(), size_t(cpu_id + 1)));
|
||||
view_build_status_per_shard[cpu_id].emplace_back(view_build_status{
|
||||
vbi.status_per_shard.resize(std::max(vbi.status_per_shard.size(), size_t(cpu_id + 1)));
|
||||
vbi.status_per_shard[cpu_id].emplace_back(view_build_status{
|
||||
std::move(view),
|
||||
std::move(first_token),
|
||||
std::move(next_token_opt)});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// All shards need to arrive at the same decisions on whether or not to
|
||||
// restart a view build at some common token (reshard), and which token
|
||||
// to restart at. So we need to wait until all shards have read the view
|
||||
// build statuses before they can all proceed to make the (same) decision.
|
||||
// If we don't synchronoize here, a fast shard may make a decision, start
|
||||
// building and finish a build step - before the slowest shard even read
|
||||
// the view build information.
|
||||
container().invoke_on(0, [] (view_builder& builder) {
|
||||
if (++builder._shards_finished_read == smp::count) {
|
||||
builder._shards_finished_read_promise.set_value();
|
||||
future<> view_builder::calculate_shard_build_step(view_builder_init_state& vbi) {
|
||||
auto base_table_exists = [this] (const view_ptr& view) {
|
||||
// This is a safety check in case this node missed a create MV statement
|
||||
// but got a drop table for the base, and another node didn't get the
|
||||
// drop notification and sent us the view schema.
|
||||
try {
|
||||
_db.find_schema(view->view_info()->base_id());
|
||||
return true;
|
||||
} catch (const no_such_column_family&) {
|
||||
return false;
|
||||
}
|
||||
return builder._shards_finished_read_promise.get_shared_future();
|
||||
}).get();
|
||||
|
||||
};
|
||||
std::unordered_set<utils::UUID> loaded_views;
|
||||
if (view_build_status_per_shard.size() != smp::count) {
|
||||
reshard(std::move(view_build_status_per_shard), loaded_views);
|
||||
} else if (!view_build_status_per_shard.empty()) {
|
||||
for (auto& status : view_build_status_per_shard[this_shard_id()]) {
|
||||
if (vbi.status_per_shard.size() != smp::count) {
|
||||
reshard(std::move(vbi.status_per_shard), loaded_views);
|
||||
} else if (!vbi.status_per_shard.empty()) {
|
||||
for (auto& status : vbi.status_per_shard[this_shard_id()]) {
|
||||
load_view_status(std::move(status), loaded_views);
|
||||
}
|
||||
}
|
||||
@@ -1472,18 +1587,18 @@ future<> view_builder::calculate_shard_build_step(
|
||||
auto all_views = _db.get_views();
|
||||
auto is_new = [&] (const view_ptr& v) {
|
||||
return base_table_exists(v) && loaded_views.find(v->id()) == loaded_views.end()
|
||||
&& built_views.find(v->id()) == built_views.end();
|
||||
&& vbi.built_views.find(v->id()) == vbi.built_views.end();
|
||||
};
|
||||
for (auto&& view : all_views | boost::adaptors::filtered(is_new)) {
|
||||
bookkeeping_ops->push_back(add_new_view(view, get_or_create_build_step(view->view_info()->base_id())));
|
||||
vbi.bookkeeping_ops.push_back(add_new_view(view, get_or_create_build_step(view->view_info()->base_id())));
|
||||
}
|
||||
|
||||
for (auto& [_, build_step] : _base_to_build_step) {
|
||||
initialize_reader_at_current_token(build_step);
|
||||
}
|
||||
|
||||
auto f = seastar::when_all_succeed(bookkeeping_ops->begin(), bookkeeping_ops->end());
|
||||
return f.handle_exception([this, bookkeeping_ops = std::move(bookkeeping_ops)] (std::exception_ptr ep) {
|
||||
auto f = seastar::when_all_succeed(vbi.bookkeeping_ops.begin(), vbi.bookkeeping_ops.end());
|
||||
return f.handle_exception([this] (std::exception_ptr ep) {
|
||||
log_level severity = _as.abort_requested() ? log_level::warn : log_level::error;
|
||||
vlogger.log(severity, "Failed to update materialized view bookkeeping ({}), continuing anyway.", ep);
|
||||
});
|
||||
@@ -1732,7 +1847,7 @@ public:
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
_fragments_memory_usage += cr.memory_usage(*_step.base->schema());
|
||||
_fragments_memory_usage += cr.memory_usage(*_step.reader.schema());
|
||||
_fragments.push_back(std::move(cr));
|
||||
if (_fragments_memory_usage > batch_memory_max) {
|
||||
// Although we have not yet completed the batch of base rows that
|
||||
@@ -1754,10 +1869,14 @@ public:
|
||||
_builder._as.check();
|
||||
if (!_fragments.empty()) {
|
||||
_fragments.push_front(partition_start(_step.current_key, tombstone()));
|
||||
auto base_schema = _step.base->schema();
|
||||
auto views = with_base_info_snapshot(_views_to_build);
|
||||
auto reader = make_flat_mutation_reader_from_fragments(_step.reader.schema(), std::move(_fragments));
|
||||
reader.upgrade_schema(base_schema);
|
||||
_step.base->populate_views(
|
||||
_views_to_build,
|
||||
std::move(views),
|
||||
_step.current_token(),
|
||||
make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments)),
|
||||
std::move(reader),
|
||||
_now).get();
|
||||
_fragments.clear();
|
||||
_fragments_memory_usage = 0;
|
||||
@@ -1909,5 +2028,54 @@ future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_d
|
||||
});
|
||||
}
|
||||
|
||||
const size_t view_updating_consumer::buffer_size_soft_limit{1 * 1024 * 1024};
|
||||
const size_t view_updating_consumer::buffer_size_hard_limit{2 * 1024 * 1024};
|
||||
|
||||
void view_updating_consumer::do_flush_buffer() {
|
||||
_staging_reader_handle.pause();
|
||||
|
||||
if (_buffer.front().partition().empty()) {
|
||||
// If we flushed mid-partition we can have an empty mutation if we
|
||||
// flushed right before getting the end-of-partition fragment.
|
||||
_buffer.pop_front();
|
||||
}
|
||||
|
||||
while (!_buffer.empty()) {
|
||||
try {
|
||||
auto lock_holder = _view_update_pusher(std::move(_buffer.front())).get();
|
||||
} catch (...) {
|
||||
vlogger.warn("Failed to push replica updates for table {}.{}: {}", _schema->ks_name(), _schema->cf_name(), std::current_exception());
|
||||
}
|
||||
_buffer.pop_front();
|
||||
}
|
||||
|
||||
_buffer_size = 0;
|
||||
_m = nullptr;
|
||||
}
|
||||
|
||||
void view_updating_consumer::maybe_flush_buffer_mid_partition() {
|
||||
if (_buffer_size >= buffer_size_hard_limit) {
|
||||
auto m = mutation(_schema, _m->decorated_key(), mutation_partition(_schema));
|
||||
do_flush_buffer();
|
||||
_buffer.emplace_back(std::move(m));
|
||||
_m = &_buffer.back();
|
||||
}
|
||||
}
|
||||
|
||||
view_updating_consumer::view_updating_consumer(schema_ptr schema, table& table, std::vector<sstables::shared_sstable> excluded_sstables, const seastar::abort_source& as,
|
||||
evictable_reader_handle& staging_reader_handle)
|
||||
: view_updating_consumer(std::move(schema), as, staging_reader_handle,
|
||||
[table = table.shared_from_this(), excluded_sstables = std::move(excluded_sstables)] (mutation m) mutable {
|
||||
auto s = m.schema();
|
||||
return table->stream_view_replica_updates(std::move(s), std::move(m), db::no_timeout, excluded_sstables);
|
||||
})
|
||||
{ }
|
||||
|
||||
std::vector<db::view::view_and_base> with_base_info_snapshot(std::vector<view_ptr> vs) {
|
||||
return boost::copy_range<std::vector<db::view::view_and_base>>(vs | boost::adaptors::transformed([] (const view_ptr& v) {
|
||||
return db::view::view_and_base{v, v->view_info()->base_info()};
|
||||
}));
|
||||
}
|
||||
|
||||
} // namespace view
|
||||
} // namespace db
|
||||
|
||||
@@ -43,6 +43,46 @@ namespace db {
|
||||
|
||||
namespace view {
|
||||
|
||||
// Part of the view description which depends on the base schema version.
|
||||
//
|
||||
// This structure may change even though the view schema doesn't change, so
|
||||
// it needs to live outside view_ptr.
|
||||
struct base_dependent_view_info {
|
||||
private:
|
||||
schema_ptr _base_schema;
|
||||
// Id of a regular base table column included in the view's PK, if any.
|
||||
// Scylla views only allow one such column, alternator can have up to two.
|
||||
std::vector<column_id> _base_non_pk_columns_in_view_pk;
|
||||
public:
|
||||
const std::vector<column_id>& base_non_pk_columns_in_view_pk() const;
|
||||
const schema_ptr& base_schema() const;
|
||||
|
||||
// Indicates if the view hase pk columns which are not part of the base
|
||||
// pk, it seems that !base_non_pk_columns_in_view_pk.empty() is the same,
|
||||
// but actually there are cases where we can compute this boolean without
|
||||
// succeeding to reliably build the former.
|
||||
const bool has_base_non_pk_columns_in_view_pk;
|
||||
|
||||
// If base_non_pk_columns_in_view_pk couldn't reliably be built, this base
|
||||
// info can't be used for computing view updates, only for reading the materialized
|
||||
// view.
|
||||
const bool use_only_for_reads;
|
||||
|
||||
// A constructor for a base info that can facilitate reads and writes from the materialized view.
|
||||
base_dependent_view_info(schema_ptr base_schema, std::vector<column_id>&& base_non_pk_columns_in_view_pk);
|
||||
// A constructor for a base info that can facilitate only reads from the materialized view.
|
||||
base_dependent_view_info(bool has_base_non_pk_columns_in_view_pk);
|
||||
};
|
||||
|
||||
// Immutable snapshot of view's base-schema-dependent part.
|
||||
using base_info_ptr = lw_shared_ptr<const base_dependent_view_info>;
|
||||
|
||||
// Snapshot of the view schema and its base-schema-dependent part.
|
||||
struct view_and_base {
|
||||
view_ptr view;
|
||||
base_info_ptr base;
|
||||
};
|
||||
|
||||
/**
|
||||
* Whether the view filter considers the specified partition key.
|
||||
*
|
||||
@@ -94,7 +134,7 @@ bool clustering_prefix_matches(const schema& base, const partition_key& key, con
|
||||
|
||||
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||
const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views_to_update,
|
||||
std::vector<view_and_base>&& views_to_update,
|
||||
flat_mutation_reader&& updates,
|
||||
flat_mutation_reader_opt&& existings,
|
||||
gc_clock::time_point now);
|
||||
@@ -103,7 +143,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
|
||||
const schema& base,
|
||||
const dht::decorated_key& key,
|
||||
const mutation_partition& mp,
|
||||
const std::vector<view_ptr>& views,
|
||||
const std::vector<view_and_base>& views,
|
||||
gc_clock::time_point now);
|
||||
|
||||
struct wait_for_all_updates_tag {};
|
||||
@@ -133,6 +173,13 @@ future<> mutate_MV(
|
||||
*/
|
||||
void create_virtual_column(schema_builder& builder, const bytes& name, const data_type& type);
|
||||
|
||||
/**
|
||||
* Converts a collection of view schema snapshots into a collection of
|
||||
* view_and_base objects, which are snapshots of both the view schema
|
||||
* and the base-schema-dependent part of view description.
|
||||
*/
|
||||
std::vector<view_and_base> with_base_info_snapshot(std::vector<view_ptr>);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -165,6 +165,12 @@ class view_builder final : public service::migration_listener::only_view_notific
|
||||
// Used for testing.
|
||||
std::unordered_map<std::pair<sstring, sstring>, seastar::shared_promise<>, utils::tuple_hash> _build_notifiers;
|
||||
|
||||
struct view_builder_init_state {
|
||||
std::vector<future<>> bookkeeping_ops;
|
||||
std::vector<std::vector<view_build_status>> status_per_shard;
|
||||
std::unordered_set<utils::UUID> built_views;
|
||||
};
|
||||
|
||||
public:
|
||||
// The view builder processes the base table in steps of batch_size rows.
|
||||
// However, if the individual rows are large, there is no real need to
|
||||
@@ -201,7 +207,8 @@ private:
|
||||
void initialize_reader_at_current_token(build_step&);
|
||||
void load_view_status(view_build_status, std::unordered_set<utils::UUID>&);
|
||||
void reshard(std::vector<std::vector<view_build_status>>, std::unordered_set<utils::UUID>&);
|
||||
future<> calculate_shard_build_step(std::vector<system_keyspace::view_name>, std::vector<system_keyspace::view_build_progress>);
|
||||
void setup_shard_build_step(view_builder_init_state& vbi, std::vector<system_keyspace::view_name>, std::vector<system_keyspace::view_build_progress>);
|
||||
future<> calculate_shard_build_step(view_builder_init_state& vbi);
|
||||
future<> add_new_view(view_ptr, build_step&);
|
||||
future<> do_build_step();
|
||||
void execute(build_step&, exponential_backoff_retry);
|
||||
|
||||
@@ -42,35 +42,52 @@ future<> view_update_generator::start() {
|
||||
_pending_sstables.wait().get();
|
||||
}
|
||||
|
||||
// To ensure we don't race with updates, move the entire content
|
||||
// into a local variable.
|
||||
auto sstables_with_tables = std::exchange(_sstables_with_tables, {});
|
||||
|
||||
// If we got here, we will process all tables we know about so far eventually so there
|
||||
// is no starvation
|
||||
for (auto& t : _sstables_with_tables | boost::adaptors::map_keys) {
|
||||
for (auto table_it = sstables_with_tables.begin(); table_it != sstables_with_tables.end(); table_it = sstables_with_tables.erase(table_it)) {
|
||||
auto& [t, sstables] = *table_it;
|
||||
schema_ptr s = t->schema();
|
||||
|
||||
// Copy what we have so far so we don't miss new updates
|
||||
auto sstables = std::exchange(_sstables_with_tables[t], {});
|
||||
vug_logger.trace("Processing {}.{}: {} sstables", s->ks_name(), s->cf_name(), sstables.size());
|
||||
|
||||
const auto num_sstables = sstables.size();
|
||||
|
||||
try {
|
||||
// temporary: need an sstable set for the flat mutation reader, but the
|
||||
// compaction_descriptor takes a vector. Soon this will become a compaction
|
||||
// so the transformation to the SSTable set will not be needed.
|
||||
auto ssts = make_lw_shared(t->get_compaction_strategy().make_sstable_set(s));
|
||||
// Exploit the fact that sstables in the staging directory
|
||||
// are usually non-overlapping and use a partitioned set for
|
||||
// the read.
|
||||
auto ssts = make_lw_shared(sstables::make_partitioned_sstable_set(s, make_lw_shared<sstable_list>(sstable_list{}), false));
|
||||
for (auto& sst : sstables) {
|
||||
ssts->insert(sst);
|
||||
}
|
||||
|
||||
flat_mutation_reader staging_sstable_reader = ::make_range_sstable_reader(s,
|
||||
auto ms = mutation_source([this, ssts] (
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr ts,
|
||||
streamed_mutation::forwarding fwd_ms,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return ::make_restricted_range_sstable_reader(s, std::move(permit), std::move(ssts), pr, ps, pc, std::move(ts), fwd_ms, fwd_mr);
|
||||
});
|
||||
auto [staging_sstable_reader, staging_sstable_reader_handle] = make_manually_paused_evictable_reader(
|
||||
std::move(ms),
|
||||
s,
|
||||
_db.make_query_class_config().semaphore.make_permit(),
|
||||
std::move(ssts),
|
||||
query::full_partition_range,
|
||||
s->full_slice(),
|
||||
service::get_local_streaming_priority(),
|
||||
nullptr,
|
||||
::streamed_mutation::forwarding::no,
|
||||
::mutation_reader::forwarding::no);
|
||||
|
||||
inject_failure("view_update_generator_consume_staging_sstable");
|
||||
auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, *t, sstables, _as), db::no_timeout);
|
||||
auto result = staging_sstable_reader.consume_in_thread(view_updating_consumer(s, *t, sstables, _as, staging_sstable_reader_handle), db::no_timeout);
|
||||
if (result == stop_iteration::yes) {
|
||||
break;
|
||||
}
|
||||
@@ -89,7 +106,7 @@ future<> view_update_generator::start() {
|
||||
// Move from staging will be retried upon restart.
|
||||
vug_logger.warn("Moving {} from staging failed: {}:{}. Ignoring...", s->ks_name(), s->cf_name(), std::current_exception());
|
||||
}
|
||||
_registration_sem.signal();
|
||||
_registration_sem.signal(num_sstables);
|
||||
}
|
||||
// For each table, move the processed staging sstables into the table's base dir.
|
||||
for (auto it = _sstables_to_move.begin(); it != _sstables_to_move.end(); ) {
|
||||
|
||||
@@ -32,7 +32,10 @@
|
||||
namespace db::view {
|
||||
|
||||
class view_update_generator {
|
||||
public:
|
||||
static constexpr size_t registration_queue_size = 5;
|
||||
|
||||
private:
|
||||
database& _db;
|
||||
seastar::abort_source _as;
|
||||
future<> _started = make_ready_future<>();
|
||||
@@ -51,6 +54,8 @@ public:
|
||||
future<> start();
|
||||
future<> stop();
|
||||
future<> register_staging_sstable(sstables::shared_sstable sst, lw_shared_ptr<table> table);
|
||||
|
||||
ssize_t available_register_units() const { return _registration_sem.available_units(); }
|
||||
private:
|
||||
bool should_throttle() const;
|
||||
};
|
||||
|
||||
@@ -27,6 +27,8 @@
|
||||
#include "sstables/shared_sstable.hh"
|
||||
#include "database.hh"
|
||||
|
||||
class evictable_reader_handle;
|
||||
|
||||
namespace db::view {
|
||||
|
||||
/*
|
||||
@@ -34,22 +36,46 @@ namespace db::view {
|
||||
* It is expected to be run in seastar::async threaded context through consume_in_thread()
|
||||
*/
|
||||
class view_updating_consumer {
|
||||
schema_ptr _schema;
|
||||
lw_shared_ptr<table> _table;
|
||||
std::vector<sstables::shared_sstable> _excluded_sstables;
|
||||
const seastar::abort_source* _as;
|
||||
std::optional<mutation> _m;
|
||||
public:
|
||||
view_updating_consumer(schema_ptr schema, table& table, std::vector<sstables::shared_sstable> excluded_sstables, const seastar::abort_source& as)
|
||||
// We prefer flushing on partition boundaries, so at the end of a partition,
|
||||
// we flush on reaching the soft limit. Otherwise we continue accumulating
|
||||
// data. We flush mid-partition if we reach the hard limit.
|
||||
static const size_t buffer_size_soft_limit;
|
||||
static const size_t buffer_size_hard_limit;
|
||||
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
const seastar::abort_source* _as;
|
||||
evictable_reader_handle& _staging_reader_handle;
|
||||
circular_buffer<mutation> _buffer;
|
||||
mutation* _m{nullptr};
|
||||
size_t _buffer_size{0};
|
||||
noncopyable_function<future<row_locker::lock_holder>(mutation)> _view_update_pusher;
|
||||
|
||||
private:
|
||||
void do_flush_buffer();
|
||||
void maybe_flush_buffer_mid_partition();
|
||||
|
||||
public:
|
||||
// Push updates with a custom pusher. Mainly for tests.
|
||||
view_updating_consumer(schema_ptr schema, const seastar::abort_source& as, evictable_reader_handle& staging_reader_handle,
|
||||
noncopyable_function<future<row_locker::lock_holder>(mutation)> view_update_pusher)
|
||||
: _schema(std::move(schema))
|
||||
, _table(table.shared_from_this())
|
||||
, _excluded_sstables(std::move(excluded_sstables))
|
||||
, _as(&as)
|
||||
, _m()
|
||||
, _staging_reader_handle(staging_reader_handle)
|
||||
, _view_update_pusher(std::move(view_update_pusher))
|
||||
{ }
|
||||
|
||||
view_updating_consumer(schema_ptr schema, table& table, std::vector<sstables::shared_sstable> excluded_sstables, const seastar::abort_source& as,
|
||||
evictable_reader_handle& staging_reader_handle);
|
||||
|
||||
view_updating_consumer(view_updating_consumer&&) = default;
|
||||
|
||||
view_updating_consumer& operator=(view_updating_consumer&&) = delete;
|
||||
|
||||
void consume_new_partition(const dht::decorated_key& dk) {
|
||||
_m = mutation(_schema, dk, mutation_partition(_schema));
|
||||
_buffer.emplace_back(_schema, dk, mutation_partition(_schema));
|
||||
_m = &_buffer.back();
|
||||
}
|
||||
|
||||
void consume(tombstone t) {
|
||||
@@ -60,7 +86,9 @@ public:
|
||||
if (_as->abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_buffer_size += sr.memory_usage(*_schema);
|
||||
_m->partition().apply(*_schema, std::move(sr));
|
||||
maybe_flush_buffer_mid_partition();
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
@@ -68,7 +96,9 @@ public:
|
||||
if (_as->abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_buffer_size += cr.memory_usage(*_schema);
|
||||
_m->partition().apply(*_schema, std::move(cr));
|
||||
maybe_flush_buffer_mid_partition();
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
@@ -76,14 +106,27 @@ public:
|
||||
if (_as->abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
_buffer_size += rt.memory_usage(*_schema);
|
||||
_m->partition().apply(*_schema, std::move(rt));
|
||||
maybe_flush_buffer_mid_partition();
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
// Expected to be run in seastar::async threaded context (consume_in_thread())
|
||||
stop_iteration consume_end_of_partition();
|
||||
stop_iteration consume_end_of_partition() {
|
||||
if (_as->abort_requested()) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
if (_buffer_size >= buffer_size_soft_limit) {
|
||||
do_flush_buffer();
|
||||
}
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
stop_iteration consume_end_of_stream() {
|
||||
if (!_buffer.empty()) {
|
||||
do_flush_buffer();
|
||||
}
|
||||
return stop_iteration(_as->abort_requested());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -59,7 +59,12 @@ future<> boot_strapper::bootstrap(streaming::stream_reason reason) {
|
||||
return make_exception_future<>(std::runtime_error("Wrong stream_reason provided: it can only be replace or bootstrap"));
|
||||
}
|
||||
auto streamer = make_lw_shared<range_streamer>(_db, _token_metadata, _abort_source, _tokens, _address, description, reason);
|
||||
streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(gms::get_local_gossiper().get_unreachable_members()));
|
||||
auto nodes_to_filter = gms::get_local_gossiper().get_unreachable_members();
|
||||
if (reason == streaming::stream_reason::replace && _db.local().get_replace_address()) {
|
||||
nodes_to_filter.insert(_db.local().get_replace_address().value());
|
||||
}
|
||||
blogger.debug("nodes_to_filter={}", nodes_to_filter);
|
||||
streamer->add_source_filter(std::make_unique<range_streamer::failure_detector_source_filter>(nodes_to_filter));
|
||||
auto keyspaces = make_lw_shared<std::vector<sstring>>(_db.local().get_non_system_keyspaces());
|
||||
return do_for_each(*keyspaces, [this, keyspaces, streamer] (sstring& keyspace_name) {
|
||||
auto& ks = _db.local().find_keyspace(keyspace_name);
|
||||
|
||||
@@ -28,7 +28,8 @@ namespace query {
|
||||
enum class digest_algorithm : uint8_t {
|
||||
none = 0, // digest not required
|
||||
MD5 = 1,
|
||||
xxHash = 2,// default algorithm
|
||||
legacy_xxHash_without_null_digest = 2,
|
||||
xxHash = 3, // default algorithm
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ struct noop_hasher {
|
||||
};
|
||||
|
||||
class digester final {
|
||||
std::variant<noop_hasher, md5_hasher, xx_hasher> _impl;
|
||||
std::variant<noop_hasher, md5_hasher, xx_hasher, legacy_xx_hasher_without_null_digest> _impl;
|
||||
|
||||
public:
|
||||
explicit digester(digest_algorithm algo) {
|
||||
@@ -47,6 +47,9 @@ public:
|
||||
case digest_algorithm::xxHash:
|
||||
_impl = xx_hasher();
|
||||
break;
|
||||
case digest_algorithm::legacy_xxHash_without_null_digest:
|
||||
_impl = legacy_xx_hasher_without_null_digest();
|
||||
break;
|
||||
case digest_algorithm ::none:
|
||||
_impl = noop_hasher();
|
||||
break;
|
||||
|
||||
15
dist/common/scripts/scylla-housekeeping
vendored
15
dist/common/scripts/scylla-housekeeping
vendored
@@ -61,7 +61,15 @@ def sh_command(*args):
|
||||
return out
|
||||
|
||||
def get_url(path):
|
||||
return urllib.request.urlopen(path).read().decode('utf-8')
|
||||
# If server returns any error, like 403, or 500 urllib.request throws exception, which is not serializable.
|
||||
# When multiprocessing routines fail to serialize it, it throws ambiguous serialization exception
|
||||
# from get_json_from_url.
|
||||
# In order to see legit error we catch it from the inside of process, covert to string and
|
||||
# pass it as part of return value
|
||||
try:
|
||||
return 0, urllib.request.urlopen(path).read().decode('utf-8')
|
||||
except Exception as exc:
|
||||
return 1, str(exc)
|
||||
|
||||
def get_json_from_url(path):
|
||||
pool = mp.Pool(processes=1)
|
||||
@@ -71,13 +79,16 @@ def get_json_from_url(path):
|
||||
# to enforce a wallclock timeout.
|
||||
result = pool.apply_async(get_url, args=(path,))
|
||||
try:
|
||||
retval = result.get(timeout=5)
|
||||
status, retval = result.get(timeout=5)
|
||||
except mp.TimeoutError as err:
|
||||
pool.terminate()
|
||||
pool.join()
|
||||
raise
|
||||
if status == 1:
|
||||
raise RuntimeError(f'Failed to get "{path}" due to the following error: {retval}')
|
||||
return json.loads(retval)
|
||||
|
||||
|
||||
def get_api(path):
|
||||
return get_json_from_url("http://" + api_address + path)
|
||||
|
||||
|
||||
13
dist/common/scripts/scylla_setup
vendored
13
dist/common/scripts/scylla_setup
vendored
@@ -27,6 +27,7 @@ import glob
|
||||
import shutil
|
||||
import io
|
||||
import stat
|
||||
import distro
|
||||
from scylla_util import *
|
||||
|
||||
interactive = False
|
||||
@@ -385,6 +386,9 @@ if __name__ == '__main__':
|
||||
if not stat.S_ISBLK(os.stat(dsk).st_mode):
|
||||
print('{} is not block device'.format(dsk))
|
||||
continue
|
||||
if dsk in selected:
|
||||
print(f'{dsk} is already added')
|
||||
continue
|
||||
selected.append(dsk)
|
||||
devices.remove(dsk)
|
||||
disks = ','.join(selected)
|
||||
@@ -468,5 +472,10 @@ if __name__ == '__main__':
|
||||
print('Please restart your machine before using ScyllaDB, as you have disabled')
|
||||
print(' SELinux.')
|
||||
|
||||
if dist_name() == 'Ubuntu':
|
||||
run('apt-get install -y hugepages')
|
||||
if distro.id() == 'ubuntu':
|
||||
# Ubuntu version is 20.04 or later
|
||||
if int(distro.major_version()) >= 20:
|
||||
hugepkg = 'libhugetlbfs-bin'
|
||||
else:
|
||||
hugepkg = 'hugepages'
|
||||
run(f'apt-get install -y {hugepkg}')
|
||||
|
||||
4
dist/common/scripts/scylla_swap_setup
vendored
4
dist/common/scripts/scylla_swap_setup
vendored
@@ -40,6 +40,10 @@ if __name__ == '__main__':
|
||||
sys.exit(1)
|
||||
|
||||
memtotal = get_memtotal_gb()
|
||||
if memtotal == 0:
|
||||
print('memory too small: {} KB'.format(get_memtotal()))
|
||||
sys.exit(1)
|
||||
|
||||
# Scylla document says 'swap size should be set to either total_mem/3 or
|
||||
# 16GB - lower of the two', so we need to compare 16g vs memtotal/3 and
|
||||
# choose lower one
|
||||
|
||||
20
dist/common/scripts/scylla_util.py
vendored
20
dist/common/scripts/scylla_util.py
vendored
@@ -184,7 +184,7 @@ class aws_instance:
|
||||
instance_size = self.instance_size()
|
||||
if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
|
||||
return 'ixgbevf'
|
||||
if instance_class in ['a1', 'c5', 'c5d', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
|
||||
if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
|
||||
return 'ena'
|
||||
if instance_class == 'm4':
|
||||
if instance_size == '16xlarge':
|
||||
@@ -331,7 +331,7 @@ class scylla_cpuinfo:
|
||||
|
||||
# When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
|
||||
scylla_env = os.environ.copy()
|
||||
scylla_env['PATH'] = '{}:{}'.format(scylla_env['PATH'], scyllabindir())
|
||||
scylla_env['PATH'] = '{}:{}'.format(scyllabindir(), scylla_env['PATH'])
|
||||
|
||||
def run(cmd, shell=False, silent=False, exception=True):
|
||||
stdout = subprocess.DEVNULL if silent else None
|
||||
@@ -446,6 +446,19 @@ def dist_ver():
|
||||
return distro.version()
|
||||
|
||||
|
||||
SYSTEM_PARTITION_UUIDS = [
|
||||
'21686148-6449-6e6f-744e-656564454649', # BIOS boot partition
|
||||
'c12a7328-f81f-11d2-ba4b-00a0c93ec93b', # EFI system partition
|
||||
'024dee41-33e7-11d3-9d69-0008c781f39f' # MBR partition scheme
|
||||
]
|
||||
|
||||
def get_partition_uuid(dev):
|
||||
return out(f'lsblk -n -oPARTTYPE {dev}')
|
||||
|
||||
def is_system_partition(dev):
|
||||
uuid = get_partition_uuid(dev)
|
||||
return (uuid in SYSTEM_PARTITION_UUIDS)
|
||||
|
||||
def is_unused_disk(dev):
|
||||
# dev is not in /sys/class/block/, like /dev/nvme[0-9]+
|
||||
if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/', ''))):
|
||||
@@ -453,7 +466,8 @@ def is_unused_disk(dev):
|
||||
try:
|
||||
fd = os.open(dev, os.O_EXCL)
|
||||
os.close(fd)
|
||||
return True
|
||||
# dev is not reserved for system
|
||||
return not is_system_partition(dev)
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
1
dist/debian/debian/rules
vendored
1
dist/debian/debian/rules
vendored
@@ -39,6 +39,7 @@ override_dh_strip:
|
||||
# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
|
||||
# already stripped, nothing is lost if we exclude them, so that's what we do.
|
||||
dh_strip -Xlibprotobuf.so.15 -Xld.so -Xethtool -Xgawk -Xgzip -Xhwloc-calc -Xhwloc-distrib -Xifconfig -Xlscpu -Xnetstat -Xpatchelf --dbg-package=$(product)-server-dbg
|
||||
find $(CURDIR)/debian/$(product)-server-dbg/usr/lib/debug/.build-id/ -name "*.debug" -exec objcopy --decompress-debug-sections {} \;
|
||||
|
||||
override_dh_makeshlibs:
|
||||
|
||||
|
||||
4
dist/docker/redhat/Dockerfile
vendored
4
dist/docker/redhat/Dockerfile
vendored
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
|
||||
ENV container docker
|
||||
|
||||
# The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
|
||||
ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
|
||||
ARG VERSION=666.development
|
||||
ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/scylla-4.2/latest/scylla.repo
|
||||
ARG VERSION=4.2
|
||||
|
||||
ADD scylla_bashrc /scylla_bashrc
|
||||
|
||||
|
||||
@@ -25,6 +25,15 @@ By default, Scylla listens on this port on all network interfaces.
|
||||
To listen only on a specific interface, pass also an "`alternator-address`"
|
||||
option.
|
||||
|
||||
In addition to (or instead of) serving HTTP requests on `alternator-port`,
|
||||
Scylla can accept DynamoDB API requests over HTTPS (encrypted), on the port
|
||||
specified by `alternator-https-port`. As usual for HTTPS servers, the
|
||||
operator must specify certificate and key files. By default these should
|
||||
be placed in `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key`, but
|
||||
these default locations can overridden by specifying
|
||||
`--alternator-encryption-options keyfile="..."` and
|
||||
`--alternator-encryption-options certificate="..."`.
|
||||
|
||||
As we explain below in the "Write isolation policies", Alternator has
|
||||
four different choices for the implementation of writes, each with
|
||||
different advantages. You should consider which of the options makes
|
||||
|
||||
@@ -137,10 +137,9 @@ TODO: is there an SSL version of Thrift?
|
||||
|
||||
# DynamoDB client protocol
|
||||
|
||||
Scylla also supports, as an experimental feature, Amazon's DynamoDB API.
|
||||
The DynamoDB API is a JSON over HTTP (unencrypted) or HTTPS (encrypted)
|
||||
protocol. Because Scylla's support for this protocol is experimental,
|
||||
it is not turned on by default, and must be turned on manually by setting
|
||||
Scylla also supports Amazon's DynamoDB API. The DynamoDB API is a JSON over
|
||||
HTTP (unencrypted) or HTTPS (encrypted) protocol. Support for this protocol
|
||||
is not turned on by default, and must be turned on manually by setting
|
||||
the `alternator_port` and/or `alternator_https_port` configuration option.
|
||||
"Alternator" is the codename of Scylla's DynamoDB API support, and is
|
||||
documented in more detail in [alternator.md](alternator/alternator.md).
|
||||
@@ -153,6 +152,13 @@ There is also an `alternator_address` configuration option to set the IP
|
||||
address (and therefore network interface) on which Scylla should listen
|
||||
for the DynamoDB protocol. This address defaults to 0.0.0.0.
|
||||
|
||||
When the HTTPS-based protocol is enabled, the server also needs to know
|
||||
the certificate and key files to use. The default locations of these files
|
||||
are `/etc/scylla/scylla.crt` and `/etc/scylla/scylla.key` respectively, but
|
||||
can be overridden by specifying in `alternator_encryption_options` the
|
||||
`keyfile` and `certificate` options. For example,
|
||||
`--alternator-encryption-options keyfile="..."`.
|
||||
|
||||
# Redis client protocol
|
||||
|
||||
Scylla also has partial and experimental support for the Redis API.
|
||||
|
||||
@@ -468,6 +468,9 @@ public:
|
||||
size_t buffer_size() const {
|
||||
return _impl->buffer_size();
|
||||
}
|
||||
const circular_buffer<mutation_fragment>& buffer() const {
|
||||
return _impl->buffer();
|
||||
}
|
||||
// Detach the internal buffer of the reader.
|
||||
// Roughly equivalent to depleting it by calling pop_mutation_fragment()
|
||||
// until is_buffer_empty() returns true.
|
||||
|
||||
@@ -141,6 +141,7 @@ extern const std::string_view HINTED_HANDOFF_SEPARATE_CONNECTION;
|
||||
extern const std::string_view LWT;
|
||||
extern const std::string_view PER_TABLE_PARTITIONERS;
|
||||
extern const std::string_view PER_TABLE_CACHING;
|
||||
extern const std::string_view DIGEST_FOR_NULL_VALUES;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -56,6 +56,7 @@ constexpr std::string_view features::HINTED_HANDOFF_SEPARATE_CONNECTION = "HINTE
|
||||
constexpr std::string_view features::LWT = "LWT";
|
||||
constexpr std::string_view features::PER_TABLE_PARTITIONERS = "PER_TABLE_PARTITIONERS";
|
||||
constexpr std::string_view features::PER_TABLE_CACHING = "PER_TABLE_CACHING";
|
||||
constexpr std::string_view features::DIGEST_FOR_NULL_VALUES = "DIGEST_FOR_NULL_VALUES";
|
||||
|
||||
static logging::logger logger("features");
|
||||
|
||||
@@ -90,8 +91,9 @@ feature_service::feature_service(feature_config cfg) : _config(cfg)
|
||||
, _hinted_handoff_separate_connection(*this, features::HINTED_HANDOFF_SEPARATE_CONNECTION)
|
||||
, _lwt_feature(*this, features::LWT)
|
||||
, _per_table_partitioners_feature(*this, features::PER_TABLE_PARTITIONERS)
|
||||
, _per_table_caching_feature(*this, features::PER_TABLE_CACHING) {
|
||||
}
|
||||
, _per_table_caching_feature(*this, features::PER_TABLE_CACHING)
|
||||
, _digest_for_null_values_feature(*this, features::DIGEST_FOR_NULL_VALUES)
|
||||
{}
|
||||
|
||||
feature_config feature_config_from_db_config(db::config& cfg, std::set<sstring> disabled) {
|
||||
feature_config fcfg;
|
||||
@@ -179,6 +181,7 @@ std::set<std::string_view> feature_service::known_feature_set() {
|
||||
gms::features::MC_SSTABLE,
|
||||
gms::features::UDF,
|
||||
gms::features::CDC,
|
||||
gms::features::DIGEST_FOR_NULL_VALUES,
|
||||
};
|
||||
|
||||
for (const sstring& s : _config._disabled_features) {
|
||||
@@ -269,6 +272,7 @@ void feature_service::enable(const std::set<std::string_view>& list) {
|
||||
std::ref(_lwt_feature),
|
||||
std::ref(_per_table_partitioners_feature),
|
||||
std::ref(_per_table_caching_feature),
|
||||
std::ref(_digest_for_null_values_feature),
|
||||
})
|
||||
{
|
||||
if (list.count(f.name())) {
|
||||
|
||||
@@ -103,6 +103,7 @@ private:
|
||||
gms::feature _lwt_feature;
|
||||
gms::feature _per_table_partitioners_feature;
|
||||
gms::feature _per_table_caching_feature;
|
||||
gms::feature _digest_for_null_values_feature;
|
||||
|
||||
public:
|
||||
bool cluster_supports_range_tombstones() const {
|
||||
@@ -177,6 +178,10 @@ public:
|
||||
return _per_table_caching_feature;
|
||||
}
|
||||
|
||||
const feature& cluster_supports_digest_for_null_values() const {
|
||||
return _digest_for_null_values_feature;
|
||||
}
|
||||
|
||||
bool cluster_supports_row_level_repair() const {
|
||||
return bool(_row_level_repair_feature);
|
||||
}
|
||||
|
||||
@@ -428,6 +428,7 @@ future<> gossiper::handle_shutdown_msg(inet_address from) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return seastar::async([this, from] {
|
||||
auto permit = this->lock_endpoint(from).get0();
|
||||
this->mark_as_shutdown(from);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -98,6 +98,7 @@ fedora_packages=(
|
||||
debhelper
|
||||
fakeroot
|
||||
file
|
||||
dpkg-dev
|
||||
)
|
||||
|
||||
centos_packages=(
|
||||
|
||||
@@ -132,6 +132,7 @@ relocate_python3() {
|
||||
cp "$script" "$relocateddir"
|
||||
cat > "$install"<<EOF
|
||||
#!/usr/bin/env bash
|
||||
export LC_ALL=en_US.UTF-8
|
||||
x="\$(readlink -f "\$0")"
|
||||
b="\$(basename "\$x")"
|
||||
d="\$(dirname "\$x")"
|
||||
|
||||
@@ -168,15 +168,33 @@ insert_token_range_to_sorted_container_while_unwrapping(
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::get_ranges(inet_address ep) const {
|
||||
return get_ranges(ep, _token_metadata);
|
||||
return do_get_ranges(ep, _token_metadata, false);
|
||||
}
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::get_ranges_in_thread(inet_address ep) const {
|
||||
return do_get_ranges(ep, _token_metadata, true);
|
||||
}
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::get_ranges(inet_address ep, token_metadata& tm) const {
|
||||
return do_get_ranges(ep, tm, false);
|
||||
}
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::get_ranges_in_thread(inet_address ep, token_metadata& tm) const {
|
||||
return do_get_ranges(ep, tm, true);
|
||||
}
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const {
|
||||
dht::token_range_vector ret;
|
||||
auto prev_tok = tm.sorted_tokens().back();
|
||||
for (auto tok : tm.sorted_tokens()) {
|
||||
for (inet_address a : calculate_natural_endpoints(tok, tm)) {
|
||||
if (can_yield) {
|
||||
seastar::thread::maybe_yield();
|
||||
}
|
||||
if (a == ep) {
|
||||
insert_token_range_to_sorted_container_while_unwrapping(prev_tok, tok, ret);
|
||||
break;
|
||||
|
||||
@@ -113,10 +113,15 @@ public:
|
||||
// It the analogue of Origin's getAddressRanges().get(endpoint).
|
||||
// This function is not efficient, and not meant for the fast path.
|
||||
dht::token_range_vector get_ranges(inet_address ep) const;
|
||||
dht::token_range_vector get_ranges_in_thread(inet_address ep) const;
|
||||
|
||||
// Use the token_metadata provided by the caller instead of _token_metadata
|
||||
dht::token_range_vector get_ranges(inet_address ep, token_metadata& tm) const;
|
||||
dht::token_range_vector get_ranges_in_thread(inet_address ep, token_metadata& tm) const;
|
||||
private:
|
||||
dht::token_range_vector do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const;
|
||||
|
||||
public:
|
||||
// get_primary_ranges() returns the list of "primary ranges" for the given
|
||||
// endpoint. "Primary ranges" are the ranges that the node is responsible
|
||||
// for storing replica primarily, which means this is the first node
|
||||
|
||||
10
lua.cc
10
lua.cc
@@ -262,14 +262,12 @@ static auto visit_lua_raw_value(lua_State* l, int index, Func&& f) {
|
||||
|
||||
template <typename Func>
|
||||
static auto visit_decimal(const big_decimal &v, Func&& f) {
|
||||
boost::multiprecision::cpp_int ten(10);
|
||||
const auto& dividend = v.unscaled_value();
|
||||
auto divisor = boost::multiprecision::pow(ten, v.scale());
|
||||
boost::multiprecision::cpp_rational r = v.as_rational();
|
||||
const boost::multiprecision::cpp_int& dividend = numerator(r);
|
||||
const boost::multiprecision::cpp_int& divisor = denominator(r);
|
||||
if (dividend % divisor == 0) {
|
||||
return f(utils::multiprecision_int(boost::multiprecision::cpp_int(dividend/divisor)));
|
||||
return f(utils::multiprecision_int(dividend/divisor));
|
||||
}
|
||||
boost::multiprecision::cpp_rational r = dividend;
|
||||
r /= divisor;
|
||||
return f(r.convert_to<double>());
|
||||
}
|
||||
|
||||
|
||||
42
main.cc
42
main.cc
@@ -830,6 +830,7 @@ int main(int ac, char** av) {
|
||||
storage_proxy_smp_service_group_config.max_nonlocal_requests = 5000;
|
||||
spcfg.read_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
|
||||
spcfg.write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
|
||||
spcfg.hints_write_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
|
||||
spcfg.write_ack_smp_service_group = create_smp_service_group(storage_proxy_smp_service_group_config).get0();
|
||||
static db::view::node_update_backlog node_backlog(smp::count, 10ms);
|
||||
scheduling_group_key_config storage_proxy_stats_cfg =
|
||||
@@ -967,12 +968,16 @@ int main(int ac, char** av) {
|
||||
mm.init_messaging_service();
|
||||
}).get();
|
||||
supervisor::notify("initializing storage proxy RPC verbs");
|
||||
proxy.invoke_on_all([] (service::storage_proxy& p) {
|
||||
p.init_messaging_service();
|
||||
}).get();
|
||||
proxy.invoke_on_all(&service::storage_proxy::init_messaging_service).get();
|
||||
auto stop_proxy_handlers = defer_verbose_shutdown("storage proxy RPC verbs", [&proxy] {
|
||||
proxy.invoke_on_all(&service::storage_proxy::uninit_messaging_service).get();
|
||||
});
|
||||
|
||||
supervisor::notify("starting streaming service");
|
||||
streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_generator).get();
|
||||
auto stop_streaming_service = defer_verbose_shutdown("streaming service", [] {
|
||||
streaming::stream_session::uninit_streaming_service().get();
|
||||
});
|
||||
api::set_server_stream_manager(ctx).get();
|
||||
|
||||
supervisor::notify("starting hinted handoff manager");
|
||||
@@ -1005,6 +1010,9 @@ int main(int ac, char** av) {
|
||||
rs.stop().get();
|
||||
});
|
||||
repair_init_messaging_service_handler(rs, sys_dist_ks, view_update_generator).get();
|
||||
auto stop_repair_messages = defer_verbose_shutdown("repair message handlers", [] {
|
||||
repair_uninit_messaging_service_handler().get();
|
||||
});
|
||||
supervisor::notify("starting storage service", true);
|
||||
auto& ss = service::get_local_storage_service();
|
||||
ss.init_messaging_service_part().get();
|
||||
@@ -1196,22 +1204,30 @@ int main(int ac, char** av) {
|
||||
std::optional<uint16_t> alternator_https_port;
|
||||
std::optional<tls::credentials_builder> creds;
|
||||
if (cfg->alternator_https_port()) {
|
||||
creds.emplace();
|
||||
alternator_https_port = cfg->alternator_https_port();
|
||||
creds->set_dh_level(tls::dh_params::level::MEDIUM);
|
||||
creds->set_x509_key_file(cert, key, tls::x509_crt_format::PEM).get();
|
||||
if (trust_store.empty()) {
|
||||
creds->set_system_trust().get();
|
||||
} else {
|
||||
creds->set_x509_trust_file(trust_store, tls::x509_crt_format::PEM).get();
|
||||
creds.emplace();
|
||||
auto opts = cfg->alternator_encryption_options();
|
||||
if (opts.empty()) {
|
||||
// Earlier versions mistakenly configured Alternator's
|
||||
// HTTPS parameters via the "server_encryption_option"
|
||||
// configuration parameter. We *temporarily* continue
|
||||
// to allow this, for backward compatibility.
|
||||
opts = cfg->server_encryption_options();
|
||||
if (!opts.empty()) {
|
||||
startlog.warn("Setting server_encryption_options to configure "
|
||||
"Alternator's HTTPS encryption is deprecated. Please "
|
||||
"switch to setting alternator_encryption_options instead.");
|
||||
}
|
||||
}
|
||||
creds->set_dh_level(tls::dh_params::level::MEDIUM);
|
||||
auto cert = get_or_default(opts, "certificate", db::config::get_conf_sub("scylla.crt").string());
|
||||
auto key = get_or_default(opts, "keyfile", db::config::get_conf_sub("scylla.key").string());
|
||||
creds->set_x509_key_file(cert, key, tls::x509_crt_format::PEM).get();
|
||||
auto prio = get_or_default(opts, "priority_string", sstring());
|
||||
creds->set_priority_string(db::config::default_tls_priority);
|
||||
if (!prio.empty()) {
|
||||
creds->set_priority_string(prio);
|
||||
}
|
||||
if (clauth) {
|
||||
creds->set_client_auth(seastar::tls::client_auth::REQUIRE);
|
||||
}
|
||||
}
|
||||
bool alternator_enforce_authorization = cfg->alternator_enforce_authorization();
|
||||
with_scheduling_group(dbcfg.statement_scheduling_group,
|
||||
|
||||
@@ -572,7 +572,12 @@ messaging_service::initial_scheduling_info() const {
|
||||
|
||||
scheduling_group
|
||||
messaging_service::scheduling_group_for_verb(messaging_verb verb) const {
|
||||
return _scheduling_info_for_connection_index[get_rpc_client_idx(verb)].sched_group;
|
||||
// We are not using get_rpc_client_idx() because it figures out the client
|
||||
// index based on the current scheduling group, which is relevant when
|
||||
// selecting the right client for sending a message, but is not relevant
|
||||
// when registering handlers.
|
||||
const auto idx = s_rpc_client_idx_table[static_cast<size_t>(verb)];
|
||||
return _scheduling_info_for_connection_index[idx].sched_group;
|
||||
}
|
||||
|
||||
scheduling_group
|
||||
@@ -791,6 +796,10 @@ void messaging_service::register_stream_mutation_fragments(std::function<future<
|
||||
register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
|
||||
}
|
||||
|
||||
future<> messaging_service::unregister_stream_mutation_fragments() {
|
||||
return unregister_handler(messaging_verb::STREAM_MUTATION_FRAGMENTS);
|
||||
}
|
||||
|
||||
template<class SinkType, class SourceType>
|
||||
future<rpc::sink<SinkType>, rpc::source<SourceType>>
|
||||
do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shared_ptr<messaging_service::rpc_protocol_client_wrapper> rpc_client, std::unique_ptr<messaging_service::rpc_protocol_wrapper>& rpc) {
|
||||
@@ -822,6 +831,9 @@ rpc::sink<repair_row_on_wire_with_cmd> messaging_service::make_sink_for_repair_g
|
||||
void messaging_service::register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_get_row_diff_with_rpc_stream() {
|
||||
return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM);
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
|
||||
future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>
|
||||
@@ -841,6 +853,9 @@ rpc::sink<repair_stream_cmd> messaging_service::make_sink_for_repair_put_row_dif
|
||||
void messaging_service::register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_put_row_diff_with_rpc_stream() {
|
||||
return unregister_handler(messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM);
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
|
||||
future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>
|
||||
@@ -860,6 +875,9 @@ rpc::sink<repair_hash_with_cmd> messaging_service::make_sink_for_repair_get_full
|
||||
void messaging_service::register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_get_full_row_hashes_with_rpc_stream() {
|
||||
return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM);
|
||||
}
|
||||
|
||||
// Send a message for verb
|
||||
template <typename MsgIn, typename... MsgOut>
|
||||
@@ -943,6 +961,9 @@ future<streaming::prepare_message> messaging_service::send_prepare_message(msg_a
|
||||
return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
|
||||
std::move(msg), plan_id, std::move(description), reason);
|
||||
}
|
||||
future<> messaging_service::unregister_prepare_message() {
|
||||
return unregister_handler(messaging_verb::PREPARE_MESSAGE);
|
||||
}
|
||||
|
||||
// PREPARE_DONE_MESSAGE
|
||||
void messaging_service::register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
|
||||
@@ -952,6 +973,9 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
|
||||
return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
|
||||
plan_id, dst_cpu_id);
|
||||
}
|
||||
future<> messaging_service::unregister_prepare_done_message() {
|
||||
return unregister_handler(messaging_verb::PREPARE_DONE_MESSAGE);
|
||||
}
|
||||
|
||||
// STREAM_MUTATION
|
||||
void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
|
||||
@@ -976,6 +1000,9 @@ future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id,
|
||||
return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
|
||||
plan_id, std::move(ranges), cf_id, dst_cpu_id);
|
||||
}
|
||||
future<> messaging_service::unregister_stream_mutation_done() {
|
||||
return unregister_handler(messaging_verb::STREAM_MUTATION_DONE);
|
||||
}
|
||||
|
||||
// COMPLETE_MESSAGE
|
||||
void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
|
||||
@@ -985,6 +1012,9 @@ future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, uns
|
||||
return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
|
||||
plan_id, dst_cpu_id, failed);
|
||||
}
|
||||
future<> messaging_service::unregister_complete_message() {
|
||||
return unregister_handler(messaging_verb::COMPLETE_MESSAGE);
|
||||
}
|
||||
|
||||
void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
|
||||
register_handler(this, messaging_verb::GOSSIP_ECHO, std::move(func));
|
||||
@@ -1199,14 +1229,14 @@ future<partition_checksum> messaging_service::send_repair_checksum_range(
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_GET_FULL_ROW_HASHES
|
||||
void messaging_service::register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
|
||||
void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_get_full_row_hashes() {
|
||||
return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES);
|
||||
}
|
||||
future<std::unordered_set<repair_hash>> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
|
||||
return send_message<future<std::unordered_set<repair_hash>>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
|
||||
future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
|
||||
return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_GET_COMBINED_ROW_HASH
|
||||
@@ -1231,13 +1261,13 @@ future<get_sync_boundary_response> messaging_service::send_repair_get_sync_bound
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_GET_ROW_DIFF
|
||||
void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func) {
|
||||
void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_get_row_diff() {
|
||||
return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF);
|
||||
}
|
||||
future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
|
||||
future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows) {
|
||||
return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows);
|
||||
}
|
||||
|
||||
|
||||
@@ -297,10 +297,12 @@ public:
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
|
||||
future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
|
||||
sstring description, streaming::stream_reason);
|
||||
future<> unregister_prepare_message();
|
||||
|
||||
// Wrapper for PREPARE_DONE_MESSAGE verb
|
||||
void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
|
||||
future<> unregister_prepare_done_message();
|
||||
|
||||
// Wrapper for STREAM_MUTATION verb
|
||||
void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
|
||||
@@ -309,6 +311,7 @@ public:
|
||||
// Wrapper for STREAM_MUTATION_FRAGMENTS
|
||||
// The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
|
||||
void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
|
||||
future<> unregister_stream_mutation_fragments();
|
||||
rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
|
||||
future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
|
||||
|
||||
@@ -316,22 +319,27 @@ public:
|
||||
future<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>> make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
|
||||
rpc::sink<repair_row_on_wire_with_cmd> make_sink_for_repair_get_row_diff_with_rpc_stream(rpc::source<repair_hash_with_cmd>& source);
|
||||
void register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func);
|
||||
future<> unregister_repair_get_row_diff_with_rpc_stream();
|
||||
|
||||
// Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
|
||||
future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>> make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
|
||||
rpc::sink<repair_stream_cmd> make_sink_for_repair_put_row_diff_with_rpc_stream(rpc::source<repair_row_on_wire_with_cmd>& source);
|
||||
void register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func);
|
||||
future<> unregister_repair_put_row_diff_with_rpc_stream();
|
||||
|
||||
// Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
|
||||
future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>> make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
|
||||
rpc::sink<repair_hash_with_cmd> make_sink_for_repair_get_full_row_hashes_with_rpc_stream(rpc::source<repair_stream_cmd>& source);
|
||||
void register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func);
|
||||
future<> unregister_repair_get_full_row_hashes_with_rpc_stream();
|
||||
|
||||
void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
|
||||
future<> unregister_stream_mutation_done();
|
||||
|
||||
void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
|
||||
future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);
|
||||
future<> unregister_complete_message();
|
||||
|
||||
// Wrapper for REPAIR_CHECKSUM_RANGE verb
|
||||
void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
|
||||
@@ -339,9 +347,9 @@ public:
|
||||
future<partition_checksum> send_repair_checksum_range(msg_addr id, sstring keyspace, sstring cf, dht::token_range range, repair_checksum hash_version);
|
||||
|
||||
// Wrapper for REPAIR_GET_FULL_ROW_HASHES
|
||||
void register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
|
||||
void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
|
||||
future<> unregister_repair_get_full_row_hashes();
|
||||
future<std::unordered_set<repair_hash>> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
|
||||
future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
|
||||
|
||||
// Wrapper for REPAIR_GET_COMBINED_ROW_HASH
|
||||
void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func);
|
||||
@@ -354,9 +362,9 @@ public:
|
||||
future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary);
|
||||
|
||||
// Wrapper for REPAIR_GET_ROW_DIFF
|
||||
void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func);
|
||||
void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func);
|
||||
future<> unregister_repair_get_row_diff();
|
||||
future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows);
|
||||
future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows);
|
||||
|
||||
// Wrapper for REPAIR_PUT_ROW_DIFF
|
||||
void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func);
|
||||
|
||||
@@ -300,10 +300,9 @@ flat_mutation_reader read_context::create_reader(
|
||||
}
|
||||
|
||||
auto& table = _db.local().find_column_family(schema);
|
||||
auto class_config = _db.local().make_query_class_config();
|
||||
|
||||
if (!rm.rparts) {
|
||||
rm.rparts = make_foreign(std::make_unique<reader_meta::remote_parts>(class_config.semaphore));
|
||||
rm.rparts = make_foreign(std::make_unique<reader_meta::remote_parts>(semaphore()));
|
||||
}
|
||||
|
||||
rm.rparts->range = std::make_unique<const dht::partition_range>(pr);
|
||||
@@ -513,18 +512,28 @@ future<> read_context::lookup_readers() {
|
||||
}
|
||||
|
||||
return parallel_for_each(boost::irange(0u, smp::count), [this] (shard_id shard) {
|
||||
return _db.invoke_on(shard, [shard, cmd = &_cmd, ranges = &_ranges, gs = global_schema_ptr(_schema),
|
||||
return _db.invoke_on(shard, [this, shard, cmd = &_cmd, ranges = &_ranges, gs = global_schema_ptr(_schema),
|
||||
gts = tracing::global_trace_state_ptr(_trace_state)] (database& db) mutable {
|
||||
auto schema = gs.get();
|
||||
auto querier_opt = db.get_querier_cache().lookup_shard_mutation_querier(cmd->query_uuid, *schema, *ranges, cmd->slice, gts.get());
|
||||
auto& table = db.find_column_family(schema);
|
||||
auto& semaphore = db.make_query_class_config().semaphore;
|
||||
auto& semaphore = this->semaphore();
|
||||
|
||||
if (!querier_opt) {
|
||||
return reader_meta(reader_state::inexistent, reader_meta::remote_parts(semaphore));
|
||||
}
|
||||
|
||||
auto& q = *querier_opt;
|
||||
|
||||
if (&q.permit().semaphore() != &semaphore) {
|
||||
on_internal_error(mmq_log, format("looked-up reader belongs to different semaphore than the one appropriate for this query class: "
|
||||
"looked-up reader belongs to {} (0x{:x}) the query class appropriate is {} (0x{:x})",
|
||||
q.permit().semaphore().name(),
|
||||
reinterpret_cast<uintptr_t>(&q.permit().semaphore()),
|
||||
semaphore.name(),
|
||||
reinterpret_cast<uintptr_t>(&semaphore)));
|
||||
}
|
||||
|
||||
auto handle = pause(semaphore, std::move(q).reader());
|
||||
return reader_meta(
|
||||
reader_state::successful_lookup,
|
||||
|
||||
@@ -734,56 +734,78 @@ void write_counter_cell(RowWriter& w, const query::partition_slice& slice, ::ato
|
||||
});
|
||||
}
|
||||
|
||||
// Used to return the timestamp of the latest update to the row
|
||||
struct max_timestamp {
|
||||
api::timestamp_type max = api::missing_timestamp;
|
||||
|
||||
void update(api::timestamp_type ts) {
|
||||
max = std::max(max, ts);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct appending_hash<row> {
|
||||
template<typename Hasher>
|
||||
void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
|
||||
for (auto id : columns) {
|
||||
const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
|
||||
if (!cell_and_hash) {
|
||||
return;
|
||||
}
|
||||
auto&& def = s.column_at(kind, id);
|
||||
if (def.is_atomic()) {
|
||||
max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
|
||||
if constexpr (query::using_hash_of_hash_v<Hasher>) {
|
||||
if (cell_and_hash->hash) {
|
||||
feed_hash(h, *cell_and_hash->hash);
|
||||
} else {
|
||||
query::default_hasher cellh;
|
||||
feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
|
||||
feed_hash(h, cellh.finalize_uint64());
|
||||
}
|
||||
template<typename Hasher>
|
||||
void appending_hash<row>::operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
|
||||
for (auto id : columns) {
|
||||
const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
|
||||
if (!cell_and_hash) {
|
||||
feed_hash(h, appending_hash<row>::null_hash_value);
|
||||
continue;
|
||||
}
|
||||
auto&& def = s.column_at(kind, id);
|
||||
if (def.is_atomic()) {
|
||||
max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
|
||||
if constexpr (query::using_hash_of_hash_v<Hasher>) {
|
||||
if (cell_and_hash->hash) {
|
||||
feed_hash(h, *cell_and_hash->hash);
|
||||
} else {
|
||||
feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
|
||||
query::default_hasher cellh;
|
||||
feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
|
||||
feed_hash(h, cellh.finalize_uint64());
|
||||
}
|
||||
} else {
|
||||
auto cm = cell_and_hash->cell.as_collection_mutation();
|
||||
max_ts.update(cm.last_update(*def.type));
|
||||
if constexpr (query::using_hash_of_hash_v<Hasher>) {
|
||||
if (cell_and_hash->hash) {
|
||||
feed_hash(h, *cell_and_hash->hash);
|
||||
} else {
|
||||
query::default_hasher cellh;
|
||||
feed_hash(cellh, cm, def);
|
||||
feed_hash(h, cellh.finalize_uint64());
|
||||
}
|
||||
feed_hash(h, cell_and_hash->cell.as_atomic_cell(def), def);
|
||||
}
|
||||
} else {
|
||||
auto cm = cell_and_hash->cell.as_collection_mutation();
|
||||
max_ts.update(cm.last_update(*def.type));
|
||||
if constexpr (query::using_hash_of_hash_v<Hasher>) {
|
||||
if (cell_and_hash->hash) {
|
||||
feed_hash(h, *cell_and_hash->hash);
|
||||
} else {
|
||||
feed_hash(h, cm, def);
|
||||
query::default_hasher cellh;
|
||||
feed_hash(cellh, cm, def);
|
||||
feed_hash(h, cellh.finalize_uint64());
|
||||
}
|
||||
} else {
|
||||
feed_hash(h, cm, def);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
// Instantiation for mutation_test.cc
|
||||
template void appending_hash<row>::operator()<xx_hasher>(xx_hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
|
||||
|
||||
template<>
|
||||
void appending_hash<row>::operator()<legacy_xx_hasher_without_null_digest>(legacy_xx_hasher_without_null_digest& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
|
||||
for (auto id : columns) {
|
||||
const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
|
||||
if (!cell_and_hash) {
|
||||
return;
|
||||
}
|
||||
auto&& def = s.column_at(kind, id);
|
||||
if (def.is_atomic()) {
|
||||
max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
|
||||
if (cell_and_hash->hash) {
|
||||
feed_hash(h, *cell_and_hash->hash);
|
||||
} else {
|
||||
query::default_hasher cellh;
|
||||
feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
|
||||
feed_hash(h, cellh.finalize_uint64());
|
||||
}
|
||||
} else {
|
||||
auto cm = cell_and_hash->cell.as_collection_mutation();
|
||||
max_ts.update(cm.last_update(*def.type));
|
||||
if (cell_and_hash->hash) {
|
||||
feed_hash(h, *cell_and_hash->hash);
|
||||
} else {
|
||||
query::default_hasher cellh;
|
||||
feed_hash(cellh, cm, def);
|
||||
feed_hash(h, cellh.finalize_uint64());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cell_hash_opt row::cell_hash_for(column_id id) const {
|
||||
if (_type == storage_type::vector) {
|
||||
@@ -1721,7 +1743,7 @@ void row::apply_monotonically(const schema& s, column_kind kind, row&& other) {
|
||||
// we erase the live cells according to the shadowable_tombstone rules.
|
||||
static bool dead_marker_shadows_row(const schema& s, column_kind kind, const row_marker& marker) {
|
||||
return s.is_view()
|
||||
&& !s.view_info()->base_non_pk_columns_in_view_pk().empty()
|
||||
&& s.view_info()->has_base_non_pk_columns_in_view_pk()
|
||||
&& !marker.is_live()
|
||||
&& kind == column_kind::regular_column; // not applicable to static rows
|
||||
}
|
||||
|
||||
@@ -649,6 +649,22 @@ public:
|
||||
};
|
||||
};
|
||||
|
||||
// Used to return the timestamp of the latest update to the row
|
||||
struct max_timestamp {
|
||||
api::timestamp_type max = api::missing_timestamp;
|
||||
|
||||
void update(api::timestamp_type ts) {
|
||||
max = std::max(max, ts);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct appending_hash<row> {
|
||||
static constexpr int null_hash_value = 0xbeefcafe;
|
||||
template<typename Hasher>
|
||||
void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
|
||||
};
|
||||
|
||||
class row_marker;
|
||||
int compare_row_marker_for_merge(const row_marker& left, const row_marker& right) noexcept;
|
||||
|
||||
|
||||
@@ -114,9 +114,6 @@ class reconcilable_result_builder {
|
||||
const schema& _schema;
|
||||
const query::partition_slice& _slice;
|
||||
|
||||
utils::chunked_vector<partition> _result;
|
||||
uint32_t _live_rows{};
|
||||
|
||||
bool _return_static_content_on_partition_with_no_rows{};
|
||||
bool _static_row_is_alive{};
|
||||
uint32_t _total_live_rows = 0;
|
||||
@@ -124,6 +121,10 @@ class reconcilable_result_builder {
|
||||
stop_iteration _stop;
|
||||
bool _short_read_allowed;
|
||||
std::optional<streamed_mutation_freezer> _mutation_consumer;
|
||||
|
||||
uint32_t _live_rows{};
|
||||
// make this the last member so it is destroyed first. #7240
|
||||
utils::chunked_vector<partition> _result;
|
||||
public:
|
||||
reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
|
||||
query::result_memory_accounter&& accounter)
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "schema_registry.hh"
|
||||
#include "mutation_compactor.hh"
|
||||
|
||||
logging::logger mrlog("mutation_reader");
|
||||
|
||||
static constexpr size_t merger_small_vector_size = 4;
|
||||
|
||||
@@ -659,6 +660,8 @@ flat_mutation_reader make_combined_reader(schema_ptr schema,
|
||||
return make_combined_reader(std::move(schema), std::move(v), fwd_sm, fwd_mr);
|
||||
}
|
||||
|
||||
const ssize_t new_reader_base_cost{16 * 1024};
|
||||
|
||||
class restricting_mutation_reader : public flat_mutation_reader::impl {
|
||||
struct mutation_source_and_params {
|
||||
mutation_source _ms;
|
||||
@@ -685,8 +688,6 @@ class restricting_mutation_reader : public flat_mutation_reader::impl {
|
||||
};
|
||||
std::variant<pending_state, admitted_state> _state;
|
||||
|
||||
static const ssize_t new_reader_base_cost{16 * 1024};
|
||||
|
||||
template<typename Function>
|
||||
requires std::is_move_constructible<Function>::value
|
||||
&& requires(Function fn, flat_mutation_reader& reader) {
|
||||
@@ -1026,6 +1027,13 @@ private:
|
||||
bool _reader_created = false;
|
||||
bool _drop_partition_start = false;
|
||||
bool _drop_static_row = false;
|
||||
// Trim range tombstones on the start of the buffer to the start of the read
|
||||
// range (_next_position_in_partition). Set after reader recreation.
|
||||
// Also validate the first not-trimmed mutation fragment's position.
|
||||
bool _trim_range_tombstones = false;
|
||||
// Validate the partition key of the first emitted partition, set after the
|
||||
// reader was recreated.
|
||||
bool _validate_partition_key = false;
|
||||
position_in_partition::tri_compare _tri_cmp;
|
||||
|
||||
std::optional<dht::decorated_key> _last_pkey;
|
||||
@@ -1047,7 +1055,10 @@ private:
|
||||
void adjust_partition_slice();
|
||||
flat_mutation_reader recreate_reader();
|
||||
flat_mutation_reader resume_or_create_reader();
|
||||
void maybe_validate_partition_start(const circular_buffer<mutation_fragment>& buffer);
|
||||
void validate_position_in_partition(position_in_partition_view pos) const;
|
||||
bool should_drop_fragment(const mutation_fragment& mf);
|
||||
bool maybe_trim_range_tombstone(mutation_fragment& mf) const;
|
||||
future<> do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout);
|
||||
future<> fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout);
|
||||
|
||||
@@ -1120,16 +1131,11 @@ void evictable_reader::update_next_position(flat_mutation_reader& reader) {
|
||||
_next_position_in_partition = position_in_partition::before_all_clustered_rows();
|
||||
break;
|
||||
case partition_region::clustered:
|
||||
if (reader.is_buffer_empty()) {
|
||||
_next_position_in_partition = position_in_partition::after_key(last_pos);
|
||||
} else {
|
||||
const auto& next_frag = reader.peek_buffer();
|
||||
if (next_frag.is_end_of_partition()) {
|
||||
if (!reader.is_buffer_empty() && reader.peek_buffer().is_end_of_partition()) {
|
||||
push_mutation_fragment(reader.pop_mutation_fragment());
|
||||
_next_position_in_partition = position_in_partition::for_partition_start();
|
||||
} else {
|
||||
_next_position_in_partition = position_in_partition(next_frag.position());
|
||||
}
|
||||
} else {
|
||||
_next_position_in_partition = position_in_partition::after_key(last_pos);
|
||||
}
|
||||
break;
|
||||
case partition_region::partition_end:
|
||||
@@ -1154,6 +1160,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
|
||||
const dht::partition_range* range = _pr;
|
||||
const query::partition_slice* slice = &_ps;
|
||||
|
||||
_range_override.reset();
|
||||
_slice_override.reset();
|
||||
|
||||
if (_last_pkey) {
|
||||
bool partition_range_is_inclusive = true;
|
||||
|
||||
@@ -1190,6 +1199,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
|
||||
range = &*_range_override;
|
||||
}
|
||||
|
||||
_trim_range_tombstones = true;
|
||||
_validate_partition_key = true;
|
||||
|
||||
return _ms.make_reader(
|
||||
_schema,
|
||||
_permit,
|
||||
@@ -1216,6 +1228,78 @@ flat_mutation_reader evictable_reader::resume_or_create_reader() {
|
||||
return recreate_reader();
|
||||
}
|
||||
|
||||
template <typename... Arg>
|
||||
static void require(bool condition, const char* msg, const Arg&... arg) {
|
||||
if (!condition) {
|
||||
on_internal_error(mrlog, format(msg, arg...));
|
||||
}
|
||||
}
|
||||
|
||||
void evictable_reader::maybe_validate_partition_start(const circular_buffer<mutation_fragment>& buffer) {
|
||||
if (!_validate_partition_key || buffer.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If this is set we can assume the first fragment is a partition-start.
|
||||
const auto& ps = buffer.front().as_partition_start();
|
||||
const auto tri_cmp = dht::ring_position_comparator(*_schema);
|
||||
// If we recreated the reader after fast-forwarding it we won't have
|
||||
// _last_pkey set. In this case it is enough to check if the partition
|
||||
// is in range.
|
||||
if (_last_pkey) {
|
||||
const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
|
||||
if (_drop_partition_start) { // should be the same partition
|
||||
require(
|
||||
cmp_res == 0,
|
||||
"{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
|
||||
__FUNCTION__,
|
||||
*_last_pkey,
|
||||
ps.key());
|
||||
} else { // should be a larger partition
|
||||
require(
|
||||
cmp_res < 0,
|
||||
"{}(): validation failed, expected partition with key larger than _last_pkey {} due to _drop_partition_start being unset, but got {}",
|
||||
__FUNCTION__,
|
||||
*_last_pkey,
|
||||
ps.key());
|
||||
}
|
||||
}
|
||||
const auto& prange = _range_override ? *_range_override : *_pr;
|
||||
require(
|
||||
// TODO: somehow avoid this copy
|
||||
prange.contains(ps.key(), tri_cmp),
|
||||
"{}(): validation failed, expected partition with key that falls into current range {}, but got {}",
|
||||
__FUNCTION__,
|
||||
prange,
|
||||
ps.key());
|
||||
|
||||
_validate_partition_key = false;
|
||||
}
|
||||
|
||||
void evictable_reader::validate_position_in_partition(position_in_partition_view pos) const {
|
||||
require(
|
||||
_tri_cmp(_next_position_in_partition, pos) <= 0,
|
||||
"{}(): validation failed, expected position in partition that is larger-than-equal than _next_position_in_partition {}, but got {}",
|
||||
__FUNCTION__,
|
||||
_next_position_in_partition,
|
||||
pos);
|
||||
|
||||
if (_slice_override && pos.region() == partition_region::clustered) {
|
||||
const auto ranges = _slice_override->row_ranges(*_schema, _last_pkey->key());
|
||||
const bool any_contains = std::any_of(ranges.begin(), ranges.end(), [this, &pos] (const query::clustering_range& cr) {
|
||||
// TODO: somehow avoid this copy
|
||||
auto range = position_range(cr);
|
||||
return range.contains(*_schema, pos);
|
||||
});
|
||||
require(
|
||||
any_contains,
|
||||
"{}(): validation failed, expected clustering fragment that is included in the slice {}, but got {}",
|
||||
__FUNCTION__,
|
||||
*_slice_override,
|
||||
pos);
|
||||
}
|
||||
}
|
||||
|
||||
bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
|
||||
if (_drop_partition_start && mf.is_partition_start()) {
|
||||
_drop_partition_start = false;
|
||||
@@ -1228,12 +1312,50 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool evictable_reader::maybe_trim_range_tombstone(mutation_fragment& mf) const {
|
||||
// We either didn't read a partition yet (evicted after fast-forwarding) or
|
||||
// didn't stop in a clustering region. We don't need to trim range
|
||||
// tombstones in either case.
|
||||
if (!_last_pkey || _next_position_in_partition.region() != partition_region::clustered) {
|
||||
return false;
|
||||
}
|
||||
if (!mf.is_range_tombstone()) {
|
||||
validate_position_in_partition(mf.position());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (_tri_cmp(mf.position(), _next_position_in_partition) >= 0) {
|
||||
validate_position_in_partition(mf.position());
|
||||
return false; // rt in range, no need to trim
|
||||
}
|
||||
|
||||
auto& rt = mf.as_mutable_range_tombstone();
|
||||
|
||||
require(
|
||||
_tri_cmp(_next_position_in_partition, rt.end_position()) <= 0,
|
||||
"{}(): validation failed, expected range tombstone with end pos larger than _next_position_in_partition {}, but got {}",
|
||||
__FUNCTION__,
|
||||
_next_position_in_partition,
|
||||
rt.end_position());
|
||||
|
||||
rt.set_start(*_schema, position_in_partition_view::before_key(_next_position_in_partition));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
future<> evictable_reader::do_fill_buffer(flat_mutation_reader& reader, db::timeout_clock::time_point timeout) {
|
||||
if (!_drop_partition_start && !_drop_static_row) {
|
||||
return reader.fill_buffer(timeout);
|
||||
auto fill_buf_fut = reader.fill_buffer(timeout);
|
||||
if (_validate_partition_key) {
|
||||
fill_buf_fut = fill_buf_fut.then([this, &reader] {
|
||||
maybe_validate_partition_start(reader.buffer());
|
||||
});
|
||||
}
|
||||
return fill_buf_fut;
|
||||
}
|
||||
return repeat([this, &reader, timeout] {
|
||||
return reader.fill_buffer(timeout).then([this, &reader] {
|
||||
maybe_validate_partition_start(reader.buffer());
|
||||
while (!reader.is_buffer_empty() && should_drop_fragment(reader.peek_buffer())) {
|
||||
reader.pop_mutation_fragment();
|
||||
}
|
||||
@@ -1247,6 +1369,11 @@ future<> evictable_reader::fill_buffer(flat_mutation_reader& reader, db::timeout
|
||||
if (reader.is_buffer_empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
while (_trim_range_tombstones && !reader.is_buffer_empty()) {
|
||||
auto mf = reader.pop_mutation_fragment();
|
||||
_trim_range_tombstones = maybe_trim_range_tombstone(mf);
|
||||
push_mutation_fragment(std::move(mf));
|
||||
}
|
||||
reader.move_buffer_content_to(*this);
|
||||
auto stop = [this, &reader] {
|
||||
// The only problematic fragment kind is the range tombstone.
|
||||
@@ -1287,7 +1414,13 @@ future<> evictable_reader::fill_buffer(flat_mutation_reader& reader, db::timeout
|
||||
if (reader.is_buffer_empty()) {
|
||||
return do_fill_buffer(reader, timeout);
|
||||
}
|
||||
push_mutation_fragment(reader.pop_mutation_fragment());
|
||||
if (_trim_range_tombstones) {
|
||||
auto mf = reader.pop_mutation_fragment();
|
||||
_trim_range_tombstones = maybe_trim_range_tombstone(mf);
|
||||
push_mutation_fragment(std::move(mf));
|
||||
} else {
|
||||
push_mutation_fragment(reader.pop_mutation_fragment());
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}).then([this, &reader] {
|
||||
|
||||
@@ -304,6 +304,8 @@ public:
|
||||
mutation_source make_empty_mutation_source();
|
||||
snapshot_source make_empty_snapshot_source();
|
||||
|
||||
extern const ssize_t new_reader_base_cost;
|
||||
|
||||
// Creates a restricted reader whose resource usages will be tracked
|
||||
// during it's lifetime. If there are not enough resources (dues to
|
||||
// existing readers) to create the new reader, it's construction will
|
||||
|
||||
@@ -163,6 +163,11 @@ public:
|
||||
return {partition_region::clustered, bound_weight::before_all_prefixed, &ck};
|
||||
}
|
||||
|
||||
// Returns a view to before_key(pos._ck) if pos.is_clustering_row() else returns pos as-is.
|
||||
static position_in_partition_view before_key(position_in_partition_view pos) {
|
||||
return {partition_region::clustered, pos._bound_weight == bound_weight::equal ? bound_weight::before_all_prefixed : pos._bound_weight, pos._ck};
|
||||
}
|
||||
|
||||
partition_region region() const { return _type; }
|
||||
bound_weight get_bound_weight() const { return _bound_weight; }
|
||||
bool is_partition_start() const { return _type == partition_region::partition_start; }
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
|
||||
reader_permit::resource_units::resource_units(reader_concurrency_semaphore& semaphore, reader_resources res) noexcept
|
||||
: _semaphore(&semaphore), _resources(res) {
|
||||
_semaphore->consume(res);
|
||||
}
|
||||
|
||||
reader_permit::resource_units::resource_units(resource_units&& o) noexcept
|
||||
@@ -75,7 +76,6 @@ reader_permit::resource_units reader_permit::consume_memory(size_t memory) {
|
||||
}
|
||||
|
||||
reader_permit::resource_units reader_permit::consume_resources(reader_resources res) {
|
||||
_semaphore->consume(res);
|
||||
return resource_units(*_semaphore, res);
|
||||
}
|
||||
|
||||
@@ -83,7 +83,6 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
|
||||
_resources += r;
|
||||
while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
|
||||
auto& x = _wait_list.front();
|
||||
_resources -= x.res;
|
||||
try {
|
||||
x.pr.set_value(reader_permit::resource_units(*this, x.res));
|
||||
} catch (...) {
|
||||
@@ -104,7 +103,7 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
|
||||
const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
|
||||
(void)_;
|
||||
++_inactive_read_stats.population;
|
||||
return inactive_read_handle(it->first);
|
||||
return inactive_read_handle(*this, it->first);
|
||||
}
|
||||
|
||||
// The evicted reader will release its permit, hopefully allowing us to
|
||||
@@ -115,6 +114,17 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
|
||||
}
|
||||
|
||||
std::unique_ptr<reader_concurrency_semaphore::inactive_read> reader_concurrency_semaphore::unregister_inactive_read(inactive_read_handle irh) {
|
||||
if (irh && irh._sem != this) {
|
||||
throw std::runtime_error(fmt::format(
|
||||
"reader_concurrency_semaphore::unregister_inactive_read(): "
|
||||
"attempted to unregister an inactive read with a handle belonging to another semaphore: "
|
||||
"this is {} (0x{:x}) but the handle belongs to {} (0x{:x})",
|
||||
name(),
|
||||
reinterpret_cast<uintptr_t>(this),
|
||||
irh._sem->name(),
|
||||
reinterpret_cast<uintptr_t>(irh._sem)));
|
||||
}
|
||||
|
||||
if (auto it = _inactive_reads.find(irh._id); it != _inactive_reads.end()) {
|
||||
auto ir = std::move(it->second);
|
||||
_inactive_reads.erase(it);
|
||||
@@ -158,7 +168,6 @@ future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admi
|
||||
--_inactive_read_stats.population;
|
||||
}
|
||||
if (may_proceed(r)) {
|
||||
_resources -= r;
|
||||
return make_ready_future<reader_permit::resource_units>(reader_permit::resource_units(*this, r));
|
||||
}
|
||||
promise<reader_permit::resource_units> pr;
|
||||
|
||||
@@ -60,18 +60,20 @@ public:
|
||||
};
|
||||
|
||||
class inactive_read_handle {
|
||||
reader_concurrency_semaphore* _sem = nullptr;
|
||||
uint64_t _id = 0;
|
||||
|
||||
friend class reader_concurrency_semaphore;
|
||||
|
||||
explicit inactive_read_handle(uint64_t id)
|
||||
: _id(id) {
|
||||
explicit inactive_read_handle(reader_concurrency_semaphore& sem, uint64_t id)
|
||||
: _sem(&sem), _id(id) {
|
||||
}
|
||||
public:
|
||||
inactive_read_handle() = default;
|
||||
inactive_read_handle(inactive_read_handle&& o) : _id(std::exchange(o._id, 0)) {
|
||||
inactive_read_handle(inactive_read_handle&& o) : _sem(std::exchange(o._sem, nullptr)), _id(std::exchange(o._id, 0)) {
|
||||
}
|
||||
inactive_read_handle& operator=(inactive_read_handle&& o) {
|
||||
_sem = std::exchange(o._sem, nullptr);
|
||||
_id = std::exchange(o._id, 0);
|
||||
return *this;
|
||||
}
|
||||
@@ -105,6 +107,7 @@ private:
|
||||
};
|
||||
|
||||
private:
|
||||
const resources _initial_resources;
|
||||
resources _resources;
|
||||
|
||||
expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;
|
||||
@@ -135,7 +138,8 @@ public:
|
||||
sstring name,
|
||||
size_t max_queue_length = std::numeric_limits<size_t>::max(),
|
||||
std::function<void()> prethrow_action = nullptr)
|
||||
: _resources(count, memory)
|
||||
: _initial_resources(count, memory)
|
||||
, _resources(count, memory)
|
||||
, _wait_list(expiry_handler(name))
|
||||
, _name(std::move(name))
|
||||
, _max_queue_length(max_queue_length)
|
||||
@@ -144,11 +148,11 @@ public:
|
||||
/// Create a semaphore with practically unlimited count and memory.
|
||||
///
|
||||
/// And conversely, no queue limit either.
|
||||
explicit reader_concurrency_semaphore(no_limits)
|
||||
explicit reader_concurrency_semaphore(no_limits, sstring name = "unlimited reader_concurrency_semaphore")
|
||||
: reader_concurrency_semaphore(
|
||||
std::numeric_limits<int>::max(),
|
||||
std::numeric_limits<ssize_t>::max(),
|
||||
"unlimited reader_concurrency_semaphore") {}
|
||||
std::move(name)) {}
|
||||
|
||||
~reader_concurrency_semaphore();
|
||||
|
||||
@@ -158,6 +162,13 @@ public:
|
||||
reader_concurrency_semaphore(reader_concurrency_semaphore&&) = delete;
|
||||
reader_concurrency_semaphore& operator=(reader_concurrency_semaphore&&) = delete;
|
||||
|
||||
/// Returns the name of the semaphore
|
||||
///
|
||||
/// If the semaphore has no name, "unnamed reader concurrency semaphore" is returned.
|
||||
std::string_view name() const {
|
||||
return _name.empty() ? "unnamed reader concurrency semaphore" : std::string_view(_name);
|
||||
}
|
||||
|
||||
/// Register an inactive read.
|
||||
///
|
||||
/// The semaphore will evict this read when there is a shortage of
|
||||
@@ -193,6 +204,10 @@ public:
|
||||
|
||||
reader_permit make_permit();
|
||||
|
||||
const resources initial_resources() const {
|
||||
return _initial_resources;
|
||||
}
|
||||
|
||||
const resources available_resources() const {
|
||||
return _resources;
|
||||
}
|
||||
|
||||
@@ -42,12 +42,20 @@ struct reader_resources {
|
||||
return count >= other.count && memory >= other.memory;
|
||||
}
|
||||
|
||||
reader_resources operator-(const reader_resources& other) const {
|
||||
return reader_resources{count - other.count, memory - other.memory};
|
||||
}
|
||||
|
||||
reader_resources& operator-=(const reader_resources& other) {
|
||||
count -= other.count;
|
||||
memory -= other.memory;
|
||||
return *this;
|
||||
}
|
||||
|
||||
reader_resources operator+(const reader_resources& other) const {
|
||||
return reader_resources{count + other.count, memory + other.memory};
|
||||
}
|
||||
|
||||
reader_resources& operator+=(const reader_resources& other) {
|
||||
count += other.count;
|
||||
memory += other.memory;
|
||||
|
||||
@@ -62,7 +62,7 @@ shared_ptr<abstract_command> exists::prepare(service::storage_proxy& proxy, requ
|
||||
}
|
||||
|
||||
future<redis_message> exists::execute(service::storage_proxy& proxy, redis::redis_options& options, service_permit permit) {
|
||||
return seastar::do_for_each(_keys, [&proxy, &options, &permit, this] (bytes key) {
|
||||
return seastar::do_for_each(_keys, [&proxy, &options, permit, this] (bytes& key) {
|
||||
return redis::read_strings(proxy, options, key, permit).then([this] (lw_shared_ptr<strings_result> result) {
|
||||
if (result->has_result()) {
|
||||
_count++;
|
||||
|
||||
@@ -44,15 +44,15 @@ mkdir -p $BUILDDIR/scylla-package
|
||||
tar -C $BUILDDIR/scylla-package -xpf $RELOC_PKG
|
||||
cd $BUILDDIR/scylla-package
|
||||
|
||||
PRODUCT=$(cat scylla/SCYLLA-PRODUCT-FILE)
|
||||
SCYLLA_VERSION=$(cat scylla/SCYLLA-VERSION-FILE)
|
||||
SCYLLA_RELEASE=$(cat scylla/SCYLLA-RELEASE-FILE)
|
||||
|
||||
ln -fv $RELOC_PKG ../$PRODUCT-server_$SCYLLA_VERSION-$SCYLLA_RELEASE.orig.tar.gz
|
||||
|
||||
if $DIST; then
|
||||
export DEB_BUILD_OPTIONS="housekeeping"
|
||||
fi
|
||||
|
||||
mv scylla/debian debian
|
||||
|
||||
PKG_NAME=$(dpkg-parsechangelog --show-field Source)
|
||||
# XXX: Drop revision number from version string.
|
||||
# Since it always '1', this should be okay for now.
|
||||
PKG_VERSION=$(dpkg-parsechangelog --show-field Version |sed -e 's/-1$//')
|
||||
ln -fv $RELOC_PKG ../"$PKG_NAME"_"$PKG_VERSION".orig.tar.gz
|
||||
debuild -rfakeroot -us -uc
|
||||
|
||||
@@ -1633,6 +1633,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
|
||||
auto& ks = db.local().find_keyspace(keyspace_name);
|
||||
auto& strat = ks.get_replication_strategy();
|
||||
dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tm, tokens, myip);
|
||||
bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;
|
||||
|
||||
//Active ranges
|
||||
auto metadata_clone = tm.clone_only_token_map();
|
||||
@@ -1719,6 +1720,9 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
|
||||
mandatory_neighbors = get_node_losing_the_ranges(old_endpoints, new_endpoints);
|
||||
neighbors = mandatory_neighbors;
|
||||
} else if (old_endpoints.size() < strat.get_replication_factor()) {
|
||||
if (!find_node_in_local_dc_only) {
|
||||
neighbors = old_endpoints;
|
||||
} else {
|
||||
if (old_endpoints_in_local_dc.size() == rf_in_local_dc) {
|
||||
// Local DC has enough replica nodes.
|
||||
mandatory_neighbors = get_node_losing_the_ranges(old_endpoints_in_local_dc, new_endpoints);
|
||||
@@ -1746,6 +1750,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
|
||||
throw std::runtime_error(format("bootstrap_with_repair: keyspace={}, range={}, wrong number of old_endpoints_in_local_dc={}, rf_in_local_dc={}",
|
||||
keyspace_name, desired_range, old_endpoints_in_local_dc.size(), rf_in_local_dc));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error(format("bootstrap_with_repair: keyspace={}, range={}, wrong number of old_endpoints={}, rf={}",
|
||||
keyspace_name, desired_range, old_endpoints, strat.get_replication_factor()));
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
|
||||
#include <unordered_map>
|
||||
#include <exception>
|
||||
#include <absl/container/btree_set.h>
|
||||
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
@@ -339,6 +340,8 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
using repair_hash_set = absl::btree_set<repair_hash>;
|
||||
|
||||
enum class repair_row_level_start_status: uint8_t {
|
||||
ok,
|
||||
no_such_column_family,
|
||||
|
||||
@@ -47,6 +47,7 @@
|
||||
#include "gms/gossiper.hh"
|
||||
#include "repair/row_level.hh"
|
||||
#include "mutation_source_metadata.hh"
|
||||
#include "utils/stall_free.hh"
|
||||
|
||||
extern logging::logger rlogger;
|
||||
|
||||
@@ -529,7 +530,7 @@ public:
|
||||
sstables::shared_sstable sst = use_view_update_path ? t->make_streaming_staging_sstable() : t->make_streaming_sstable_for_write();
|
||||
schema_ptr s = reader.schema();
|
||||
auto& pc = service::get_local_streaming_priority();
|
||||
return sst->write_components(std::move(reader), std::max(1ul, adjusted_estimated_partitions), s,
|
||||
return sst->write_components(std::move(reader), adjusted_estimated_partitions, s,
|
||||
t->get_sstables_manager().configure_writer(),
|
||||
encoding_stats{}, pc).then([sst] {
|
||||
return sst->open_data();
|
||||
@@ -666,7 +667,7 @@ private:
|
||||
// Tracks current sync boundary
|
||||
std::optional<repair_sync_boundary> _current_sync_boundary;
|
||||
// Contains the hashes of rows in the _working_row_buffor for all peer nodes
|
||||
std::vector<std::unordered_set<repair_hash>> _peer_row_hash_sets;
|
||||
std::vector<repair_hash_set> _peer_row_hash_sets;
|
||||
// Gate used to make sure pending operation of meta data is done
|
||||
seastar::gate _gate;
|
||||
sink_source_for_get_full_row_hashes _sink_source_for_get_full_row_hashes;
|
||||
@@ -754,11 +755,12 @@ public:
|
||||
public:
|
||||
future<> stop() {
|
||||
auto gate_future = _gate.close();
|
||||
auto writer_future = _repair_writer.wait_for_writer_done();
|
||||
auto f1 = _sink_source_for_get_full_row_hashes.close();
|
||||
auto f2 = _sink_source_for_get_row_diff.close();
|
||||
auto f3 = _sink_source_for_put_row_diff.close();
|
||||
return when_all_succeed(std::move(gate_future), std::move(writer_future), std::move(f1), std::move(f2), std::move(f3)).discard_result();
|
||||
return when_all_succeed(std::move(gate_future), std::move(f1), std::move(f2), std::move(f3)).discard_result().finally([this] {
|
||||
return _repair_writer.wait_for_writer_done();
|
||||
});
|
||||
}
|
||||
|
||||
static std::unordered_map<node_repair_meta_id, lw_shared_ptr<repair_meta>>& repair_meta_map() {
|
||||
@@ -886,9 +888,9 @@ public:
|
||||
}
|
||||
|
||||
// Must run inside a seastar thread
|
||||
static std::unordered_set<repair_hash>
|
||||
get_set_diff(const std::unordered_set<repair_hash>& x, const std::unordered_set<repair_hash>& y) {
|
||||
std::unordered_set<repair_hash> set_diff;
|
||||
static repair_hash_set
|
||||
get_set_diff(const repair_hash_set& x, const repair_hash_set& y) {
|
||||
repair_hash_set set_diff;
|
||||
// Note std::set_difference needs x and y are sorted.
|
||||
std::copy_if(x.begin(), x.end(), std::inserter(set_diff, set_diff.end()),
|
||||
[&y] (auto& item) { thread::maybe_yield(); return y.find(item) == y.end(); });
|
||||
@@ -906,14 +908,14 @@ public:
|
||||
|
||||
}
|
||||
|
||||
std::unordered_set<repair_hash>& peer_row_hash_sets(unsigned node_idx) {
|
||||
repair_hash_set& peer_row_hash_sets(unsigned node_idx) {
|
||||
return _peer_row_hash_sets[node_idx];
|
||||
}
|
||||
|
||||
// Get a list of row hashes in _working_row_buf
|
||||
future<std::unordered_set<repair_hash>>
|
||||
future<repair_hash_set>
|
||||
working_row_hashes() {
|
||||
return do_with(std::unordered_set<repair_hash>(), [this] (std::unordered_set<repair_hash>& hashes) {
|
||||
return do_with(repair_hash_set(), [this] (repair_hash_set& hashes) {
|
||||
return do_for_each(_working_row_buf, [&hashes] (repair_row& r) {
|
||||
hashes.emplace(r.hash());
|
||||
}).then([&hashes] {
|
||||
@@ -1090,24 +1092,32 @@ private:
|
||||
});
|
||||
}
|
||||
|
||||
future<> clear_row_buf() {
|
||||
return utils::clear_gently(_row_buf);
|
||||
}
|
||||
|
||||
future<> clear_working_row_buf() {
|
||||
return utils::clear_gently(_working_row_buf).then([this] {
|
||||
_working_row_buf_combined_hash.clear();
|
||||
});
|
||||
}
|
||||
|
||||
// Read rows from disk until _max_row_buf_size of rows are filled into _row_buf.
|
||||
// Calculate the combined checksum of the rows
|
||||
// Calculate the total size of the rows in _row_buf
|
||||
future<get_sync_boundary_response>
|
||||
get_sync_boundary(std::optional<repair_sync_boundary> skipped_sync_boundary) {
|
||||
auto f = make_ready_future<>();
|
||||
if (skipped_sync_boundary) {
|
||||
_current_sync_boundary = skipped_sync_boundary;
|
||||
_row_buf.clear();
|
||||
_working_row_buf.clear();
|
||||
_working_row_buf_combined_hash.clear();
|
||||
} else {
|
||||
_working_row_buf.clear();
|
||||
_working_row_buf_combined_hash.clear();
|
||||
f = clear_row_buf();
|
||||
}
|
||||
// Here is the place we update _last_sync_boundary
|
||||
rlogger.trace("SET _last_sync_boundary from {} to {}", _last_sync_boundary, _current_sync_boundary);
|
||||
_last_sync_boundary = _current_sync_boundary;
|
||||
return row_buf_size().then([this, sb = std::move(skipped_sync_boundary)] (size_t cur_size) {
|
||||
return f.then([this, sb = std::move(skipped_sync_boundary)] () mutable {
|
||||
return clear_working_row_buf().then([this, sb = sb] () mutable {
|
||||
return row_buf_size().then([this, sb = std::move(sb)] (size_t cur_size) {
|
||||
return read_rows_from_disk(cur_size).then([this, sb = std::move(sb)] (std::list<repair_row> new_rows, size_t new_rows_size) mutable {
|
||||
size_t new_rows_nr = new_rows.size();
|
||||
_row_buf.splice(_row_buf.end(), new_rows);
|
||||
@@ -1124,6 +1134,8 @@ private:
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> move_row_buf_to_working_row_buf() {
|
||||
@@ -1199,9 +1211,9 @@ private:
|
||||
}
|
||||
|
||||
future<std::list<repair_row>>
|
||||
copy_rows_from_working_row_buf_within_set_diff(std::unordered_set<repair_hash> set_diff) {
|
||||
copy_rows_from_working_row_buf_within_set_diff(repair_hash_set set_diff) {
|
||||
return do_with(std::list<repair_row>(), std::move(set_diff),
|
||||
[this] (std::list<repair_row>& rows, std::unordered_set<repair_hash>& set_diff) {
|
||||
[this] (std::list<repair_row>& rows, repair_hash_set& set_diff) {
|
||||
return do_for_each(_working_row_buf, [this, &set_diff, &rows] (const repair_row& r) {
|
||||
if (set_diff.count(r.hash()) > 0) {
|
||||
rows.push_back(r);
|
||||
@@ -1216,7 +1228,7 @@ private:
|
||||
// Give a set of row hashes, return the corresponding rows
|
||||
// If needs_all_rows is set, return all the rows in _working_row_buf, ignore the set_diff
|
||||
future<std::list<repair_row>>
|
||||
get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
|
||||
get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
|
||||
if (needs_all_rows) {
|
||||
if (!_repair_master || _nr_peer_nodes == 1) {
|
||||
return make_ready_future<std::list<repair_row>>(std::move(_working_row_buf));
|
||||
@@ -1227,19 +1239,28 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
future<> do_apply_rows(std::list<repair_row>& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
|
||||
return with_semaphore(_repair_writer.sem(), 1, [this, node_idx, update_buf, &row_diff] {
|
||||
_repair_writer.create_writer(_db, node_idx);
|
||||
return do_for_each(row_diff, [this, node_idx, update_buf] (repair_row& r) {
|
||||
if (update_buf) {
|
||||
_working_row_buf_combined_hash.add(r.hash());
|
||||
}
|
||||
// The repair_row here is supposed to have
|
||||
// mutation_fragment attached because we have stored it in
|
||||
// to_repair_rows_list above where the repair_row is created.
|
||||
mutation_fragment mf = std::move(r.get_mutation_fragment());
|
||||
auto dk_with_hash = r.get_dk_with_hash();
|
||||
return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf));
|
||||
future<> do_apply_rows(std::list<repair_row>&& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
|
||||
return do_with(std::move(row_diff), [this, node_idx, update_buf] (std::list<repair_row>& row_diff) {
|
||||
return with_semaphore(_repair_writer.sem(), 1, [this, node_idx, update_buf, &row_diff] {
|
||||
_repair_writer.create_writer(_db, node_idx);
|
||||
return repeat([this, node_idx, update_buf, &row_diff] () mutable {
|
||||
if (row_diff.empty()) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
repair_row& r = row_diff.front();
|
||||
if (update_buf) {
|
||||
_working_row_buf_combined_hash.add(r.hash());
|
||||
}
|
||||
// The repair_row here is supposed to have
|
||||
// mutation_fragment attached because we have stored it in
|
||||
// to_repair_rows_list above where the repair_row is created.
|
||||
mutation_fragment mf = std::move(r.get_mutation_fragment());
|
||||
auto dk_with_hash = r.get_dk_with_hash();
|
||||
return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
|
||||
row_diff.pop_front();
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -1257,19 +1278,17 @@ private:
|
||||
stats().rx_row_nr += row_diff.size();
|
||||
stats().rx_row_nr_peer[from] += row_diff.size();
|
||||
if (update_buf) {
|
||||
std::list<repair_row> tmp;
|
||||
tmp.swap(_working_row_buf);
|
||||
// Both row_diff and _working_row_buf and are ordered, merging
|
||||
// two sored list to make sure the combination of row_diff
|
||||
// and _working_row_buf are ordered.
|
||||
std::merge(tmp.begin(), tmp.end(), row_diff.begin(), row_diff.end(), std::back_inserter(_working_row_buf),
|
||||
[this] (const repair_row& x, const repair_row& y) { thread::maybe_yield(); return _cmp(x.boundary(), y.boundary()) < 0; });
|
||||
utils::merge_to_gently(_working_row_buf, row_diff,
|
||||
[this] (const repair_row& x, const repair_row& y) { return _cmp(x.boundary(), y.boundary()) < 0; });
|
||||
}
|
||||
if (update_hash_set) {
|
||||
_peer_row_hash_sets[node_idx] = boost::copy_range<std::unordered_set<repair_hash>>(row_diff |
|
||||
_peer_row_hash_sets[node_idx] = boost::copy_range<repair_hash_set>(row_diff |
|
||||
boost::adaptors::transformed([] (repair_row& r) { thread::maybe_yield(); return r.hash(); }));
|
||||
}
|
||||
do_apply_rows(row_diff, node_idx, update_buf).get();
|
||||
do_apply_rows(std::move(row_diff), node_idx, update_buf).get();
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -1277,11 +1296,9 @@ private:
|
||||
if (rows.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return to_repair_rows_list(rows).then([this] (std::list<repair_row> row_diff) {
|
||||
return do_with(std::move(row_diff), [this] (std::list<repair_row>& row_diff) {
|
||||
unsigned node_idx = 0;
|
||||
return do_apply_rows(row_diff, node_idx, update_working_row_buf::no);
|
||||
});
|
||||
return to_repair_rows_list(std::move(rows)).then([this] (std::list<repair_row> row_diff) {
|
||||
unsigned node_idx = 0;
|
||||
return do_apply_rows(std::move(row_diff), node_idx, update_working_row_buf::no);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1360,13 +1377,13 @@ private:
|
||||
public:
|
||||
// RPC API
|
||||
// Return the hashes of the rows in _working_row_buf
|
||||
future<std::unordered_set<repair_hash>>
|
||||
future<repair_hash_set>
|
||||
get_full_row_hashes(gms::inet_address remote_node) {
|
||||
if (remote_node == _myip) {
|
||||
return get_full_row_hashes_handler();
|
||||
}
|
||||
return netw::get_local_messaging_service().send_repair_get_full_row_hashes(msg_addr(remote_node),
|
||||
_repair_meta_id).then([this, remote_node] (std::unordered_set<repair_hash> hashes) {
|
||||
_repair_meta_id).then([this, remote_node] (repair_hash_set hashes) {
|
||||
rlogger.debug("Got full hashes from peer={}, nr_hashes={}", remote_node, hashes.size());
|
||||
_metrics.rx_hashes_nr += hashes.size();
|
||||
stats().rx_hashes_nr += hashes.size();
|
||||
@@ -1377,7 +1394,7 @@ public:
|
||||
|
||||
private:
|
||||
future<> get_full_row_hashes_source_op(
|
||||
lw_shared_ptr<std::unordered_set<repair_hash>> current_hashes,
|
||||
lw_shared_ptr<repair_hash_set> current_hashes,
|
||||
gms::inet_address remote_node,
|
||||
unsigned node_idx,
|
||||
rpc::source<repair_hash_with_cmd>& source) {
|
||||
@@ -1415,12 +1432,12 @@ private:
|
||||
}
|
||||
|
||||
public:
|
||||
future<std::unordered_set<repair_hash>>
|
||||
future<repair_hash_set>
|
||||
get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx) {
|
||||
if (remote_node == _myip) {
|
||||
return get_full_row_hashes_handler();
|
||||
}
|
||||
auto current_hashes = make_lw_shared<std::unordered_set<repair_hash>>();
|
||||
auto current_hashes = make_lw_shared<repair_hash_set>();
|
||||
return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx).then(
|
||||
[this, current_hashes, remote_node, node_idx]
|
||||
(rpc::sink<repair_stream_cmd>& sink, rpc::source<repair_hash_with_cmd>& source) mutable {
|
||||
@@ -1435,7 +1452,7 @@ public:
|
||||
}
|
||||
|
||||
// RPC handler
|
||||
future<std::unordered_set<repair_hash>>
|
||||
future<repair_hash_set>
|
||||
get_full_row_hashes_handler() {
|
||||
return with_gate(_gate, [this] {
|
||||
return working_row_hashes();
|
||||
@@ -1585,7 +1602,7 @@ public:
|
||||
// RPC API
|
||||
// Return rows in the _working_row_buf with hash within the given sef_diff
|
||||
// Must run inside a seastar thread
|
||||
void get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
|
||||
void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
|
||||
if (needs_all_rows || !set_diff.empty()) {
|
||||
if (remote_node == _myip) {
|
||||
return;
|
||||
@@ -1654,11 +1671,11 @@ private:
|
||||
}
|
||||
|
||||
future<> get_row_diff_sink_op(
|
||||
std::unordered_set<repair_hash> set_diff,
|
||||
repair_hash_set set_diff,
|
||||
needs_all_rows_t needs_all_rows,
|
||||
rpc::sink<repair_hash_with_cmd>& sink,
|
||||
gms::inet_address remote_node) {
|
||||
return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (std::unordered_set<repair_hash>& set_diff) mutable {
|
||||
return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (repair_hash_set& set_diff) mutable {
|
||||
if (inject_rpc_stream_error) {
|
||||
return make_exception_future<>(std::runtime_error("get_row_diff: Inject sender error in sink loop"));
|
||||
}
|
||||
@@ -1685,7 +1702,7 @@ private:
|
||||
public:
|
||||
// Must run inside a seastar thread
|
||||
void get_row_diff_with_rpc_stream(
|
||||
std::unordered_set<repair_hash> set_diff,
|
||||
repair_hash_set set_diff,
|
||||
needs_all_rows_t needs_all_rows,
|
||||
update_peer_row_hash_sets update_hash_set,
|
||||
gms::inet_address remote_node,
|
||||
@@ -1711,7 +1728,7 @@ public:
|
||||
}
|
||||
|
||||
// RPC handler
|
||||
future<repair_rows_on_wire> get_row_diff_handler(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows) {
|
||||
future<repair_rows_on_wire> get_row_diff_handler(repair_hash_set set_diff, needs_all_rows_t needs_all_rows) {
|
||||
return with_gate(_gate, [this, set_diff = std::move(set_diff), needs_all_rows] () mutable {
|
||||
return get_row_diff(std::move(set_diff), needs_all_rows).then([this] (std::list<repair_row> row_diff) {
|
||||
return to_repair_rows_on_wire(std::move(row_diff));
|
||||
@@ -1721,15 +1738,16 @@ public:
|
||||
|
||||
// RPC API
|
||||
// Send rows in the _working_row_buf with hash within the given sef_diff
|
||||
future<> put_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
|
||||
future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
|
||||
if (!set_diff.empty()) {
|
||||
if (remote_node == _myip) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
auto sz = set_diff.size();
|
||||
size_t sz = set_diff.size();
|
||||
return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, sz] (std::list<repair_row> row_diff) {
|
||||
if (row_diff.size() != sz) {
|
||||
throw std::runtime_error("row_diff.size() != set_diff.size()");
|
||||
rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
|
||||
_schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
|
||||
}
|
||||
return do_with(std::move(row_diff), [this, remote_node] (std::list<repair_row>& row_diff) {
|
||||
return get_repair_rows_size(row_diff).then([this, remote_node, &row_diff] (size_t row_bytes) mutable {
|
||||
@@ -1796,17 +1814,18 @@ private:
|
||||
|
||||
public:
|
||||
future<> put_row_diff_with_rpc_stream(
|
||||
std::unordered_set<repair_hash> set_diff,
|
||||
repair_hash_set set_diff,
|
||||
needs_all_rows_t needs_all_rows,
|
||||
gms::inet_address remote_node, unsigned node_idx) {
|
||||
if (!set_diff.empty()) {
|
||||
if (remote_node == _myip) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
auto sz = set_diff.size();
|
||||
size_t sz = set_diff.size();
|
||||
return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, node_idx, sz] (std::list<repair_row> row_diff) {
|
||||
if (row_diff.size() != sz) {
|
||||
throw std::runtime_error("row_diff.size() != set_diff.size()");
|
||||
rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
|
||||
_schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
|
||||
}
|
||||
return do_with(std::move(row_diff), [this, remote_node, node_idx] (std::list<repair_row>& row_diff) {
|
||||
return get_repair_rows_size(row_diff).then([this, remote_node, node_idx, &row_diff] (size_t row_bytes) mutable {
|
||||
@@ -1845,7 +1864,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
|
||||
rpc::sink<repair_row_on_wire_with_cmd> sink,
|
||||
rpc::source<repair_hash_with_cmd> source,
|
||||
bool &error,
|
||||
std::unordered_set<repair_hash>& current_set_diff,
|
||||
repair_hash_set& current_set_diff,
|
||||
std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) {
|
||||
repair_hash_with_cmd hash_cmd = std::get<0>(hash_cmd_opt.value());
|
||||
rlogger.trace("Got repair_hash_with_cmd from peer={}, hash={}, cmd={}", from, hash_cmd.hash, int(hash_cmd.cmd));
|
||||
@@ -1858,7 +1877,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
|
||||
}
|
||||
bool needs_all_rows = hash_cmd.cmd == repair_stream_cmd::needs_all_rows;
|
||||
_metrics.rx_hashes_nr += current_set_diff.size();
|
||||
auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(current_set_diff)));
|
||||
auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(current_set_diff)));
|
||||
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] {
|
||||
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
|
||||
if (fp.get_owner_shard() == this_shard_id()) {
|
||||
@@ -1936,12 +1955,12 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
|
||||
if (status == repair_stream_cmd::get_full_row_hashes) {
|
||||
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
|
||||
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
|
||||
return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
|
||||
return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
|
||||
_metrics.tx_hashes_nr += hashes.size();
|
||||
return hashes;
|
||||
});
|
||||
}).then([sink] (std::unordered_set<repair_hash> hashes) mutable {
|
||||
return do_with(std::move(hashes), [sink] (std::unordered_set<repair_hash>& hashes) mutable {
|
||||
}).then([sink] (repair_hash_set hashes) mutable {
|
||||
return do_with(std::move(hashes), [sink] (repair_hash_set& hashes) mutable {
|
||||
return do_for_each(hashes, [sink] (const repair_hash& hash) mutable {
|
||||
return sink(repair_hash_with_cmd{repair_stream_cmd::hash_data, hash});
|
||||
}).then([sink] () mutable {
|
||||
@@ -1964,7 +1983,7 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
|
||||
uint32_t repair_meta_id,
|
||||
rpc::sink<repair_row_on_wire_with_cmd> sink,
|
||||
rpc::source<repair_hash_with_cmd> source) {
|
||||
return do_with(false, std::unordered_set<repair_hash>(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, std::unordered_set<repair_hash>& current_set_diff) mutable {
|
||||
return do_with(false, repair_hash_set(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
|
||||
return repeat([from, src_cpu_id, repair_meta_id, sink, source, &error, ¤t_set_diff] () mutable {
|
||||
return source().then([from, src_cpu_id, repair_meta_id, sink, source, &error, ¤t_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
|
||||
if (hash_cmd_opt) {
|
||||
@@ -2107,7 +2126,7 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
|
||||
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
|
||||
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
|
||||
return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
|
||||
return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
|
||||
_metrics.tx_hashes_nr += hashes.size();
|
||||
return hashes;
|
||||
});
|
||||
@@ -2135,11 +2154,11 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
|
||||
});
|
||||
});
|
||||
ms.register_repair_get_row_diff([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
|
||||
std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
|
||||
repair_hash_set set_diff, bool needs_all_rows) {
|
||||
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
|
||||
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
_metrics.rx_hashes_nr += set_diff.size();
|
||||
auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(set_diff)));
|
||||
auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(set_diff)));
|
||||
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] () mutable {
|
||||
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
|
||||
if (fp.get_owner_shard() == this_shard_id()) {
|
||||
@@ -2207,6 +2226,25 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
|
||||
});
|
||||
}
|
||||
|
||||
future<> repair_uninit_messaging_service_handler() {
|
||||
return netw::get_messaging_service().invoke_on_all([] (auto& ms) {
|
||||
return when_all_succeed(
|
||||
ms.unregister_repair_get_row_diff_with_rpc_stream(),
|
||||
ms.unregister_repair_put_row_diff_with_rpc_stream(),
|
||||
ms.unregister_repair_get_full_row_hashes_with_rpc_stream(),
|
||||
ms.unregister_repair_get_full_row_hashes(),
|
||||
ms.unregister_repair_get_combined_row_hash(),
|
||||
ms.unregister_repair_get_sync_boundary(),
|
||||
ms.unregister_repair_get_row_diff(),
|
||||
ms.unregister_repair_put_row_diff(),
|
||||
ms.unregister_repair_row_level_start(),
|
||||
ms.unregister_repair_row_level_stop(),
|
||||
ms.unregister_repair_get_estimated_partitions(),
|
||||
ms.unregister_repair_set_estimated_partitions(),
|
||||
ms.unregister_repair_get_diff_algorithms()).discard_result();
|
||||
});
|
||||
}
|
||||
|
||||
class row_level_repair {
|
||||
repair_info& _ri;
|
||||
sstring _cf_name;
|
||||
@@ -2439,7 +2477,7 @@ private:
|
||||
// sequentially because the rows from repair follower 1 to
|
||||
// repair master might reduce the amount of missing data
|
||||
// between repair master and repair follower 2.
|
||||
std::unordered_set<repair_hash> set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
|
||||
repair_hash_set set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
|
||||
// Request missing sets from peer node
|
||||
rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
|
||||
node, master.working_row_hashes().get0().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
|
||||
@@ -2462,9 +2500,9 @@ private:
|
||||
// So we can figure out which rows peer node are missing and send the missing rows to them
|
||||
check_in_shutdown();
|
||||
_ri.check_in_abort();
|
||||
std::unordered_set<repair_hash> local_row_hash_sets = master.working_row_hashes().get0();
|
||||
repair_hash_set local_row_hash_sets = master.working_row_hashes().get0();
|
||||
auto sz = _all_live_peer_nodes.size();
|
||||
std::vector<std::unordered_set<repair_hash>> set_diffs(sz);
|
||||
std::vector<repair_hash_set> set_diffs(sz);
|
||||
for (size_t idx : boost::irange(size_t(0), sz)) {
|
||||
set_diffs[idx] = repair_meta::get_set_diff(local_row_hash_sets, master.peer_row_hash_sets(idx));
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ private:
|
||||
};
|
||||
|
||||
future<> repair_init_messaging_service_handler(repair_service& rs, distributed<db::system_distributed_keyspace>& sys_dist_ks, distributed<db::view::view_update_generator>& view_update_generator);
|
||||
future<> repair_uninit_messaging_service_handler();
|
||||
|
||||
class repair_info;
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <seastar/core/on_internal_error.hh>
|
||||
#include <map>
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include "cql3/column_identifier.hh"
|
||||
@@ -43,6 +44,8 @@
|
||||
|
||||
constexpr int32_t schema::NAME_LENGTH;
|
||||
|
||||
extern logging::logger dblog;
|
||||
|
||||
sstring to_sstring(column_kind k) {
|
||||
switch (k) {
|
||||
case column_kind::partition_key: return "PARTITION_KEY";
|
||||
@@ -592,11 +595,15 @@ schema::get_column_definition(const bytes& name) const {
|
||||
|
||||
const column_definition&
|
||||
schema::column_at(column_kind kind, column_id id) const {
|
||||
return _raw._columns.at(column_offset(kind) + id);
|
||||
return column_at(static_cast<ordinal_column_id>(column_offset(kind) + id));
|
||||
}
|
||||
|
||||
const column_definition&
|
||||
schema::column_at(ordinal_column_id ordinal_id) const {
|
||||
if (size_t(ordinal_id) >= _raw._columns.size()) [[unlikely]] {
|
||||
on_internal_error(dblog, format("{}.{}@{}: column id {:d} >= {:d}",
|
||||
ks_name(), cf_name(), version(), size_t(ordinal_id), _raw._columns.size()));
|
||||
}
|
||||
return _raw._columns.at(static_cast<column_count_type>(ordinal_id));
|
||||
}
|
||||
|
||||
|
||||
@@ -92,7 +92,8 @@ executables = ['build/{}/scylla'.format(args.mode),
|
||||
'/usr/sbin/ethtool',
|
||||
'/usr/bin/netstat',
|
||||
'/usr/bin/hwloc-distrib',
|
||||
'/usr/bin/hwloc-calc']
|
||||
'/usr/bin/hwloc-calc',
|
||||
'/usr/bin/lsblk']
|
||||
|
||||
output = args.dest
|
||||
|
||||
|
||||
@@ -597,7 +597,7 @@ def current_shard():
|
||||
|
||||
|
||||
def find_db(shard=None):
|
||||
if not shard:
|
||||
if shard is None:
|
||||
shard = current_shard()
|
||||
return gdb.parse_and_eval('::debug::db')['_instances']['_M_impl']['_M_start'][shard]['service']['_p']
|
||||
|
||||
|
||||
@@ -63,6 +63,17 @@ MemoryHigh=1200M
|
||||
MemoryMax=1400M
|
||||
MemoryLimit=1400M
|
||||
EOS
|
||||
|
||||
# On CentOS7, systemd does not support percentage-based parameter.
|
||||
# To apply memory parameter on CentOS7, we need to override the parameter
|
||||
# in bytes, instead of percentage.
|
||||
elif [ "$RHEL" -a "$VERSION_ID" = "7" ]; then
|
||||
MEMORY_LIMIT=$((MEMTOTAL_BYTES / 100 * 5))
|
||||
mkdir -p /etc/systemd/system/scylla-helper.slice.d/
|
||||
cat << EOS > /etc/systemd/system/scylla-helper.slice.d/memory.conf
|
||||
[Slice]
|
||||
MemoryLimit=$MEMORY_LIMIT
|
||||
EOS
|
||||
fi
|
||||
|
||||
systemctl --system daemon-reload >/dev/null || true
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 11e86172ba...f760efe0a0
@@ -25,6 +25,7 @@
|
||||
#include <seastar/util/bool_class.hh>
|
||||
#include <boost/range/algorithm/for_each.hpp>
|
||||
#include "utils/small_vector.hh"
|
||||
#include <absl/container/btree_set.h>
|
||||
|
||||
namespace ser {
|
||||
|
||||
@@ -81,6 +82,17 @@ static inline void serialize_array(Output& out, const Container& v) {
|
||||
template<typename Container>
|
||||
struct container_traits;
|
||||
|
||||
template<typename T>
|
||||
struct container_traits<absl::btree_set<T>> {
|
||||
struct back_emplacer {
|
||||
absl::btree_set<T>& c;
|
||||
back_emplacer(absl::btree_set<T>& c_) : c(c_) {}
|
||||
void operator()(T&& v) {
|
||||
c.emplace(std::move(v));
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct container_traits<std::unordered_set<T>> {
|
||||
struct back_emplacer {
|
||||
@@ -253,6 +265,27 @@ struct serializer<std::list<T>> {
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct serializer<absl::btree_set<T>> {
|
||||
template<typename Input>
|
||||
static absl::btree_set<T> read(Input& in) {
|
||||
auto sz = deserialize(in, boost::type<uint32_t>());
|
||||
absl::btree_set<T> v;
|
||||
deserialize_array_helper<false, T>::doit(in, v, sz);
|
||||
return v;
|
||||
}
|
||||
template<typename Output>
|
||||
static void write(Output& out, const absl::btree_set<T>& v) {
|
||||
safe_serialize_as_uint32(out, v.size());
|
||||
serialize_array_helper<false, T>::doit(out, v);
|
||||
}
|
||||
template<typename Input>
|
||||
static void skip(Input& in) {
|
||||
auto sz = deserialize(in, boost::type<uint32_t>());
|
||||
skip_array<T>(in, sz);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct serializer<std::unordered_set<T>> {
|
||||
template<typename Input>
|
||||
|
||||
@@ -92,7 +92,7 @@ void migration_manager::init_messaging_service()
|
||||
//FIXME: future discarded.
|
||||
(void)with_gate(_background_tasks, [this] {
|
||||
mlogger.debug("features changed, recalculating schema version");
|
||||
return update_schema_version_and_announce(get_storage_proxy(), _feat.cluster_schema_features());
|
||||
return db::schema_tables::recalculate_schema_version(get_storage_proxy(), _feat);
|
||||
});
|
||||
};
|
||||
|
||||
@@ -1104,6 +1104,20 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
|
||||
mlogger.debug("Requesting schema {} from {}", v, dst);
|
||||
auto& ms = netw::get_local_messaging_service();
|
||||
return ms.send_get_schema_version(dst, v);
|
||||
}).then([] (schema_ptr s) {
|
||||
// If this is a view so this schema also needs a reference to the base
|
||||
// table.
|
||||
if (s->is_view()) {
|
||||
if (!s->view_info()->base_info()) {
|
||||
auto& db = service::get_local_storage_proxy().get_db().local();
|
||||
// This line might throw a no_such_column_family
|
||||
// It should be fine since if we tried to register a view for which
|
||||
// we don't know the base table, our registry is broken.
|
||||
schema_ptr base_schema = db.find_schema(s->view_info()->base_id());
|
||||
s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*base_schema));
|
||||
}
|
||||
}
|
||||
return s;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -120,9 +120,11 @@ using fbu = utils::fb_utilities;
|
||||
|
||||
static inline
|
||||
query::digest_algorithm digest_algorithm(service::storage_proxy& proxy) {
|
||||
return proxy.features().cluster_supports_xxhash_digest_algorithm()
|
||||
? query::digest_algorithm::xxHash
|
||||
: query::digest_algorithm::MD5;
|
||||
return proxy.features().cluster_supports_digest_for_null_values()
|
||||
? query::digest_algorithm::xxHash
|
||||
: proxy.features().cluster_supports_xxhash_digest_algorithm()
|
||||
? query::digest_algorithm::legacy_xxHash_without_null_digest
|
||||
: query::digest_algorithm::MD5;
|
||||
}
|
||||
|
||||
static inline
|
||||
@@ -1760,6 +1762,7 @@ storage_proxy::storage_proxy(distributed<database>& db, storage_proxy::config cf
|
||||
, _token_metadata(tm)
|
||||
, _read_smp_service_group(cfg.read_smp_service_group)
|
||||
, _write_smp_service_group(cfg.write_smp_service_group)
|
||||
, _hints_write_smp_service_group(cfg.hints_write_smp_service_group)
|
||||
, _write_ack_smp_service_group(cfg.write_ack_smp_service_group)
|
||||
, _next_response_id(std::chrono::system_clock::now().time_since_epoch()/1ms)
|
||||
, _hints_resource_manager(cfg.available_memory / 10)
|
||||
@@ -1803,39 +1806,48 @@ storage_proxy::response_id_type storage_proxy::unique_response_handler::release(
|
||||
}
|
||||
|
||||
future<>
|
||||
storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
|
||||
storage_proxy::mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp) {
|
||||
auto shard = _db.local().shard_of(m);
|
||||
get_stats().replica_cross_shard_ops += shard != this_shard_id();
|
||||
return _db.invoke_on(shard, {_write_smp_service_group, timeout},
|
||||
[s = global_schema_ptr(m.schema()), m = freeze(m), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout] (database& db) mutable -> future<> {
|
||||
return db.apply(s, m, gtr.get(), db::commitlog::force_sync::no, timeout);
|
||||
return _db.invoke_on(shard, {smp_grp, timeout},
|
||||
[s = global_schema_ptr(m.schema()),
|
||||
m = freeze(m),
|
||||
gtr = tracing::global_trace_state_ptr(std::move(tr_state)),
|
||||
timeout,
|
||||
sync] (database& db) mutable -> future<> {
|
||||
return db.apply(s, m, gtr.get(), sync, timeout);
|
||||
});
|
||||
}
|
||||
|
||||
future<>
|
||||
storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout) {
|
||||
storage_proxy::mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
|
||||
smp_service_group smp_grp) {
|
||||
auto shard = _db.local().shard_of(m);
|
||||
get_stats().replica_cross_shard_ops += shard != this_shard_id();
|
||||
return _db.invoke_on(shard, {_write_smp_service_group, timeout},
|
||||
return _db.invoke_on(shard, {smp_grp, timeout},
|
||||
[&m, gs = global_schema_ptr(s), gtr = tracing::global_trace_state_ptr(std::move(tr_state)), timeout, sync] (database& db) mutable -> future<> {
|
||||
return db.apply(gs, m, gtr.get(), sync, timeout);
|
||||
});
|
||||
}
|
||||
|
||||
future<>
|
||||
storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
|
||||
return do_with(std::move(mutations), [this, timeout, tr_state = std::move(tr_state)] (std::vector<mutation>& pmut) mutable {
|
||||
return parallel_for_each(pmut.begin(), pmut.end(), [this, tr_state = std::move(tr_state), timeout] (const mutation& m) mutable {
|
||||
return mutate_locally(m, tr_state, timeout);
|
||||
storage_proxy::mutate_locally(std::vector<mutation> mutations, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp) {
|
||||
return do_with(std::move(mutations), [this, timeout, tr_state = std::move(tr_state), smp_grp] (std::vector<mutation>& pmut) mutable {
|
||||
return parallel_for_each(pmut.begin(), pmut.end(), [this, tr_state = std::move(tr_state), timeout, smp_grp] (const mutation& m) mutable {
|
||||
return mutate_locally(m, tr_state, db::commitlog::force_sync::no, timeout, smp_grp);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<>
|
||||
storage_proxy::mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
|
||||
return mutate_locally(std::move(mutation), tr_state, timeout, _write_smp_service_group);
|
||||
}
|
||||
future<>
|
||||
storage_proxy::mutate_hint(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout) {
|
||||
auto shard = _db.local().shard_of(m);
|
||||
get_stats().replica_cross_shard_ops += shard != this_shard_id();
|
||||
return _db.invoke_on(shard, {_write_smp_service_group, timeout}, [&m, gs = global_schema_ptr(s), tr_state = std::move(tr_state), timeout] (database& db) mutable -> future<> {
|
||||
return _db.invoke_on(shard, {_hints_write_smp_service_group, timeout}, [&m, gs = global_schema_ptr(s), tr_state = std::move(tr_state), timeout] (database& db) mutable -> future<> {
|
||||
return db.apply_hint(gs, m, std::move(tr_state), timeout);
|
||||
});
|
||||
}
|
||||
@@ -4488,6 +4500,12 @@ future<bool> storage_proxy::cas(schema_ptr schema, shared_ptr<cas_request> reque
|
||||
paxos::paxos_state::logger.debug("CAS[{}] successful", handler->id());
|
||||
tracing::trace(handler->tr_state, "CAS successful");
|
||||
return std::optional<bool>(condition_met);
|
||||
}).handle_exception_type([handler] (unavailable_exception& e) {
|
||||
// if learning stage encountered unavailablity error lets re-map it to a write error
|
||||
// since unavailable error means that operation has never ever started which is not the case here
|
||||
schema_ptr schema = handler->schema();
|
||||
return make_exception_future<std::optional<bool>>(mutation_write_timeout_exception(schema->ks_name(), schema->cf_name(),
|
||||
e.consistency, e.alive, e.required, db::write_type::CAS));
|
||||
});
|
||||
}
|
||||
paxos::paxos_state::logger.debug("CAS[{}] PAXOS proposal not accepted (pre-empted by a higher ballot)",
|
||||
@@ -4849,7 +4867,7 @@ void storage_proxy::init_messaging_service() {
|
||||
});
|
||||
};
|
||||
|
||||
auto receive_mutation_handler = [] (const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, std::vector<gms::inet_address> forward,
|
||||
auto receive_mutation_handler = [] (smp_service_group smp_grp, const rpc::client_info& cinfo, rpc::opt_time_point t, frozen_mutation in, std::vector<gms::inet_address> forward,
|
||||
gms::inet_address reply_to, unsigned shard, storage_proxy::response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info) {
|
||||
tracing::trace_state_ptr trace_state_ptr;
|
||||
auto src_addr = netw::messaging_service::get_source(cinfo);
|
||||
@@ -4857,9 +4875,9 @@ void storage_proxy::init_messaging_service() {
|
||||
utils::UUID schema_version = in.schema_version();
|
||||
return handle_write(src_addr, t, schema_version, std::move(in), std::move(forward), reply_to, shard, response_id,
|
||||
trace_info ? *trace_info : std::nullopt,
|
||||
/* apply_fn */ [] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
|
||||
/* apply_fn */ [smp_grp] (shared_ptr<storage_proxy>& p, tracing::trace_state_ptr tr_state, schema_ptr s, const frozen_mutation& m,
|
||||
clock_type::time_point timeout) {
|
||||
return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout);
|
||||
return p->mutate_locally(std::move(s), m, std::move(tr_state), db::commitlog::force_sync::no, timeout, smp_grp);
|
||||
},
|
||||
/* forward_fn */ [] (netw::messaging_service::msg_addr addr, clock_type::time_point timeout, const frozen_mutation& m,
|
||||
gms::inet_address reply_to, unsigned shard, response_id_type response_id,
|
||||
@@ -4868,8 +4886,8 @@ void storage_proxy::init_messaging_service() {
|
||||
return ms.send_mutation(addr, timeout, m, {}, reply_to, shard, response_id, std::move(trace_info));
|
||||
});
|
||||
};
|
||||
ms.register_mutation(receive_mutation_handler);
|
||||
ms.register_hint_mutation(receive_mutation_handler);
|
||||
ms.register_mutation(std::bind_front<>(receive_mutation_handler, _write_smp_service_group));
|
||||
ms.register_hint_mutation(std::bind_front<>(receive_mutation_handler, _hints_write_smp_service_group));
|
||||
|
||||
ms.register_paxos_learn([] (const rpc::client_info& cinfo, rpc::opt_time_point t, paxos::proposal decision,
|
||||
std::vector<gms::inet_address> forward, gms::inet_address reply_to, unsigned shard,
|
||||
@@ -5112,18 +5130,22 @@ void storage_proxy::init_messaging_service() {
|
||||
future<> storage_proxy::uninit_messaging_service() {
|
||||
auto& ms = netw::get_local_messaging_service();
|
||||
return when_all_succeed(
|
||||
ms.unregister_counter_mutation(),
|
||||
ms.unregister_mutation(),
|
||||
ms.unregister_hint_mutation(),
|
||||
ms.unregister_mutation_done(),
|
||||
ms.unregister_mutation_failed(),
|
||||
ms.unregister_read_data(),
|
||||
ms.unregister_read_mutation_data(),
|
||||
ms.unregister_read_digest(),
|
||||
ms.unregister_truncate(),
|
||||
ms.unregister_get_schema_version(),
|
||||
ms.unregister_paxos_prepare(),
|
||||
ms.unregister_paxos_accept(),
|
||||
ms.unregister_paxos_learn(),
|
||||
ms.unregister_paxos_prune()
|
||||
).discard_result();
|
||||
|
||||
}
|
||||
|
||||
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
|
||||
@@ -5217,8 +5239,7 @@ future<> storage_proxy::drain_on_shutdown() {
|
||||
|
||||
future<>
|
||||
storage_proxy::stop() {
|
||||
// FIXME: hints manager should be stopped here but it seems like this function is never called
|
||||
return uninit_messaging_service();
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -166,6 +166,7 @@ public:
|
||||
size_t available_memory;
|
||||
smp_service_group read_smp_service_group = default_smp_service_group();
|
||||
smp_service_group write_smp_service_group = default_smp_service_group();
|
||||
smp_service_group hints_write_smp_service_group = default_smp_service_group();
|
||||
// Write acknowledgments might not be received on the correct shard, and
|
||||
// they need a separate smp_service_group to prevent an ABBA deadlock
|
||||
// with writes.
|
||||
@@ -256,6 +257,7 @@ private:
|
||||
locator::token_metadata& _token_metadata;
|
||||
smp_service_group _read_smp_service_group;
|
||||
smp_service_group _write_smp_service_group;
|
||||
smp_service_group _hints_write_smp_service_group;
|
||||
smp_service_group _write_ack_smp_service_group;
|
||||
response_id_type _next_response_id;
|
||||
response_handlers_map _response_handlers;
|
||||
@@ -314,7 +316,6 @@ private:
|
||||
|
||||
cdc_stats _cdc_stats;
|
||||
private:
|
||||
future<> uninit_messaging_service();
|
||||
future<coordinator_query_result> query_singular(lw_shared_ptr<query::read_command> cmd,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
db::consistency_level cl,
|
||||
@@ -469,13 +470,31 @@ public:
|
||||
return next;
|
||||
}
|
||||
void init_messaging_service();
|
||||
future<> uninit_messaging_service();
|
||||
|
||||
private:
|
||||
// Applies mutation on this node.
|
||||
// Resolves with timed_out_error when timeout is reached.
|
||||
future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
|
||||
future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout, smp_service_group smp_grp);
|
||||
// Applies mutation on this node.
|
||||
// Resolves with timed_out_error when timeout is reached.
|
||||
future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max());
|
||||
future<> mutate_locally(const schema_ptr&, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout,
|
||||
smp_service_group smp_grp);
|
||||
// Applies mutations on this node.
|
||||
// Resolves with timed_out_error when timeout is reached.
|
||||
future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout, smp_service_group smp_grp);
|
||||
|
||||
public:
|
||||
// Applies mutation on this node.
|
||||
// Resolves with timed_out_error when timeout is reached.
|
||||
future<> mutate_locally(const mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
|
||||
return mutate_locally(m, tr_state, sync, timeout, _write_smp_service_group);
|
||||
}
|
||||
// Applies mutation on this node.
|
||||
// Resolves with timed_out_error when timeout is reached.
|
||||
future<> mutate_locally(const schema_ptr& s, const frozen_mutation& m, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, clock_type::time_point timeout = clock_type::time_point::max()) {
|
||||
return mutate_locally(s, m, tr_state, sync, timeout, _write_smp_service_group);
|
||||
}
|
||||
// Applies mutations on this node.
|
||||
// Resolves with timed_out_error when timeout is reached.
|
||||
future<> mutate_locally(std::vector<mutation> mutation, tracing::trace_state_ptr tr_state, clock_type::time_point timeout = clock_type::time_point::max());
|
||||
|
||||
@@ -108,7 +108,8 @@ storage_service::storage_service(abort_source& abort_source, distributed<databas
|
||||
, _replicate_action([this] { return do_replicate_to_all_cores(); })
|
||||
, _update_pending_ranges_action([this] { return do_update_pending_ranges(); })
|
||||
, _sys_dist_ks(sys_dist_ks)
|
||||
, _view_update_generator(view_update_generator) {
|
||||
, _view_update_generator(view_update_generator)
|
||||
, _schema_version_publisher([this] { return publish_schema_version(); }) {
|
||||
register_metrics();
|
||||
sstable_read_error.connect([this] { do_isolate_on_error(disk_error::regular); });
|
||||
sstable_write_error.connect([this] { do_isolate_on_error(disk_error::regular); });
|
||||
@@ -209,6 +210,16 @@ bool storage_service::should_bootstrap() const {
|
||||
return is_auto_bootstrap() && !db::system_keyspace::bootstrap_complete() && !_gossiper.get_seeds().count(get_broadcast_address());
|
||||
}
|
||||
|
||||
void storage_service::install_schema_version_change_listener() {
|
||||
_listeners.emplace_back(make_lw_shared(_db.local().observable_schema_version().observe([this] (utils::UUID schema_version) {
|
||||
(void)_schema_version_publisher.trigger();
|
||||
})));
|
||||
}
|
||||
|
||||
future<> storage_service::publish_schema_version() {
|
||||
return get_local_migration_manager().passive_announce(_db.local().get_version());
|
||||
}
|
||||
|
||||
// Runs inside seastar::async context
|
||||
void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints, const std::unordered_map<gms::inet_address, sstring>& loaded_peer_features, bind_messaging_port do_bind) {
|
||||
std::map<gms::application_state, gms::versioned_value> app_states;
|
||||
@@ -351,7 +362,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
auto broadcast_rpc_address = utils::fb_utilities::get_broadcast_rpc_address();
|
||||
auto& proxy = service::get_storage_proxy();
|
||||
// Ensure we know our own actual Schema UUID in preparation for updates
|
||||
auto schema_version = update_schema_version(proxy, _feature_service.cluster_schema_features()).get0();
|
||||
db::schema_tables::recalculate_schema_version(proxy, _feature_service).get0();
|
||||
app_states.emplace(gms::application_state::NET_VERSION, versioned_value::network_version());
|
||||
app_states.emplace(gms::application_state::HOST_ID, versioned_value::host_id(local_host_id));
|
||||
app_states.emplace(gms::application_state::RPC_ADDRESS, versioned_value::rpcaddress(broadcast_rpc_address));
|
||||
@@ -361,7 +372,7 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
app_states.emplace(gms::application_state::SCHEMA_TABLES_VERSION, versioned_value(db::schema_tables::version));
|
||||
app_states.emplace(gms::application_state::RPC_READY, versioned_value::cql_ready(false));
|
||||
app_states.emplace(gms::application_state::VIEW_BACKLOG, versioned_value(""));
|
||||
app_states.emplace(gms::application_state::SCHEMA, versioned_value::schema(schema_version));
|
||||
app_states.emplace(gms::application_state::SCHEMA, versioned_value::schema(_db.local().get_version()));
|
||||
if (restarting_normal_node) {
|
||||
// Order is important: both the CDC streams timestamp and tokens must be known when a node handles our status.
|
||||
// Exception: there might be no CDC streams timestamp proposed by us if we're upgrading from a non-CDC version.
|
||||
@@ -369,11 +380,16 @@ void storage_service::prepare_to_join(std::vector<inet_address> loaded_endpoints
|
||||
app_states.emplace(gms::application_state::CDC_STREAMS_TIMESTAMP, versioned_value::cdc_streams_timestamp(_cdc_streams_ts));
|
||||
app_states.emplace(gms::application_state::STATUS, versioned_value::normal(my_tokens));
|
||||
}
|
||||
if (replacing_a_node_with_same_ip || replacing_a_node_with_diff_ip) {
|
||||
app_states.emplace(gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens));
|
||||
}
|
||||
slogger.info("Starting up server gossip");
|
||||
|
||||
auto generation_number = db::system_keyspace::increment_and_get_generation().get0();
|
||||
_gossiper.start_gossiping(generation_number, app_states, gms::bind_messaging_port(bool(do_bind))).get();
|
||||
|
||||
install_schema_version_change_listener();
|
||||
|
||||
// gossip snitch infos (local DC and rack)
|
||||
gossip_snitch_info().get();
|
||||
|
||||
@@ -698,6 +714,8 @@ bool storage_service::do_handle_cdc_generation_intercept_nonfatal_errors(db_cloc
|
||||
throw cdc_generation_handling_nonfatal_exception(e.what());
|
||||
} catch (exceptions::unavailable_exception& e) {
|
||||
throw cdc_generation_handling_nonfatal_exception(e.what());
|
||||
} catch (exceptions::read_failure_exception& e) {
|
||||
throw cdc_generation_handling_nonfatal_exception(e.what());
|
||||
} catch (...) {
|
||||
const auto ep = std::current_exception();
|
||||
if (is_timeout_exception(ep)) {
|
||||
@@ -890,12 +908,14 @@ future<> storage_service::check_and_repair_cdc_streams() {
|
||||
cdc_log.error("Aborting CDC generation repair due to missing STATUS");
|
||||
return;
|
||||
}
|
||||
// Update _cdc_streams_ts first, so that do_handle_cdc_generation (which will get called due to the status update)
|
||||
// won't try to update the gossiper, which would result in a deadlock inside add_local_application_state
|
||||
_cdc_streams_ts = new_streams_ts;
|
||||
_gossiper.add_local_application_state({
|
||||
{ gms::application_state::CDC_STREAMS_TIMESTAMP, versioned_value::cdc_streams_timestamp(new_streams_ts) },
|
||||
{ gms::application_state::STATUS, *status }
|
||||
}).get();
|
||||
db::system_keyspace::update_cdc_streams_timestamp(new_streams_ts).get();
|
||||
_cdc_streams_ts = new_streams_ts;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1728,6 +1748,9 @@ future<> storage_service::gossip_sharder() {
|
||||
future<> storage_service::stop() {
|
||||
return uninit_messaging_service().then([this] {
|
||||
return _service_memory_limiter.wait(_service_memory_total); // make sure nobody uses the semaphore
|
||||
}).finally([this] {
|
||||
_listeners.clear();
|
||||
return _schema_version_publisher.join();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1884,9 +1907,11 @@ future<std::map<gms::inet_address, float>> storage_service::effective_ownership(
|
||||
return do_with(dht::token::describe_ownership(ss._token_metadata.sorted_tokens()),
|
||||
ss._token_metadata.get_topology().get_datacenter_endpoints(),
|
||||
std::map<gms::inet_address, float>(),
|
||||
[&ss, keyspace_name](const std::map<token, float>& token_ownership, std::unordered_map<sstring,
|
||||
std::move(keyspace_name),
|
||||
[&ss](const std::map<token, float>& token_ownership, std::unordered_map<sstring,
|
||||
std::unordered_set<gms::inet_address>>& datacenter_endpoints,
|
||||
std::map<gms::inet_address, float>& final_ownership) {
|
||||
std::map<gms::inet_address, float>& final_ownership,
|
||||
sstring& keyspace_name) {
|
||||
return do_for_each(datacenter_endpoints, [&ss, &keyspace_name, &final_ownership, &token_ownership](std::pair<sstring,std::unordered_set<inet_address>>&& endpoints) mutable {
|
||||
return do_with(std::unordered_set<inet_address>(endpoints.second), [&ss, &keyspace_name, &final_ownership, &token_ownership](const std::unordered_set<inet_address>& endpoints_map) mutable {
|
||||
return do_for_each(endpoints_map, [&ss, &keyspace_name, &final_ownership, &token_ownership](const gms::inet_address& endpoint) mutable {
|
||||
|
||||
@@ -150,6 +150,7 @@ private:
|
||||
semaphore _service_memory_limiter;
|
||||
using client_shutdown_hook = noncopyable_function<void()>;
|
||||
std::vector<std::pair<std::string, client_shutdown_hook>> _client_shutdown_hooks;
|
||||
std::vector<std::any> _listeners;
|
||||
|
||||
/* For unit tests only.
|
||||
*
|
||||
@@ -170,7 +171,8 @@ public:
|
||||
private:
|
||||
future<> do_update_pending_ranges();
|
||||
void register_metrics();
|
||||
|
||||
future<> publish_schema_version();
|
||||
void install_schema_version_change_listener();
|
||||
public:
|
||||
future<> keyspace_changed(const sstring& ks_name);
|
||||
future<> update_pending_ranges();
|
||||
@@ -545,6 +547,7 @@ private:
|
||||
serialized_action _update_pending_ranges_action;
|
||||
sharded<db::system_distributed_keyspace>& _sys_dist_ks;
|
||||
sharded<db::view::view_update_generator>& _view_update_generator;
|
||||
serialized_action _schema_version_publisher;
|
||||
private:
|
||||
/**
|
||||
* Replicates token_metadata contents on shard0 instance to other shards.
|
||||
|
||||
@@ -602,7 +602,7 @@ private:
|
||||
// - add support to merge summary (message: Partition merge counts were {%s}.).
|
||||
// - there is no easy way, currently, to know the exact number of total partitions.
|
||||
// By the time being, using estimated key count.
|
||||
sstring formatted_msg = fmt::format("{} sstables to [{}]. {} to {} (~{} of original) in {}ms = {}. " \
|
||||
sstring formatted_msg = fmt::format("{} sstables to [{}]. {} to {} (~{}% of original) in {}ms = {}. " \
|
||||
"~{} total partitions merged to {}.",
|
||||
_info->sstables, new_sstables_msg, pretty_printed_data_size(_info->start_size), pretty_printed_data_size(_info->end_size), int(ratio * 100),
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_info->end_size, duration),
|
||||
@@ -1236,11 +1236,8 @@ private:
|
||||
// return estimated partitions per sstable for a given shard
|
||||
uint64_t partitions_per_sstable(shard_id s) const {
|
||||
uint64_t estimated_sstables = std::max(uint64_t(1), uint64_t(ceil(double(_estimation_per_shard[s].estimated_size) / _max_sstable_size)));
|
||||
// As we adjust this estimate downwards from the compaction strategy, it can get to 0 so
|
||||
// make sure we're returning at least 1.
|
||||
return std::max(uint64_t(1),
|
||||
std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
|
||||
_cf.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions)));
|
||||
return std::min(uint64_t(ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables)),
|
||||
_cf.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimation_per_shard[s].estimated_partitions));
|
||||
}
|
||||
public:
|
||||
resharding_compaction(column_family& cf, sstables::compaction_descriptor descriptor)
|
||||
|
||||
@@ -92,6 +92,9 @@ public:
|
||||
void transfer_ongoing_charges(compaction_backlog_tracker& new_bt, bool move_read_charges = true);
|
||||
void revert_charges(sstables::shared_sstable sst);
|
||||
private:
|
||||
// Returns true if this SSTable can be added or removed from the tracker.
|
||||
bool sstable_belongs_to_tracker(const sstables::shared_sstable& sst);
|
||||
|
||||
void disable() {
|
||||
_disabled = true;
|
||||
_ongoing_writes = {};
|
||||
|
||||
@@ -218,7 +218,7 @@ std::vector<sstables::shared_sstable> compaction_manager::get_candidates(const c
|
||||
auto& cs = cf.get_compaction_strategy();
|
||||
|
||||
// Filter out sstables that are being compacted.
|
||||
for (auto& sst : cf.candidates_for_compaction()) {
|
||||
for (auto& sst : cf.non_staging_sstables()) {
|
||||
if (_compacting_sstables.count(sst)) {
|
||||
continue;
|
||||
}
|
||||
@@ -708,8 +708,8 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
return task->compaction_done.get_future().then([task] {});
|
||||
}
|
||||
|
||||
static bool needs_cleanup(const sstables::shared_sstable& sst,
|
||||
const dht::token_range_vector& owned_ranges,
|
||||
bool needs_cleanup(const sstables::shared_sstable& sst,
|
||||
const dht::token_range_vector& sorted_owned_ranges,
|
||||
schema_ptr s) {
|
||||
auto first = sst->get_first_partition_key();
|
||||
auto last = sst->get_last_partition_key();
|
||||
@@ -717,29 +717,40 @@ static bool needs_cleanup(const sstables::shared_sstable& sst,
|
||||
auto last_token = dht::get_token(*s, last);
|
||||
dht::token_range sst_token_range = dht::token_range::make(first_token, last_token);
|
||||
|
||||
auto r = std::lower_bound(sorted_owned_ranges.begin(), sorted_owned_ranges.end(), first_token,
|
||||
[] (const range<dht::token>& a, const dht::token& b) {
|
||||
// check that range a is before token b.
|
||||
return a.after(b, dht::token_comparator());
|
||||
});
|
||||
|
||||
// return true iff sst partition range isn't fully contained in any of the owned ranges.
|
||||
for (auto& r : owned_ranges) {
|
||||
if (r.contains(sst_token_range, dht::token_comparator())) {
|
||||
if (r != sorted_owned_ranges.end()) {
|
||||
if (r->contains(sst_token_range, dht::token_comparator())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
future<> compaction_manager::perform_cleanup(column_family* cf) {
|
||||
future<> compaction_manager::perform_cleanup(database& db, column_family* cf) {
|
||||
if (check_for_cleanup(cf)) {
|
||||
throw std::runtime_error(format("cleanup request failed: there is an ongoing cleanup on {}.{}",
|
||||
cf->schema()->ks_name(), cf->schema()->cf_name()));
|
||||
}
|
||||
return rewrite_sstables(cf, sstables::compaction_options::make_cleanup(), [this] (const table& table) {
|
||||
auto schema = table.schema();
|
||||
auto owned_ranges = service::get_local_storage_service().get_local_ranges(schema->ks_name());
|
||||
return seastar::async([this, cf, &db] {
|
||||
auto schema = cf->schema();
|
||||
auto& rs = db.find_keyspace(schema->ks_name()).get_replication_strategy();
|
||||
auto sorted_owned_ranges = rs.get_ranges_in_thread(utils::fb_utilities::get_broadcast_address());
|
||||
auto sstables = std::vector<sstables::shared_sstable>{};
|
||||
const auto candidates = table.candidates_for_compaction();
|
||||
std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&owned_ranges, schema] (const sstables::shared_sstable& sst) {
|
||||
return owned_ranges.empty() || needs_cleanup(sst, owned_ranges, schema);
|
||||
const auto candidates = get_candidates(*cf);
|
||||
std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&sorted_owned_ranges, schema] (const sstables::shared_sstable& sst) {
|
||||
seastar::thread::maybe_yield();
|
||||
return sorted_owned_ranges.empty() || needs_cleanup(sst, sorted_owned_ranges, schema);
|
||||
});
|
||||
return sstables;
|
||||
}).then([this, cf] (std::vector<sstables::shared_sstable> sstables) {
|
||||
return rewrite_sstables(cf, sstables::compaction_options::make_cleanup(),
|
||||
[sstables = std::move(sstables)] (const table&) { return sstables; });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -754,7 +765,7 @@ future<> compaction_manager::perform_sstable_upgrade(column_family* cf, bool exc
|
||||
return cf->run_with_compaction_disabled([this, cf, &tables, exclude_current_version] {
|
||||
auto last_version = cf->get_sstables_manager().get_highest_supported_format();
|
||||
|
||||
for (auto& sst : cf->candidates_for_compaction()) {
|
||||
for (auto& sst : get_candidates(*cf)) {
|
||||
// if we are a "normal" upgrade, we only care about
|
||||
// tables with older versions, but potentially
|
||||
// we are to actually rewrite everything. (-a)
|
||||
@@ -779,8 +790,8 @@ future<> compaction_manager::perform_sstable_upgrade(column_family* cf, bool exc
|
||||
|
||||
// Submit a column family to be scrubbed and wait for its termination.
|
||||
future<> compaction_manager::perform_sstable_scrub(column_family* cf, bool skip_corrupted) {
|
||||
return rewrite_sstables(cf, sstables::compaction_options::make_scrub(skip_corrupted), [] (const table& cf) {
|
||||
return cf.candidates_for_compaction();
|
||||
return rewrite_sstables(cf, sstables::compaction_options::make_scrub(skip_corrupted), [this] (const table& cf) {
|
||||
return get_candidates(cf);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -857,7 +868,7 @@ double compaction_backlog_tracker::backlog() const {
|
||||
}
|
||||
|
||||
void compaction_backlog_tracker::add_sstable(sstables::shared_sstable sst) {
|
||||
if (_disabled) {
|
||||
if (_disabled || !sstable_belongs_to_tracker(sst)) {
|
||||
return;
|
||||
}
|
||||
_ongoing_writes.erase(sst);
|
||||
@@ -870,7 +881,7 @@ void compaction_backlog_tracker::add_sstable(sstables::shared_sstable sst) {
|
||||
}
|
||||
|
||||
void compaction_backlog_tracker::remove_sstable(sstables::shared_sstable sst) {
|
||||
if (_disabled) {
|
||||
if (_disabled || !sstable_belongs_to_tracker(sst)) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -883,6 +894,10 @@ void compaction_backlog_tracker::remove_sstable(sstables::shared_sstable sst) {
|
||||
}
|
||||
}
|
||||
|
||||
bool compaction_backlog_tracker::sstable_belongs_to_tracker(const sstables::shared_sstable& sst) {
|
||||
return !sst->requires_view_building();
|
||||
}
|
||||
|
||||
void compaction_backlog_tracker::register_partially_written_sstable(sstables::shared_sstable sst, backlog_write_progress_manager& wp) {
|
||||
if (_disabled) {
|
||||
return;
|
||||
|
||||
@@ -205,7 +205,7 @@ public:
|
||||
// Cleanup is about discarding keys that are no longer relevant for a
|
||||
// given sstable, e.g. after node loses part of its token range because
|
||||
// of a newly added node.
|
||||
future<> perform_cleanup(column_family* cf);
|
||||
future<> perform_cleanup(database& db, column_family* cf);
|
||||
|
||||
// Submit a column family to be upgraded and wait for its termination.
|
||||
future<> perform_sstable_upgrade(column_family* cf, bool exclude_current_version);
|
||||
@@ -271,3 +271,5 @@ public:
|
||||
friend class compaction_weight_registration;
|
||||
};
|
||||
|
||||
bool needs_cleanup(const sstables::shared_sstable& sst, const dht::token_range_vector& owned_ranges, schema_ptr s);
|
||||
|
||||
|
||||
@@ -438,8 +438,8 @@ std::unique_ptr<sstable_set_impl> leveled_compaction_strategy::make_sstable_set(
|
||||
return std::make_unique<partitioned_sstable_set>(std::move(schema));
|
||||
}
|
||||
|
||||
std::unique_ptr<sstable_set_impl> make_partitioned_sstable_set(schema_ptr schema, bool use_level_metadata) {
|
||||
return std::make_unique<partitioned_sstable_set>(std::move(schema), use_level_metadata);
|
||||
sstable_set make_partitioned_sstable_set(schema_ptr schema, lw_shared_ptr<sstable_list> all, bool use_level_metadata) {
|
||||
return sstables::sstable_set(std::make_unique<partitioned_sstable_set>(schema, use_level_metadata), schema, std::move(all));
|
||||
}
|
||||
|
||||
compaction_descriptor compaction_strategy_impl::get_major_compaction_job(column_family& cf, std::vector<sstables::shared_sstable> candidates) {
|
||||
|
||||
@@ -453,9 +453,16 @@ private:
|
||||
auto indexes = std::move(entries_reader->_consumer.indexes);
|
||||
return entries_reader->_context.close().then([indexes = std::move(indexes), ex = std::move(ex)] () mutable {
|
||||
if (ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
return do_with(std::move(indexes), [ex = std::move(ex)] (index_list& indexes) mutable {
|
||||
return parallel_for_each(indexes, [] (index_entry& ie) mutable {
|
||||
return ie.close_pi_stream();
|
||||
}).then_wrapped([ex = std::move(ex)] (future<>&& fut) mutable {
|
||||
fut.ignore_ready_future();
|
||||
return make_exception_future<index_list>(std::move(ex));
|
||||
});
|
||||
});
|
||||
}
|
||||
return std::move(indexes);
|
||||
return make_ready_future<index_list>(std::move(indexes));
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
@@ -178,7 +178,13 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
|
||||
|
||||
size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
|
||||
size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));
|
||||
unsigned tolerance = mode == reshape_mode::strict ? 0 : leveled_manifest::leveled_fan_out * 2;
|
||||
auto tolerance = [mode] (unsigned level) -> unsigned {
|
||||
if (mode == reshape_mode::strict) {
|
||||
return 0;
|
||||
}
|
||||
constexpr unsigned fan_out = leveled_manifest::leveled_fan_out;
|
||||
return std::max(double(fan_out), std::ceil(std::pow(fan_out, level) * 0.1));
|
||||
};
|
||||
|
||||
if (level_info[0].size() > offstrategy_threshold) {
|
||||
level_info[0].resize(std::min(level_info[0].size(), max_sstables));
|
||||
@@ -193,7 +199,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
|
||||
}
|
||||
max_filled_level = std::max(max_filled_level, level);
|
||||
|
||||
if (!is_disjoint(level_info[level], tolerance)) {
|
||||
if (!is_disjoint(level_info[level], tolerance(level))) {
|
||||
leveled_manifest::logger.warn("Turns out that level {} is not disjoint, so compacting everything on behalf of {}.{}", level, schema->ks_name(), schema->cf_name());
|
||||
// Unfortunately no good limit to limit input size to max_sstables for LCS major
|
||||
compaction_descriptor desc(std::move(input), std::optional<sstables::sstable_set>(), iop, max_filled_level, _max_sstable_size_in_mb * 1024 * 1024);
|
||||
|
||||
@@ -741,6 +741,11 @@ public:
|
||||
, _run_identifier(cfg.run_identifier)
|
||||
, _write_regular_as_static(cfg.correctly_serialize_static_compact_in_mc && s.is_static_compact_table())
|
||||
{
|
||||
// This can be 0 in some cases, which is albeit benign, can wreak havoc
|
||||
// in lower-level writer code, so clamp it to [1, +inf) here, which is
|
||||
// exactly what callers used to do anyway.
|
||||
estimated_partitions = std::max(uint64_t(1), estimated_partitions);
|
||||
|
||||
_sst.generate_toc(_schema.get_compressor_params().get_compressor(), _schema.bloom_filter_fp_chance());
|
||||
_sst.write_toc(_pc);
|
||||
_sst.create_data().get();
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
namespace sstables {
|
||||
|
||||
std::vector<std::pair<sstables::shared_sstable, uint64_t>>
|
||||
size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) const {
|
||||
size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) {
|
||||
|
||||
std::vector<std::pair<sstables::shared_sstable, uint64_t>> sstable_length_pairs;
|
||||
sstable_length_pairs.reserve(sstables.size());
|
||||
@@ -43,7 +43,7 @@ size_tiered_compaction_strategy::create_sstable_and_length_pairs(const std::vect
|
||||
}
|
||||
|
||||
std::vector<std::vector<sstables::shared_sstable>>
|
||||
size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_sstable>& sstables) const {
|
||||
size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_sstable>& sstables, size_tiered_compaction_strategy_options options) {
|
||||
// sstables sorted by size of its data file.
|
||||
auto sorted_sstables = create_sstable_and_length_pairs(sstables);
|
||||
|
||||
@@ -64,8 +64,8 @@ size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_
|
||||
for (auto it = buckets.begin(); it != buckets.end(); it++) {
|
||||
size_t old_average_size = it->first;
|
||||
|
||||
if ((size > (old_average_size * _options.bucket_low) && size < (old_average_size * _options.bucket_high)) ||
|
||||
(size < _options.min_sstable_size && old_average_size < _options.min_sstable_size)) {
|
||||
if ((size > (old_average_size * options.bucket_low) && size < (old_average_size * options.bucket_high)) ||
|
||||
(size < options.min_sstable_size && old_average_size < options.min_sstable_size)) {
|
||||
auto bucket = std::move(it->second);
|
||||
size_t total_size = bucket.size() * old_average_size;
|
||||
size_t new_average_size = (total_size + size) / (bucket.size() + 1);
|
||||
@@ -97,6 +97,11 @@ size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_
|
||||
return bucket_list;
|
||||
}
|
||||
|
||||
std::vector<std::vector<sstables::shared_sstable>>
|
||||
size_tiered_compaction_strategy::get_buckets(const std::vector<sstables::shared_sstable>& sstables) const {
|
||||
return get_buckets(sstables, _options);
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable>
|
||||
size_tiered_compaction_strategy::most_interesting_bucket(std::vector<std::vector<sstables::shared_sstable>> buckets,
|
||||
unsigned min_threshold, unsigned max_threshold)
|
||||
@@ -176,23 +181,28 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(column_family& cfs,
|
||||
return sstables::compaction_descriptor();
|
||||
}
|
||||
|
||||
int64_t size_tiered_compaction_strategy::estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
|
||||
int min_threshold, int max_threshold, size_tiered_compaction_strategy_options options) {
|
||||
int64_t n = 0;
|
||||
for (auto& bucket : get_buckets(sstables, options)) {
|
||||
if (bucket.size() >= size_t(min_threshold)) {
|
||||
n += std::ceil(double(bucket.size()) / max_threshold);
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
int64_t size_tiered_compaction_strategy::estimated_pending_compactions(column_family& cf) const {
|
||||
int min_threshold = cf.min_compaction_threshold();
|
||||
int max_threshold = cf.schema()->max_compaction_threshold();
|
||||
std::vector<sstables::shared_sstable> sstables;
|
||||
int64_t n = 0;
|
||||
|
||||
sstables.reserve(cf.sstables_count());
|
||||
for (auto& entry : *cf.get_sstables()) {
|
||||
sstables.push_back(entry);
|
||||
}
|
||||
|
||||
for (auto& bucket : get_buckets(sstables)) {
|
||||
if (bucket.size() >= size_t(min_threshold)) {
|
||||
n += std::ceil(double(bucket.size()) / max_threshold);
|
||||
}
|
||||
}
|
||||
return n;
|
||||
return estimated_pending_compactions(sstables, min_threshold, max_threshold, _options);
|
||||
}
|
||||
|
||||
std::vector<sstables::shared_sstable>
|
||||
|
||||
@@ -116,9 +116,11 @@ class size_tiered_compaction_strategy : public compaction_strategy_impl {
|
||||
compaction_backlog_tracker _backlog_tracker;
|
||||
|
||||
// Return a list of pair of shared_sstable and its respective size.
|
||||
std::vector<std::pair<sstables::shared_sstable, uint64_t>> create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables) const;
|
||||
static std::vector<std::pair<sstables::shared_sstable, uint64_t>> create_sstable_and_length_pairs(const std::vector<sstables::shared_sstable>& sstables);
|
||||
|
||||
// Group files of similar size into buckets.
|
||||
static std::vector<std::vector<sstables::shared_sstable>> get_buckets(const std::vector<sstables::shared_sstable>& sstables, size_tiered_compaction_strategy_options options);
|
||||
|
||||
std::vector<std::vector<sstables::shared_sstable>> get_buckets(const std::vector<sstables::shared_sstable>& sstables) const;
|
||||
|
||||
// Maybe return a bucket of sstables to compact
|
||||
@@ -154,6 +156,8 @@ public:
|
||||
|
||||
virtual compaction_descriptor get_sstables_for_compaction(column_family& cfs, std::vector<sstables::shared_sstable> candidates) override;
|
||||
|
||||
static int64_t estimated_pending_compactions(const std::vector<sstables::shared_sstable>& sstables,
|
||||
int min_threshold, int max_threshold, size_tiered_compaction_strategy_options options);
|
||||
virtual int64_t estimated_pending_compactions(column_family& cf) const override;
|
||||
|
||||
virtual compaction_strategy_type type() const {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user