Compare commits
177 Commits
copilot/up
...
scylla-4.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4ae9a56466 | ||
|
|
0374c1d040 | ||
|
|
9cb0fe3b33 | ||
|
|
a813ff4da2 | ||
|
|
d5936147f4 | ||
|
|
a3d3b4e185 | ||
|
|
4ca2576c98 | ||
|
|
e99a0c7b89 | ||
|
|
f8c7605657 | ||
|
|
7b9e33dcd4 | ||
|
|
d86a31097a | ||
|
|
bd9d6f8e45 | ||
|
|
11ef23e97a | ||
|
|
2c0eac09ae | ||
|
|
713a7269d0 | ||
|
|
1724301d4d | ||
|
|
9971f2f5db | ||
|
|
ee328c22ca | ||
|
|
3a9c9a8a12 | ||
|
|
c03445871a | ||
|
|
565ac1b092 | ||
|
|
7d1180b98f | ||
|
|
f258e6f6ee | ||
|
|
2708b0d664 | ||
|
|
e31ffbf2e6 | ||
|
|
801994e299 | ||
|
|
3b932078bf | ||
|
|
608f62a0e9 | ||
|
|
d8619d3320 | ||
|
|
4f0c99a187 | ||
|
|
ada79df082 | ||
|
|
1935f2b480 | ||
|
|
44a76ed231 | ||
|
|
aeb49f4915 | ||
|
|
8d6b35ad20 | ||
|
|
b123700ebe | ||
|
|
6786b521f9 | ||
|
|
fda0d1ae8e | ||
|
|
e7cffb978a | ||
|
|
79a1c74921 | ||
|
|
3ee854f9fc | ||
|
|
2b65984d14 | ||
|
|
52d1099d09 | ||
|
|
3a03906377 | ||
|
|
2395a240b4 | ||
|
|
d182c595a1 | ||
|
|
fe9c4611b3 | ||
|
|
29df416720 | ||
|
|
1d3c00572c | ||
|
|
9d6e2c5a71 | ||
|
|
386741e3b7 | ||
|
|
d0fdc3960a | ||
|
|
4035cf4f9f | ||
|
|
09367742b1 | ||
|
|
a18ff57b29 | ||
|
|
4734ba21a7 | ||
|
|
425af4c543 | ||
|
|
55f096d01b | ||
|
|
fc79da5912 | ||
|
|
da9e7080ca | ||
|
|
01b0195c22 | ||
|
|
d05b567a40 | ||
|
|
2c11efbbae | ||
|
|
c60d71dc69 | ||
|
|
79930048db | ||
|
|
82b4f4a6c2 | ||
|
|
5b99195d21 | ||
|
|
edde256228 | ||
|
|
3cf28ac18e | ||
|
|
58b65f61c0 | ||
|
|
466cfb0ca6 | ||
|
|
1cd6f50806 | ||
|
|
3f6fe7328a | ||
|
|
f9dd8608eb | ||
|
|
24a80cbf47 | ||
|
|
6e4edc97ad | ||
|
|
81df28b6f3 | ||
|
|
ea6620e9eb | ||
|
|
19be84dafd | ||
|
|
2ff897d351 | ||
|
|
8fc3300739 | ||
|
|
d2ac7d4b18 | ||
|
|
61706a6789 | ||
|
|
65aa531010 | ||
|
|
4bffd0f522 | ||
|
|
9409fc7290 | ||
|
|
86faf1b3ca | ||
|
|
426295bda9 | ||
|
|
c6fde0e562 | ||
|
|
d9f9e7455b | ||
|
|
e95bcd0f8f | ||
|
|
2ff6e2e122 | ||
|
|
1fcf38abd9 | ||
|
|
3375b8b86c | ||
|
|
586546ab32 | ||
|
|
e1d558cb01 | ||
|
|
b0a8f396b4 | ||
|
|
48e7ee374a | ||
|
|
3e85ecd1bd | ||
|
|
930a4af8b3 | ||
|
|
6a6d36058a | ||
|
|
ce57d0174d | ||
|
|
cd11f210ad | ||
|
|
1e2e203cf0 | ||
|
|
1a98c93a25 | ||
|
|
4f4845c94c | ||
|
|
ef745e1ce7 | ||
|
|
ae32aa970a | ||
|
|
a3eb12c5f1 | ||
|
|
b5cedfc177 | ||
|
|
8d9bc57aca | ||
|
|
1cbda629a2 | ||
|
|
baf0201a6e | ||
|
|
7dcffb963c | ||
|
|
dcfaf4d035 | ||
|
|
f974a54cbd | ||
|
|
30a96cc592 | ||
|
|
faf300382a | ||
|
|
55400598ff | ||
|
|
c177295bce | ||
|
|
d95aa77b62 | ||
|
|
fe54009855 | ||
|
|
bbe82236be | ||
|
|
abd73cab78 | ||
|
|
8fd7cf5cd1 | ||
|
|
dd88b2dd18 | ||
|
|
eee4c00e29 | ||
|
|
85071ceeb1 | ||
|
|
4cf201fc24 | ||
|
|
c6ad5cf556 | ||
|
|
51e3e6c655 | ||
|
|
8ac6579b30 | ||
|
|
3744e66244 | ||
|
|
d3bf349484 | ||
|
|
3e6a8ba5bd | ||
|
|
5f1785b9cf | ||
|
|
e1fd6cf989 | ||
|
|
b7328ff1e4 | ||
|
|
602ed43ac7 | ||
|
|
c42c91c5bb | ||
|
|
cf017b320a | ||
|
|
89e79023ae | ||
|
|
bc67da1a21 | ||
|
|
0c7643f1fe | ||
|
|
c563234f40 | ||
|
|
77b7a48a02 | ||
|
|
b2b1bfb159 | ||
|
|
d72cbe37aa | ||
|
|
9f7b560771 | ||
|
|
06af9c028c | ||
|
|
c74ab3ae80 | ||
|
|
32cd3a070a | ||
|
|
bb1554f09e | ||
|
|
2037d7550e | ||
|
|
c320c3f6da | ||
|
|
0ed70944aa | ||
|
|
89f860d409 | ||
|
|
0819d221f4 | ||
|
|
53f47d4e67 | ||
|
|
21ad12669a | ||
|
|
c812359383 | ||
|
|
1bd79705fb | ||
|
|
7e2ef386cc | ||
|
|
51bad7e72c | ||
|
|
0379d0c031 | ||
|
|
a8ef820f27 | ||
|
|
9908f009a4 | ||
|
|
48d8a075b4 | ||
|
|
e3ddd607bc | ||
|
|
511773d466 | ||
|
|
121cd383fa | ||
|
|
90639f48e5 | ||
|
|
8d029a04aa | ||
|
|
67995db899 | ||
|
|
282cd0df7c | ||
|
|
ce58994d30 | ||
|
|
78f5afec30 |
5
.gitmodules
vendored
5
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
@@ -15,3 +15,6 @@
|
||||
[submodule "zstd"]
|
||||
path = zstd
|
||||
url = ../zstd
|
||||
[submodule "abseil"]
|
||||
path = abseil
|
||||
url = ../abseil-cpp
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
PRODUCT=scylla
|
||||
VERSION=666.development
|
||||
VERSION=4.0.11
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
1
abseil
Submodule
1
abseil
Submodule
Submodule abseil added at 2069dc796a
@@ -66,8 +66,9 @@ static std::string format_time_point(db_clock::time_point tp) {
|
||||
time_t time_point_repr = db_clock::to_time_t(tp);
|
||||
std::string time_point_str;
|
||||
time_point_str.resize(17);
|
||||
::tm time_buf;
|
||||
// strftime prints the terminating null character as well
|
||||
std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", std::gmtime(&time_point_repr));
|
||||
std::strftime(time_point_str.data(), time_point_str.size(), "%Y%m%dT%H%M%SZ", ::gmtime_r(&time_point_repr, &time_buf));
|
||||
time_point_str.resize(16);
|
||||
return time_point_str;
|
||||
}
|
||||
|
||||
@@ -365,31 +365,35 @@ bool check_compare(const rjson::value* v1, const rjson::value& v2, const Compara
|
||||
|
||||
struct cmp_lt {
|
||||
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs; }
|
||||
// We cannot use the normal comparison operators like "<" on the bytes
|
||||
// type, because they treat individual bytes as signed but we need to
|
||||
// compare them as *unsigned*. So we need a specialization for bytes.
|
||||
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) < 0; }
|
||||
static constexpr const char* diagnostic = "LT operator";
|
||||
};
|
||||
|
||||
struct cmp_le {
|
||||
// bytes only has <, so we cannot use <=.
|
||||
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs < rhs || lhs == rhs; }
|
||||
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs <= rhs; }
|
||||
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) <= 0; }
|
||||
static constexpr const char* diagnostic = "LE operator";
|
||||
};
|
||||
|
||||
struct cmp_ge {
|
||||
// bytes only has <, so we cannot use >=.
|
||||
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs || lhs == rhs; }
|
||||
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs >= rhs; }
|
||||
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) >= 0; }
|
||||
static constexpr const char* diagnostic = "GE operator";
|
||||
};
|
||||
|
||||
struct cmp_gt {
|
||||
// bytes only has <, so we cannot use >.
|
||||
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return rhs < lhs; }
|
||||
template <typename T> bool operator()(const T& lhs, const T& rhs) const { return lhs > rhs; }
|
||||
bool operator()(const bytes& lhs, const bytes& rhs) const { return compare_unsigned(lhs, rhs) > 0; }
|
||||
static constexpr const char* diagnostic = "GT operator";
|
||||
};
|
||||
|
||||
// True if v is between lb and ub, inclusive. Throws if lb > ub.
|
||||
template <typename T>
|
||||
bool check_BETWEEN(const T& v, const T& lb, const T& ub) {
|
||||
if (ub < lb) {
|
||||
if (cmp_lt()(ub, lb)) {
|
||||
throw api_error("ValidationException",
|
||||
format("BETWEEN operator requires lower_bound <= upper_bound, but {} > {}", lb, ub));
|
||||
}
|
||||
|
||||
@@ -208,12 +208,11 @@ get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
|
||||
throw api_error("ValidationException",
|
||||
format("Non-string IndexName '{}'", index_name->GetString()));
|
||||
}
|
||||
}
|
||||
|
||||
// If no tables for global indexes were found, the index may be local
|
||||
if (!proxy.get_db().local().has_schema(keyspace_name, table_name)) {
|
||||
type = table_or_view_type::lsi;
|
||||
table_name = lsi_name(orig_table_name, index_name->GetString());
|
||||
// If no tables for global indexes were found, the index may be local
|
||||
if (!proxy.get_db().local().has_schema(keyspace_name, table_name)) {
|
||||
type = table_or_view_type::lsi;
|
||||
table_name = lsi_name(orig_table_name, index_name->GetString());
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -566,7 +565,7 @@ static void validate_tags(const std::map<sstring, sstring>& tags) {
|
||||
// to races during concurrent updates of the same table. Once Scylla schema updates
|
||||
// are fixed, this issue will automatically get fixed as well.
|
||||
enum class update_tags_action { add_tags, delete_tags };
|
||||
static future<> update_tags(const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
|
||||
static future<> update_tags(service::migration_manager& mm, const rjson::value& tags, schema_ptr schema, std::map<sstring, sstring>&& tags_map, update_tags_action action) {
|
||||
if (action == update_tags_action::add_tags) {
|
||||
for (auto it = tags.Begin(); it != tags.End(); ++it) {
|
||||
const rjson::value& key = (*it)["Key"];
|
||||
@@ -593,24 +592,12 @@ static future<> update_tags(const rjson::value& tags, schema_ptr schema, std::ma
|
||||
}
|
||||
validate_tags(tags_map);
|
||||
|
||||
std::stringstream serialized_tags;
|
||||
serialized_tags << '{';
|
||||
for (auto& tag_entry : tags_map) {
|
||||
serialized_tags << format("'{}':'{}',", tag_entry.first, tag_entry.second);
|
||||
}
|
||||
std::string serialized_tags_str = serialized_tags.str();
|
||||
if (!tags_map.empty()) {
|
||||
serialized_tags_str[serialized_tags_str.size() - 1] = '}'; // trims the last ',' delimiter
|
||||
} else {
|
||||
serialized_tags_str.push_back('}');
|
||||
}
|
||||
|
||||
sstring req = format("ALTER TABLE \"{}\".\"{}\" WITH {} = {}",
|
||||
schema->ks_name(), schema->cf_name(), tags_extension::NAME, serialized_tags_str);
|
||||
return db::execute_cql(std::move(req)).discard_result();
|
||||
schema_builder builder(schema);
|
||||
builder.set_extensions(schema::extensions_map{{sstring(tags_extension::NAME), ::make_shared<tags_extension>(std::move(tags_map))}});
|
||||
return mm.announce_column_family_update(builder.build(), false, std::vector<view_ptr>(), false);
|
||||
}
|
||||
|
||||
static future<> add_tags(service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
|
||||
static future<> add_tags(service::migration_manager& mm, service::storage_proxy& proxy, schema_ptr schema, rjson::value& request_info) {
|
||||
const rjson::value* tags = rjson::find(request_info, "Tags");
|
||||
if (!tags || !tags->IsArray()) {
|
||||
return make_exception_future<>(api_error("ValidationException", format("Cannot parse tags")));
|
||||
@@ -620,7 +607,7 @@ static future<> add_tags(service::storage_proxy& proxy, schema_ptr schema, rjson
|
||||
}
|
||||
|
||||
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
||||
return update_tags(rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
|
||||
return update_tags(mm, rjson::copy(*tags), schema, std::move(tags_map), update_tags_action::add_tags);
|
||||
}
|
||||
|
||||
future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
|
||||
@@ -632,7 +619,7 @@ future<executor::request_return_type> executor::tag_resource(client_state& clien
|
||||
return api_error("AccessDeniedException", "Incorrect resource identifier");
|
||||
}
|
||||
schema_ptr schema = get_table_from_arn(_proxy, std::string_view(arn->GetString(), arn->GetStringLength()));
|
||||
add_tags(_proxy, schema, request).get();
|
||||
add_tags(_mm, _proxy, schema, request).get();
|
||||
return json_string("");
|
||||
});
|
||||
}
|
||||
@@ -653,7 +640,7 @@ future<executor::request_return_type> executor::untag_resource(client_state& cli
|
||||
schema_ptr schema = get_table_from_arn(_proxy, std::string_view(arn->GetString(), arn->GetStringLength()));
|
||||
|
||||
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
||||
update_tags(*tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
|
||||
update_tags(_mm, *tags, schema, std::move(tags_map), update_tags_action::delete_tags).get();
|
||||
return json_string("");
|
||||
});
|
||||
}
|
||||
@@ -870,7 +857,7 @@ future<executor::request_return_type> executor::create_table(client_state& clien
|
||||
}).then([this, table_info = std::move(table_info), schema] () mutable {
|
||||
future<> f = make_ready_future<>();
|
||||
if (rjson::find(table_info, "Tags")) {
|
||||
f = add_tags(_proxy, schema, table_info);
|
||||
f = add_tags(_mm, _proxy, schema, table_info);
|
||||
}
|
||||
return f.then([table_info = std::move(table_info), schema] () mutable {
|
||||
rjson::value status = rjson::empty_object();
|
||||
@@ -900,15 +887,24 @@ class attribute_collector {
|
||||
void add(bytes&& name, atomic_cell&& cell) {
|
||||
collected.emplace(std::move(name), std::move(cell));
|
||||
}
|
||||
void add(const bytes& name, atomic_cell&& cell) {
|
||||
collected.emplace(name, std::move(cell));
|
||||
}
|
||||
public:
|
||||
attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
|
||||
void put(bytes&& name, bytes&& val, api::timestamp_type ts) {
|
||||
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, std::move(val), atomic_cell::collection_member::yes));
|
||||
void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
|
||||
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
|
||||
|
||||
}
|
||||
void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
|
||||
add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
|
||||
}
|
||||
void del(bytes&& name, api::timestamp_type ts) {
|
||||
add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
|
||||
}
|
||||
void del(const bytes& name, api::timestamp_type ts) {
|
||||
add(name, atomic_cell::make_dead(ts, gc_clock::now()));
|
||||
}
|
||||
collection_mutation_description to_mut() {
|
||||
collection_mutation_description ret;
|
||||
for (auto&& e : collected) {
|
||||
@@ -988,7 +984,7 @@ public:
|
||||
put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item);
|
||||
// put_or_delete_item doesn't keep a reference to schema (so it can be
|
||||
// moved between shards for LWT) so it needs to be given again to build():
|
||||
mutation build(schema_ptr schema, api::timestamp_type ts);
|
||||
mutation build(schema_ptr schema, api::timestamp_type ts) const;
|
||||
const partition_key& pk() const { return _pk; }
|
||||
const clustering_key& ck() const { return _ck; }
|
||||
};
|
||||
@@ -1017,20 +1013,29 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche
|
||||
}
|
||||
}
|
||||
|
||||
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
|
||||
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
|
||||
mutation m(schema, _pk);
|
||||
auto& row = m.partition().clustered_row(*schema, _ck);
|
||||
// If there's no clustering key, a tombstone should be created directly
|
||||
// on a partition, not on a clustering row - otherwise it will look like
|
||||
// an open-ended range tombstone, which will crash on KA/LA sstable format.
|
||||
// Ref: #6035
|
||||
const bool use_partition_tombstone = schema->clustering_key_size() == 0;
|
||||
if (!_cells) {
|
||||
// a DeleteItem operation:
|
||||
row.apply(tombstone(ts, gc_clock::now()));
|
||||
if (use_partition_tombstone) {
|
||||
m.partition().apply(tombstone(ts, gc_clock::now()));
|
||||
} else {
|
||||
// a DeleteItem operation:
|
||||
m.partition().clustered_row(*schema, _ck).apply(tombstone(ts, gc_clock::now()));
|
||||
}
|
||||
return m;
|
||||
}
|
||||
// else, a PutItem operation:
|
||||
auto& row = m.partition().clustered_row(*schema, _ck);
|
||||
attribute_collector attrs_collector;
|
||||
for (auto& c : *_cells) {
|
||||
const column_definition* cdef = schema->get_column_definition(c.column_name);
|
||||
if (!cdef) {
|
||||
attrs_collector.put(std::move(c.column_name), std::move(c.value), ts);
|
||||
attrs_collector.put(c.column_name, c.value, ts);
|
||||
} else {
|
||||
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
|
||||
}
|
||||
@@ -1048,7 +1053,11 @@ mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) {
|
||||
// Scylla proper, to implement the operation to replace an entire
|
||||
// collection ("UPDATE .. SET x = ..") - see
|
||||
// cql3::update_parameters::make_tombstone_just_before().
|
||||
row.apply(tombstone(ts-1, gc_clock::now()));
|
||||
if (use_partition_tombstone) {
|
||||
m.partition().apply(tombstone(ts-1, gc_clock::now()));
|
||||
} else {
|
||||
row.apply(tombstone(ts-1, gc_clock::now()));
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
@@ -1202,11 +1211,6 @@ std::optional<shard_id> rmw_operation::shard_for_execute(bool needs_read_before_
|
||||
// PutItem, DeleteItem). All these return nothing by default, but can
|
||||
// optionally return Attributes if requested via the ReturnValues option.
|
||||
static future<executor::request_return_type> rmw_operation_return(rjson::value&& attributes) {
|
||||
// As an optimization, in the simple and common case that nothing is to be
|
||||
// returned, quickly return an empty result:
|
||||
if (attributes.IsNull()) {
|
||||
return make_ready_future<executor::request_return_type>(json_string(""));
|
||||
}
|
||||
rjson::value ret = rjson::empty_object();
|
||||
if (!attributes.IsNull()) {
|
||||
rjson::set(ret, "Attributes", std::move(attributes));
|
||||
@@ -1331,7 +1335,7 @@ public:
|
||||
check_needs_read_before_write(_condition_expression) ||
|
||||
_returnvalues == returnvalues::ALL_OLD;
|
||||
}
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
|
||||
std::unordered_set<std::string> used_attribute_values;
|
||||
std::unordered_set<std::string> used_attribute_names;
|
||||
if (!verify_expected(_request, previous_item) ||
|
||||
@@ -1343,6 +1347,7 @@ public:
|
||||
// efficient than throwing an exception.
|
||||
return {};
|
||||
}
|
||||
_return_attributes = {};
|
||||
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
||||
// previous_item is supposed to have been created with
|
||||
// describe_item(), so has the "Item" attribute:
|
||||
@@ -1409,7 +1414,7 @@ public:
|
||||
check_needs_read_before_write(_condition_expression) ||
|
||||
_returnvalues == returnvalues::ALL_OLD;
|
||||
}
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override {
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
|
||||
std::unordered_set<std::string> used_attribute_values;
|
||||
std::unordered_set<std::string> used_attribute_names;
|
||||
if (!verify_expected(_request, previous_item) ||
|
||||
@@ -1421,6 +1426,7 @@ public:
|
||||
// efficient than throwing an exception.
|
||||
return {};
|
||||
}
|
||||
_return_attributes = {};
|
||||
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
||||
rjson::value* item = rjson::find(*previous_item, "Item");
|
||||
if (item) {
|
||||
@@ -1504,7 +1510,7 @@ public:
|
||||
virtual ~put_or_delete_item_cas_request() = default;
|
||||
virtual std::optional<mutation> apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) override {
|
||||
std::optional<mutation> ret;
|
||||
for (put_or_delete_item& mutation_builder : _mutation_builders) {
|
||||
for (const put_or_delete_item& mutation_builder : _mutation_builders) {
|
||||
// We assume all these builders have the same partition.
|
||||
if (ret) {
|
||||
ret->apply(mutation_builder.build(schema, ts));
|
||||
@@ -2329,7 +2335,7 @@ public:
|
||||
|
||||
update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
|
||||
virtual ~update_item_operation() = default;
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) override;
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override;
|
||||
bool needs_read_before_write() const;
|
||||
};
|
||||
|
||||
@@ -2393,7 +2399,7 @@ update_item_operation::needs_read_before_write() const {
|
||||
}
|
||||
|
||||
std::optional<mutation>
|
||||
update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) {
|
||||
update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
|
||||
std::unordered_set<std::string> used_attribute_values;
|
||||
std::unordered_set<std::string> used_attribute_names;
|
||||
if (!verify_expected(_request, previous_item) ||
|
||||
@@ -2773,6 +2779,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
|
||||
[] (std::vector<std::tuple<std::string, std::optional<rjson::value>>> responses) {
|
||||
rjson::value response = rjson::empty_object();
|
||||
rjson::set(response, "Responses", rjson::empty_object());
|
||||
rjson::set(response, "UnprocessedKeys", rjson::empty_object());
|
||||
for (auto& t : responses) {
|
||||
if (!response["Responses"].HasMember(std::get<0>(t).c_str())) {
|
||||
rjson::set_with_string_name(response["Responses"], std::get<0>(t), rjson::empty_array());
|
||||
@@ -2889,6 +2896,7 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
|
||||
uint32_t limit,
|
||||
db::consistency_level cl,
|
||||
::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions,
|
||||
query::partition_slice::option_set custom_opts,
|
||||
service::client_state& client_state,
|
||||
cql3::cql_stats& cql_stats,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
@@ -2909,7 +2917,9 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
|
||||
auto regular_columns = boost::copy_range<query::column_id_vector>(
|
||||
schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
|
||||
auto selection = cql3::selection::selection::wildcard(schema);
|
||||
auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), selection->get_query_options());
|
||||
query::partition_slice::option_set opts = selection->get_query_options();
|
||||
opts.add(custom_opts);
|
||||
auto partition_slice = query::partition_slice(std::move(ck_bounds), {}, std::move(regular_columns), opts);
|
||||
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
|
||||
|
||||
auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));
|
||||
@@ -2939,11 +2949,38 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
|
||||
});
|
||||
}
|
||||
|
||||
static dht::token token_for_segment(int segment, int total_segments) {
|
||||
assert(total_segments > 1 && segment >= 0 && segment < total_segments);
|
||||
uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
|
||||
return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
|
||||
}
|
||||
|
||||
static dht::partition_range get_range_for_segment(int segment, int total_segments) {
|
||||
if (total_segments == 1) {
|
||||
return dht::partition_range::make_open_ended_both_sides();
|
||||
}
|
||||
if (segment == 0) {
|
||||
dht::token ending_token = token_for_segment(1, total_segments);
|
||||
return dht::partition_range::make_ending_with(
|
||||
dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false));
|
||||
} else if (segment == total_segments - 1) {
|
||||
dht::token starting_token = token_for_segment(segment, total_segments);
|
||||
return dht::partition_range::make_starting_with(
|
||||
dht::partition_range::bound(dht::ring_position::starting_at(starting_token)));
|
||||
} else {
|
||||
dht::token starting_token = token_for_segment(segment, total_segments);
|
||||
dht::token ending_token = token_for_segment(segment + 1, total_segments);
|
||||
return dht::partition_range::make(
|
||||
dht::partition_range::bound(dht::ring_position::starting_at(starting_token)),
|
||||
dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(sarna):
|
||||
// 1. Paging must have 1MB boundary according to the docs. IIRC we do have a replica-side reply size limit though - verify.
|
||||
// 2. Filtering - by passing appropriately created restrictions to pager as a last parameter
|
||||
// 3. Proper timeouts instead of gc_clock::now() and db::no_timeout
|
||||
// 4. Implement parallel scanning via Segments
|
||||
future<executor::request_return_type> executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
||||
_stats.api_operations.scan++;
|
||||
elogger.trace("Scanning {}", request);
|
||||
@@ -2954,10 +2991,21 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException",
|
||||
"FilterExpression is not yet implemented in alternator"));
|
||||
}
|
||||
if (get_int_attribute(request, "Segment") || get_int_attribute(request, "TotalSegments")) {
|
||||
// FIXME: need to support parallel scan. See issue #5059.
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException",
|
||||
"Scan Segment/TotalSegments is not yet implemented in alternator"));
|
||||
auto segment = get_int_attribute(request, "Segment");
|
||||
auto total_segments = get_int_attribute(request, "TotalSegments");
|
||||
if (segment || total_segments) {
|
||||
if (!segment || !total_segments) {
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException",
|
||||
"Both Segment and TotalSegments attributes need to be present for a parallel scan"));
|
||||
}
|
||||
if (*segment < 0 || *segment >= *total_segments) {
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException",
|
||||
"Segment must be non-negative and less than TotalSegments"));
|
||||
}
|
||||
if (*total_segments < 0 || *total_segments > 1000000) {
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException",
|
||||
"TotalSegments must be non-negative and less or equal to 1000000"));
|
||||
}
|
||||
}
|
||||
|
||||
rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey");
|
||||
@@ -2976,7 +3024,12 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
|
||||
|
||||
auto attrs_to_get = calculate_attrs_to_get(request);
|
||||
|
||||
dht::partition_range_vector partition_ranges{dht::partition_range::make_open_ended_both_sides()};
|
||||
dht::partition_range_vector partition_ranges;
|
||||
if (segment) {
|
||||
partition_ranges.push_back(get_range_for_segment(*segment, *total_segments));
|
||||
} else {
|
||||
partition_ranges.push_back(dht::partition_range::make_open_ended_both_sides());
|
||||
}
|
||||
std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};
|
||||
|
||||
::shared_ptr<cql3::restrictions::statement_restrictions> filtering_restrictions;
|
||||
@@ -2986,14 +3039,15 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
|
||||
partition_ranges = filtering_restrictions->get_partition_key_ranges(query_options);
|
||||
ck_bounds = filtering_restrictions->get_clustering_bounds(query_options);
|
||||
}
|
||||
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats, trace_state, std::move(permit));
|
||||
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
|
||||
std::move(filtering_restrictions), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit));
|
||||
}
|
||||
|
||||
static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, comparison_operator_type op, const rjson::value& attrs) {
|
||||
if (attrs.Size() != 1) {
|
||||
throw api_error("ValidationException", format("Only a single attribute is allowed for a hash key restriction: {}", attrs));
|
||||
}
|
||||
bytes raw_value = pk_cdef.type->from_string(attrs[0][type_to_string(pk_cdef.type)].GetString());
|
||||
bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef);
|
||||
partition_key pk = partition_key::from_singular(*schema, pk_cdef.type->deserialize(raw_value));
|
||||
auto decorated_key = dht::decorate_key(*schema, pk);
|
||||
if (op != comparison_operator_type::EQ) {
|
||||
@@ -3018,7 +3072,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
|
||||
if (attrs.Size() != expected_attrs_size) {
|
||||
throw api_error("ValidationException", format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs));
|
||||
}
|
||||
bytes raw_value = ck_cdef.type->from_string(attrs[0][type_to_string(ck_cdef.type)].GetString());
|
||||
bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef);
|
||||
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
|
||||
switch (op) {
|
||||
case comparison_operator_type::EQ:
|
||||
@@ -3032,7 +3086,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
|
||||
case comparison_operator_type::GT:
|
||||
return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false));
|
||||
case comparison_operator_type::BETWEEN: {
|
||||
bytes raw_upper_limit = ck_cdef.type->from_string(attrs[1][type_to_string(ck_cdef.type)].GetString());
|
||||
bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef);
|
||||
clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit);
|
||||
return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit));
|
||||
}
|
||||
@@ -3045,9 +3099,7 @@ static query::clustering_range calculate_ck_bound(schema_ptr schema, const colum
|
||||
if (!ck_cdef.type->is_compatible_with(*utf8_type)) {
|
||||
throw api_error("ValidationException", format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type)));
|
||||
}
|
||||
std::string raw_upper_limit_str = attrs[0][type_to_string(ck_cdef.type)].GetString();
|
||||
bytes raw_upper_limit = ck_cdef.type->from_string(raw_upper_limit_str);
|
||||
return get_clustering_range_for_begins_with(std::move(raw_upper_limit), ck, schema, ck_cdef.type);
|
||||
return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type);
|
||||
}
|
||||
default:
|
||||
throw api_error("ValidationException", format("Unknown primary key bound passed: {}", int(op)));
|
||||
@@ -3429,11 +3481,7 @@ future<executor::request_return_type> executor::query(client_state& client_state
|
||||
if (rjson::find(request, "FilterExpression")) {
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException", "FilterExpression is not yet implemented in alternator"));
|
||||
}
|
||||
bool forward = get_bool_attribute(request, "ScanIndexForward", true);
|
||||
if (!forward) {
|
||||
// FIXME: need to support the !forward (i.e., reverse sort order) case. See issue #5153.
|
||||
return make_ready_future<request_return_type>(api_error("ValidationException", "ScanIndexForward=false is not yet implemented in alternator"));
|
||||
}
|
||||
const bool forward = get_bool_attribute(request, "ScanIndexForward", true);
|
||||
|
||||
rjson::value* key_conditions = rjson::find(request, "KeyConditions");
|
||||
rjson::value* key_condition_expression = rjson::find(request, "KeyConditionExpression");
|
||||
@@ -3476,7 +3524,10 @@ future<executor::request_return_type> executor::query(client_state& client_state
|
||||
}
|
||||
verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "KeyConditionExpression");
|
||||
verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "KeyConditionExpression");
|
||||
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filtering_restrictions), client_state, _stats.cql_stats, std::move(trace_state), std::move(permit));
|
||||
query::partition_slice::option_set opts;
|
||||
opts.set_if<query::partition_slice::option::reversed>(!forward);
|
||||
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
|
||||
std::move(filtering_restrictions), opts, client_state, _stats.cql_stats, std::move(trace_state), std::move(permit));
|
||||
}
|
||||
|
||||
future<executor::request_return_type> executor::list_tables(client_state& client_state, service_permit permit, rjson::value request) {
|
||||
@@ -3567,12 +3618,12 @@ static std::map<sstring, sstring> get_network_topology_options(int rf) {
|
||||
// manually create the keyspace to override this predefined behavior.
|
||||
future<> executor::create_keyspace(std::string_view keyspace_name) {
|
||||
sstring keyspace_name_str(keyspace_name);
|
||||
return gms::get_up_endpoint_count().then([this, keyspace_name_str = std::move(keyspace_name_str)] (int up_endpoint_count) {
|
||||
return gms::get_all_endpoint_count().then([this, keyspace_name_str = std::move(keyspace_name_str)] (int endpoint_count) {
|
||||
int rf = 3;
|
||||
if (up_endpoint_count < rf) {
|
||||
if (endpoint_count < rf) {
|
||||
rf = 1;
|
||||
elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} live nodes.",
|
||||
keyspace_name_str, rf, up_endpoint_count);
|
||||
elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} nodes.",
|
||||
keyspace_name_str, rf, endpoint_count);
|
||||
}
|
||||
auto opts = get_network_topology_options(rf);
|
||||
auto ksm = keyspace_metadata::new_keyspace(keyspace_name_str, "org.apache.cassandra.locator.NetworkTopologyStrategy", std::move(opts), true);
|
||||
|
||||
@@ -83,7 +83,11 @@ protected:
|
||||
// When _returnvalues != NONE, apply() should store here, in JSON form,
|
||||
// the values which are to be returned in the "Attributes" field.
|
||||
// The default null JSON means do not return an Attributes field at all.
|
||||
rjson::value _return_attributes;
|
||||
// This field is marked "mutable" so that the const apply() can modify
|
||||
// it (see explanation below), but note that because apply() may be
|
||||
// called more than once, if apply() will sometimes set this field it
|
||||
// must set it (even if just to the default empty value) every time.
|
||||
mutable rjson::value _return_attributes;
|
||||
public:
|
||||
// The constructor of a rmw_operation subclass should parse the request
|
||||
// and try to discover as many input errors as it can before really
|
||||
@@ -96,7 +100,12 @@ public:
|
||||
// conditional expression, apply() should return an empty optional.
|
||||
// apply() may throw if it encounters input errors not discovered during
|
||||
// the constructor.
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) = 0;
|
||||
// apply() may be called more than once in case of contention, so it must
|
||||
// not change the state saved in the object (issue #7218 was caused by
|
||||
// violating this). We mark apply() "const" to let the compiler validate
|
||||
// this for us. The output-only field _return_attributes is marked
|
||||
// "mutable" above so that apply() can still write to it.
|
||||
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const = 0;
|
||||
// Convert the above apply() into the signature needed by cas_request:
|
||||
virtual std::optional<mutation> apply(query::result& qr, const query::partition_slice& slice, api::timestamp_type ts) override;
|
||||
virtual ~rmw_operation() = default;
|
||||
|
||||
@@ -54,26 +54,22 @@ static sstring validate_keyspace(http_context& ctx, const parameters& param) {
|
||||
throw bad_param_exception("Keyspace " + param["keyspace"] + " Does not exist");
|
||||
}
|
||||
|
||||
static std::vector<ss::token_range> describe_ring(const sstring& keyspace) {
|
||||
std::vector<ss::token_range> res;
|
||||
for (auto d : service::get_local_storage_service().describe_ring(keyspace)) {
|
||||
ss::token_range r;
|
||||
r.start_token = d._start_token;
|
||||
r.end_token = d._end_token;
|
||||
r.endpoints = d._endpoints;
|
||||
r.rpc_endpoints = d._rpc_endpoints;
|
||||
for (auto det : d._endpoint_details) {
|
||||
ss::endpoint_detail ed;
|
||||
ed.host = det._host;
|
||||
ed.datacenter = det._datacenter;
|
||||
if (det._rack != "") {
|
||||
ed.rack = det._rack;
|
||||
}
|
||||
r.endpoint_details.push(ed);
|
||||
static ss::token_range token_range_endpoints_to_json(const dht::token_range_endpoints& d) {
|
||||
ss::token_range r;
|
||||
r.start_token = d._start_token;
|
||||
r.end_token = d._end_token;
|
||||
r.endpoints = d._endpoints;
|
||||
r.rpc_endpoints = d._rpc_endpoints;
|
||||
for (auto det : d._endpoint_details) {
|
||||
ss::endpoint_detail ed;
|
||||
ed.host = det._host;
|
||||
ed.datacenter = det._datacenter;
|
||||
if (det._rack != "") {
|
||||
ed.rack = det._rack;
|
||||
}
|
||||
res.push_back(r);
|
||||
r.endpoint_details.push(ed);
|
||||
}
|
||||
return res;
|
||||
return r;
|
||||
}
|
||||
|
||||
using ks_cf_func = std::function<future<json::json_return_type>(http_context&, std::unique_ptr<request>, sstring, std::vector<sstring>)>;
|
||||
@@ -175,13 +171,13 @@ void set_storage_service(http_context& ctx, routes& r) {
|
||||
return make_ready_future<json::json_return_type>(res);
|
||||
});
|
||||
|
||||
ss::describe_any_ring.set(r, [&ctx](const_req req) {
|
||||
return describe_ring("");
|
||||
ss::describe_any_ring.set(r, [&ctx](std::unique_ptr<request> req) {
|
||||
return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(""), token_range_endpoints_to_json));
|
||||
});
|
||||
|
||||
ss::describe_ring.set(r, [&ctx](const_req req) {
|
||||
auto keyspace = validate_keyspace(ctx, req.param);
|
||||
return describe_ring(keyspace);
|
||||
ss::describe_ring.set(r, [&ctx](std::unique_ptr<request> req) {
|
||||
auto keyspace = validate_keyspace(ctx, req->param);
|
||||
return make_ready_future<json::json_return_type>(stream_range_as_array(service::get_local_storage_service().describe_ring(keyspace), token_range_endpoints_to_json));
|
||||
});
|
||||
|
||||
ss::get_host_id_map.set(r, [&ctx](const_req req) {
|
||||
@@ -256,8 +252,8 @@ void set_storage_service(http_context& ctx, routes& r) {
|
||||
for (auto cf : column_families) {
|
||||
column_families_vec.push_back(&db.find_column_family(keyspace, cf));
|
||||
}
|
||||
return parallel_for_each(column_families_vec, [&cm] (column_family* cf) {
|
||||
return cm.perform_cleanup(cf);
|
||||
return parallel_for_each(column_families_vec, [&cm, &db] (column_family* cf) {
|
||||
return cm.perform_cleanup(db, cf);
|
||||
});
|
||||
}).then([]{
|
||||
return make_ready_future<json::json_return_type>(0);
|
||||
@@ -1000,6 +996,9 @@ void set_snapshot(http_context& ctx, routes& r) {
|
||||
if (column_family.empty()) {
|
||||
resp = service::get_local_storage_service().take_snapshot(tag, keynames);
|
||||
} else {
|
||||
if (keynames.empty()) {
|
||||
throw httpd::bad_param_exception("The keyspace of column families must be specified");
|
||||
}
|
||||
if (keynames.size() > 1) {
|
||||
throw httpd::bad_param_exception("Only one keyspace allowed when specifying a column family");
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
|
||||
#include "auth/resource.hh"
|
||||
#include "seastarx.hh"
|
||||
#include "exceptions/exceptions.hh"
|
||||
|
||||
namespace auth {
|
||||
|
||||
@@ -52,9 +53,9 @@ struct role_config_update final {
|
||||
///
|
||||
/// A logical argument error for a role-management operation.
|
||||
///
|
||||
class roles_argument_exception : public std::invalid_argument {
|
||||
class roles_argument_exception : public exceptions::invalid_request_exception {
|
||||
public:
|
||||
using std::invalid_argument::invalid_argument;
|
||||
using exceptions::invalid_request_exception::invalid_request_exception;
|
||||
};
|
||||
|
||||
class role_already_exists : public roles_argument_exception {
|
||||
|
||||
@@ -1135,7 +1135,7 @@ public:
|
||||
if (r.row().deleted_at()) {
|
||||
touched_parts.set<stats::part_type::ROW_DELETE>();
|
||||
cdc_op = operation::row_delete;
|
||||
if (pirow) {
|
||||
if (pirow && pikey) {
|
||||
for (const column_definition& column: _schema->regular_columns()) {
|
||||
assert(pirow->has(column.name_as_text()));
|
||||
auto& cdef = *_log_schema->get_column_definition(log_data_column_name_bytes(column.name()));
|
||||
|
||||
@@ -30,10 +30,12 @@ std::atomic<int64_t> clocks_offset;
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, db_clock::time_point tp) {
|
||||
auto t = db_clock::to_time_t(tp);
|
||||
return os << std::put_time(std::gmtime(&t), "%Y/%m/%d %T");
|
||||
::tm t_buf;
|
||||
return os << std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T");
|
||||
}
|
||||
|
||||
std::string format_timestamp(api::timestamp_type ts) {
|
||||
auto t = std::time_t(std::chrono::duration_cast<std::chrono::seconds>(api::timestamp_clock::duration(ts)).count());
|
||||
return format("{}", std::put_time(std::gmtime(&t), "%Y/%m/%d %T"));
|
||||
::tm t_buf;
|
||||
return format("{}", std::put_time(::gmtime_r(&t, &t_buf), "%Y/%m/%d %T"));
|
||||
}
|
||||
|
||||
59
configure.py
59
configure.py
@@ -381,6 +381,7 @@ scylla_tests = set([
|
||||
'test/boost/view_schema_ckey_test',
|
||||
'test/boost/vint_serialization_test',
|
||||
'test/boost/virtual_reader_test',
|
||||
'test/boost/stall_free_test',
|
||||
'test/manual/ec2_snitch_test',
|
||||
'test/manual/gce_snitch_test',
|
||||
'test/manual/gossip',
|
||||
@@ -1265,9 +1266,9 @@ def query_seastar_flags(pc_file, link_static_cxx=False):
|
||||
return cflags, libs
|
||||
|
||||
for mode in build_modes:
|
||||
seastar_cflags, seastar_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
|
||||
modes[mode]['seastar_cflags'] = seastar_cflags
|
||||
modes[mode]['seastar_libs'] = seastar_libs
|
||||
seastar_pc_cflags, seastar_pc_libs = query_seastar_flags(pc[mode], link_static_cxx=args.staticcxx)
|
||||
modes[mode]['seastar_cflags'] = seastar_pc_cflags
|
||||
modes[mode]['seastar_libs'] = seastar_pc_libs
|
||||
|
||||
# We need to use experimental features of the zstd library (to use our own allocators for the (de)compression context),
|
||||
# which are available only when the library is linked statically.
|
||||
@@ -1288,6 +1289,46 @@ def configure_zstd(build_dir, mode):
|
||||
os.makedirs(zstd_build_dir, exist_ok=True)
|
||||
subprocess.check_call(zstd_cmd, shell=False, cwd=zstd_build_dir)
|
||||
|
||||
def configure_abseil(build_dir, mode):
|
||||
abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
|
||||
|
||||
abseil_cflags = seastar_cflags + ' ' + modes[mode]['cxx_ld_flags']
|
||||
cmake_mode = MODE_TO_CMAKE_BUILD_TYPE[mode]
|
||||
abseil_cmake_args = [
|
||||
'-DCMAKE_BUILD_TYPE={}'.format(cmake_mode),
|
||||
'-DCMAKE_INSTALL_PREFIX={}'.format(build_dir + '/inst'), # just to avoid a warning from absl
|
||||
'-DCMAKE_C_COMPILER={}'.format(args.cc),
|
||||
'-DCMAKE_CXX_COMPILER={}'.format(args.cxx),
|
||||
'-DCMAKE_CXX_FLAGS_{}={}'.format(cmake_mode.upper(), abseil_cflags),
|
||||
]
|
||||
|
||||
abseil_cmd = ['cmake', '-G', 'Ninja', os.path.relpath('abseil', abseil_build_dir)] + abseil_cmake_args
|
||||
|
||||
os.makedirs(abseil_build_dir, exist_ok=True)
|
||||
subprocess.check_call(abseil_cmd, shell=False, cwd=abseil_build_dir)
|
||||
|
||||
abseil_libs = ['absl/' + lib for lib in [
|
||||
'container/libabsl_hashtablez_sampler.a',
|
||||
'container/libabsl_raw_hash_set.a',
|
||||
'synchronization/libabsl_synchronization.a',
|
||||
'synchronization/libabsl_graphcycles_internal.a',
|
||||
'debugging/libabsl_stacktrace.a',
|
||||
'debugging/libabsl_symbolize.a',
|
||||
'debugging/libabsl_debugging_internal.a',
|
||||
'debugging/libabsl_demangle_internal.a',
|
||||
'time/libabsl_time.a',
|
||||
'time/libabsl_time_zone.a',
|
||||
'numeric/libabsl_int128.a',
|
||||
'hash/libabsl_city.a',
|
||||
'hash/libabsl_hash.a',
|
||||
'base/libabsl_malloc_internal.a',
|
||||
'base/libabsl_spinlock_wait.a',
|
||||
'base/libabsl_base.a',
|
||||
'base/libabsl_dynamic_annotations.a',
|
||||
'base/libabsl_raw_logging_internal.a',
|
||||
'base/libabsl_exponential_biased.a',
|
||||
'base/libabsl_throw_delegate.a']]
|
||||
|
||||
args.user_cflags += " " + pkg_config('jsoncpp', '--cflags')
|
||||
args.user_cflags += ' -march=' + args.target
|
||||
libs = ' '.join([maybe_static(args.staticyamlcpp, '-lyaml-cpp'), '-latomic', '-llz4', '-lz', '-lsnappy', pkg_config('jsoncpp', '--libs'),
|
||||
@@ -1316,6 +1357,7 @@ if any(filter(thrift_version.startswith, thrift_boost_versions)):
|
||||
for pkg in pkgs:
|
||||
args.user_cflags += ' ' + pkg_config(pkg, '--cflags')
|
||||
libs += ' ' + pkg_config(pkg, '--libs')
|
||||
args.user_cflags += '-I abseil'
|
||||
user_cflags = args.user_cflags + ' -fvisibility=hidden'
|
||||
user_ldflags = args.user_ldflags + ' -fvisibility=hidden'
|
||||
if args.staticcxx:
|
||||
@@ -1346,6 +1388,9 @@ else:
|
||||
for mode in build_modes:
|
||||
configure_zstd(outdir, mode)
|
||||
|
||||
for mode in build_modes:
|
||||
configure_abseil(outdir, mode)
|
||||
|
||||
# configure.py may run automatically from an already-existing build.ninja.
|
||||
# If the user interrupts configure.py in the middle, we need build.ninja
|
||||
# to remain in a valid state. So we write our output to a temporary
|
||||
@@ -1480,6 +1525,8 @@ with open(buildfile_tmp, 'w') as f:
|
||||
objs.extend(['$builddir/' + mode + '/' + artifact for artifact in [
|
||||
'libdeflate/libdeflate.a',
|
||||
'zstd/lib/libzstd.a',
|
||||
] + [
|
||||
'abseil/' + x for x in abseil_libs
|
||||
]])
|
||||
objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
|
||||
if binary in tests:
|
||||
@@ -1621,6 +1668,12 @@ with open(buildfile_tmp, 'w') as f:
|
||||
f.write(' subdir = build/{mode}/zstd\n'.format(**locals()))
|
||||
f.write(' target = libzstd.a\n'.format(**locals()))
|
||||
|
||||
for lib in abseil_libs:
|
||||
f.write('build build/{mode}/abseil/{lib}: ninja\n'.format(**locals()))
|
||||
f.write(' pool = submodule_pool\n')
|
||||
f.write(' subdir = build/{mode}/abseil\n'.format(**locals()))
|
||||
f.write(' target = {lib}\n'.format(**locals()))
|
||||
|
||||
mode = 'dev' if 'dev' in modes else modes[0]
|
||||
f.write('build checkheaders: phony || {}\n'.format(' '.join(['$builddir/{}/{}.o'.format(mode, hh) for hh in headers])))
|
||||
|
||||
|
||||
@@ -267,10 +267,13 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
/// The same as `impl_max_function_for' but without knowledge of `Type'.
|
||||
/// The same as `impl_max_function_for' but without compile-time dependency on `Type'.
|
||||
class impl_max_dynamic_function final : public aggregate_function::aggregate {
|
||||
data_type _io_type;
|
||||
opt_bytes _max;
|
||||
public:
|
||||
impl_max_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
|
||||
|
||||
virtual void reset() override {
|
||||
_max = {};
|
||||
}
|
||||
@@ -278,12 +281,11 @@ public:
|
||||
return _max.value_or(bytes{});
|
||||
}
|
||||
virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
|
||||
if (!values[0]) {
|
||||
if (values.empty() || !values[0]) {
|
||||
return;
|
||||
}
|
||||
const auto val = *values[0];
|
||||
if (!_max || *_max < val) {
|
||||
_max = val;
|
||||
if (!_max || _io_type->less(*_max, *values[0])) {
|
||||
_max = values[0];
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -298,10 +300,13 @@ public:
|
||||
};
|
||||
|
||||
class max_dynamic_function final : public native_aggregate_function {
|
||||
data_type _io_type;
|
||||
public:
|
||||
max_dynamic_function(data_type io_type) : native_aggregate_function("max", io_type, { io_type }) {}
|
||||
max_dynamic_function(data_type io_type)
|
||||
: native_aggregate_function("max", io_type, { io_type })
|
||||
, _io_type(std::move(io_type)) {}
|
||||
virtual std::unique_ptr<aggregate> new_aggregate() override {
|
||||
return std::make_unique<impl_max_dynamic_function>();
|
||||
return std::make_unique<impl_max_dynamic_function>(_io_type);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -358,10 +363,13 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
/// The same as `impl_min_function_for' but without knowledge of `Type'.
|
||||
/// The same as `impl_min_function_for' but without compile-time dependency on `Type'.
|
||||
class impl_min_dynamic_function final : public aggregate_function::aggregate {
|
||||
data_type _io_type;
|
||||
opt_bytes _min;
|
||||
public:
|
||||
impl_min_dynamic_function(data_type io_type) : _io_type(std::move(io_type)) {}
|
||||
|
||||
virtual void reset() override {
|
||||
_min = {};
|
||||
}
|
||||
@@ -369,12 +377,11 @@ public:
|
||||
return _min.value_or(bytes{});
|
||||
}
|
||||
virtual void add_input(cql_serialization_format sf, const std::vector<opt_bytes>& values) override {
|
||||
if (!values[0]) {
|
||||
if (values.empty() || !values[0]) {
|
||||
return;
|
||||
}
|
||||
const auto val = *values[0];
|
||||
if (!_min || val < *_min) {
|
||||
_min = val;
|
||||
if (!_min || _io_type->less(*values[0], *_min)) {
|
||||
_min = values[0];
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -389,10 +396,13 @@ public:
|
||||
};
|
||||
|
||||
class min_dynamic_function final : public native_aggregate_function {
|
||||
data_type _io_type;
|
||||
public:
|
||||
min_dynamic_function(data_type io_type) : native_aggregate_function("min", io_type, { io_type }) {}
|
||||
min_dynamic_function(data_type io_type)
|
||||
: native_aggregate_function("min", io_type, { io_type })
|
||||
, _io_type(std::move(io_type)) {}
|
||||
virtual std::unique_ptr<aggregate> new_aggregate() override {
|
||||
return std::make_unique<impl_min_dynamic_function>();
|
||||
return std::make_unique<impl_min_dynamic_function>(_io_type);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -87,17 +87,14 @@ template<typename ToType>
|
||||
std::function<data_value(data_value)> make_castas_fctn_from_decimal_to_float() {
|
||||
return [](data_value from) -> data_value {
|
||||
auto val_from = value_cast<big_decimal>(from);
|
||||
boost::multiprecision::cpp_int ten(10);
|
||||
boost::multiprecision::cpp_rational r = val_from.unscaled_value();
|
||||
r /= boost::multiprecision::pow(ten, val_from.scale());
|
||||
return static_cast<ToType>(r);
|
||||
return static_cast<ToType>(val_from.as_rational());
|
||||
};
|
||||
}
|
||||
|
||||
static utils::multiprecision_int from_decimal_to_cppint(const data_value& from) {
|
||||
const auto& val_from = value_cast<big_decimal>(from);
|
||||
boost::multiprecision::cpp_int ten(10);
|
||||
return boost::multiprecision::cpp_int(val_from.unscaled_value() / boost::multiprecision::pow(ten, val_from.scale()));
|
||||
auto r = val_from.as_rational();
|
||||
return utils::multiprecision_int(numerator(r)/denominator(r));
|
||||
}
|
||||
|
||||
template<typename ToType>
|
||||
|
||||
@@ -357,7 +357,12 @@ lists::setter_by_uuid::execute(mutation& m, const clustering_key_prefix& prefix,
|
||||
|
||||
collection_mutation_description mut;
|
||||
mut.cells.reserve(1);
|
||||
mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
|
||||
|
||||
if (!value) {
|
||||
mut.cells.emplace_back(to_bytes(*index), params.make_dead_cell());
|
||||
} else {
|
||||
mut.cells.emplace_back(to_bytes(*index), params.make_cell(*ltype->value_comparator(), *value, atomic_cell::collection_member::yes));
|
||||
}
|
||||
|
||||
m.set_cell(prefix, column, mut.serialize(*ltype));
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ relation::to_column_definition(const schema& schema, const column_identifier::ra
|
||||
auto id = entity.prepare_column_identifier(schema);
|
||||
auto def = get_column_definition(schema, *id);
|
||||
if (!def || def->is_hidden_from_cql()) {
|
||||
throw exceptions::unrecognized_entity_exception(id, shared_from_this());
|
||||
throw exceptions::unrecognized_entity_exception(*id, to_string());
|
||||
}
|
||||
return *def;
|
||||
}
|
||||
|
||||
@@ -417,7 +417,7 @@ std::vector<const column_definition*> statement_restrictions::get_column_defs_fo
|
||||
_clustering_columns_restrictions->num_prefix_columns_that_need_not_be_filtered();
|
||||
for (auto&& cdef : _clustering_columns_restrictions->get_column_defs()) {
|
||||
::shared_ptr<single_column_restriction> restr;
|
||||
if (single_pk_restrs) {
|
||||
if (single_ck_restrs) {
|
||||
auto it = single_ck_restrs->restrictions().find(cdef);
|
||||
if (it != single_ck_restrs->restrictions().end()) {
|
||||
restr = dynamic_pointer_cast<single_column_restriction>(it->second);
|
||||
@@ -624,9 +624,6 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
auto operand = value(options);
|
||||
if (operand) {
|
||||
auto cell_value = get_value(schema, key, ckey, cells, now);
|
||||
@@ -641,9 +638,6 @@ bool single_column_restriction::EQ::is_satisfied_by(const schema& schema,
|
||||
}
|
||||
|
||||
bool single_column_restriction::EQ::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
auto operand = value(options);
|
||||
return operand && _column_def.type->compare(*operand, data) == 0;
|
||||
}
|
||||
@@ -654,9 +648,6 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
auto cell_value = get_value(schema, key, ckey, cells, now);
|
||||
if (!cell_value) {
|
||||
return false;
|
||||
@@ -670,9 +661,6 @@ bool single_column_restriction::IN::is_satisfied_by(const schema& schema,
|
||||
}
|
||||
|
||||
bool single_column_restriction::IN::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
auto operands = values(options);
|
||||
return boost::algorithm::any_of(operands, [this, &data] (const bytes_opt& operand) {
|
||||
return operand && _column_def.type->compare(*operand, data) == 0;
|
||||
@@ -697,6 +685,11 @@ static query::range<bytes_view> to_range(const term_slice& slice, const query_op
|
||||
extract_bound(statements::bound::END));
|
||||
}
|
||||
|
||||
static bool contains_without_wraparound(
|
||||
const query::range<bytes_view>& range, bytes_view value, const serialized_tri_compare& cmp) {
|
||||
return !range.is_wrap_around(cmp) && range.contains(value, cmp);
|
||||
}
|
||||
|
||||
bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
|
||||
const partition_key& key,
|
||||
const clustering_key_prefix& ckey,
|
||||
@@ -711,15 +704,14 @@ bool single_column_restriction::slice::is_satisfied_by(const schema& schema,
|
||||
return false;
|
||||
}
|
||||
return cell_value->with_linearized([&] (bytes_view cell_value_bv) {
|
||||
return to_range(_slice, options).contains(cell_value_bv, _column_def.type->as_tri_comparator());
|
||||
return contains_without_wraparound(to_range(_slice, options),
|
||||
cell_value_bv, _column_def.type->as_tri_comparator());
|
||||
});
|
||||
}
|
||||
|
||||
bool single_column_restriction::slice::is_satisfied_by(bytes_view data, const query_options& options) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
return to_range(_slice, options).contains(data, _column_def.type->underlying_type()->as_tri_comparator());
|
||||
return contains_without_wraparound(to_range(_slice, options),
|
||||
data, _column_def.type->underlying_type()->as_tri_comparator());
|
||||
}
|
||||
|
||||
bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
@@ -728,9 +720,6 @@ bool single_column_restriction::contains::is_satisfied_by(const schema& schema,
|
||||
const row& cells,
|
||||
const query_options& options,
|
||||
gc_clock::time_point now) const {
|
||||
if (_column_def.type->is_counter()) {
|
||||
fail(unimplemented::cause::COUNTERS);
|
||||
}
|
||||
if (!_column_def.type->is_collection()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -207,6 +207,9 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
|
||||
"because a collection with the same name and a different type has already been used in the past", column_name));
|
||||
}
|
||||
}
|
||||
if (type->is_counter() && !schema.is_counter()) {
|
||||
throw exceptions::configuration_exception(format("Cannot add a counter column ({}) in a non counter column family", column_name));
|
||||
}
|
||||
|
||||
cfm.with_column(column_name.name(), type, is_static ? column_kind::static_column : column_kind::regular_column);
|
||||
|
||||
@@ -222,7 +225,7 @@ void alter_table_statement::add_column(const schema& schema, const table& cf, sc
|
||||
schema_builder builder(view);
|
||||
if (view->view_info()->include_all_columns()) {
|
||||
builder.with_column(column_name.name(), type);
|
||||
} else if (view->view_info()->base_non_pk_columns_in_view_pk().empty()) {
|
||||
} else if (!view->view_info()->has_base_non_pk_columns_in_view_pk()) {
|
||||
db::view::create_virtual_column(builder, column_name.name(), type);
|
||||
}
|
||||
view_updates.push_back(view_ptr(builder.build()));
|
||||
|
||||
@@ -68,6 +68,7 @@ batch_statement::batch_statement(int bound_terms, type type_,
|
||||
, _has_conditions(boost::algorithm::any_of(_statements, [] (auto&& s) { return s.statement->has_conditions(); }))
|
||||
, _stats(stats)
|
||||
{
|
||||
validate();
|
||||
if (has_conditions()) {
|
||||
// A batch can be created not only by raw::batch_statement::prepare, but also by
|
||||
// cql_server::connection::process_batch, which doesn't call any methods of
|
||||
@@ -448,7 +449,6 @@ batch_statement::prepare(database& db, cql_stats& stats) {
|
||||
prep_attrs->collect_marker_specification(bound_names);
|
||||
|
||||
cql3::statements::batch_statement batch_statement_(bound_names.size(), _type, std::move(statements), std::move(prep_attrs), stats);
|
||||
batch_statement_.validate();
|
||||
|
||||
std::vector<uint16_t> partition_key_bind_indices;
|
||||
if (!have_multiple_cfs && batch_statement_.get_statements().size() > 0) {
|
||||
|
||||
@@ -255,7 +255,9 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_
|
||||
}
|
||||
}
|
||||
|
||||
builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));
|
||||
if (has_property(KW_DEFAULT_TIME_TO_LIVE)) {
|
||||
builder.set_default_time_to_live(gc_clock::duration(get_int(KW_DEFAULT_TIME_TO_LIVE, DEFAULT_DEFAULT_TIME_TO_LIVE)));
|
||||
}
|
||||
|
||||
if (has_property(KW_SPECULATIVE_RETRY)) {
|
||||
builder.set_speculative_retry(get_string(KW_SPECULATIVE_RETRY, builder.get_speculative_retry().to_sstring()));
|
||||
|
||||
@@ -434,6 +434,12 @@ GCC6_CONCEPT(
|
||||
static KeyType
|
||||
generate_base_key_from_index_pk(const partition_key& index_pk, const std::optional<clustering_key>& index_ck, const schema& base_schema, const schema& view_schema) {
|
||||
const auto& base_columns = std::is_same_v<KeyType, partition_key> ? base_schema.partition_key_columns() : base_schema.clustering_key_columns();
|
||||
|
||||
// An empty key in the index paging state translates to an empty base key
|
||||
if (index_pk.is_empty() && !index_ck) {
|
||||
return KeyType::make_empty();
|
||||
}
|
||||
|
||||
std::vector<bytes_view> exploded_base_key;
|
||||
exploded_base_key.reserve(base_columns.size());
|
||||
|
||||
@@ -507,8 +513,7 @@ indexed_table_select_statement::do_execute_base_query(
|
||||
if (old_paging_state && concurrency == 1) {
|
||||
auto base_pk = generate_base_key_from_index_pk<partition_key>(old_paging_state->get_partition_key(),
|
||||
old_paging_state->get_clustering_key(), *_schema, *_view_schema);
|
||||
if (_schema->clustering_key_size() > 0) {
|
||||
assert(old_paging_state->get_clustering_key().has_value());
|
||||
if (old_paging_state->get_clustering_key() && _schema->clustering_key_size() > 0) {
|
||||
auto base_ck = generate_base_key_from_index_pk<clustering_key>(old_paging_state->get_partition_key(),
|
||||
old_paging_state->get_clustering_key(), *_schema, *_view_schema);
|
||||
command->slice.set_range(*_schema, base_pk,
|
||||
@@ -1362,8 +1367,8 @@ select_statement::prepare_restrictions(database& db,
|
||||
return ::make_shared<restrictions::statement_restrictions>(db, schema, statement_type::SELECT, std::move(_where_clause), bound_names,
|
||||
selection->contains_only_static_columns(), selection->contains_a_collection(), for_view, allow_filtering);
|
||||
} catch (const exceptions::unrecognized_entity_exception& e) {
|
||||
if (contains_alias(*e.entity)) {
|
||||
throw exceptions::invalid_request_exception(format("Aliases aren't allowed in the where clause ('{}')", e.relation->to_string()));
|
||||
if (contains_alias(e.entity)) {
|
||||
throw exceptions::invalid_request_exception(format("Aliases aren't allowed in the where clause ('{}')", e.relation_str));
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
21
database.cc
21
database.cc
@@ -1323,7 +1323,7 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
|
||||
// counter state for each modified cell...
|
||||
|
||||
tracing::trace(trace_state, "Reading counter values from the CF");
|
||||
return counter_write_query(m_schema, cf.as_mutation_source(), m.decorated_key(), slice, trace_state)
|
||||
return counter_write_query(m_schema, cf.as_mutation_source(), m.decorated_key(), slice, trace_state, timeout)
|
||||
.then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) {
|
||||
// ...now, that we got existing state of all affected counter
|
||||
// cells we can look for our shard in each of them, increment
|
||||
@@ -1827,7 +1827,11 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
|
||||
// TODO: indexes.
|
||||
// Note: since discard_sstables was changed to only count tables owned by this shard,
|
||||
// we can get zero rp back. Changed assert, and ensure we save at least low_mark.
|
||||
assert(low_mark <= rp || rp == db::replay_position());
|
||||
// #6995 - the assert below was broken in c2c6c71 and remained so for many years.
|
||||
// We nowadays do not flush tables with sstables but autosnapshot=false. This means
|
||||
// the low_mark assertion does not hold, because we maybe/probably never got around to
|
||||
// creating the sstables that would create them.
|
||||
assert(!should_flush || low_mark <= rp || rp == db::replay_position());
|
||||
rp = std::max(low_mark, rp);
|
||||
return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
|
||||
// save_truncation_record() may actually fail after we cached the truncation time
|
||||
@@ -2005,9 +2009,10 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
|
||||
reader_concurrency_semaphore* semaphore;
|
||||
};
|
||||
distributed<database>& _db;
|
||||
utils::UUID _table_id;
|
||||
std::vector<reader_context> _contexts;
|
||||
public:
|
||||
explicit streaming_reader_lifecycle_policy(distributed<database>& db) : _db(db), _contexts(smp::count) {
|
||||
streaming_reader_lifecycle_policy(distributed<database>& db, utils::UUID table_id) : _db(db), _table_id(table_id), _contexts(smp::count) {
|
||||
}
|
||||
virtual flat_mutation_reader create_reader(
|
||||
schema_ptr schema,
|
||||
@@ -2036,7 +2041,12 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
|
||||
});
|
||||
}
|
||||
virtual reader_concurrency_semaphore& semaphore() override {
|
||||
return *_contexts[engine().cpu_id()].semaphore;
|
||||
const auto shard = engine().cpu_id();
|
||||
if (!_contexts[shard].semaphore) {
|
||||
auto& cf = _db.local().find_column_family(_table_id);
|
||||
_contexts[shard].semaphore = &cf.streaming_read_concurrency_semaphore();
|
||||
}
|
||||
return *_contexts[shard].semaphore;
|
||||
}
|
||||
};
|
||||
auto ms = mutation_source([&db] (schema_ptr s,
|
||||
@@ -2047,7 +2057,8 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db), std::move(s), pr, ps, pc,
|
||||
auto table_id = s->id();
|
||||
return make_multishard_combining_reader(make_shared<streaming_reader_lifecycle_policy>(db, table_id), std::move(s), pr, ps, pc,
|
||||
std::move(trace_state), fwd_mr);
|
||||
});
|
||||
auto&& full_slice = schema->full_slice();
|
||||
|
||||
@@ -55,6 +55,7 @@
|
||||
#include <limits>
|
||||
#include <cstddef>
|
||||
#include "schema_fwd.hh"
|
||||
#include "db/view/view.hh"
|
||||
#include "db/schema_features.hh"
|
||||
#include "gms/feature.hh"
|
||||
#include "timestamp.hh"
|
||||
@@ -885,7 +886,7 @@ public:
|
||||
lw_shared_ptr<const sstable_list> get_sstables_including_compacted_undeleted() const;
|
||||
const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const;
|
||||
std::vector<sstables::shared_sstable> select_sstables(const dht::partition_range& range) const;
|
||||
std::vector<sstables::shared_sstable> candidates_for_compaction() const;
|
||||
std::vector<sstables::shared_sstable> non_staging_sstables() const;
|
||||
std::vector<sstables::shared_sstable> sstables_need_rewrite() const;
|
||||
size_t sstables_count() const;
|
||||
std::vector<uint64_t> sstable_count_per_level() const;
|
||||
@@ -981,8 +982,9 @@ public:
|
||||
return *_config.sstables_manager;
|
||||
}
|
||||
|
||||
// Reader's schema must be the same as the base schema of each of the views.
|
||||
future<> populate_views(
|
||||
std::vector<view_ptr>,
|
||||
std::vector<db::view::view_and_base>,
|
||||
dht::token base_token,
|
||||
flat_mutation_reader&&);
|
||||
|
||||
@@ -998,7 +1000,7 @@ private:
|
||||
future<row_locker::lock_holder> do_push_view_replica_updates(const schema_ptr& s, mutation&& m, db::timeout_clock::time_point timeout, mutation_source&& source, const io_priority_class& io_priority) const;
|
||||
std::vector<view_ptr> affected_views(const schema_ptr& base, const mutation& update) const;
|
||||
future<> generate_and_propagate_view_updates(const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views,
|
||||
std::vector<db::view::view_and_base>&& views,
|
||||
mutation&& m,
|
||||
flat_mutation_reader_opt existings) const;
|
||||
|
||||
|
||||
@@ -520,7 +520,7 @@ public:
|
||||
_segment_manager->totals.total_size_on_disk -= size_on_disk();
|
||||
_segment_manager->totals.total_size -= (size_on_disk() + _buffer.size_bytes());
|
||||
_segment_manager->add_file_to_delete(_file_name, _desc);
|
||||
} else {
|
||||
} else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
|
||||
clogger.warn("Segment {} is dirty and is left on disk.", *this);
|
||||
}
|
||||
}
|
||||
@@ -614,11 +614,17 @@ public:
|
||||
future<sseg_ptr> terminate() {
|
||||
assert(_closed);
|
||||
if (!std::exchange(_terminated, true)) {
|
||||
clogger.trace("{} is closed but not terminated.", *this);
|
||||
if (_buffer.empty()) {
|
||||
new_buffer(0);
|
||||
// write a terminating zero block iff we are ending (a reused)
|
||||
// block before actual file end.
|
||||
// we should only get here when all actual data is
|
||||
// already flushed (see below, close()).
|
||||
if (size_on_disk() < _segment_manager->max_size) {
|
||||
clogger.trace("{} is closed but not terminated.", *this);
|
||||
if (_buffer.empty()) {
|
||||
new_buffer(0);
|
||||
}
|
||||
return cycle(true, true);
|
||||
}
|
||||
return cycle(true, true);
|
||||
}
|
||||
return make_ready_future<sseg_ptr>(shared_from_this());
|
||||
}
|
||||
@@ -1287,7 +1293,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
auto fut = open_file_dma(filename, flags, opt);
|
||||
if (cfg.extensions && !cfg.extensions->commitlog_file_extensions().empty()) {
|
||||
for (auto * ext : cfg.extensions->commitlog_file_extensions()) {
|
||||
fut = fut.then([ext, filename, flags](file f) {
|
||||
fut = close_on_failure(std::move(fut), [ext, filename, flags](file f) {
|
||||
return ext->wrap_file(filename, f, flags).then([f](file nf) mutable {
|
||||
return nf ? nf : std::move(f);
|
||||
});
|
||||
@@ -2127,8 +2133,9 @@ db::commitlog::read_log_file(const sstring& filename, const sstring& pfx, seasta
|
||||
}).handle_exception([w](auto ep) {
|
||||
w->s.set_exception(ep);
|
||||
});
|
||||
|
||||
return ret.done();
|
||||
// #6265 - must keep subscription alive.
|
||||
auto res = ret.done();
|
||||
return res.finally([ret = std::move(ret)] {});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -137,6 +137,7 @@ public:
|
||||
|
||||
bool reuse_segments = true;
|
||||
bool use_o_dsync = false;
|
||||
bool warn_about_segments_left_on_disk_after_shutdown = true;
|
||||
|
||||
const db::extensions * extensions = nullptr;
|
||||
};
|
||||
|
||||
@@ -299,7 +299,7 @@ future<> db::commitlog_replayer::impl::process(stats* s, commitlog::buffer_and_r
|
||||
mutation m(cf.schema(), fm.decorated_key(*cf.schema()));
|
||||
converting_mutation_partition_applier v(cm, *cf.schema(), m.partition());
|
||||
fm.partition().accept(cm, v);
|
||||
return do_with(std::move(m), [&db, &cf] (mutation m) {
|
||||
return do_with(std::move(m), [&db, &cf] (const mutation& m) {
|
||||
return db.apply_in_memory(m, cf, db::rp_handle(), db::no_timeout);
|
||||
});
|
||||
} else {
|
||||
|
||||
11
db/config.cc
11
db/config.cc
@@ -681,7 +681,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, replace_address(this, "replace_address", value_status::Used, "", "The listen_address or broadcast_address of the dead node to replace. Same as -Dcassandra.replace_address.")
|
||||
, replace_address_first_boot(this, "replace_address_first_boot", value_status::Used, "", "Like replace_address option, but if the node has been bootstrapped successfully it will be ignored. Same as -Dcassandra.replace_address_first_boot.")
|
||||
, override_decommission(this, "override_decommission", value_status::Used, false, "Set true to force a decommissioned node to join the cluster")
|
||||
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based")
|
||||
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, false, "Set true to use enable repair based node operations instead of streaming based")
|
||||
, ring_delay_ms(this, "ring_delay_ms", value_status::Used, 30 * 1000, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.")
|
||||
, shadow_round_ms(this, "shadow_round_ms", value_status::Used, 300 * 1000, "The maximum gossip shadow round time. Can be used to reduce the gossip feature check time during node boot up.")
|
||||
, fd_max_interval_ms(this, "fd_max_interval_ms", value_status::Used, 2 * 1000, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.")
|
||||
@@ -689,6 +689,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
, shutdown_announce_in_ms(this, "shutdown_announce_in_ms", value_status::Used, 2 * 1000, "Time a node waits after sending gossip shutdown message in milliseconds. Same as -Dcassandra.shutdown_announce_in_ms in cassandra.")
|
||||
, developer_mode(this, "developer_mode", value_status::Used, false, "Relax environment checks. Setting to true can reduce performance and reliability significantly.")
|
||||
, skip_wait_for_gossip_to_settle(this, "skip_wait_for_gossip_to_settle", value_status::Used, -1, "An integer to configure the wait for gossip to settle. -1: wait normally, 0: do not wait at all, n: wait for at most n polls. Same as -Dcassandra.skip_wait_for_gossip_to_settle in cassandra.")
|
||||
, force_gossip_generation(this, "force_gossip_generation", liveness::LiveUpdate, value_status::Used, -1 , "Force gossip to use the generation number provided by user")
|
||||
, experimental(this, "experimental", value_status::Used, false, "Set to true to unlock all experimental features.")
|
||||
, experimental_features(this, "experimental_features", value_status::Used, {}, "Unlock experimental features provided as the option arguments (possible values: 'lwt', 'cdc', 'udf'). Can be repeated.")
|
||||
, lsa_reclamation_step(this, "lsa_reclamation_step", value_status::Used, 1, "Minimum number of segments to reclaim in a single step")
|
||||
@@ -859,7 +860,7 @@ db::fs::path db::config::get_conf_sub(db::fs::path sub) {
|
||||
}
|
||||
|
||||
bool db::config::check_experimental(experimental_features_t::feature f) const {
|
||||
if (experimental()) {
|
||||
if (experimental() && f != experimental_features_t::UNUSED) {
|
||||
return true;
|
||||
}
|
||||
const auto& optval = experimental_features();
|
||||
@@ -911,11 +912,13 @@ const db::extensions& db::config::extensions() const {
|
||||
std::unordered_map<sstring, db::experimental_features_t::feature> db::experimental_features_t::map() {
|
||||
// We decided against using the construct-on-first-use idiom here:
|
||||
// https://github.com/scylladb/scylla/pull/5369#discussion_r353614807
|
||||
return {{"lwt", LWT}, {"udf", UDF}, {"cdc", CDC}};
|
||||
// Lightweight transactions are no longer experimental. Map them
|
||||
// to UNUSED switch for a while, then remove altogether.
|
||||
return {{"lwt", UNUSED}, {"udf", UDF}, {"cdc", CDC}};
|
||||
}
|
||||
|
||||
std::vector<enum_option<db::experimental_features_t>> db::experimental_features_t::all() {
|
||||
return {LWT, UDF, CDC};
|
||||
return {UDF, CDC};
|
||||
}
|
||||
|
||||
template struct utils::config_file::named_value<seastar::log_level>;
|
||||
|
||||
@@ -81,7 +81,7 @@ namespace db {
|
||||
|
||||
/// Enumeration of all valid values for the `experimental` config entry.
|
||||
struct experimental_features_t {
|
||||
enum feature { LWT, UDF, CDC };
|
||||
enum feature { UNUSED, UDF, CDC };
|
||||
static std::unordered_map<sstring, feature> map(); // See enum_option.
|
||||
static std::vector<enum_option<experimental_features_t>> all();
|
||||
};
|
||||
@@ -278,6 +278,7 @@ public:
|
||||
named_value<uint32_t> shutdown_announce_in_ms;
|
||||
named_value<bool> developer_mode;
|
||||
named_value<int32_t> skip_wait_for_gossip_to_settle;
|
||||
named_value<int32_t> force_gossip_generation;
|
||||
named_value<bool> experimental;
|
||||
named_value<std::vector<enum_option<experimental_features_t>>> experimental_features;
|
||||
named_value<size_t> lsa_reclamation_step;
|
||||
|
||||
@@ -224,7 +224,9 @@ future<> manager::end_point_hints_manager::stop(drain should_drain) noexcept {
|
||||
with_lock(file_update_mutex(), [this] {
|
||||
if (_hints_store_anchor) {
|
||||
hints_store_ptr tmp = std::exchange(_hints_store_anchor, nullptr);
|
||||
return tmp->shutdown().finally([tmp] {});
|
||||
return tmp->shutdown().finally([tmp] {
|
||||
return tmp->release();
|
||||
}).finally([tmp] {});
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).handle_exception([&eptr] (auto e) { eptr = std::move(e); }).get();
|
||||
@@ -326,6 +328,10 @@ future<db::commitlog> manager::end_point_hints_manager::add_store() noexcept {
|
||||
// HH doesn't utilize the flow that benefits from reusing segments.
|
||||
// Therefore let's simply disable it to avoid any possible confusion.
|
||||
cfg.reuse_segments = false;
|
||||
// HH leaves segments on disk after commitlog shutdown, and later reads
|
||||
// them when commitlog is re-created. This is expected to happen regularly
|
||||
// during standard HH workload, so no need to print a warning about it.
|
||||
cfg.warn_about_segments_left_on_disk_after_shutdown = false;
|
||||
|
||||
return commitlog::create_commitlog(std::move(cfg)).then([this] (commitlog l) {
|
||||
// add_store() is triggered every time hint files are forcefully flushed to I/O (every hints_flush_period).
|
||||
@@ -352,7 +358,9 @@ future<> manager::end_point_hints_manager::flush_current_hints() noexcept {
|
||||
return futurize_apply([this] {
|
||||
return with_lock(file_update_mutex(), [this]() -> future<> {
|
||||
return get_or_load().then([] (hints_store_ptr cptr) {
|
||||
return cptr->shutdown();
|
||||
return cptr->shutdown().finally([cptr] {
|
||||
return cptr->release();
|
||||
}).finally([cptr] {});
|
||||
}).then([this] {
|
||||
// Un-hold the commitlog object. Since we are under the exclusive _file_update_mutex lock there are no
|
||||
// other hints_store_ptr copies and this would destroy the commitlog shared value.
|
||||
@@ -703,6 +711,7 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
||||
// Files are aggregated for at most manager::hints_timer_period therefore the oldest hint there is
|
||||
// (last_modification - manager::hints_timer_period) old.
|
||||
if (gc_clock::now().time_since_epoch() - secs_since_file_mod > gc_grace_sec - manager::hints_flush_period) {
|
||||
ctx_ptr->rps_set.erase(rp);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
@@ -725,6 +734,7 @@ future<> manager::end_point_hints_manager::sender::send_one_hint(lw_shared_ptr<s
|
||||
manager_logger.debug("send_hints(): {} at {}: {}", fname, rp, e.what());
|
||||
++this->shard_stats().discarded;
|
||||
}
|
||||
ctx_ptr->rps_set.erase(rp);
|
||||
return make_ready_future<>();
|
||||
}).finally([units = std::move(units), ctx_ptr] {});
|
||||
}).handle_exception([this, ctx_ptr] (auto eptr) {
|
||||
|
||||
@@ -822,6 +822,14 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
|
||||
});
|
||||
}
|
||||
|
||||
future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat) {
|
||||
return merge_lock().then([&proxy, &feat] {
|
||||
return update_schema_version_and_announce(proxy, feat.cluster_schema_features());
|
||||
}).finally([] {
|
||||
return merge_unlock();
|
||||
});
|
||||
}
|
||||
|
||||
future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush)
|
||||
{
|
||||
return merge_lock().then([&proxy, mutations = std::move(mutations), do_flush] () mutable {
|
||||
|
||||
@@ -170,6 +170,13 @@ future<> merge_schema(distributed<service::storage_proxy>& proxy, gms::feature_s
|
||||
|
||||
future<> merge_schema(distributed<service::storage_proxy>& proxy, std::vector<mutation> mutations, bool do_flush);
|
||||
|
||||
// Recalculates the local schema version and publishes it in gossip.
|
||||
//
|
||||
// It is safe to call concurrently with recalculate_schema_version() and merge_schema() in which case it
|
||||
// is guaranteed that the schema version we end up with after all calls will reflect the most recent state
|
||||
// of feature_service and schema tables.
|
||||
future<> recalculate_schema_version(distributed<service::storage_proxy>& proxy, gms::feature_service& feat);
|
||||
|
||||
future<std::set<sstring>> merge_keyspaces(distributed<service::storage_proxy>& proxy, schema_result&& before, schema_result&& after);
|
||||
|
||||
std::vector<mutation> make_create_keyspace_mutations(lw_shared_ptr<keyspace_metadata> keyspace, api::timestamp_type timestamp, bool with_tables_and_types_and_functions = true);
|
||||
|
||||
@@ -187,7 +187,7 @@ schema_ptr batchlog() {
|
||||
{{"cf_id", uuid_type}},
|
||||
// regular columns
|
||||
{
|
||||
{"in_progress_ballot", timeuuid_type},
|
||||
{"promise", timeuuid_type},
|
||||
{"most_recent_commit", bytes_type}, // serialization format is defined by frozen_mutation idl
|
||||
{"most_recent_commit_at", timeuuid_type},
|
||||
{"proposal", bytes_type}, // serialization format is defined by frozen_mutation idl
|
||||
@@ -203,6 +203,7 @@ schema_ptr batchlog() {
|
||||
// operations on resulting CFMetaData:
|
||||
// .compactionStrategyClass(LeveledCompactionStrategy.class);
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
builder.set_wait_for_sync_to_commitlog(true);
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
@@ -226,6 +227,7 @@ schema_ptr built_indexes() {
|
||||
// comment
|
||||
"built column indexes"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build(schema_builder::compact_storage::yes);
|
||||
}();
|
||||
@@ -272,6 +274,7 @@ schema_ptr built_indexes() {
|
||||
// comment
|
||||
"information about the local node"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
builder.remove_column("scylla_cpu_sharding_algorithm");
|
||||
builder.remove_column("scylla_nr_shards");
|
||||
@@ -307,6 +310,7 @@ schema_ptr built_indexes() {
|
||||
// comment
|
||||
"information about known peers in the cluster"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
@@ -331,6 +335,7 @@ schema_ptr built_indexes() {
|
||||
// comment
|
||||
"events related to peers"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
@@ -353,6 +358,7 @@ schema_ptr built_indexes() {
|
||||
// comment
|
||||
"ranges requested for transfer"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
@@ -490,6 +496,7 @@ schema_ptr size_estimates() {
|
||||
// comment
|
||||
"partitions larger than specified threshold"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
@@ -510,6 +517,7 @@ static schema_ptr large_rows() {
|
||||
.with_column("compaction_time", timestamp_type)
|
||||
.set_comment("rows larger than specified threshold")
|
||||
.with_version(generate_schema_version(id))
|
||||
.set_gc_grace_seconds(0)
|
||||
.build();
|
||||
}();
|
||||
return large_rows;
|
||||
@@ -530,6 +538,7 @@ static schema_ptr large_cells() {
|
||||
.with_column("compaction_time", timestamp_type)
|
||||
.set_comment("cells larger than specified threshold")
|
||||
.with_version(generate_schema_version(id))
|
||||
.set_gc_grace_seconds(0)
|
||||
.build();
|
||||
}();
|
||||
return large_cells;
|
||||
@@ -553,6 +562,7 @@ static schema_ptr large_cells() {
|
||||
// comment
|
||||
"Scylla specific information about the local node"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
@@ -666,6 +676,7 @@ schema_ptr local() {
|
||||
// comment
|
||||
"information about the local node"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
@@ -693,6 +704,7 @@ schema_ptr truncated() {
|
||||
// comment
|
||||
"information about table truncation"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
@@ -755,6 +767,7 @@ schema_ptr available_ranges() {
|
||||
// comment
|
||||
"available keyspace/ranges during bootstrap/replace that are ready to be served"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build();
|
||||
}();
|
||||
@@ -777,6 +790,7 @@ schema_ptr views_builds_in_progress() {
|
||||
// comment
|
||||
"views builds current progress"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build();
|
||||
}();
|
||||
@@ -799,6 +813,7 @@ schema_ptr built_views() {
|
||||
// comment
|
||||
"built views"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build();
|
||||
}();
|
||||
@@ -842,6 +857,7 @@ schema_ptr scylla_views_builds_in_progress() {
|
||||
// comment
|
||||
"CDC-specific information that the local node stores"
|
||||
)));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
builder.with_version(generate_schema_version(builder.uuid()));
|
||||
return builder.build(schema_builder::compact_storage::no);
|
||||
}();
|
||||
@@ -2196,13 +2212,13 @@ future<service::paxos::paxos_state> load_paxos_state(const partition_key& key, s
|
||||
// FIXME: we need execute_cql_with_now()
|
||||
(void)now;
|
||||
auto f = execute_cql_with_timeout(cql, timeout, to_legacy(*key.get_compound_type(*s), key.representation()), s->id());
|
||||
return f.then([s] (shared_ptr<cql3::untyped_result_set> results) mutable {
|
||||
return f.then([s, key] (shared_ptr<cql3::untyped_result_set> results) mutable {
|
||||
if (results->empty()) {
|
||||
return service::paxos::paxos_state();
|
||||
}
|
||||
auto& row = results->one();
|
||||
auto promised = row.has("in_progress_ballot")
|
||||
? row.get_as<utils::UUID>("in_progress_ballot") : utils::UUID_gen::min_time_UUID(0);
|
||||
auto promised = row.has("promise")
|
||||
? row.get_as<utils::UUID>("promise") : utils::UUID_gen::min_time_UUID(0);
|
||||
|
||||
std::optional<service::paxos::proposal> accepted;
|
||||
if (row.has("proposal")) {
|
||||
@@ -2211,9 +2227,14 @@ future<service::paxos::paxos_state> load_paxos_state(const partition_key& key, s
|
||||
}
|
||||
|
||||
std::optional<service::paxos::proposal> most_recent;
|
||||
if (row.has("most_recent_commit")) {
|
||||
if (row.has("most_recent_commit_at")) {
|
||||
// the value can be missing if it was pruned, suply empty one since
|
||||
// it will not going to be used anyway
|
||||
auto fm = row.has("most_recent_commit") ?
|
||||
ser::deserialize_from_buffer<>(row.get_blob("most_recent_commit"), boost::type<frozen_mutation>(), 0) :
|
||||
freeze(mutation(s, key));
|
||||
most_recent = service::paxos::proposal(row.get_as<utils::UUID>("most_recent_commit_at"),
|
||||
ser::deserialize_from_buffer<>(row.get_blob("most_recent_commit"), boost::type<frozen_mutation>(), 0));
|
||||
std::move(fm));
|
||||
}
|
||||
|
||||
return service::paxos::paxos_state(promised, std::move(accepted), std::move(most_recent));
|
||||
@@ -2228,7 +2249,7 @@ static int32_t paxos_ttl_sec(const schema& s) {
|
||||
}
|
||||
|
||||
future<> save_paxos_promise(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout) {
|
||||
static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET in_progress_ballot = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
|
||||
static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET promise = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
|
||||
return execute_cql_with_timeout(cql,
|
||||
timeout,
|
||||
utils::UUID_gen::micros_timestamp(ballot),
|
||||
@@ -2240,13 +2261,14 @@ future<> save_paxos_promise(const schema& s, const partition_key& key, const uti
|
||||
}
|
||||
|
||||
future<> save_paxos_proposal(const schema& s, const service::paxos::proposal& proposal, db::timeout_clock::time_point timeout) {
|
||||
static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET proposal_ballot = ?, proposal = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
|
||||
static auto cql = format("UPDATE system.{} USING TIMESTAMP ? AND TTL ? SET promise = ?, proposal_ballot = ?, proposal = ? WHERE row_key = ? AND cf_id = ?", PAXOS);
|
||||
partition_key_view key = proposal.update.key(s);
|
||||
return execute_cql_with_timeout(cql,
|
||||
timeout,
|
||||
utils::UUID_gen::micros_timestamp(proposal.ballot),
|
||||
paxos_ttl_sec(s),
|
||||
proposal.ballot,
|
||||
proposal.ballot,
|
||||
ser::serialize_to_buffer<bytes>(proposal.update),
|
||||
to_legacy(*key.get_compound_type(s), key.representation()),
|
||||
s.id()
|
||||
@@ -2274,6 +2296,20 @@ future<> save_paxos_decision(const schema& s, const service::paxos::proposal& de
|
||||
).discard_result();
|
||||
}
|
||||
|
||||
future<> delete_paxos_decision(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout) {
|
||||
// This should be called only if a learn stage succeeded on all replicas.
|
||||
// In this case we can remove learned paxos value using ballot's timestamp which
|
||||
// guarantees that if there is more recent round it will not be affected.
|
||||
static auto cql = format("DELETE most_recent_commit FROM system.{} USING TIMESTAMP ? WHERE row_key = ? AND cf_id = ?", PAXOS);
|
||||
|
||||
return execute_cql_with_timeout(cql,
|
||||
timeout,
|
||||
utils::UUID_gen::micros_timestamp(ballot),
|
||||
to_legacy(*key.get_compound_type(s), key.representation()),
|
||||
s.id()
|
||||
).discard_result();
|
||||
}
|
||||
|
||||
} // namespace system_keyspace
|
||||
|
||||
sstring system_keyspace_name() {
|
||||
|
||||
@@ -647,6 +647,7 @@ future<service::paxos::paxos_state> load_paxos_state(const partition_key& key, s
|
||||
future<> save_paxos_promise(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout);
|
||||
future<> save_paxos_proposal(const schema& s, const service::paxos::proposal& proposal, db::timeout_clock::time_point timeout);
|
||||
future<> save_paxos_decision(const schema& s, const service::paxos::proposal& decision, db::timeout_clock::time_point timeout);
|
||||
future<> delete_paxos_decision(const schema& s, const partition_key& key, const utils::UUID& ballot, db::timeout_clock::time_point timeout);
|
||||
|
||||
} // namespace system_keyspace
|
||||
} // namespace db
|
||||
|
||||
@@ -130,17 +130,26 @@ const column_definition* view_info::view_column(const column_definition& base_de
|
||||
return _schema.get_column_definition(base_def.name());
|
||||
}
|
||||
|
||||
const std::vector<column_id>& view_info::base_non_pk_columns_in_view_pk() const {
|
||||
return _base_non_pk_columns_in_view_pk;
|
||||
void view_info::set_base_info(db::view::base_info_ptr base_info) {
|
||||
_base_info = std::move(base_info);
|
||||
}
|
||||
|
||||
void view_info::initialize_base_dependent_fields(const schema& base) {
|
||||
db::view::base_info_ptr view_info::make_base_dependent_view_info(const schema& base) const {
|
||||
std::vector<column_id> base_non_pk_columns_in_view_pk;
|
||||
for (auto&& view_col : boost::range::join(_schema.partition_key_columns(), _schema.clustering_key_columns())) {
|
||||
auto* base_col = base.get_column_definition(view_col.name());
|
||||
if (base_col && !base_col->is_primary_key()) {
|
||||
_base_non_pk_columns_in_view_pk.push_back(base_col->id);
|
||||
base_non_pk_columns_in_view_pk.push_back(base_col->id);
|
||||
}
|
||||
}
|
||||
return make_lw_shared<db::view::base_dependent_view_info>({
|
||||
.base_schema = base.shared_from_this(),
|
||||
.base_non_pk_columns_in_view_pk = std::move(base_non_pk_columns_in_view_pk)
|
||||
});
|
||||
}
|
||||
|
||||
bool view_info::has_base_non_pk_columns_in_view_pk() const {
|
||||
return !_base_info->base_non_pk_columns_in_view_pk.empty();
|
||||
}
|
||||
|
||||
namespace db {
|
||||
@@ -188,11 +197,11 @@ bool may_be_affected_by(const schema& base, const view_info& view, const dht::de
|
||||
}
|
||||
|
||||
static bool update_requires_read_before_write(const schema& base,
|
||||
const std::vector<view_ptr>& views,
|
||||
const std::vector<view_and_base>& views,
|
||||
const dht::decorated_key& key,
|
||||
const rows_entry& update) {
|
||||
for (auto&& v : views) {
|
||||
view_info& vf = *v->view_info();
|
||||
view_info& vf = *v.view->view_info();
|
||||
if (may_be_affected_by(base, vf, key, update)) {
|
||||
return true;
|
||||
}
|
||||
@@ -239,12 +248,14 @@ class view_updates final {
|
||||
view_ptr _view;
|
||||
const view_info& _view_info;
|
||||
schema_ptr _base;
|
||||
base_info_ptr _base_info;
|
||||
std::unordered_map<partition_key, mutation_partition, partition_key::hashing, partition_key::equality> _updates;
|
||||
public:
|
||||
explicit view_updates(view_ptr view, schema_ptr base)
|
||||
: _view(std::move(view))
|
||||
explicit view_updates(view_and_base vab)
|
||||
: _view(std::move(vab.view))
|
||||
, _view_info(*_view->view_info())
|
||||
, _base(std::move(base))
|
||||
, _base(vab.base->base_schema)
|
||||
, _base_info(vab.base)
|
||||
, _updates(8, partition_key::hashing(*_view), partition_key::equality(*_view)) {
|
||||
}
|
||||
|
||||
@@ -306,7 +317,7 @@ row_marker view_updates::compute_row_marker(const clustering_row& base_row) cons
|
||||
// they share liveness information. It's true especially in the only case currently allowed by CQL,
|
||||
// which assumes there's up to one non-pk column in the view key. It's also true in alternator,
|
||||
// which does not carry TTL information.
|
||||
const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
|
||||
const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk;
|
||||
if (!col_ids.empty()) {
|
||||
auto& def = _base->regular_column_at(col_ids[0]);
|
||||
// Note: multi-cell columns can't be part of the primary key.
|
||||
@@ -537,7 +548,7 @@ void view_updates::delete_old_entry(const partition_key& base_key, const cluster
|
||||
|
||||
void view_updates::do_delete_old_entry(const partition_key& base_key, const clustering_row& existing, const clustering_row& update, gc_clock::time_point now) {
|
||||
auto& r = get_view_row(base_key, existing);
|
||||
const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
|
||||
const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk;
|
||||
if (!col_ids.empty()) {
|
||||
// We delete the old row using a shadowable row tombstone, making sure that
|
||||
// the tombstone deletes everything in the row (or it might still show up).
|
||||
@@ -678,7 +689,7 @@ void view_updates::generate_update(
|
||||
return;
|
||||
}
|
||||
|
||||
const auto& col_ids = _view_info.base_non_pk_columns_in_view_pk();
|
||||
const auto& col_ids = _base_info->base_non_pk_columns_in_view_pk;
|
||||
if (col_ids.empty()) {
|
||||
// The view key is necessarily the same pre and post update.
|
||||
if (existing && existing->is_live(*_base)) {
|
||||
@@ -932,11 +943,16 @@ future<stop_iteration> view_update_builder::on_results() {
|
||||
|
||||
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||
const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views_to_update,
|
||||
std::vector<view_and_base>&& views_to_update,
|
||||
flat_mutation_reader&& updates,
|
||||
flat_mutation_reader_opt&& existings) {
|
||||
auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (auto&& v) {
|
||||
return view_updates(std::move(v), base);
|
||||
auto vs = boost::copy_range<std::vector<view_updates>>(views_to_update | boost::adaptors::transformed([&] (view_and_base v) {
|
||||
if (base->version() != v.base->base_schema->version()) {
|
||||
on_internal_error(vlogger, format("Schema version used for view updates ({}) does not match the current"
|
||||
" base schema version of the view ({}) for view {}.{} of {}.{}",
|
||||
base->version(), v.base->base_schema->version(), v.view->ks_name(), v.view->cf_name(), base->ks_name(), base->cf_name()));
|
||||
}
|
||||
return view_updates(std::move(v));
|
||||
}));
|
||||
auto builder = std::make_unique<view_update_builder>(base, std::move(vs), std::move(updates), std::move(existings));
|
||||
auto f = builder->build();
|
||||
@@ -946,18 +962,18 @@ future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||
query::clustering_row_ranges calculate_affected_clustering_ranges(const schema& base,
|
||||
const dht::decorated_key& key,
|
||||
const mutation_partition& mp,
|
||||
const std::vector<view_ptr>& views) {
|
||||
const std::vector<view_and_base>& views) {
|
||||
std::vector<nonwrapping_range<clustering_key_prefix_view>> row_ranges;
|
||||
std::vector<nonwrapping_range<clustering_key_prefix_view>> view_row_ranges;
|
||||
clustering_key_prefix_view::tri_compare cmp(base);
|
||||
if (mp.partition_tombstone() || !mp.row_tombstones().empty()) {
|
||||
for (auto&& v : views) {
|
||||
// FIXME: #2371
|
||||
if (v->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
|
||||
if (v.view->view_info()->select_statement().get_restrictions()->has_unrestricted_clustering_columns()) {
|
||||
view_row_ranges.push_back(nonwrapping_range<clustering_key_prefix_view>::make_open_ended_both_sides());
|
||||
break;
|
||||
}
|
||||
for (auto&& r : v->view_info()->partition_slice().default_row_ranges()) {
|
||||
for (auto&& r : v.view->view_info()->partition_slice().default_row_ranges()) {
|
||||
view_row_ranges.push_back(r.transform(std::mem_fn(&clustering_key_prefix::view)));
|
||||
}
|
||||
}
|
||||
@@ -1101,6 +1117,8 @@ future<> mutate_MV(
|
||||
}
|
||||
};
|
||||
if (paired_endpoint) {
|
||||
// If paired endpoint is present, remove it from the list of pending endpoints to avoid duplicates
|
||||
pending_endpoints.erase(std::remove(pending_endpoints.begin(), pending_endpoints.end(), *paired_endpoint), pending_endpoints.end());
|
||||
// When paired endpoint is the local node, we can just apply
|
||||
// the mutation locally, unless there are pending endpoints, in
|
||||
// which case we want to do an ordinary write so the view mutation
|
||||
@@ -1715,7 +1733,7 @@ public:
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
|
||||
_fragments_memory_usage += cr.memory_usage(*_step.base->schema());
|
||||
_fragments_memory_usage += cr.memory_usage(*_step.reader.schema());
|
||||
_fragments.push_back(std::move(cr));
|
||||
if (_fragments_memory_usage > batch_memory_max) {
|
||||
// Although we have not yet completed the batch of base rows that
|
||||
@@ -1735,10 +1753,14 @@ public:
|
||||
_builder._as.check();
|
||||
if (!_fragments.empty()) {
|
||||
_fragments.push_front(partition_start(_step.current_key, tombstone()));
|
||||
auto base_schema = _step.base->schema();
|
||||
auto views = with_base_info_snapshot(_views_to_build);
|
||||
auto reader = make_flat_mutation_reader_from_fragments(_step.reader.schema(), std::move(_fragments));
|
||||
reader.upgrade_schema(base_schema);
|
||||
_step.base->populate_views(
|
||||
_views_to_build,
|
||||
std::move(views),
|
||||
_step.current_token(),
|
||||
make_flat_mutation_reader_from_fragments(_step.base->schema(), std::move(_fragments))).get();
|
||||
std::move(reader)).get();
|
||||
_fragments.clear();
|
||||
_fragments_memory_usage = 0;
|
||||
}
|
||||
@@ -1885,5 +1907,11 @@ future<bool> check_needs_view_update_path(db::system_distributed_keyspace& sys_d
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<db::view::view_and_base> with_base_info_snapshot(std::vector<view_ptr> vs) {
|
||||
return boost::copy_range<std::vector<db::view::view_and_base>>(vs | boost::adaptors::transformed([] (const view_ptr& v) {
|
||||
return db::view::view_and_base{v, v->view_info()->base_info()};
|
||||
}));
|
||||
}
|
||||
|
||||
} // namespace view
|
||||
} // namespace db
|
||||
|
||||
@@ -43,6 +43,27 @@ namespace db {
|
||||
|
||||
namespace view {
|
||||
|
||||
// Part of the view description which depends on the base schema version.
|
||||
//
|
||||
// This structure may change even though the view schema doesn't change, so
|
||||
// it needs to live outside view_ptr.
|
||||
struct base_dependent_view_info {
|
||||
schema_ptr base_schema;
|
||||
|
||||
// Id of a regular base table column included in the view's PK, if any.
|
||||
// Scylla views only allow one such column, alternator can have up to two.
|
||||
std::vector<column_id> base_non_pk_columns_in_view_pk;
|
||||
};
|
||||
|
||||
// Immutable snapshot of view's base-schema-dependent part.
|
||||
using base_info_ptr = lw_shared_ptr<const base_dependent_view_info>;
|
||||
|
||||
// Snapshot of the view schema and its base-schema-dependent part.
|
||||
struct view_and_base {
|
||||
view_ptr view;
|
||||
base_info_ptr base;
|
||||
};
|
||||
|
||||
/**
|
||||
* Whether the view filter considers the specified partition key.
|
||||
*
|
||||
@@ -92,7 +113,7 @@ bool clustering_prefix_matches(const schema& base, const partition_key& key, con
|
||||
|
||||
future<std::vector<frozen_mutation_and_schema>> generate_view_updates(
|
||||
const schema_ptr& base,
|
||||
std::vector<view_ptr>&& views_to_update,
|
||||
std::vector<view_and_base>&& views_to_update,
|
||||
flat_mutation_reader&& updates,
|
||||
flat_mutation_reader_opt&& existings);
|
||||
|
||||
@@ -100,7 +121,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
|
||||
const schema& base,
|
||||
const dht::decorated_key& key,
|
||||
const mutation_partition& mp,
|
||||
const std::vector<view_ptr>& views);
|
||||
const std::vector<view_and_base>& views);
|
||||
|
||||
struct wait_for_all_updates_tag {};
|
||||
using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
|
||||
@@ -128,6 +149,13 @@ future<> mutate_MV(
|
||||
*/
|
||||
void create_virtual_column(schema_builder& builder, const bytes& name, const data_type& type);
|
||||
|
||||
/**
|
||||
* Converts a collection of view schema snapshots into a collection of
|
||||
* view_and_base objects, which are snapshots of both the view schema
|
||||
* and the base-schema-dependent part of view description.
|
||||
*/
|
||||
std::vector<view_and_base> with_base_info_snapshot(std::vector<view_ptr>);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ token token::midpoint(const token& t1, const token& t2) {
|
||||
}
|
||||
|
||||
token token::get_random_token() {
|
||||
return {kind::key, dht::get_random_number<int64_t>()};
|
||||
return token(kind::key, dht::get_random_number<uint64_t>());
|
||||
}
|
||||
|
||||
token token::from_sstring(const sstring& t) {
|
||||
|
||||
24
dht/token.hh
24
dht/token.hh
@@ -58,19 +58,27 @@ public:
|
||||
, _data(normalize(d)) { }
|
||||
|
||||
token(kind k, const bytes& b) : _kind(std::move(k)) {
|
||||
if (b.size() != sizeof(_data)) {
|
||||
throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
|
||||
if (_kind != kind::key) {
|
||||
_data = 0;
|
||||
} else {
|
||||
if (b.size() != sizeof(_data)) {
|
||||
throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
|
||||
}
|
||||
std::copy_n(b.begin(), sizeof(_data), reinterpret_cast<int8_t *>(&_data));
|
||||
_data = net::ntoh(_data);
|
||||
}
|
||||
std::copy_n(b.begin(), sizeof(_data), reinterpret_cast<int8_t *>(&_data));
|
||||
_data = net::ntoh(_data);
|
||||
}
|
||||
|
||||
token(kind k, bytes_view b) : _kind(std::move(k)) {
|
||||
if (b.size() != sizeof(_data)) {
|
||||
throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
|
||||
if (_kind != kind::key) {
|
||||
_data = 0;
|
||||
} else {
|
||||
if (b.size() != sizeof(_data)) {
|
||||
throw std::runtime_error(fmt::format("Wrong token bytes size: expected {} but got {}", sizeof(_data), b.size()));
|
||||
}
|
||||
std::copy_n(b.begin(), sizeof(_data), reinterpret_cast<int8_t *>(&_data));
|
||||
_data = net::ntoh(_data);
|
||||
}
|
||||
std::copy_n(b.begin(), sizeof(_data), reinterpret_cast<int8_t *>(&_data));
|
||||
_data = net::ntoh(_data);
|
||||
}
|
||||
|
||||
bool is_minimum() const {
|
||||
|
||||
15
dist/common/scripts/scylla-housekeeping
vendored
15
dist/common/scripts/scylla-housekeeping
vendored
@@ -61,7 +61,15 @@ def sh_command(*args):
|
||||
return out
|
||||
|
||||
def get_url(path):
|
||||
return urllib.request.urlopen(path).read().decode('utf-8')
|
||||
# If server returns any error, like 403, or 500 urllib.request throws exception, which is not serializable.
|
||||
# When multiprocessing routines fail to serialize it, it throws ambiguous serialization exception
|
||||
# from get_json_from_url.
|
||||
# In order to see legit error we catch it from the inside of process, covert to string and
|
||||
# pass it as part of return value
|
||||
try:
|
||||
return 0, urllib.request.urlopen(path).read().decode('utf-8')
|
||||
except Exception as exc:
|
||||
return 1, str(exc)
|
||||
|
||||
def get_json_from_url(path):
|
||||
pool = mp.Pool(processes=1)
|
||||
@@ -71,13 +79,16 @@ def get_json_from_url(path):
|
||||
# to enforce a wallclock timeout.
|
||||
result = pool.apply_async(get_url, args=(path,))
|
||||
try:
|
||||
retval = result.get(timeout=5)
|
||||
status, retval = result.get(timeout=5)
|
||||
except mp.TimeoutError as err:
|
||||
pool.terminate()
|
||||
pool.join()
|
||||
raise
|
||||
if status == 1:
|
||||
raise RuntimeError(f'Failed to get "{path}" due to the following error: {retval}')
|
||||
return json.loads(retval)
|
||||
|
||||
|
||||
def get_api(path):
|
||||
return get_json_from_url("http://" + api_address + path)
|
||||
|
||||
|
||||
1
dist/common/scripts/scylla_fstrim_setup
vendored
1
dist/common/scripts/scylla_fstrim_setup
vendored
@@ -31,5 +31,6 @@ if __name__ == '__main__':
|
||||
sys.exit(1)
|
||||
if is_systemd():
|
||||
systemd_unit('scylla-fstrim.timer').unmask()
|
||||
systemd_unit('scylla-fstrim.timer').enable()
|
||||
if is_redhat_variant():
|
||||
systemd_unit('fstrim.timer').disable()
|
||||
|
||||
3
dist/common/scripts/scylla_setup
vendored
3
dist/common/scripts/scylla_setup
vendored
@@ -371,6 +371,9 @@ if __name__ == '__main__':
|
||||
if not stat.S_ISBLK(os.stat(dsk).st_mode):
|
||||
print('{} is not block device'.format(dsk))
|
||||
continue
|
||||
if dsk in selected:
|
||||
print(f'{dsk} is already added')
|
||||
continue
|
||||
selected.append(dsk)
|
||||
devices.remove(dsk)
|
||||
disks = ','.join(selected)
|
||||
|
||||
24
dist/common/scripts/scylla_util.py
vendored
24
dist/common/scripts/scylla_util.py
vendored
@@ -182,7 +182,7 @@ class aws_instance:
|
||||
instance_size = self.instance_size()
|
||||
if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
|
||||
return 'ixgbevf'
|
||||
if instance_class in ['c5', 'c5d', 'f1', 'g3', 'h1', 'i3', 'i3en', 'm5', 'm5d', 'p2', 'p3', 'r4', 'x1']:
|
||||
if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d']:
|
||||
return 'ena'
|
||||
if instance_class == 'm4':
|
||||
if instance_size == '16xlarge':
|
||||
@@ -329,7 +329,7 @@ class scylla_cpuinfo:
|
||||
|
||||
# When a CLI tool is not installed, use relocatable CLI tool provided by Scylla
|
||||
scylla_env = os.environ.copy()
|
||||
scylla_env['PATH'] = '{}:{}'.format(scylla_env['PATH'], scyllabindir())
|
||||
scylla_env['PATH'] = '{}:{}'.format(scyllabindir(), scylla_env['PATH'])
|
||||
|
||||
def run(cmd, shell=False, silent=False, exception=True):
|
||||
stdout = subprocess.DEVNULL if silent else None
|
||||
@@ -441,6 +441,19 @@ def dist_ver():
|
||||
return platform.dist()[1]
|
||||
|
||||
|
||||
SYSTEM_PARTITION_UUIDS = [
|
||||
'21686148-6449-6e6f-744e-656564454649', # BIOS boot partition
|
||||
'c12a7328-f81f-11d2-ba4b-00a0c93ec93b', # EFI system partition
|
||||
'024dee41-33e7-11d3-9d69-0008c781f39f' # MBR partition scheme
|
||||
]
|
||||
|
||||
def get_partition_uuid(dev):
|
||||
return out(f'lsblk -n -oPARTTYPE {dev}')
|
||||
|
||||
def is_system_partition(dev):
|
||||
uuid = get_partition_uuid(dev)
|
||||
return (uuid in SYSTEM_PARTITION_UUIDS)
|
||||
|
||||
def is_unused_disk(dev):
|
||||
# dev is not in /sys/class/block/, like /dev/nvme[0-9]+
|
||||
if not os.path.isdir('/sys/class/block/{dev}'.format(dev=dev.replace('/dev/', ''))):
|
||||
@@ -448,7 +461,8 @@ def is_unused_disk(dev):
|
||||
try:
|
||||
fd = os.open(dev, os.O_EXCL)
|
||||
os.close(fd)
|
||||
return True
|
||||
# dev is not reserved for system
|
||||
return not is_system_partition(dev)
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
@@ -481,8 +495,8 @@ def parse_scylla_dirs_with_default(conf='/etc/scylla/scylla.yaml'):
|
||||
y['data_file_directories'] = [os.path.join(y['workdir'], 'data')]
|
||||
for t in [ "commitlog", "hints", "view_hints", "saved_caches" ]:
|
||||
key = "%s_directory" % t
|
||||
if key not in y or not y[k]:
|
||||
y[k] = os.path.join(y['workdir'], t)
|
||||
if key not in y or not y[key]:
|
||||
y[key] = os.path.join(y['workdir'], t)
|
||||
return y
|
||||
|
||||
|
||||
|
||||
1
dist/debian/control.mustache
vendored
1
dist/debian/control.mustache
vendored
@@ -5,6 +5,7 @@ Section: database
|
||||
Priority: optional
|
||||
X-Python3-Version: >= 3.4
|
||||
Standards-Version: 3.9.5
|
||||
Rules-Requires-Root: no
|
||||
|
||||
Package: {{product}}-conf
|
||||
Architecture: any
|
||||
|
||||
1
dist/debian/python3/control.mustache
vendored
1
dist/debian/python3/control.mustache
vendored
@@ -5,6 +5,7 @@ Section: python
|
||||
Priority: optional
|
||||
X-Python3-Version: >= 3.4
|
||||
Standards-Version: 3.9.5
|
||||
Rules-Requires-Root: no
|
||||
|
||||
Package: {{product}}-python3
|
||||
Architecture: amd64
|
||||
|
||||
1
dist/debian/rules.mustache
vendored
1
dist/debian/rules.mustache
vendored
@@ -37,6 +37,7 @@ override_dh_strip:
|
||||
# The binaries (ethtool...patchelf) don't pass dh_strip after going through patchelf. Since they are
|
||||
# already stripped, nothing is lost if we exclude them, so that's what we do.
|
||||
dh_strip -Xlibprotobuf.so.15 -Xld.so -Xethtool -Xgawk -Xgzip -Xhwloc-calc -Xhwloc-distrib -Xifconfig -Xlscpu -Xnetstat -Xpatchelf --dbg-package={{product}}-server-dbg
|
||||
find $(CURDIR)/debian/{{product}}-server-dbg/usr/lib/debug/.build-id/ -name "*.debug" -exec objcopy --decompress-debug-sections {} \;
|
||||
|
||||
override_dh_makeshlibs:
|
||||
|
||||
|
||||
4
dist/docker/redhat/Dockerfile
vendored
4
dist/docker/redhat/Dockerfile
vendored
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
|
||||
ENV container docker
|
||||
|
||||
# The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
|
||||
ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
|
||||
ARG VERSION=666.development
|
||||
ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/branch-4.0/latest/scylla.repo
|
||||
ARG VERSION=4.0.*
|
||||
|
||||
ADD scylla_bashrc /scylla_bashrc
|
||||
|
||||
|
||||
@@ -21,10 +21,6 @@ DynamoDB API requests.
|
||||
For example., "`--alternator-port=8000`" on the command line will run
|
||||
Alternator on port 8000 - the traditional port used by DynamoDB.
|
||||
|
||||
Alternator uses Scylla's LWT feature, which is currently considered
|
||||
experimental and needs to be seperately enabled as well, e.g. with the
|
||||
"`--experimental=on`" option.
|
||||
|
||||
By default, Scylla listens on this port on all network interfaces.
|
||||
To listen only on a specific interface, pass also an "`alternator-address`"
|
||||
option.
|
||||
@@ -55,9 +51,8 @@ Alternator's compatibility with DynamoDB, and will be updated as the work
|
||||
progresses and compatibility continues to improve.
|
||||
|
||||
### API Server
|
||||
* Transport: HTTP mostly supported, but small features like CRC header and
|
||||
compression are still missing. HTTPS supported on top of HTTP, so small
|
||||
features may still be missing.
|
||||
* Transport: HTTP and HTTPS are mostly supported, but small features like CRC
|
||||
header and compression are still missing.
|
||||
* Authorization (verifying the originator of the request): implemented
|
||||
on top of system\_auth.roles table. The secret key used for authorization
|
||||
is the salted\_hash column from the roles table, selected with:
|
||||
@@ -65,20 +60,19 @@ progresses and compatibility continues to improve.
|
||||
By default, authorization is not enforced at all. It can be turned on
|
||||
by providing an entry in Scylla configuration:
|
||||
alternator\_enforce\_authorization: true
|
||||
* DNS server for load balancing: Not yet supported. Client needs to pick
|
||||
one of the live Scylla nodes and send a request to it.
|
||||
* Load balancing: Not a part of Alternator. One should use an external load
|
||||
balancer or DNS server to balance the requests between the live Scylla
|
||||
nodes. We plan to publish a reference example soon.
|
||||
### Table Operations
|
||||
* CreateTable: Supported. Note our implementation is synchronous.
|
||||
* CreateTable and DeleteTable: Supported. Note our implementation is synchronous.
|
||||
* DescribeTable: Partial implementation. Missing creation date and size estimate.
|
||||
* UpdateTable: Not supported.
|
||||
* DescribeTable: Partial implementation. Missing creation date and size esitmate.
|
||||
* DeleteTable: Supported. Note our implementation is synchronous.
|
||||
* ListTables: Supported.
|
||||
### Item Operations
|
||||
* GetItem: Support almost complete except that projection expressions can
|
||||
only ask for top-level attributes.
|
||||
* PutItem: Support almost complete except that condition expressions can
|
||||
only refer to to-level attributes.
|
||||
pre-put content) not yet supported.
|
||||
* UpdateItem: Nested documents are supported but updates to nested attributes
|
||||
are not (e.g., `SET a.b[3].c=val`), and neither are nested attributes in
|
||||
condition expressions.
|
||||
@@ -90,15 +84,14 @@ progresses and compatibility continues to improve.
|
||||
* BatchWriteItem: Supported. Doesn't limit the number of items (DynamoDB
|
||||
limits to 25) or size of items (400 KB) or total request size (16 MB).
|
||||
### Scans
|
||||
* Scan: As usual, projection expressions only support top-level attributes.
|
||||
Filter expressions (to filter some of the items) partially supported:
|
||||
The ScanFilter syntax is supported but FilterExpression is not yet, and
|
||||
only equality operator is supported so far.
|
||||
The "Select" options which allows to count items instead of returning them
|
||||
is not yet supported. Parallel scan is not yet supported.
|
||||
* Query: Same issues as Scan above. Additionally, missing support for
|
||||
KeyConditionExpression (an alternative syntax replacing the older
|
||||
KeyConditions parameter which we do support).
|
||||
Scan and Query are mostly supported, with the following limitations:
|
||||
* As above, projection expressions only support top-level attributes.
|
||||
* Filter expressions (to filter some of the items) are only partially
|
||||
supported: The ScanFilter syntax is currently only supports the equality
|
||||
operator, and the FilterExpression syntax is not yet supported at all.
|
||||
* The "Select" options which allows to count items instead of returning them
|
||||
is not yet supported.
|
||||
* Parallel scan is not yet supported.
|
||||
### Secondary Indexes
|
||||
Global Secondary Indexes (GSI) and Local Secondary Indexes (LSI) are
|
||||
implemented, with the following limitations:
|
||||
@@ -116,24 +109,28 @@ implemented, with the following limitations:
|
||||
Writes are done in LOCAL_QURUM and reads in LOCAL_ONE (eventual consistency)
|
||||
or LOCAL_QUORUM (strong consistency).
|
||||
### Global Tables
|
||||
* Not yet supported: CreateGlobalTable, UpdateGlobalTable,
|
||||
DescribeGlobalTable, ListGlobalTables, UpdateGlobalTableSettings,
|
||||
DescribeGlobalTableSettings. Implementation will use Scylla's multi-DC
|
||||
features.
|
||||
* Currently, *all* Alternator tables are created as "Global Tables", i.e., can
|
||||
be accessed from all of Scylla's DCs.
|
||||
* We do not yet support the DynamoDB API calls to make some of the tables
|
||||
global and others local to a particular DC: CreateGlobalTable,
|
||||
UpdateGlobalTable, DescribeGlobalTable, ListGlobalTables,
|
||||
UpdateGlobalTableSettings, DescribeGlobalTableSettings, and UpdateTable.
|
||||
### Backup and Restore
|
||||
* On-demand backup: Not yet supported: CreateBackup, DescribeBackup,
|
||||
DeleteBackup, ListBackups, RestoreTableFromBackup. Implementation will
|
||||
use Scylla's snapshots
|
||||
* On-demand backup: the DynamoDB APIs are not yet supported: CreateBackup,
|
||||
DescribeBackup, DeleteBackup, ListBackups, RestoreTableFromBackup.
|
||||
Users can use Scylla's [snapshots](https://docs.scylladb.com/operating-scylla/procedures/backup-restore/)
|
||||
or [Scylla Manager](https://docs.scylladb.com/operating-scylla/manager/2.0/backup/).
|
||||
* Continuous backup: Not yet supported: UpdateContinuousBackups,
|
||||
DescribeContinuousBackups, RestoreTableToPoinInTime.
|
||||
### Transations
|
||||
### Transactions
|
||||
* Not yet supported: TransactWriteItems, TransactGetItems.
|
||||
Note that this is a new DynamoDB feature - these are more powerful than
|
||||
the old conditional updates which were "lightweight transactions".
|
||||
### Streams (CDC)
|
||||
* Not yet supported
|
||||
### Streams
|
||||
* Scylla has experimental support for [CDC](https://docs.scylladb.com/using-scylla/cdc/)
|
||||
(change data capture), but the "DynamoDB Streams" API is not yet supported.
|
||||
### Encryption at rest
|
||||
* Supported natively by Scylla, but needs to be enabled by default.
|
||||
* Supported by Scylla Enterprise (not in open-source). Needs to be enabled.
|
||||
### ARNs and tags
|
||||
* ARN is generated for every alternator table
|
||||
* Tagging can be used with the help of the following requests:
|
||||
@@ -166,7 +163,9 @@ implemented, with the following limitations:
|
||||
* Not required. Scylla cache is rather advanced and there is no need to place
|
||||
a cache in front of the database: https://www.scylladb.com/2017/07/31/database-caches-not-good/
|
||||
### Metrics
|
||||
* Several metrics are available through the Grafana/Promethues stack: https://docs.scylladb.com/operating-scylla/monitoring/ It is different than the expectations of the current DynamoDB implementation. However, our
|
||||
* Several metrics are available through the Grafana/Prometheus stack:
|
||||
https://docs.scylladb.com/operating-scylla/monitoring/
|
||||
Those are different from the current DynamoDB metrics, but Scylla's
|
||||
monitoring is rather advanced and provide more insights to the internals.
|
||||
|
||||
## Alternator design and implementation
|
||||
@@ -229,8 +228,3 @@ one DynamoDB feature which we cannot support safely: we cannot modify
|
||||
a non-top-level attribute (e.g., a.b[3].c) directly without RMW. We plan
|
||||
to fix this in a future version by rethinking the data model we use for
|
||||
attributes, or rethinking our implementation of RMW (as explained above).
|
||||
|
||||
For reasons explained above, the data model used by Alternator to store
|
||||
data on disk is still in a state of flux, and may change in future versions.
|
||||
Therefore, in this early stage it is not recommended to store important
|
||||
production data using Alternator.
|
||||
|
||||
@@ -10,12 +10,10 @@ This section will guide you through the steps for setting up the cluster:
|
||||
nightly image by running: `docker pull scylladb/scylla-nightly:latest`
|
||||
2. Follow the steps in the [Scylla official download web page](https://www.scylladb.com/download/open-source/#docker)
|
||||
add to every "docker run" command: `-p 8000:8000` before the image name
|
||||
and `--alternator-port=8000 --experimental 1` at the end. The
|
||||
"alternator-port" option specifies on which port Scylla will listen for
|
||||
the (unencrypted) DynamoDB API, and "--experimental 1" is required to
|
||||
enable the experimental LWT feature which Alternator uses.
|
||||
and `--alternator-port=8000` at the end. The "alternator-port" option
|
||||
specifies on which port Scylla will listen for the (unencrypted) DynamoDB API.
|
||||
For example,
|
||||
`docker run --name scylla -d -p 8000:8000 scylladb/scylla-nightly:latest --alternator-port=8000 --experimental 1`
|
||||
`docker run --name scylla -d -p 8000:8000 scylladb/scylla-nightly:latest --alternator-port=8000
|
||||
|
||||
## Testing Scylla's DynamoDB API support:
|
||||
### Running AWS Tic Tac Toe demo app to test the cluster:
|
||||
|
||||
@@ -76,6 +76,9 @@ Scylla with issue #4139 fixed)
|
||||
bit 4: CorrectEmptyCounters (if set, indicates the sstable was generated by
|
||||
Scylla with issue #4363 fixed)
|
||||
|
||||
bit 5: CorrectUDTsInCollections (if set, indicates that the sstable was generated
|
||||
by Scylla with issue #6130 fixed)
|
||||
|
||||
## extension_attributes subcomponent
|
||||
|
||||
extension_attributes = extension_attribute_count extension_attribute*
|
||||
|
||||
@@ -56,22 +56,22 @@ public:
|
||||
/**
|
||||
* The unrecognized entity.
|
||||
*/
|
||||
::shared_ptr<cql3::column_identifier> entity;
|
||||
cql3::column_identifier entity;
|
||||
|
||||
/**
|
||||
* The entity relation.
|
||||
* The entity relation in a stringified form.
|
||||
*/
|
||||
cql3::relation_ptr relation;
|
||||
sstring relation_str;
|
||||
|
||||
/**
|
||||
* Creates a new <code>UnrecognizedEntityException</code>.
|
||||
* @param entity the unrecognized entity
|
||||
* @param relation the entity relation
|
||||
* @param relation_str the entity relation string
|
||||
*/
|
||||
unrecognized_entity_exception(::shared_ptr<cql3::column_identifier> entity, cql3::relation_ptr relation)
|
||||
: invalid_request_exception(format("Undefined name {} in where clause ('{}')", *entity, relation->to_string()))
|
||||
, entity(entity)
|
||||
, relation(relation)
|
||||
unrecognized_entity_exception(cql3::column_identifier entity, sstring relation_str)
|
||||
: invalid_request_exception(format("Undefined name {} in where clause ('{}')", entity, relation_str))
|
||||
, entity(std::move(entity))
|
||||
, relation_str(std::move(relation_str))
|
||||
{ }
|
||||
};
|
||||
|
||||
|
||||
@@ -487,6 +487,9 @@ public:
|
||||
size_t buffer_size() const {
|
||||
return _impl->buffer_size();
|
||||
}
|
||||
const circular_buffer<mutation_fragment>& buffer() const {
|
||||
return _impl->buffer();
|
||||
}
|
||||
// Detach the internal buffer of the reader.
|
||||
// Roughly equivalent to depleting it by calling pop_mutation_fragment()
|
||||
// until is_buffer_empty() returns true.
|
||||
|
||||
@@ -110,10 +110,6 @@ feature_config feature_config_from_db_config(db::config& cfg) {
|
||||
fcfg.enable_cdc = true;
|
||||
}
|
||||
|
||||
if (cfg.check_experimental(db::experimental_features_t::LWT)) {
|
||||
fcfg.enable_lwt = true;
|
||||
}
|
||||
|
||||
return fcfg;
|
||||
}
|
||||
|
||||
@@ -178,9 +174,7 @@ std::set<std::string_view> feature_service::known_feature_set() {
|
||||
if (_config.enable_cdc) {
|
||||
features.insert(gms::features::CDC);
|
||||
}
|
||||
if (_config.enable_lwt) {
|
||||
features.insert(gms::features::LWT);
|
||||
}
|
||||
features.insert(gms::features::LWT);
|
||||
|
||||
for (const sstring& s : _config.disabled_features) {
|
||||
features.erase(s);
|
||||
|
||||
@@ -41,7 +41,6 @@ struct feature_config {
|
||||
bool enable_sstables_mc_format = false;
|
||||
bool enable_user_defined_functions = false;
|
||||
bool enable_cdc = false;
|
||||
bool enable_lwt = false;
|
||||
std::set<sstring> disabled_features;
|
||||
feature_config();
|
||||
};
|
||||
|
||||
@@ -428,6 +428,7 @@ future<> gossiper::handle_shutdown_msg(inet_address from) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return seastar::async([this, from] {
|
||||
auto permit = this->lock_endpoint(from).get0();
|
||||
this->mark_as_shutdown(from);
|
||||
});
|
||||
}
|
||||
@@ -632,7 +633,7 @@ void gossiper::remove_endpoint(inet_address endpoint) {
|
||||
// We can not run on_remove callbacks here becasue on_remove in
|
||||
// storage_service might take the gossiper::timer_callback_lock
|
||||
(void)seastar::async([this, endpoint] {
|
||||
_subscribers.for_each([endpoint] (auto& subscriber) {
|
||||
_subscribers.for_each([endpoint] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_remove(endpoint);
|
||||
});
|
||||
}).handle_exception([] (auto ep) {
|
||||
@@ -1464,7 +1465,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
logger.info("InetAddress {} is now UP, status = {}", addr, status);
|
||||
}
|
||||
|
||||
_subscribers.for_each([addr, local_state] (auto& subscriber) {
|
||||
_subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_alive(addr, local_state);
|
||||
logger.trace("Notified {}", subscriber.get());
|
||||
});
|
||||
@@ -1478,7 +1479,7 @@ void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {
|
||||
_live_endpoints_just_added.remove(addr);
|
||||
_unreachable_endpoints[addr] = now();
|
||||
logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(local_state));
|
||||
_subscribers.for_each([addr, local_state] (auto& subscriber) {
|
||||
_subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_dead(addr, local_state);
|
||||
logger.trace("Notified {}", subscriber.get());
|
||||
});
|
||||
@@ -1510,7 +1511,7 @@ void gossiper::handle_major_state_change(inet_address ep, const endpoint_state&
|
||||
|
||||
if (eps_old) {
|
||||
// the node restarted: it is up to the subscriber to take whatever action is necessary
|
||||
_subscribers.for_each([ep, eps_old] (auto& subscriber) {
|
||||
_subscribers.for_each([ep, eps_old] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_restart(ep, *eps_old);
|
||||
});
|
||||
}
|
||||
@@ -1525,7 +1526,7 @@ void gossiper::handle_major_state_change(inet_address ep, const endpoint_state&
|
||||
|
||||
auto* eps_new = get_endpoint_state_for_endpoint_ptr(ep);
|
||||
if (eps_new) {
|
||||
_subscribers.for_each([ep, eps_new] (auto& subscriber) {
|
||||
_subscribers.for_each([ep, eps_new] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_join(ep, *eps_new);
|
||||
});
|
||||
}
|
||||
@@ -1618,14 +1619,14 @@ void gossiper::apply_new_states(inet_address addr, endpoint_state& local_state,
|
||||
|
||||
// Runs inside seastar::async context
|
||||
void gossiper::do_before_change_notifications(inet_address addr, const endpoint_state& ep_state, const application_state& ap_state, const versioned_value& new_value) {
|
||||
_subscribers.for_each([addr, ep_state, ap_state, new_value] (auto& subscriber) {
|
||||
_subscribers.for_each([addr, ep_state, ap_state, new_value] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->before_change(addr, ep_state, ap_state, new_value);
|
||||
});
|
||||
}
|
||||
|
||||
// Runs inside seastar::async context
|
||||
void gossiper::do_on_change_notifications(inet_address addr, const application_state& state, const versioned_value& value) {
|
||||
_subscribers.for_each([addr, state, value] (auto& subscriber) {
|
||||
_subscribers.for_each([addr, state, value] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_change(addr, state, value);
|
||||
});
|
||||
}
|
||||
@@ -1725,8 +1726,12 @@ future<> gossiper::start_gossiping(int generation_nbr, std::map<application_stat
|
||||
// message on all cpus and forard them to cpu0 to process.
|
||||
return get_gossiper().invoke_on_all([do_bind] (gossiper& g) {
|
||||
g.init_messaging_service_handler(do_bind);
|
||||
}).then([this, generation_nbr, preload_local_states] {
|
||||
}).then([this, generation_nbr, preload_local_states] () mutable {
|
||||
build_seeds_list();
|
||||
if (_cfg.force_gossip_generation() > 0) {
|
||||
generation_nbr = _cfg.force_gossip_generation();
|
||||
logger.warn("Use the generation number provided by user: generation = {}", generation_nbr);
|
||||
}
|
||||
endpoint_state& local_state = endpoint_state_map[get_broadcast_address()];
|
||||
local_state.set_heart_beat_state_and_update_timestamp(heart_beat_state(generation_nbr));
|
||||
local_state.mark_alive();
|
||||
|
||||
@@ -591,6 +591,7 @@ public:
|
||||
std::map<sstring, sstring> get_simple_states();
|
||||
int get_down_endpoint_count();
|
||||
int get_up_endpoint_count();
|
||||
int get_all_endpoint_count();
|
||||
sstring get_endpoint_state(sstring address);
|
||||
failure_detector& fd() { return _fd; }
|
||||
};
|
||||
@@ -637,6 +638,12 @@ inline future<int> get_up_endpoint_count() {
|
||||
});
|
||||
}
|
||||
|
||||
inline future<int> get_all_endpoint_count() {
|
||||
return smp::submit_to(0, [] {
|
||||
return static_cast<int>(get_local_gossiper().get_endpoint_states().size());
|
||||
});
|
||||
}
|
||||
|
||||
inline future<> set_phi_convict_threshold(double phi) {
|
||||
return smp::submit_to(0, [phi] {
|
||||
get_local_gossiper().fd().set_phi_convict_threshold(phi);
|
||||
|
||||
@@ -69,7 +69,8 @@ std::ostream& gms::operator<<(std::ostream& os, const inet_address& x) {
|
||||
auto&& bytes = x.bytes();
|
||||
auto i = 0u;
|
||||
auto acc = 0u;
|
||||
for (auto b : bytes) {
|
||||
// extra paranoid sign extension evasion - #5808
|
||||
for (uint8_t b : bytes) {
|
||||
acc <<= 8;
|
||||
acc |= b;
|
||||
if ((++i & 1) == 0) {
|
||||
|
||||
@@ -76,6 +76,8 @@ fedora_packages=(
|
||||
python3-psutil
|
||||
python3-cassandra-driver
|
||||
python3-colorama
|
||||
python3-boto3
|
||||
python3-pytest
|
||||
dnf-utils
|
||||
pigz
|
||||
net-tools
|
||||
|
||||
@@ -126,6 +126,7 @@ relocate_python3() {
|
||||
cp "$script" "$relocateddir"
|
||||
cat > "$install"<<EOF
|
||||
#!/usr/bin/env bash
|
||||
export LC_ALL=en_US.UTF-8
|
||||
x="\$(readlink -f "\$0")"
|
||||
b="\$(basename "\$x")"
|
||||
d="\$(dirname "\$x")"
|
||||
|
||||
203
licenses/abseil-license.txt
Normal file
203
licenses/abseil-license.txt
Normal file
@@ -0,0 +1,203 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
https://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
https://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
@@ -144,10 +144,33 @@ insert_token_range_to_sorted_container_while_unwrapping(
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::get_ranges(inet_address ep) const {
|
||||
return do_get_ranges(ep, _token_metadata, false);
|
||||
}
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::get_ranges_in_thread(inet_address ep) const {
|
||||
return do_get_ranges(ep, _token_metadata, true);
|
||||
}
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::get_ranges(inet_address ep, token_metadata& tm) const {
|
||||
return do_get_ranges(ep, tm, false);
|
||||
}
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::get_ranges_in_thread(inet_address ep, token_metadata& tm) const {
|
||||
return do_get_ranges(ep, tm, true);
|
||||
}
|
||||
|
||||
dht::token_range_vector
|
||||
abstract_replication_strategy::do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const {
|
||||
dht::token_range_vector ret;
|
||||
auto prev_tok = _token_metadata.sorted_tokens().back();
|
||||
for (auto tok : _token_metadata.sorted_tokens()) {
|
||||
for (inet_address a : calculate_natural_endpoints(tok, _token_metadata)) {
|
||||
auto prev_tok = tm.sorted_tokens().back();
|
||||
for (auto tok : tm.sorted_tokens()) {
|
||||
for (inet_address a : calculate_natural_endpoints(tok, tm)) {
|
||||
if (can_yield) {
|
||||
seastar::thread::maybe_yield();
|
||||
}
|
||||
if (a == ep) {
|
||||
insert_token_range_to_sorted_container_while_unwrapping(prev_tok, tok, ret);
|
||||
break;
|
||||
|
||||
@@ -106,6 +106,15 @@ public:
|
||||
// It the analogue of Origin's getAddressRanges().get(endpoint).
|
||||
// This function is not efficient, and not meant for the fast path.
|
||||
dht::token_range_vector get_ranges(inet_address ep) const;
|
||||
dht::token_range_vector get_ranges_in_thread(inet_address ep) const;
|
||||
|
||||
// Use the token_metadata provided by the caller instead of _token_metadata
|
||||
dht::token_range_vector get_ranges(inet_address ep, token_metadata& tm) const;
|
||||
dht::token_range_vector get_ranges_in_thread(inet_address ep, token_metadata& tm) const;
|
||||
private:
|
||||
dht::token_range_vector do_get_ranges(inet_address ep, token_metadata& tm, bool can_yield) const;
|
||||
|
||||
public:
|
||||
// get_primary_ranges() returns the list of "primary ranges" for the given
|
||||
// endpoint. "Primary ranges" are the ranges that the node is responsible
|
||||
// for storing replica primarily, which means this is the first node
|
||||
|
||||
10
lua.cc
10
lua.cc
@@ -264,14 +264,12 @@ static auto visit_lua_raw_value(lua_State* l, int index, Func&& f) {
|
||||
|
||||
template <typename Func>
|
||||
static auto visit_decimal(const big_decimal &v, Func&& f) {
|
||||
boost::multiprecision::cpp_int ten(10);
|
||||
const auto& dividend = v.unscaled_value();
|
||||
auto divisor = boost::multiprecision::pow(ten, v.scale());
|
||||
boost::multiprecision::cpp_rational r = v.as_rational();
|
||||
const boost::multiprecision::cpp_int& dividend = numerator(r);
|
||||
const boost::multiprecision::cpp_int& divisor = denominator(r);
|
||||
if (dividend % divisor == 0) {
|
||||
return f(utils::multiprecision_int(boost::multiprecision::cpp_int(dividend/divisor)));
|
||||
return f(utils::multiprecision_int(dividend/divisor));
|
||||
}
|
||||
boost::multiprecision::cpp_rational r = dividend;
|
||||
r /= divisor;
|
||||
return f(r.convert_to<double>());
|
||||
}
|
||||
|
||||
|
||||
40
main.cc
40
main.cc
@@ -546,9 +546,13 @@ int main(int ac, char** av) {
|
||||
gms::feature_config fcfg = gms::feature_config_from_db_config(*cfg);
|
||||
|
||||
feature_service.start(fcfg).get();
|
||||
auto stop_feature_service = defer_verbose_shutdown("feature service", [&feature_service] {
|
||||
feature_service.stop().get();
|
||||
});
|
||||
// FIXME storage_proxy holds a reference on it and is not yet stopped.
|
||||
// also the proxy leaves range_slice_read_executor-s hanging around
|
||||
// and willing to find out if the cluster_supports_digest_multipartition_reads
|
||||
//
|
||||
//auto stop_feature_service = defer_verbose_shutdown("feature service", [&feature_service] {
|
||||
// feature_service.stop().get();
|
||||
//});
|
||||
|
||||
schema::set_default_partitioner(cfg->partitioner(), cfg->murmur3_partitioner_ignore_msb_bits());
|
||||
auto make_sched_group = [&] (sstring name, unsigned shares) {
|
||||
@@ -662,9 +666,17 @@ int main(int ac, char** av) {
|
||||
|
||||
supervisor::notify("starting tokens manager");
|
||||
token_metadata.start().get();
|
||||
auto stop_token_metadata = defer_verbose_shutdown("token metadata", [ &token_metadata ] {
|
||||
token_metadata.stop().get();
|
||||
});
|
||||
// storage_proxy holds a reference on it and is not yet stopped.
|
||||
// what's worse is that the calltrace
|
||||
// storage_proxy::do_query
|
||||
// ::query_partition_key_range
|
||||
// ::query_partition_key_range_concurrent
|
||||
// leaves unwaited futures on the reactor and once it gets there
|
||||
// the token_metadata instance is accessed and ...
|
||||
//
|
||||
//auto stop_token_metadata = defer_verbose_shutdown("token metadata", [ &token_metadata ] {
|
||||
// token_metadata.stop().get();
|
||||
//});
|
||||
|
||||
supervisor::notify("starting migration manager notifier");
|
||||
mm_notifier.start().get();
|
||||
@@ -935,12 +947,16 @@ int main(int ac, char** av) {
|
||||
mm.init_messaging_service();
|
||||
}).get();
|
||||
supervisor::notify("initializing storage proxy RPC verbs");
|
||||
proxy.invoke_on_all([] (service::storage_proxy& p) {
|
||||
p.init_messaging_service();
|
||||
}).get();
|
||||
proxy.invoke_on_all(&service::storage_proxy::init_messaging_service).get();
|
||||
auto stop_proxy_handlers = defer_verbose_shutdown("storage proxy RPC verbs", [&proxy] {
|
||||
proxy.invoke_on_all(&service::storage_proxy::uninit_messaging_service).get();
|
||||
});
|
||||
|
||||
supervisor::notify("starting streaming service");
|
||||
streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_generator).get();
|
||||
auto stop_streaming_service = defer_verbose_shutdown("streaming service", [] {
|
||||
streaming::stream_session::uninit_streaming_service().get();
|
||||
});
|
||||
api::set_server_stream_manager(ctx).get();
|
||||
|
||||
supervisor::notify("starting hinted handoff manager");
|
||||
@@ -973,6 +989,9 @@ int main(int ac, char** av) {
|
||||
rs.stop().get();
|
||||
});
|
||||
repair_init_messaging_service_handler(rs, sys_dist_ks, view_update_generator).get();
|
||||
auto stop_repair_messages = defer_verbose_shutdown("repair message handlers", [] {
|
||||
repair_uninit_messaging_service_handler().get();
|
||||
});
|
||||
supervisor::notify("starting storage service", true);
|
||||
auto& ss = service::get_local_storage_service();
|
||||
ss.init_messaging_service_part().get();
|
||||
@@ -1071,9 +1090,6 @@ int main(int ac, char** av) {
|
||||
static sharded<alternator::executor> alternator_executor;
|
||||
static sharded<alternator::server> alternator_server;
|
||||
|
||||
if (!cfg->check_experimental(db::experimental_features_t::LWT)) {
|
||||
throw std::runtime_error("Alternator enabled, but needs experimental LWT feature which wasn't enabled");
|
||||
}
|
||||
net::inet_address addr;
|
||||
try {
|
||||
addr = net::dns::get_host_by_name(cfg->alternator_address(), family).get0().addr_list.front();
|
||||
|
||||
@@ -452,6 +452,7 @@ static constexpr unsigned do_get_rpc_client_idx(messaging_verb verb) {
|
||||
case messaging_verb::PAXOS_PREPARE:
|
||||
case messaging_verb::PAXOS_ACCEPT:
|
||||
case messaging_verb::PAXOS_LEARN:
|
||||
case messaging_verb::PAXOS_PRUNE:
|
||||
return 0;
|
||||
// GET_SCHEMA_VERSION is sent from read/mutate verbs so should be
|
||||
// sent on a different connection to avoid potential deadlocks
|
||||
@@ -717,6 +718,10 @@ void messaging_service::register_stream_mutation_fragments(std::function<future<
|
||||
register_handler(this, messaging_verb::STREAM_MUTATION_FRAGMENTS, std::move(func));
|
||||
}
|
||||
|
||||
future<> messaging_service::unregister_stream_mutation_fragments() {
|
||||
return unregister_handler(messaging_verb::STREAM_MUTATION_FRAGMENTS);
|
||||
}
|
||||
|
||||
template<class SinkType, class SourceType>
|
||||
future<rpc::sink<SinkType>, rpc::source<SourceType>>
|
||||
do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shared_ptr<messaging_service::rpc_protocol_client_wrapper> rpc_client, std::unique_ptr<messaging_service::rpc_protocol_wrapper>& rpc) {
|
||||
@@ -748,6 +753,9 @@ rpc::sink<repair_row_on_wire_with_cmd> messaging_service::make_sink_for_repair_g
|
||||
void messaging_service::register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_get_row_diff_with_rpc_stream() {
|
||||
return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM);
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
|
||||
future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>
|
||||
@@ -767,6 +775,9 @@ rpc::sink<repair_stream_cmd> messaging_service::make_sink_for_repair_put_row_dif
|
||||
void messaging_service::register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_put_row_diff_with_rpc_stream() {
|
||||
return unregister_handler(messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM);
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
|
||||
future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>
|
||||
@@ -786,6 +797,9 @@ rpc::sink<repair_hash_with_cmd> messaging_service::make_sink_for_repair_get_full
|
||||
void messaging_service::register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_get_full_row_hashes_with_rpc_stream() {
|
||||
return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM);
|
||||
}
|
||||
|
||||
// Send a message for verb
|
||||
template <typename MsgIn, typename... MsgOut>
|
||||
@@ -869,6 +883,9 @@ future<streaming::prepare_message> messaging_service::send_prepare_message(msg_a
|
||||
return send_message<streaming::prepare_message>(this, messaging_verb::PREPARE_MESSAGE, id,
|
||||
std::move(msg), plan_id, std::move(description), reason);
|
||||
}
|
||||
future<> messaging_service::unregister_prepare_message() {
|
||||
return unregister_handler(messaging_verb::PREPARE_MESSAGE);
|
||||
}
|
||||
|
||||
// PREPARE_DONE_MESSAGE
|
||||
void messaging_service::register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func) {
|
||||
@@ -878,6 +895,9 @@ future<> messaging_service::send_prepare_done_message(msg_addr id, UUID plan_id,
|
||||
return send_message<void>(this, messaging_verb::PREPARE_DONE_MESSAGE, id,
|
||||
plan_id, dst_cpu_id);
|
||||
}
|
||||
future<> messaging_service::unregister_prepare_done_message() {
|
||||
return unregister_handler(messaging_verb::PREPARE_DONE_MESSAGE);
|
||||
}
|
||||
|
||||
// STREAM_MUTATION
|
||||
void messaging_service::register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool> fragmented, rpc::optional<streaming::stream_reason> reason)>&& func) {
|
||||
@@ -902,6 +922,9 @@ future<> messaging_service::send_stream_mutation_done(msg_addr id, UUID plan_id,
|
||||
return send_message<void>(this, messaging_verb::STREAM_MUTATION_DONE, id,
|
||||
plan_id, std::move(ranges), cf_id, dst_cpu_id);
|
||||
}
|
||||
future<> messaging_service::unregister_stream_mutation_done() {
|
||||
return unregister_handler(messaging_verb::STREAM_MUTATION_DONE);
|
||||
}
|
||||
|
||||
// COMPLETE_MESSAGE
|
||||
void messaging_service::register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func) {
|
||||
@@ -911,6 +934,9 @@ future<> messaging_service::send_complete_message(msg_addr id, UUID plan_id, uns
|
||||
return send_message<void>(this, messaging_verb::COMPLETE_MESSAGE, id,
|
||||
plan_id, dst_cpu_id, failed);
|
||||
}
|
||||
future<> messaging_service::unregister_complete_message() {
|
||||
return unregister_handler(messaging_verb::COMPLETE_MESSAGE);
|
||||
}
|
||||
|
||||
void messaging_service::register_gossip_echo(std::function<future<> ()>&& func) {
|
||||
register_handler(this, messaging_verb::GOSSIP_ECHO, std::move(func));
|
||||
@@ -1125,14 +1151,14 @@ future<partition_checksum> messaging_service::send_repair_checksum_range(
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_GET_FULL_ROW_HASHES
|
||||
void messaging_service::register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
|
||||
void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_get_full_row_hashes() {
|
||||
return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES);
|
||||
}
|
||||
future<std::unordered_set<repair_hash>> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
|
||||
return send_message<future<std::unordered_set<repair_hash>>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
|
||||
future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
|
||||
return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_GET_COMBINED_ROW_HASH
|
||||
@@ -1157,13 +1183,13 @@ future<get_sync_boundary_response> messaging_service::send_repair_get_sync_bound
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_GET_ROW_DIFF
|
||||
void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func) {
|
||||
void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_get_row_diff() {
|
||||
return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF);
|
||||
}
|
||||
future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
|
||||
future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows) {
|
||||
return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows);
|
||||
}
|
||||
|
||||
@@ -1179,14 +1205,14 @@ future<> messaging_service::send_repair_put_row_diff(msg_addr id, uint32_t repai
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_ROW_LEVEL_START
|
||||
void messaging_service::register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version)>&& func) {
|
||||
void messaging_service::register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason)>&& func) {
|
||||
register_handler(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_repair_row_level_start() {
|
||||
return unregister_handler(messaging_verb::REPAIR_ROW_LEVEL_START);
|
||||
}
|
||||
future<> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version) {
|
||||
return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name), std::move(schema_version));
|
||||
future<> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, streaming::stream_reason reason) {
|
||||
return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name), std::move(schema_version), reason);
|
||||
}
|
||||
|
||||
// Wrapper for REPAIR_ROW_LEVEL_STOP
|
||||
@@ -1281,6 +1307,19 @@ future<> messaging_service::send_paxos_learn(msg_addr id, clock_type::time_point
|
||||
std::move(reply_to), shard, std::move(response_id), std::move(trace_info));
|
||||
}
|
||||
|
||||
void messaging_service::register_paxos_prune(std::function<future<rpc::no_wait_type>(
|
||||
const rpc::client_info&, rpc::opt_time_point, UUID schema_id, partition_key key, utils::UUID ballot, std::optional<tracing::trace_info>)>&& func) {
|
||||
register_handler(this, messaging_verb::PAXOS_PRUNE, std::move(func));
|
||||
}
|
||||
future<> messaging_service::unregister_paxos_prune() {
|
||||
return unregister_handler(netw::messaging_verb::PAXOS_PRUNE);
|
||||
}
|
||||
future<>
|
||||
messaging_service::send_paxos_prune(gms::inet_address peer, clock_type::time_point timeout, UUID schema_id,
|
||||
const partition_key& key, utils::UUID ballot, std::optional<tracing::trace_info> trace_info) {
|
||||
return send_message_oneway_timeout(this, timeout, messaging_verb::PAXOS_PRUNE, netw::msg_addr(peer), schema_id, key, ballot, std::move(trace_info));
|
||||
}
|
||||
|
||||
void messaging_service::register_hint_mutation(std::function<future<rpc::no_wait_type> (const rpc::client_info&, rpc::opt_time_point, frozen_mutation fm, std::vector<inet_address> forward,
|
||||
inet_address reply_to, unsigned shard, response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info)>&& func) {
|
||||
register_handler(this, netw::messaging_verb::HINT_MUTATION, std::move(func));
|
||||
|
||||
@@ -139,7 +139,8 @@ enum class messaging_verb : int32_t {
|
||||
PAXOS_ACCEPT = 40,
|
||||
PAXOS_LEARN = 41,
|
||||
HINT_MUTATION = 42,
|
||||
LAST = 43,
|
||||
PAXOS_PRUNE = 43,
|
||||
LAST = 44,
|
||||
};
|
||||
|
||||
} // namespace netw
|
||||
@@ -274,10 +275,12 @@ public:
|
||||
streaming::prepare_message msg, UUID plan_id, sstring description, rpc::optional<streaming::stream_reason> reason)>&& func);
|
||||
future<streaming::prepare_message> send_prepare_message(msg_addr id, streaming::prepare_message msg, UUID plan_id,
|
||||
sstring description, streaming::stream_reason);
|
||||
future<> unregister_prepare_message();
|
||||
|
||||
// Wrapper for PREPARE_DONE_MESSAGE verb
|
||||
void register_prepare_done_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_prepare_done_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id);
|
||||
future<> unregister_prepare_done_message();
|
||||
|
||||
// Wrapper for STREAM_MUTATION verb
|
||||
void register_stream_mutation(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, frozen_mutation fm, unsigned dst_cpu_id, rpc::optional<bool>, rpc::optional<streaming::stream_reason>)>&& func);
|
||||
@@ -286,6 +289,7 @@ public:
|
||||
// Wrapper for STREAM_MUTATION_FRAGMENTS
|
||||
// The receiver of STREAM_MUTATION_FRAGMENTS sends status code to the sender to notify any error on the receiver side. The status code is of type int32_t. 0 means successful, -1 means error, other status code value are reserved for future use.
|
||||
void register_stream_mutation_fragments(std::function<future<rpc::sink<int32_t>> (const rpc::client_info& cinfo, UUID plan_id, UUID schema_id, UUID cf_id, uint64_t estimated_partitions, rpc::optional<streaming::stream_reason> reason_opt, rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>> source)>&& func);
|
||||
future<> unregister_stream_mutation_fragments();
|
||||
rpc::sink<int32_t> make_sink_for_stream_mutation_fragments(rpc::source<frozen_mutation_fragment, rpc::optional<streaming::stream_mutation_fragments_cmd>>& source);
|
||||
future<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>> make_sink_and_source_for_stream_mutation_fragments(utils::UUID schema_id, utils::UUID plan_id, utils::UUID cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, msg_addr id);
|
||||
|
||||
@@ -293,22 +297,27 @@ public:
|
||||
future<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>> make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
|
||||
rpc::sink<repair_row_on_wire_with_cmd> make_sink_for_repair_get_row_diff_with_rpc_stream(rpc::source<repair_hash_with_cmd>& source);
|
||||
void register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func);
|
||||
future<> unregister_repair_get_row_diff_with_rpc_stream();
|
||||
|
||||
// Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
|
||||
future<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>> make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
|
||||
rpc::sink<repair_stream_cmd> make_sink_for_repair_put_row_diff_with_rpc_stream(rpc::source<repair_row_on_wire_with_cmd>& source);
|
||||
void register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func);
|
||||
future<> unregister_repair_put_row_diff_with_rpc_stream();
|
||||
|
||||
// Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
|
||||
future<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>> make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
|
||||
rpc::sink<repair_hash_with_cmd> make_sink_for_repair_get_full_row_hashes_with_rpc_stream(rpc::source<repair_stream_cmd>& source);
|
||||
void register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func);
|
||||
future<> unregister_repair_get_full_row_hashes_with_rpc_stream();
|
||||
|
||||
void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id)>&& func);
|
||||
future<> send_stream_mutation_done(msg_addr id, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id);
|
||||
future<> unregister_stream_mutation_done();
|
||||
|
||||
void register_complete_message(std::function<future<> (const rpc::client_info& cinfo, UUID plan_id, unsigned dst_cpu_id, rpc::optional<bool> failed)>&& func);
|
||||
future<> send_complete_message(msg_addr id, UUID plan_id, unsigned dst_cpu_id, bool failed = false);
|
||||
future<> unregister_complete_message();
|
||||
|
||||
// Wrapper for REPAIR_CHECKSUM_RANGE verb
|
||||
void register_repair_checksum_range(std::function<future<partition_checksum> (sstring keyspace, sstring cf, dht::token_range range, rpc::optional<repair_checksum> hash_version)>&& func);
|
||||
@@ -316,9 +325,9 @@ public:
|
||||
future<partition_checksum> send_repair_checksum_range(msg_addr id, sstring keyspace, sstring cf, dht::token_range range, repair_checksum hash_version);
|
||||
|
||||
// Wrapper for REPAIR_GET_FULL_ROW_HASHES
|
||||
void register_repair_get_full_row_hashes(std::function<future<std::unordered_set<repair_hash>> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
|
||||
void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
|
||||
future<> unregister_repair_get_full_row_hashes();
|
||||
future<std::unordered_set<repair_hash>> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
|
||||
future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
|
||||
|
||||
// Wrapper for REPAIR_GET_COMBINED_ROW_HASH
|
||||
void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func);
|
||||
@@ -331,9 +340,9 @@ public:
|
||||
future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary);
|
||||
|
||||
// Wrapper for REPAIR_GET_ROW_DIFF
|
||||
void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows)>&& func);
|
||||
void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func);
|
||||
future<> unregister_repair_get_row_diff();
|
||||
future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, std::unordered_set<repair_hash> set_diff, bool needs_all_rows);
|
||||
future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows);
|
||||
|
||||
// Wrapper for REPAIR_PUT_ROW_DIFF
|
||||
void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func);
|
||||
@@ -341,9 +350,9 @@ public:
|
||||
future<> send_repair_put_row_diff(msg_addr id, uint32_t repair_meta_id, repair_rows_on_wire row_diff);
|
||||
|
||||
// Wrapper for REPAIR_ROW_LEVEL_START
|
||||
void register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version)>&& func);
|
||||
void register_repair_row_level_start(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason)>&& func);
|
||||
future<> unregister_repair_row_level_start();
|
||||
future<> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version);
|
||||
future<> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, streaming::stream_reason reason);
|
||||
|
||||
// Wrapper for REPAIR_ROW_LEVEL_STOP
|
||||
void register_repair_row_level_stop(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range)>&& func);
|
||||
@@ -493,6 +502,14 @@ public:
|
||||
std::vector<inet_address> forward, inet_address reply_to, unsigned shard, response_id_type response_id,
|
||||
std::optional<tracing::trace_info> trace_info = std::nullopt);
|
||||
|
||||
void register_paxos_prune(std::function<future<rpc::no_wait_type>(const rpc::client_info&, rpc::opt_time_point, UUID schema_id, partition_key key,
|
||||
utils::UUID ballot, std::optional<tracing::trace_info>)>&& func);
|
||||
|
||||
future<> unregister_paxos_prune();
|
||||
|
||||
future<> send_paxos_prune(gms::inet_address peer, clock_type::time_point timeout, UUID schema_id, const partition_key& key,
|
||||
utils::UUID ballot, std::optional<tracing::trace_info> trace_info);
|
||||
|
||||
void register_hint_mutation(std::function<future<rpc::no_wait_type> (const rpc::client_info&, rpc::opt_time_point, frozen_mutation fm, std::vector<inet_address> forward,
|
||||
inet_address reply_to, unsigned shard, response_id_type response_id, rpc::optional<std::optional<tracing::trace_info>> trace_info)>&& func);
|
||||
future<> unregister_hint_mutation();
|
||||
|
||||
@@ -195,6 +195,7 @@ class read_context : public reader_lifecycle_policy {
|
||||
|
||||
// One for each shard. Index is shard id.
|
||||
std::vector<reader_meta> _readers;
|
||||
std::vector<reader_concurrency_semaphore*> _semaphores;
|
||||
|
||||
gate _dismantling_gate;
|
||||
|
||||
@@ -211,7 +212,8 @@ public:
|
||||
, _schema(std::move(s))
|
||||
, _cmd(cmd)
|
||||
, _ranges(ranges)
|
||||
, _trace_state(std::move(trace_state)) {
|
||||
, _trace_state(std::move(trace_state))
|
||||
, _semaphores(smp::count, nullptr) {
|
||||
_readers.resize(smp::count);
|
||||
}
|
||||
|
||||
@@ -236,7 +238,12 @@ public:
|
||||
virtual void destroy_reader(shard_id shard, future<stopped_reader> reader_fut) noexcept override;
|
||||
|
||||
virtual reader_concurrency_semaphore& semaphore() override {
|
||||
return _readers[engine().cpu_id()].rparts->semaphore;
|
||||
const auto shard = engine().cpu_id();
|
||||
if (!_semaphores[shard]) {
|
||||
auto& table = _db.local().find_column_family(_schema);
|
||||
_semaphores[shard] = &table.read_concurrency_semaphore();
|
||||
}
|
||||
return *_semaphores[shard];
|
||||
}
|
||||
|
||||
future<> lookup_readers();
|
||||
|
||||
@@ -1721,7 +1721,7 @@ void row::apply_monotonically(const schema& s, column_kind kind, row&& other) {
|
||||
// we erase the live cells according to the shadowable_tombstone rules.
|
||||
static bool dead_marker_shadows_row(const schema& s, column_kind kind, const row_marker& marker) {
|
||||
return s.is_view()
|
||||
&& !s.view_info()->base_non_pk_columns_in_view_pk().empty()
|
||||
&& s.view_info()->has_base_non_pk_columns_in_view_pk()
|
||||
&& !marker.is_live()
|
||||
&& kind == column_kind::regular_column; // not applicable to static rows
|
||||
}
|
||||
@@ -2505,7 +2505,8 @@ mutation_partition::fully_discontinuous(const schema& s, const position_range& r
|
||||
future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& source,
|
||||
const dht::decorated_key& dk,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_ptr)
|
||||
tracing::trace_state_ptr trace_ptr,
|
||||
db::timeout_clock::time_point timeout)
|
||||
{
|
||||
struct range_and_reader {
|
||||
dht::partition_range range;
|
||||
@@ -2530,7 +2531,7 @@ future<mutation_opt> counter_write_query(schema_ptr s, const mutation_source& so
|
||||
auto cwqrb = counter_write_query_result_builder(*s);
|
||||
auto cfq = make_stable_flattened_mutations_consumer<compact_for_query<emit_only_live_rows::yes, counter_write_query_result_builder>>(
|
||||
*s, gc_clock::now(), slice, query::max_rows, query::max_rows, std::move(cwqrb));
|
||||
auto f = r_a_r->reader.consume(std::move(cfq), db::no_timeout);
|
||||
auto f = r_a_r->reader.consume(std::move(cfq), timeout);
|
||||
return f.finally([r_a_r = std::move(r_a_r)] { });
|
||||
}
|
||||
|
||||
@@ -2605,7 +2606,7 @@ void mutation_cleaner_impl::start_worker() {
|
||||
stop_iteration mutation_cleaner_impl::merge_some(partition_snapshot& snp) noexcept {
|
||||
auto&& region = snp.region();
|
||||
return with_allocator(region.allocator(), [&] {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
{
|
||||
// Allocating sections require the region to be reclaimable
|
||||
// which means that they cannot be nested.
|
||||
// It is, however, possible, that if the snapshot is taken
|
||||
@@ -2617,13 +2618,15 @@ stop_iteration mutation_cleaner_impl::merge_some(partition_snapshot& snp) noexce
|
||||
}
|
||||
try {
|
||||
return _worker_state->alloc_section(region, [&] {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
return snp.merge_partition_versions(_app_stats);
|
||||
});
|
||||
});
|
||||
} catch (...) {
|
||||
// Merging failed, give up as there is no guarantee of forward progress.
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -113,9 +113,6 @@ class reconcilable_result_builder {
|
||||
const schema& _schema;
|
||||
const query::partition_slice& _slice;
|
||||
|
||||
utils::chunked_vector<partition> _result;
|
||||
uint32_t _live_rows{};
|
||||
|
||||
bool _return_static_content_on_partition_with_no_rows{};
|
||||
bool _static_row_is_alive{};
|
||||
uint32_t _total_live_rows = 0;
|
||||
@@ -123,6 +120,10 @@ class reconcilable_result_builder {
|
||||
stop_iteration _stop;
|
||||
bool _short_read_allowed;
|
||||
std::optional<streamed_mutation_freezer> _mutation_consumer;
|
||||
|
||||
uint32_t _live_rows{};
|
||||
// make this the last member so it is destroyed first. #7240
|
||||
utils::chunked_vector<partition> _result;
|
||||
public:
|
||||
reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
|
||||
query::result_memory_accounter&& accounter)
|
||||
@@ -206,5 +207,6 @@ public:
|
||||
future<mutation_opt> counter_write_query(schema_ptr, const mutation_source&,
|
||||
const dht::decorated_key& dk,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_ptr);
|
||||
tracing::trace_state_ptr trace_ptr,
|
||||
db::timeout_clock::time_point timeout);
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -372,6 +372,64 @@ flat_mutation_reader make_foreign_reader(schema_ptr schema,
|
||||
foreign_ptr<std::unique_ptr<flat_mutation_reader>> reader,
|
||||
streamed_mutation::forwarding fwd_sm = streamed_mutation::forwarding::no);
|
||||
|
||||
/// Make an auto-paused evictable reader.
|
||||
///
|
||||
/// The reader is paused after each use, that is after each call to any of its
|
||||
/// members that cause actual reading to be done (`fill_buffer()` and
|
||||
/// `fast_forward_to()`). When paused, the reader is made evictable, that it is
|
||||
/// it is registered with reader concurrency semaphore as an inactive read.
|
||||
/// The reader is resumed automatically on the next use. If it was evicted, it
|
||||
/// will be recreated at the position it left off reading. This is all
|
||||
/// transparent to its user.
|
||||
/// Parameters passed by reference have to be kept alive while the reader is
|
||||
/// alive.
|
||||
flat_mutation_reader make_auto_paused_evictable_reader(
|
||||
mutation_source ms,
|
||||
schema_ptr schema,
|
||||
reader_concurrency_semaphore& semaphore,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
mutation_reader::forwarding fwd_mr);
|
||||
|
||||
class evictable_reader;
|
||||
|
||||
class evictable_reader_handle {
|
||||
friend std::pair<flat_mutation_reader, evictable_reader_handle> make_manually_paused_evictable_reader(mutation_source, schema_ptr, reader_concurrency_semaphore&,
|
||||
const dht::partition_range&, const query::partition_slice&, const io_priority_class&, tracing::trace_state_ptr, mutation_reader::forwarding);
|
||||
|
||||
private:
|
||||
evictable_reader* _r;
|
||||
|
||||
private:
|
||||
explicit evictable_reader_handle(evictable_reader& r);
|
||||
|
||||
public:
|
||||
void pause();
|
||||
};
|
||||
|
||||
/// Make a manually-paused evictable reader.
|
||||
///
|
||||
/// The reader can be paused via the evictable reader handle when desired. The
|
||||
/// intended usage is subsequent reads done in bursts, after which the reader is
|
||||
/// not used for some time. When paused, the reader is made evictable, that is,
|
||||
/// it is registered with reader concurrency semaphore as an inactive read.
|
||||
/// The reader is resumed automatically on the next use. If it was evicted, it
|
||||
/// will be recreated at the position it left off reading. This is all
|
||||
/// transparent to its user.
|
||||
/// Parameters passed by reference have to be kept alive while the reader is
|
||||
/// alive.
|
||||
std::pair<flat_mutation_reader, evictable_reader_handle> make_manually_paused_evictable_reader(
|
||||
mutation_source ms,
|
||||
schema_ptr schema,
|
||||
reader_concurrency_semaphore& semaphore,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
mutation_reader::forwarding fwd_mr);
|
||||
|
||||
/// Reader lifecycle policy for the mulitshard combining reader.
|
||||
///
|
||||
/// This policy is expected to make sure any additional resource the readers
|
||||
|
||||
@@ -173,6 +173,13 @@ future<> multishard_writer::distribute_mutation_fragments() {
|
||||
return handle_end_of_stream();
|
||||
}
|
||||
});
|
||||
}).handle_exception([this] (std::exception_ptr ep) {
|
||||
for (auto& q : _queue_reader_handles) {
|
||||
if (q) {
|
||||
q->abort(ep);
|
||||
}
|
||||
}
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -163,6 +163,11 @@ public:
|
||||
return {partition_region::clustered, bound_weight::before_all_prefixed, &ck};
|
||||
}
|
||||
|
||||
// Returns a view to before_key(pos._ck) if pos.is_clustering_row() else returns pos as-is.
|
||||
static position_in_partition_view before_key(position_in_partition_view pos) {
|
||||
return {partition_region::clustered, pos._bound_weight == bound_weight::equal ? bound_weight::before_all_prefixed : pos._bound_weight, pos._ck};
|
||||
}
|
||||
|
||||
partition_region region() const { return _type; }
|
||||
bound_weight get_bound_weight() const { return _bound_weight; }
|
||||
bool is_partition_start() const { return _type == partition_region::partition_start; }
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
|
||||
|
||||
reader_permit::impl::impl(reader_concurrency_semaphore& semaphore, reader_resources base_cost) : semaphore(semaphore), base_cost(base_cost) {
|
||||
semaphore.consume(base_cost);
|
||||
}
|
||||
|
||||
reader_permit::impl::~impl() {
|
||||
@@ -88,7 +89,6 @@ void reader_concurrency_semaphore::signal(const resources& r) noexcept {
|
||||
_resources += r;
|
||||
while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
|
||||
auto& x = _wait_list.front();
|
||||
_resources -= x.res;
|
||||
try {
|
||||
x.pr.set_value(reader_permit(*this, x.res));
|
||||
} catch (...) {
|
||||
@@ -160,7 +160,6 @@ future<reader_permit> reader_concurrency_semaphore::wait_admission(size_t memory
|
||||
--_inactive_read_stats.population;
|
||||
}
|
||||
if (may_proceed(r)) {
|
||||
_resources -= r;
|
||||
return make_ready_future<reader_permit>(reader_permit(*this, r));
|
||||
}
|
||||
promise<reader_permit> pr;
|
||||
@@ -170,7 +169,6 @@ future<reader_permit> reader_concurrency_semaphore::wait_admission(size_t memory
|
||||
}
|
||||
|
||||
reader_permit reader_concurrency_semaphore::consume_resources(resources r) {
|
||||
_resources -= r;
|
||||
return reader_permit(*this, r);
|
||||
}
|
||||
|
||||
|
||||
@@ -128,6 +128,10 @@ private:
|
||||
return has_available_units(r) && _wait_list.empty();
|
||||
}
|
||||
|
||||
void consume(resources r) {
|
||||
_resources -= r;
|
||||
}
|
||||
|
||||
void consume_memory(size_t memory) {
|
||||
_resources.memory -= memory;
|
||||
}
|
||||
|
||||
@@ -12,7 +12,11 @@
|
||||
# At the end of the build we check that the build-id is indeed in the
|
||||
# first page. At install time we check that patchelf doesn't modify
|
||||
# the program headers.
|
||||
|
||||
# gdb has a SO_NAME_MAX_PATH_SIZE of 512, so limit the path size to
|
||||
# that. The 512 includes the null at the end, hence the 511 bellow.
|
||||
|
||||
ORIGINAL_DYNAMIC_LINKER=$(gcc -### /dev/null -o t 2>&1 | perl -n -e '/-dynamic-linker ([^ ]*) / && print $1')
|
||||
DYNAMIC_LINKER=$(printf "%2000s$ORIGINAL_DYNAMIC_LINKER" | sed 's| |/|g')
|
||||
DYNAMIC_LINKER=$(printf "%511s$ORIGINAL_DYNAMIC_LINKER" | sed 's| |/|g')
|
||||
|
||||
echo $DYNAMIC_LINKER
|
||||
|
||||
@@ -672,7 +672,8 @@ repair_info::repair_info(seastar::sharded<database>& db_,
|
||||
const std::vector<sstring>& cfs_,
|
||||
int id_,
|
||||
const std::vector<sstring>& data_centers_,
|
||||
const std::vector<sstring>& hosts_)
|
||||
const std::vector<sstring>& hosts_,
|
||||
streaming::stream_reason reason_)
|
||||
: db(db_)
|
||||
, partitioner(get_partitioner_for_tables(db_, keyspace_, cfs_))
|
||||
, keyspace(keyspace_)
|
||||
@@ -682,6 +683,7 @@ repair_info::repair_info(seastar::sharded<database>& db_,
|
||||
, shard(engine().cpu_id())
|
||||
, data_centers(data_centers_)
|
||||
, hosts(hosts_)
|
||||
, reason(reason_)
|
||||
, _row_level_repair(db.local().features().cluster_supports_row_level_repair()) {
|
||||
}
|
||||
|
||||
@@ -1462,7 +1464,7 @@ static int do_repair_start(seastar::sharded<database>& db, sstring keyspace,
|
||||
data_centers = options.data_centers, hosts = options.hosts] (database& localdb) mutable {
|
||||
auto ri = make_lw_shared<repair_info>(db,
|
||||
std::move(keyspace), std::move(ranges), std::move(cfs),
|
||||
id, std::move(data_centers), std::move(hosts));
|
||||
id, std::move(data_centers), std::move(hosts), streaming::stream_reason::repair);
|
||||
return repair_ranges(ri);
|
||||
});
|
||||
repair_results.push_back(std::move(f));
|
||||
@@ -1524,14 +1526,15 @@ future<> repair_abort_all(seastar::sharded<database>& db) {
|
||||
future<> sync_data_using_repair(seastar::sharded<database>& db,
|
||||
sstring keyspace,
|
||||
dht::token_range_vector ranges,
|
||||
std::unordered_map<dht::token_range, repair_neighbors> neighbors) {
|
||||
std::unordered_map<dht::token_range, repair_neighbors> neighbors,
|
||||
streaming::stream_reason reason) {
|
||||
if (ranges.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return smp::submit_to(0, [&db, keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors)] () mutable {
|
||||
return smp::submit_to(0, [&db, keyspace = std::move(keyspace), ranges = std::move(ranges), neighbors = std::move(neighbors), reason] () mutable {
|
||||
int id = repair_tracker().next_repair_command();
|
||||
rlogger.info("repair id {} to sync data for keyspace={}, status=started", id, keyspace);
|
||||
return repair_tracker().run(id, [id, &db, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors)] () mutable {
|
||||
return repair_tracker().run(id, [id, &db, keyspace, ranges = std::move(ranges), neighbors = std::move(neighbors), reason] () mutable {
|
||||
auto cfs = list_column_families(db.local(), keyspace);
|
||||
if (cfs.empty()) {
|
||||
rlogger.warn("repair id {} to sync data for keyspace={}, no table in this keyspace", id, keyspace);
|
||||
@@ -1540,12 +1543,12 @@ future<> sync_data_using_repair(seastar::sharded<database>& db,
|
||||
std::vector<future<>> repair_results;
|
||||
repair_results.reserve(smp::count);
|
||||
for (auto shard : boost::irange(unsigned(0), smp::count)) {
|
||||
auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges, neighbors] (database& localdb) mutable {
|
||||
auto f = db.invoke_on(shard, [keyspace, cfs, id, ranges, neighbors, reason] (database& localdb) mutable {
|
||||
auto data_centers = std::vector<sstring>();
|
||||
auto hosts = std::vector<sstring>();
|
||||
auto ri = make_lw_shared<repair_info>(service::get_local_storage_service().db(),
|
||||
std::move(keyspace), std::move(ranges), std::move(cfs),
|
||||
id, std::move(data_centers), std::move(hosts));
|
||||
id, std::move(data_centers), std::move(hosts), reason);
|
||||
ri->neighbors = std::move(neighbors);
|
||||
return repair_ranges(ri);
|
||||
});
|
||||
@@ -1584,6 +1587,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
|
||||
auto keyspaces = db.local().get_non_system_keyspaces();
|
||||
rlogger.info("bootstrap_with_repair: started with keyspaces={}", keyspaces);
|
||||
auto myip = utils::fb_utilities::get_broadcast_address();
|
||||
auto reason = streaming::stream_reason::bootstrap;
|
||||
for (auto& keyspace_name : keyspaces) {
|
||||
if (!db.local().has_keyspace(keyspace_name)) {
|
||||
rlogger.info("bootstrap_with_repair: keyspace={} does not exist any more, ignoring it", keyspace_name);
|
||||
@@ -1716,7 +1720,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, locator::token_me
|
||||
}
|
||||
}
|
||||
auto nr_ranges = desired_ranges.size();
|
||||
sync_data_using_repair(db, keyspace_name, std::move(desired_ranges), std::move(range_sources)).get();
|
||||
sync_data_using_repair(db, keyspace_name, std::move(desired_ranges), std::move(range_sources), reason).get();
|
||||
rlogger.info("bootstrap_with_repair: finished with keyspace={}, nr_ranges={}", keyspace_name, nr_ranges);
|
||||
}
|
||||
rlogger.info("bootstrap_with_repair: finished with keyspaces={}", keyspaces);
|
||||
@@ -1730,6 +1734,7 @@ future<> do_decommission_removenode_with_repair(seastar::sharded<database>& db,
|
||||
auto keyspaces = db.local().get_non_system_keyspaces();
|
||||
bool is_removenode = myip != leaving_node;
|
||||
auto op = is_removenode ? "removenode_with_repair" : "decommission_with_repair";
|
||||
streaming::stream_reason reason = is_removenode ? streaming::stream_reason::removenode : streaming::stream_reason::decommission;
|
||||
rlogger.info("{}: started with keyspaces={}, leaving_node={}", op, keyspaces, leaving_node);
|
||||
for (auto& keyspace_name : keyspaces) {
|
||||
if (!db.local().has_keyspace(keyspace_name)) {
|
||||
@@ -1867,7 +1872,7 @@ future<> do_decommission_removenode_with_repair(seastar::sharded<database>& db,
|
||||
ranges.swap(ranges_for_removenode);
|
||||
}
|
||||
auto nr_ranges_synced = ranges.size();
|
||||
sync_data_using_repair(db, keyspace_name, std::move(ranges), std::move(range_sources)).get();
|
||||
sync_data_using_repair(db, keyspace_name, std::move(ranges), std::move(range_sources), reason).get();
|
||||
rlogger.info("{}: finished with keyspace={}, leaving_node={}, nr_ranges={}, nr_ranges_synced={}, nr_ranges_skipped={}",
|
||||
op, keyspace_name, leaving_node, nr_ranges_total, nr_ranges_synced, nr_ranges_skipped);
|
||||
}
|
||||
@@ -1883,8 +1888,8 @@ future<> removenode_with_repair(seastar::sharded<database>& db, locator::token_m
|
||||
return do_decommission_removenode_with_repair(db, std::move(tm), std::move(leaving_node));
|
||||
}
|
||||
|
||||
future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, locator::token_metadata tm, sstring op, sstring source_dc) {
|
||||
return seastar::async([&db, tm = std::move(tm), source_dc = std::move(source_dc), op = std::move(op)] () mutable {
|
||||
future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, locator::token_metadata tm, sstring op, sstring source_dc, streaming::stream_reason reason) {
|
||||
return seastar::async([&db, tm = std::move(tm), source_dc = std::move(source_dc), op = std::move(op), reason] () mutable {
|
||||
auto keyspaces = db.local().get_non_system_keyspaces();
|
||||
rlogger.info("{}: started with keyspaces={}, source_dc={}", op, keyspaces, source_dc);
|
||||
auto myip = utils::fb_utilities::get_broadcast_address();
|
||||
@@ -1921,7 +1926,7 @@ future<> do_rebuild_replace_with_repair(seastar::sharded<database>& db, locator:
|
||||
}
|
||||
}
|
||||
auto nr_ranges = ranges.size();
|
||||
sync_data_using_repair(db, keyspace_name, std::move(ranges), std::move(range_sources)).get();
|
||||
sync_data_using_repair(db, keyspace_name, std::move(ranges), std::move(range_sources), reason).get();
|
||||
rlogger.info("{}: finished with keyspace={}, source_dc={}, nr_ranges={}", op, keyspace_name, source_dc, nr_ranges);
|
||||
}
|
||||
rlogger.info("{}: finished with keyspaces={}, source_dc={}", op, keyspaces, source_dc);
|
||||
@@ -1933,11 +1938,13 @@ future<> rebuild_with_repair(seastar::sharded<database>& db, locator::token_meta
|
||||
if (source_dc.empty()) {
|
||||
source_dc = get_local_dc();
|
||||
}
|
||||
return do_rebuild_replace_with_repair(db, std::move(tm), std::move(op), std::move(source_dc));
|
||||
auto reason = streaming::stream_reason::rebuild;
|
||||
return do_rebuild_replace_with_repair(db, std::move(tm), std::move(op), std::move(source_dc), reason);
|
||||
}
|
||||
|
||||
future<> replace_with_repair(seastar::sharded<database>& db, locator::token_metadata tm) {
|
||||
auto op = sstring("replace_with_repair");
|
||||
auto source_dc = get_local_dc();
|
||||
return do_rebuild_replace_with_repair(db, std::move(tm), std::move(op), std::move(source_dc));
|
||||
auto reason = streaming::stream_reason::bootstrap;
|
||||
return do_rebuild_replace_with_repair(db, std::move(tm), std::move(op), std::move(source_dc), reason);
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
|
||||
#include <unordered_map>
|
||||
#include <exception>
|
||||
#include <absl/container/btree_set.h>
|
||||
|
||||
#include <seastar/core/sstring.hh>
|
||||
#include <seastar/core/sharded.hh>
|
||||
@@ -181,6 +182,7 @@ public:
|
||||
shard_id shard;
|
||||
std::vector<sstring> data_centers;
|
||||
std::vector<sstring> hosts;
|
||||
streaming::stream_reason reason;
|
||||
std::unordered_map<dht::token_range, repair_neighbors> neighbors;
|
||||
size_t nr_failed_ranges = 0;
|
||||
bool aborted = false;
|
||||
@@ -211,7 +213,8 @@ public:
|
||||
const std::vector<sstring>& cfs_,
|
||||
int id_,
|
||||
const std::vector<sstring>& data_centers_,
|
||||
const std::vector<sstring>& hosts_);
|
||||
const std::vector<sstring>& hosts_,
|
||||
streaming::stream_reason reason_);
|
||||
future<> do_streaming();
|
||||
void check_failed_ranges();
|
||||
future<> request_transfer_ranges(const sstring& cf,
|
||||
@@ -332,6 +335,8 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
using repair_hash_set = absl::btree_set<repair_hash>;
|
||||
|
||||
// Return value of the REPAIR_GET_SYNC_BOUNDARY RPC verb
|
||||
struct get_sync_boundary_response {
|
||||
std::optional<repair_sync_boundary> boundary;
|
||||
|
||||
@@ -47,6 +47,7 @@
|
||||
#include "gms/gossiper.hh"
|
||||
#include "repair/row_level.hh"
|
||||
#include "mutation_source_metadata.hh"
|
||||
#include "utils/stall_free.hh"
|
||||
|
||||
extern logging::logger rlogger;
|
||||
|
||||
@@ -373,6 +374,7 @@ private:
|
||||
std::optional<utils::phased_barrier::operation> _local_read_op;
|
||||
// Local reader or multishard reader to read the range
|
||||
flat_mutation_reader _reader;
|
||||
std::optional<evictable_reader_handle> _reader_handle;
|
||||
// Current partition read from disk
|
||||
lw_shared_ptr<const decorated_key_with_hash> _current_dk;
|
||||
|
||||
@@ -392,32 +394,49 @@ public:
|
||||
, _sharder(remote_partitioner, range, remote_shard)
|
||||
, _seed(seed)
|
||||
, _local_read_op(local_reader ? std::optional(cf.read_in_progress()) : std::nullopt)
|
||||
, _reader(make_reader(db, cf, local_reader)) {
|
||||
}
|
||||
|
||||
private:
|
||||
flat_mutation_reader
|
||||
make_reader(seastar::sharded<database>& db,
|
||||
column_family& cf,
|
||||
is_local_reader local_reader) {
|
||||
, _reader(nullptr) {
|
||||
if (local_reader) {
|
||||
return cf.make_streaming_reader(_schema, _range);
|
||||
auto ms = mutation_source([&cf] (
|
||||
schema_ptr s,
|
||||
reader_permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr,
|
||||
streamed_mutation::forwarding,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return cf.make_streaming_reader(std::move(s), pr, ps, fwd_mr);
|
||||
});
|
||||
std::tie(_reader, _reader_handle) = make_manually_paused_evictable_reader(
|
||||
std::move(ms),
|
||||
_schema,
|
||||
cf.streaming_read_concurrency_semaphore(),
|
||||
_range,
|
||||
_schema->full_slice(),
|
||||
service::get_local_streaming_read_priority(),
|
||||
{},
|
||||
mutation_reader::forwarding::no);
|
||||
} else {
|
||||
_reader = make_multishard_streaming_reader(db, _schema, [this] {
|
||||
auto shard_range = _sharder.next();
|
||||
if (shard_range) {
|
||||
return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
|
||||
}
|
||||
return std::optional<dht::partition_range>();
|
||||
});
|
||||
}
|
||||
return make_multishard_streaming_reader(db, _schema, [this] {
|
||||
auto shard_range = _sharder.next();
|
||||
if (shard_range) {
|
||||
return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
|
||||
}
|
||||
return std::optional<dht::partition_range>();
|
||||
});
|
||||
}
|
||||
|
||||
public:
|
||||
future<mutation_fragment_opt>
|
||||
read_mutation_fragment() {
|
||||
return _reader(db::no_timeout);
|
||||
}
|
||||
|
||||
void on_end_of_stream() {
|
||||
_reader = make_empty_flat_reader(_schema);
|
||||
_reader_handle.reset();
|
||||
}
|
||||
|
||||
lw_shared_ptr<const decorated_key_with_hash>& get_current_dk() {
|
||||
return _current_dk;
|
||||
}
|
||||
@@ -436,6 +455,11 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void pause() {
|
||||
if (_reader_handle) {
|
||||
_reader_handle->pause();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class repair_writer {
|
||||
@@ -443,7 +467,7 @@ class repair_writer {
|
||||
uint64_t _estimated_partitions;
|
||||
size_t _nr_peer_nodes;
|
||||
// Needs more than one for repair master
|
||||
std::vector<std::optional<future<uint64_t>>> _writer_done;
|
||||
std::vector<std::optional<future<>>> _writer_done;
|
||||
std::vector<std::optional<seastar::queue<mutation_fragment_opt>>> _mq;
|
||||
// Current partition written to disk
|
||||
std::vector<lw_shared_ptr<const decorated_key_with_hash>> _current_dk_written_to_sstable;
|
||||
@@ -451,14 +475,18 @@ class repair_writer {
|
||||
// partition_start is written and is closed when a partition_end is
|
||||
// written.
|
||||
std::vector<bool> _partition_opened;
|
||||
streaming::stream_reason _reason;
|
||||
named_semaphore _sem{1, named_semaphore_exception_factory{"repair_writer"}};
|
||||
public:
|
||||
repair_writer(
|
||||
schema_ptr schema,
|
||||
uint64_t estimated_partitions,
|
||||
size_t nr_peer_nodes)
|
||||
size_t nr_peer_nodes,
|
||||
streaming::stream_reason reason)
|
||||
: _schema(std::move(schema))
|
||||
, _estimated_partitions(estimated_partitions)
|
||||
, _nr_peer_nodes(nr_peer_nodes) {
|
||||
, _nr_peer_nodes(nr_peer_nodes)
|
||||
, _reason(reason) {
|
||||
init_writer();
|
||||
}
|
||||
|
||||
@@ -495,9 +523,9 @@ public:
|
||||
table& t = db.local().find_column_family(_schema->id());
|
||||
_writer_done[node_idx] = mutation_writer::distribute_reader_and_consume_on_shards(_schema,
|
||||
make_generating_reader(_schema, std::move(get_next_mutation_fragment)),
|
||||
[&db, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
|
||||
[&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
|
||||
auto& t = db.local().find_column_family(reader.schema());
|
||||
return db::view::check_needs_view_update_path(_sys_dist_ks->local(), t, streaming::stream_reason::repair).then([t = t.shared_from_this(), estimated_partitions, reader = std::move(reader)] (bool use_view_update_path) mutable {
|
||||
return db::view::check_needs_view_update_path(_sys_dist_ks->local(), t, reason).then([t = t.shared_from_this(), estimated_partitions, reader = std::move(reader)] (bool use_view_update_path) mutable {
|
||||
//FIXME: for better estimations this should be transmitted from remote
|
||||
auto metadata = mutation_source_metadata{};
|
||||
auto& cs = t->get_compaction_strategy();
|
||||
@@ -523,7 +551,15 @@ public:
|
||||
return consumer(std::move(reader));
|
||||
});
|
||||
},
|
||||
t.stream_in_progress());
|
||||
t.stream_in_progress()).then([this, node_idx] (uint64_t partitions) {
|
||||
rlogger.debug("repair_writer: keyspace={}, table={}, managed to write partitions={} to sstable",
|
||||
_schema->ks_name(), _schema->cf_name(), partitions);
|
||||
}).handle_exception([this, node_idx] (std::exception_ptr ep) {
|
||||
rlogger.warn("repair_writer: keyspace={}, table={}, multishard_writer failed: {}",
|
||||
_schema->ks_name(), _schema->cf_name(), ep);
|
||||
_mq[node_idx]->abort(ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
}
|
||||
|
||||
future<> write_partition_end(unsigned node_idx) {
|
||||
@@ -550,23 +586,41 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
future<> write_end_of_stream(unsigned node_idx) {
|
||||
if (_mq[node_idx]) {
|
||||
return with_semaphore(_sem, 1, [this, node_idx] {
|
||||
// Partition_end is never sent on wire, so we have to write one ourselves.
|
||||
return write_partition_end(node_idx).then([this, node_idx] () mutable {
|
||||
// Empty mutation_fragment_opt means no more data, so the writer can seal the sstables.
|
||||
return _mq[node_idx]->push_eventually(mutation_fragment_opt());
|
||||
});
|
||||
});
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}
|
||||
|
||||
future<> do_wait_for_writer_done(unsigned node_idx) {
|
||||
if (_writer_done[node_idx]) {
|
||||
return std::move(*(_writer_done[node_idx]));
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}
|
||||
|
||||
future<> wait_for_writer_done() {
|
||||
return parallel_for_each(boost::irange(unsigned(0), unsigned(_nr_peer_nodes)), [this] (unsigned node_idx) {
|
||||
if (_writer_done[node_idx] && _mq[node_idx]) {
|
||||
// Partition_end is never sent on wire, so we have to write one ourselves.
|
||||
return write_partition_end(node_idx).then([this, node_idx] () mutable {
|
||||
// Empty mutation_fragment_opt means no more data, so the writer can seal the sstables.
|
||||
return _mq[node_idx]->push_eventually(mutation_fragment_opt()).then([this, node_idx] () mutable {
|
||||
return (*_writer_done[node_idx]).then([] (uint64_t partitions) {
|
||||
rlogger.debug("Managed to write partitions={} to sstable", partitions);
|
||||
return make_ready_future<>();
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
return make_ready_future<>();
|
||||
return when_all_succeed(write_end_of_stream(node_idx), do_wait_for_writer_done(node_idx));
|
||||
}).handle_exception([this] (std::exception_ptr ep) {
|
||||
rlogger.warn("repair_writer: keyspace={}, table={}, wait_for_writer_done failed: {}",
|
||||
_schema->ks_name(), _schema->cf_name(), ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
}
|
||||
|
||||
named_semaphore& sem() {
|
||||
return _sem;
|
||||
}
|
||||
};
|
||||
|
||||
class repair_meta {
|
||||
@@ -590,6 +644,7 @@ private:
|
||||
repair_master _repair_master;
|
||||
gms::inet_address _myip;
|
||||
uint32_t _repair_meta_id;
|
||||
streaming::stream_reason _reason;
|
||||
// Repair master's sharding configuration
|
||||
shard_config _master_node_shard_config;
|
||||
// Partitioner of repair master
|
||||
@@ -613,7 +668,7 @@ private:
|
||||
// Tracks current sync boundary
|
||||
std::optional<repair_sync_boundary> _current_sync_boundary;
|
||||
// Contains the hashes of rows in the _working_row_buffor for all peer nodes
|
||||
std::vector<std::unordered_set<repair_hash>> _peer_row_hash_sets;
|
||||
std::vector<repair_hash_set> _peer_row_hash_sets;
|
||||
// Gate used to make sure pending operation of meta data is done
|
||||
seastar::gate _gate;
|
||||
sink_source_for_get_full_row_hashes _sink_source_for_get_full_row_hashes;
|
||||
@@ -653,6 +708,7 @@ public:
|
||||
uint64_t seed,
|
||||
repair_master master,
|
||||
uint32_t repair_meta_id,
|
||||
streaming::stream_reason reason,
|
||||
shard_config master_node_shard_config,
|
||||
size_t nr_peer_nodes = 1)
|
||||
: _db(db)
|
||||
@@ -666,6 +722,7 @@ public:
|
||||
, _repair_master(master)
|
||||
, _myip(utils::fb_utilities::get_broadcast_address())
|
||||
, _repair_meta_id(repair_meta_id)
|
||||
, _reason(reason)
|
||||
, _master_node_shard_config(std::move(master_node_shard_config))
|
||||
, _remote_partitioner(make_remote_partitioner())
|
||||
, _same_sharding_config(is_same_sharding_config())
|
||||
@@ -681,7 +738,7 @@ public:
|
||||
_seed,
|
||||
repair_reader::is_local_reader(_repair_master || _same_sharding_config)
|
||||
)
|
||||
, _repair_writer(_schema, _estimated_partitions, _nr_peer_nodes)
|
||||
, _repair_writer(_schema, _estimated_partitions, _nr_peer_nodes, _reason)
|
||||
, _sink_source_for_get_full_row_hashes(_repair_meta_id, _nr_peer_nodes,
|
||||
[] (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr) {
|
||||
return netw::get_local_messaging_service().make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(repair_meta_id, addr);
|
||||
@@ -700,11 +757,12 @@ public:
|
||||
public:
|
||||
future<> stop() {
|
||||
auto gate_future = _gate.close();
|
||||
auto writer_future = _repair_writer.wait_for_writer_done();
|
||||
auto f1 = _sink_source_for_get_full_row_hashes.close();
|
||||
auto f2 = _sink_source_for_get_row_diff.close();
|
||||
auto f3 = _sink_source_for_put_row_diff.close();
|
||||
return when_all_succeed(std::move(gate_future), std::move(writer_future), std::move(f1), std::move(f2), std::move(f3));
|
||||
return when_all_succeed(std::move(gate_future), std::move(f1), std::move(f2), std::move(f3)).finally([this] {
|
||||
return _repair_writer.wait_for_writer_done();
|
||||
});
|
||||
}
|
||||
|
||||
static std::unordered_map<node_repair_meta_id, lw_shared_ptr<repair_meta>>& repair_meta_map() {
|
||||
@@ -731,7 +789,8 @@ public:
|
||||
uint64_t max_row_buf_size,
|
||||
uint64_t seed,
|
||||
shard_config master_node_shard_config,
|
||||
table_schema_version schema_version) {
|
||||
table_schema_version schema_version,
|
||||
streaming::stream_reason reason) {
|
||||
return service::get_schema_for_write(schema_version, {from, src_cpu_id}).then([from,
|
||||
repair_meta_id,
|
||||
range,
|
||||
@@ -739,7 +798,8 @@ public:
|
||||
max_row_buf_size,
|
||||
seed,
|
||||
master_node_shard_config,
|
||||
schema_version] (schema_ptr s) {
|
||||
schema_version,
|
||||
reason] (schema_ptr s) {
|
||||
auto& db = service::get_local_storage_proxy().get_db();
|
||||
auto& cf = db.local().find_column_family(s->id());
|
||||
node_repair_meta_id id{from, repair_meta_id};
|
||||
@@ -752,6 +812,7 @@ public:
|
||||
seed,
|
||||
repair_meta::repair_master::no,
|
||||
repair_meta_id,
|
||||
reason,
|
||||
std::move(master_node_shard_config));
|
||||
bool insertion = repair_meta_map().emplace(id, rm).second;
|
||||
if (!insertion) {
|
||||
@@ -829,9 +890,9 @@ public:
|
||||
}
|
||||
|
||||
// Must run inside a seastar thread
|
||||
static std::unordered_set<repair_hash>
|
||||
get_set_diff(const std::unordered_set<repair_hash>& x, const std::unordered_set<repair_hash>& y) {
|
||||
std::unordered_set<repair_hash> set_diff;
|
||||
static repair_hash_set
|
||||
get_set_diff(const repair_hash_set& x, const repair_hash_set& y) {
|
||||
repair_hash_set set_diff;
|
||||
// Note std::set_difference needs x and y are sorted.
|
||||
std::copy_if(x.begin(), x.end(), std::inserter(set_diff, set_diff.end()),
|
||||
[&y] (auto& item) { thread::maybe_yield(); return y.find(item) == y.end(); });
|
||||
@@ -849,14 +910,14 @@ public:
|
||||
|
||||
}
|
||||
|
||||
std::unordered_set<repair_hash>& peer_row_hash_sets(unsigned node_idx) {
|
||||
repair_hash_set& peer_row_hash_sets(unsigned node_idx) {
|
||||
return _peer_row_hash_sets[node_idx];
|
||||
}
|
||||
|
||||
// Get a list of row hashes in _working_row_buf
|
||||
future<std::unordered_set<repair_hash>>
|
||||
future<repair_hash_set>
|
||||
working_row_hashes() {
|
||||
return do_with(std::unordered_set<repair_hash>(), [this] (std::unordered_set<repair_hash>& hashes) {
|
||||
return do_with(repair_hash_set(), [this] (repair_hash_set& hashes) {
|
||||
return do_for_each(_working_row_buf, [&hashes] (repair_row& r) {
|
||||
hashes.emplace(r.hash());
|
||||
}).then([&hashes] {
|
||||
@@ -982,11 +1043,7 @@ private:
|
||||
return repair_hash(h.finalize_uint64());
|
||||
}
|
||||
|
||||
stop_iteration handle_mutation_fragment(mutation_fragment_opt mfopt, size_t& cur_size, size_t& new_rows_size, std::list<repair_row>& cur_rows) {
|
||||
if (!mfopt) {
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
mutation_fragment& mf = *mfopt;
|
||||
stop_iteration handle_mutation_fragment(mutation_fragment& mf, size_t& cur_size, size_t& new_rows_size, std::list<repair_row>& cur_rows) {
|
||||
if (mf.is_partition_start()) {
|
||||
auto& start = mf.as_partition_start();
|
||||
_repair_reader.set_current_dk(start.key());
|
||||
@@ -1021,32 +1078,49 @@ private:
|
||||
}
|
||||
_gate.check();
|
||||
return _repair_reader.read_mutation_fragment().then([this, &cur_size, &new_rows_size, &cur_rows] (mutation_fragment_opt mfopt) mutable {
|
||||
return handle_mutation_fragment(std::move(mfopt), cur_size, new_rows_size, cur_rows);
|
||||
if (!mfopt) {
|
||||
_repair_reader.on_end_of_stream();
|
||||
return stop_iteration::yes;
|
||||
}
|
||||
return handle_mutation_fragment(*mfopt, cur_size, new_rows_size, cur_rows);
|
||||
});
|
||||
}).then([&cur_rows, &new_rows_size] () mutable {
|
||||
}).then_wrapped([this, &cur_rows, &new_rows_size] (future<> fut) mutable {
|
||||
if (fut.failed()) {
|
||||
_repair_reader.on_end_of_stream();
|
||||
return make_exception_future<std::list<repair_row>, size_t>(fut.get_exception());
|
||||
}
|
||||
_repair_reader.pause();
|
||||
return make_ready_future<std::list<repair_row>, size_t>(std::move(cur_rows), new_rows_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> clear_row_buf() {
|
||||
return utils::clear_gently(_row_buf);
|
||||
}
|
||||
|
||||
future<> clear_working_row_buf() {
|
||||
return utils::clear_gently(_working_row_buf).then([this] {
|
||||
_working_row_buf_combined_hash.clear();
|
||||
});
|
||||
}
|
||||
|
||||
// Read rows from disk until _max_row_buf_size of rows are filled into _row_buf.
|
||||
// Calculate the combined checksum of the rows
|
||||
// Calculate the total size of the rows in _row_buf
|
||||
future<get_sync_boundary_response>
|
||||
get_sync_boundary(std::optional<repair_sync_boundary> skipped_sync_boundary) {
|
||||
auto f = make_ready_future<>();
|
||||
if (skipped_sync_boundary) {
|
||||
_current_sync_boundary = skipped_sync_boundary;
|
||||
_row_buf.clear();
|
||||
_working_row_buf.clear();
|
||||
_working_row_buf_combined_hash.clear();
|
||||
} else {
|
||||
_working_row_buf.clear();
|
||||
_working_row_buf_combined_hash.clear();
|
||||
f = clear_row_buf();
|
||||
}
|
||||
// Here is the place we update _last_sync_boundary
|
||||
rlogger.trace("SET _last_sync_boundary from {} to {}", _last_sync_boundary, _current_sync_boundary);
|
||||
_last_sync_boundary = _current_sync_boundary;
|
||||
return row_buf_size().then([this, sb = std::move(skipped_sync_boundary)] (size_t cur_size) {
|
||||
return f.then([this, sb = std::move(skipped_sync_boundary)] () mutable {
|
||||
return clear_working_row_buf().then([this, sb = sb] () mutable {
|
||||
return row_buf_size().then([this, sb = std::move(sb)] (size_t cur_size) {
|
||||
return read_rows_from_disk(cur_size).then([this, sb = std::move(sb)] (std::list<repair_row> new_rows, size_t new_rows_size) mutable {
|
||||
size_t new_rows_nr = new_rows.size();
|
||||
_row_buf.splice(_row_buf.end(), new_rows);
|
||||
@@ -1063,6 +1137,8 @@ private:
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> move_row_buf_to_working_row_buf() {
|
||||
@@ -1138,9 +1214,9 @@ private:
|
||||
}
|
||||
|
||||
future<std::list<repair_row>>
|
||||
copy_rows_from_working_row_buf_within_set_diff(std::unordered_set<repair_hash> set_diff) {
|
||||
copy_rows_from_working_row_buf_within_set_diff(repair_hash_set set_diff) {
|
||||
return do_with(std::list<repair_row>(), std::move(set_diff),
|
||||
[this] (std::list<repair_row>& rows, std::unordered_set<repair_hash>& set_diff) {
|
||||
[this] (std::list<repair_row>& rows, repair_hash_set& set_diff) {
|
||||
return do_for_each(_working_row_buf, [this, &set_diff, &rows] (const repair_row& r) {
|
||||
if (set_diff.count(r.hash()) > 0) {
|
||||
rows.push_back(r);
|
||||
@@ -1155,7 +1231,7 @@ private:
|
||||
// Give a set of row hashes, return the corresponding rows
|
||||
// If needs_all_rows is set, return all the rows in _working_row_buf, ignore the set_diff
|
||||
future<std::list<repair_row>>
|
||||
get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
|
||||
get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows = needs_all_rows_t::no) {
|
||||
if (needs_all_rows) {
|
||||
if (!_repair_master || _nr_peer_nodes == 1) {
|
||||
return make_ready_future<std::list<repair_row>>(std::move(_working_row_buf));
|
||||
@@ -1166,6 +1242,32 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
future<> do_apply_rows(std::list<repair_row>&& row_diff, unsigned node_idx, update_working_row_buf update_buf) {
|
||||
return do_with(std::move(row_diff), [this, node_idx, update_buf] (std::list<repair_row>& row_diff) {
|
||||
return with_semaphore(_repair_writer.sem(), 1, [this, node_idx, update_buf, &row_diff] {
|
||||
_repair_writer.create_writer(_db, node_idx);
|
||||
return repeat([this, node_idx, update_buf, &row_diff] () mutable {
|
||||
if (row_diff.empty()) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
repair_row& r = row_diff.front();
|
||||
if (update_buf) {
|
||||
_working_row_buf_combined_hash.add(r.hash());
|
||||
}
|
||||
// The repair_row here is supposed to have
|
||||
// mutation_fragment attached because we have stored it in
|
||||
// to_repair_rows_list above where the repair_row is created.
|
||||
mutation_fragment mf = std::move(r.get_mutation_fragment());
|
||||
auto dk_with_hash = r.get_dk_with_hash();
|
||||
return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf)).then([&row_diff] {
|
||||
row_diff.pop_front();
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Give a list of rows, apply the rows to disk and update the _working_row_buf and _peer_row_hash_sets if requested
|
||||
// Must run inside a seastar thread
|
||||
void apply_rows_on_master_in_thread(repair_rows_on_wire rows, gms::inet_address from, update_working_row_buf update_buf,
|
||||
@@ -1179,30 +1281,17 @@ private:
|
||||
stats().rx_row_nr += row_diff.size();
|
||||
stats().rx_row_nr_peer[from] += row_diff.size();
|
||||
if (update_buf) {
|
||||
std::list<repair_row> tmp;
|
||||
tmp.swap(_working_row_buf);
|
||||
// Both row_diff and _working_row_buf and are ordered, merging
|
||||
// two sored list to make sure the combination of row_diff
|
||||
// and _working_row_buf are ordered.
|
||||
std::merge(tmp.begin(), tmp.end(), row_diff.begin(), row_diff.end(), std::back_inserter(_working_row_buf),
|
||||
[this] (const repair_row& x, const repair_row& y) { thread::maybe_yield(); return _cmp(x.boundary(), y.boundary()) < 0; });
|
||||
utils::merge_to_gently(_working_row_buf, row_diff,
|
||||
[this] (const repair_row& x, const repair_row& y) { return _cmp(x.boundary(), y.boundary()) < 0; });
|
||||
}
|
||||
if (update_hash_set) {
|
||||
_peer_row_hash_sets[node_idx] = boost::copy_range<std::unordered_set<repair_hash>>(row_diff |
|
||||
_peer_row_hash_sets[node_idx] = boost::copy_range<repair_hash_set>(row_diff |
|
||||
boost::adaptors::transformed([] (repair_row& r) { thread::maybe_yield(); return r.hash(); }));
|
||||
}
|
||||
_repair_writer.create_writer(_db, node_idx);
|
||||
for (auto& r : row_diff) {
|
||||
if (update_buf) {
|
||||
_working_row_buf_combined_hash.add(r.hash());
|
||||
}
|
||||
// The repair_row here is supposed to have
|
||||
// mutation_fragment attached because we have stored it in
|
||||
// to_repair_rows_list above where the repair_row is created.
|
||||
mutation_fragment mf = std::move(r.get_mutation_fragment());
|
||||
auto dk_with_hash = r.get_dk_with_hash();
|
||||
_repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf)).get();
|
||||
}
|
||||
do_apply_rows(std::move(row_diff), node_idx, update_buf).get();
|
||||
}
|
||||
|
||||
future<>
|
||||
@@ -1210,19 +1299,9 @@ private:
|
||||
if (rows.empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return to_repair_rows_list(rows).then([this] (std::list<repair_row> row_diff) {
|
||||
return do_with(std::move(row_diff), [this] (std::list<repair_row>& row_diff) {
|
||||
unsigned node_idx = 0;
|
||||
_repair_writer.create_writer(_db, node_idx);
|
||||
return do_for_each(row_diff, [this, node_idx] (repair_row& r) {
|
||||
// The repair_row here is supposed to have
|
||||
// mutation_fragment attached because we have stored it in
|
||||
// to_repair_rows_list above where the repair_row is created.
|
||||
mutation_fragment mf = std::move(r.get_mutation_fragment());
|
||||
auto dk_with_hash = r.get_dk_with_hash();
|
||||
return _repair_writer.do_write(node_idx, std::move(dk_with_hash), std::move(mf));
|
||||
});
|
||||
});
|
||||
return to_repair_rows_list(std::move(rows)).then([this] (std::list<repair_row> row_diff) {
|
||||
unsigned node_idx = 0;
|
||||
return do_apply_rows(std::move(row_diff), node_idx, update_working_row_buf::no);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1301,13 +1380,13 @@ private:
|
||||
public:
|
||||
// RPC API
|
||||
// Return the hashes of the rows in _working_row_buf
|
||||
future<std::unordered_set<repair_hash>>
|
||||
future<repair_hash_set>
|
||||
get_full_row_hashes(gms::inet_address remote_node) {
|
||||
if (remote_node == _myip) {
|
||||
return get_full_row_hashes_handler();
|
||||
}
|
||||
return netw::get_local_messaging_service().send_repair_get_full_row_hashes(msg_addr(remote_node),
|
||||
_repair_meta_id).then([this, remote_node] (std::unordered_set<repair_hash> hashes) {
|
||||
_repair_meta_id).then([this, remote_node] (repair_hash_set hashes) {
|
||||
rlogger.debug("Got full hashes from peer={}, nr_hashes={}", remote_node, hashes.size());
|
||||
_metrics.rx_hashes_nr += hashes.size();
|
||||
stats().rx_hashes_nr += hashes.size();
|
||||
@@ -1318,7 +1397,7 @@ public:
|
||||
|
||||
private:
|
||||
future<> get_full_row_hashes_source_op(
|
||||
lw_shared_ptr<std::unordered_set<repair_hash>> current_hashes,
|
||||
lw_shared_ptr<repair_hash_set> current_hashes,
|
||||
gms::inet_address remote_node,
|
||||
unsigned node_idx,
|
||||
rpc::source<repair_hash_with_cmd>& source) {
|
||||
@@ -1356,12 +1435,12 @@ private:
|
||||
}
|
||||
|
||||
public:
|
||||
future<std::unordered_set<repair_hash>>
|
||||
future<repair_hash_set>
|
||||
get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx) {
|
||||
if (remote_node == _myip) {
|
||||
return get_full_row_hashes_handler();
|
||||
}
|
||||
auto current_hashes = make_lw_shared<std::unordered_set<repair_hash>>();
|
||||
auto current_hashes = make_lw_shared<repair_hash_set>();
|
||||
return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx).then(
|
||||
[this, current_hashes, remote_node, node_idx]
|
||||
(rpc::sink<repair_stream_cmd>& sink, rpc::source<repair_hash_with_cmd>& source) mutable {
|
||||
@@ -1376,7 +1455,7 @@ public:
|
||||
}
|
||||
|
||||
// RPC handler
|
||||
future<std::unordered_set<repair_hash>>
|
||||
future<repair_hash_set>
|
||||
get_full_row_hashes_handler() {
|
||||
return with_gate(_gate, [this] {
|
||||
return working_row_hashes();
|
||||
@@ -1412,28 +1491,28 @@ public:
|
||||
|
||||
// RPC API
|
||||
future<>
|
||||
repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range, table_schema_version schema_version) {
|
||||
repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range, table_schema_version schema_version, streaming::stream_reason reason) {
|
||||
if (remote_node == _myip) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
stats().rpc_call_nr++;
|
||||
return netw::get_local_messaging_service().send_repair_row_level_start(msg_addr(remote_node),
|
||||
_repair_meta_id, std::move(ks_name), std::move(cf_name), std::move(range), _algo, _max_row_buf_size, _seed,
|
||||
_master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb, _master_node_shard_config.partitioner_name, std::move(schema_version));
|
||||
_master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb, _master_node_shard_config.partitioner_name, std::move(schema_version), reason);
|
||||
}
|
||||
|
||||
// RPC handler
|
||||
static future<>
|
||||
repair_row_level_start_handler(gms::inet_address from, uint32_t src_cpu_id, uint32_t repair_meta_id, sstring ks_name, sstring cf_name,
|
||||
dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size,
|
||||
uint64_t seed, shard_config master_node_shard_config, table_schema_version schema_version) {
|
||||
uint64_t seed, shard_config master_node_shard_config, table_schema_version schema_version, streaming::stream_reason reason) {
|
||||
if (!_sys_dist_ks->local_is_initialized() || !_view_update_generator->local_is_initialized()) {
|
||||
return make_exception_future<>(std::runtime_error(format("Node {} is not fully initialized for repair, try again later",
|
||||
utils::fb_utilities::get_broadcast_address())));
|
||||
}
|
||||
rlogger.debug(">>> Started Row Level Repair (Follower): local={}, peers={}, repair_meta_id={}, keyspace={}, cf={}, schema_version={}, range={}, seed={}, max_row_buf_siz={}",
|
||||
utils::fb_utilities::get_broadcast_address(), from, repair_meta_id, ks_name, cf_name, schema_version, range, seed, max_row_buf_size);
|
||||
return insert_repair_meta(from, src_cpu_id, repair_meta_id, std::move(range), algo, max_row_buf_size, seed, std::move(master_node_shard_config), std::move(schema_version));
|
||||
return insert_repair_meta(from, src_cpu_id, repair_meta_id, std::move(range), algo, max_row_buf_size, seed, std::move(master_node_shard_config), std::move(schema_version), reason);
|
||||
}
|
||||
|
||||
// RPC API
|
||||
@@ -1509,7 +1588,7 @@ public:
|
||||
// RPC API
|
||||
// Return rows in the _working_row_buf with hash within the given sef_diff
|
||||
// Must run inside a seastar thread
|
||||
void get_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
|
||||
void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
|
||||
if (needs_all_rows || !set_diff.empty()) {
|
||||
if (remote_node == _myip) {
|
||||
return;
|
||||
@@ -1578,11 +1657,11 @@ private:
|
||||
}
|
||||
|
||||
future<> get_row_diff_sink_op(
|
||||
std::unordered_set<repair_hash> set_diff,
|
||||
repair_hash_set set_diff,
|
||||
needs_all_rows_t needs_all_rows,
|
||||
rpc::sink<repair_hash_with_cmd>& sink,
|
||||
gms::inet_address remote_node) {
|
||||
return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (std::unordered_set<repair_hash>& set_diff) mutable {
|
||||
return do_with(std::move(set_diff), [needs_all_rows, remote_node, &sink] (repair_hash_set& set_diff) mutable {
|
||||
if (inject_rpc_stream_error) {
|
||||
return make_exception_future<>(std::runtime_error("get_row_diff: Inject sender error in sink loop"));
|
||||
}
|
||||
@@ -1609,7 +1688,7 @@ private:
|
||||
public:
|
||||
// Must run inside a seastar thread
|
||||
void get_row_diff_with_rpc_stream(
|
||||
std::unordered_set<repair_hash> set_diff,
|
||||
repair_hash_set set_diff,
|
||||
needs_all_rows_t needs_all_rows,
|
||||
update_peer_row_hash_sets update_hash_set,
|
||||
gms::inet_address remote_node,
|
||||
@@ -1635,7 +1714,7 @@ public:
|
||||
}
|
||||
|
||||
// RPC handler
|
||||
future<repair_rows_on_wire> get_row_diff_handler(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows) {
|
||||
future<repair_rows_on_wire> get_row_diff_handler(repair_hash_set set_diff, needs_all_rows_t needs_all_rows) {
|
||||
return with_gate(_gate, [this, set_diff = std::move(set_diff), needs_all_rows] () mutable {
|
||||
return get_row_diff(std::move(set_diff), needs_all_rows).then([this] (std::list<repair_row> row_diff) {
|
||||
return to_repair_rows_on_wire(std::move(row_diff));
|
||||
@@ -1645,15 +1724,16 @@ public:
|
||||
|
||||
// RPC API
|
||||
// Send rows in the _working_row_buf with hash within the given sef_diff
|
||||
future<> put_row_diff(std::unordered_set<repair_hash> set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
|
||||
future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
|
||||
if (!set_diff.empty()) {
|
||||
if (remote_node == _myip) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
auto sz = set_diff.size();
|
||||
size_t sz = set_diff.size();
|
||||
return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, sz] (std::list<repair_row> row_diff) {
|
||||
if (row_diff.size() != sz) {
|
||||
throw std::runtime_error("row_diff.size() != set_diff.size()");
|
||||
rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
|
||||
_schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
|
||||
}
|
||||
return do_with(std::move(row_diff), [this, remote_node] (std::list<repair_row>& row_diff) {
|
||||
return get_repair_rows_size(row_diff).then([this, remote_node, &row_diff] (size_t row_bytes) mutable {
|
||||
@@ -1720,17 +1800,18 @@ private:
|
||||
|
||||
public:
|
||||
future<> put_row_diff_with_rpc_stream(
|
||||
std::unordered_set<repair_hash> set_diff,
|
||||
repair_hash_set set_diff,
|
||||
needs_all_rows_t needs_all_rows,
|
||||
gms::inet_address remote_node, unsigned node_idx) {
|
||||
if (!set_diff.empty()) {
|
||||
if (remote_node == _myip) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
auto sz = set_diff.size();
|
||||
size_t sz = set_diff.size();
|
||||
return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, node_idx, sz] (std::list<repair_row> row_diff) {
|
||||
if (row_diff.size() != sz) {
|
||||
throw std::runtime_error("row_diff.size() != set_diff.size()");
|
||||
rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
|
||||
_schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
|
||||
}
|
||||
return do_with(std::move(row_diff), [this, remote_node, node_idx] (std::list<repair_row>& row_diff) {
|
||||
return get_repair_rows_size(row_diff).then([this, remote_node, node_idx, &row_diff] (size_t row_bytes) mutable {
|
||||
@@ -1769,7 +1850,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
|
||||
rpc::sink<repair_row_on_wire_with_cmd> sink,
|
||||
rpc::source<repair_hash_with_cmd> source,
|
||||
bool &error,
|
||||
std::unordered_set<repair_hash>& current_set_diff,
|
||||
repair_hash_set& current_set_diff,
|
||||
std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) {
|
||||
repair_hash_with_cmd hash_cmd = std::get<0>(hash_cmd_opt.value());
|
||||
rlogger.trace("Got repair_hash_with_cmd from peer={}, hash={}, cmd={}", from, hash_cmd.hash, int(hash_cmd.cmd));
|
||||
@@ -1782,7 +1863,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
|
||||
}
|
||||
bool needs_all_rows = hash_cmd.cmd == repair_stream_cmd::needs_all_rows;
|
||||
_metrics.rx_hashes_nr += current_set_diff.size();
|
||||
auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(current_set_diff)));
|
||||
auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(current_set_diff)));
|
||||
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] {
|
||||
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
|
||||
if (fp.get_owner_shard() == engine().cpu_id()) {
|
||||
@@ -1860,12 +1941,12 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
|
||||
if (status == repair_stream_cmd::get_full_row_hashes) {
|
||||
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
|
||||
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
|
||||
return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
|
||||
return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
|
||||
_metrics.tx_hashes_nr += hashes.size();
|
||||
return hashes;
|
||||
});
|
||||
}).then([sink] (std::unordered_set<repair_hash> hashes) mutable {
|
||||
return do_with(std::move(hashes), [sink] (std::unordered_set<repair_hash>& hashes) mutable {
|
||||
}).then([sink] (repair_hash_set hashes) mutable {
|
||||
return do_with(std::move(hashes), [sink] (repair_hash_set& hashes) mutable {
|
||||
return do_for_each(hashes, [sink] (const repair_hash& hash) mutable {
|
||||
return sink(repair_hash_with_cmd{repair_stream_cmd::hash_data, hash});
|
||||
}).then([sink] () mutable {
|
||||
@@ -1888,7 +1969,7 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
|
||||
uint32_t repair_meta_id,
|
||||
rpc::sink<repair_row_on_wire_with_cmd> sink,
|
||||
rpc::source<repair_hash_with_cmd> source) {
|
||||
return do_with(false, std::unordered_set<repair_hash>(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, std::unordered_set<repair_hash>& current_set_diff) mutable {
|
||||
return do_with(false, repair_hash_set(), [from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
|
||||
return repeat([from, src_cpu_id, repair_meta_id, sink, source, &error, ¤t_set_diff] () mutable {
|
||||
return source().then([from, src_cpu_id, repair_meta_id, sink, source, &error, ¤t_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
|
||||
if (hash_cmd_opt) {
|
||||
@@ -1904,22 +1985,17 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
|
||||
current_set_diff,
|
||||
std::move(hash_cmd_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
|
||||
error = true;
|
||||
return sink(repair_row_on_wire_with_cmd{repair_stream_cmd::error, repair_row_on_wire()}).then([sink] () mutable {
|
||||
return sink.close();
|
||||
}).then([sink] {
|
||||
return sink(repair_row_on_wire_with_cmd{repair_stream_cmd::error, repair_row_on_wire()}).then([] {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
if (error) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
return sink.close().then([sink] {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
});
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
});
|
||||
});
|
||||
}).finally([sink] () mutable {
|
||||
return sink.close().finally([sink] { });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1945,22 +2021,17 @@ static future<> repair_put_row_diff_with_rpc_stream_handler(
|
||||
current_rows,
|
||||
std::move(row_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
|
||||
error = true;
|
||||
return sink(repair_stream_cmd::error).then([sink] () mutable {
|
||||
return sink.close();
|
||||
}).then([sink] {
|
||||
return sink(repair_stream_cmd::error).then([] {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
if (error) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
return sink.close().then([sink] {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
});
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
});
|
||||
});
|
||||
}).finally([sink] () mutable {
|
||||
return sink.close().finally([sink] { });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1985,22 +2056,17 @@ static future<> repair_get_full_row_hashes_with_rpc_stream_handler(
|
||||
error,
|
||||
std::move(status_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
|
||||
error = true;
|
||||
return sink(repair_hash_with_cmd{repair_stream_cmd::error, repair_hash()}).then([sink] () mutable {
|
||||
return sink.close();
|
||||
}).then([sink] {
|
||||
return sink(repair_hash_with_cmd{repair_stream_cmd::error, repair_hash()}).then([] () {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
if (error) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
return sink.close().then([sink] {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
});
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
});
|
||||
});
|
||||
}).finally([sink] () mutable {
|
||||
return sink.close().finally([sink] { });
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2046,7 +2112,7 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
|
||||
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id] {
|
||||
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
|
||||
return rm->get_full_row_hashes_handler().then([] (std::unordered_set<repair_hash> hashes) {
|
||||
return rm->get_full_row_hashes_handler().then([] (repair_hash_set hashes) {
|
||||
_metrics.tx_hashes_nr += hashes.size();
|
||||
return hashes;
|
||||
});
|
||||
@@ -2074,11 +2140,11 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
|
||||
});
|
||||
});
|
||||
ms.register_repair_get_row_diff([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
|
||||
std::unordered_set<repair_hash> set_diff, bool needs_all_rows) {
|
||||
repair_hash_set set_diff, bool needs_all_rows) {
|
||||
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
|
||||
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
_metrics.rx_hashes_nr += set_diff.size();
|
||||
auto fp = make_foreign(std::make_unique<std::unordered_set<repair_hash>>(std::move(set_diff)));
|
||||
auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(set_diff)));
|
||||
return smp::submit_to(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] () mutable {
|
||||
auto rm = repair_meta::get_repair_meta(from, repair_meta_id);
|
||||
if (fp.get_owner_shard() == engine().cpu_id()) {
|
||||
@@ -2104,15 +2170,16 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
|
||||
});
|
||||
ms.register_repair_row_level_start([] (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring ks_name,
|
||||
sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed,
|
||||
unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version) {
|
||||
unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason) {
|
||||
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
|
||||
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
return smp::submit_to(src_cpu_id % smp::count, [from, src_cpu_id, repair_meta_id, ks_name, cf_name,
|
||||
range, algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, remote_partitioner_name, schema_version] () mutable {
|
||||
range, algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, remote_partitioner_name, schema_version, reason] () mutable {
|
||||
streaming::stream_reason r = reason ? *reason : streaming::stream_reason::repair;
|
||||
return repair_meta::repair_row_level_start_handler(from, src_cpu_id, repair_meta_id, std::move(ks_name),
|
||||
std::move(cf_name), std::move(range), algo, max_row_buf_size, seed,
|
||||
shard_config{remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name)},
|
||||
schema_version);
|
||||
schema_version, r);
|
||||
});
|
||||
});
|
||||
ms.register_repair_row_level_stop([] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
|
||||
@@ -2145,6 +2212,25 @@ future<> repair_init_messaging_service_handler(repair_service& rs, distributed<d
|
||||
});
|
||||
}
|
||||
|
||||
future<> repair_uninit_messaging_service_handler() {
|
||||
return netw::get_messaging_service().invoke_on_all([] (auto& ms) {
|
||||
return when_all_succeed(
|
||||
ms.unregister_repair_get_row_diff_with_rpc_stream(),
|
||||
ms.unregister_repair_put_row_diff_with_rpc_stream(),
|
||||
ms.unregister_repair_get_full_row_hashes_with_rpc_stream(),
|
||||
ms.unregister_repair_get_full_row_hashes(),
|
||||
ms.unregister_repair_get_combined_row_hash(),
|
||||
ms.unregister_repair_get_sync_boundary(),
|
||||
ms.unregister_repair_get_row_diff(),
|
||||
ms.unregister_repair_put_row_diff(),
|
||||
ms.unregister_repair_row_level_start(),
|
||||
ms.unregister_repair_row_level_stop(),
|
||||
ms.unregister_repair_get_estimated_partitions(),
|
||||
ms.unregister_repair_set_estimated_partitions(),
|
||||
ms.unregister_repair_get_diff_algorithms()).discard_result();
|
||||
});
|
||||
}
|
||||
|
||||
class row_level_repair {
|
||||
repair_info& _ri;
|
||||
sstring _cf_name;
|
||||
@@ -2374,7 +2460,7 @@ private:
|
||||
// sequentially because the rows from repair follower 1 to
|
||||
// repair master might reduce the amount of missing data
|
||||
// between repair master and repair follower 2.
|
||||
std::unordered_set<repair_hash> set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
|
||||
repair_hash_set set_diff = repair_meta::get_set_diff(master.peer_row_hash_sets(node_idx), master.working_row_hashes().get0());
|
||||
// Request missing sets from peer node
|
||||
rlogger.debug("Before get_row_diff to node {}, local={}, peer={}, set_diff={}",
|
||||
node, master.working_row_hashes().get0().size(), master.peer_row_hash_sets(node_idx).size(), set_diff.size());
|
||||
@@ -2397,9 +2483,9 @@ private:
|
||||
// So we can figure out which rows peer node are missing and send the missing rows to them
|
||||
check_in_shutdown();
|
||||
_ri.check_in_abort();
|
||||
std::unordered_set<repair_hash> local_row_hash_sets = master.working_row_hashes().get0();
|
||||
repair_hash_set local_row_hash_sets = master.working_row_hashes().get0();
|
||||
auto sz = _all_live_peer_nodes.size();
|
||||
std::vector<std::unordered_set<repair_hash>> set_diffs(sz);
|
||||
std::vector<repair_hash_set> set_diffs(sz);
|
||||
for (size_t idx : boost::irange(size_t(0), sz)) {
|
||||
set_diffs[idx] = repair_meta::get_set_diff(local_row_hash_sets, master.peer_row_hash_sets(idx));
|
||||
}
|
||||
@@ -2442,6 +2528,7 @@ public:
|
||||
_seed,
|
||||
repair_meta::repair_master::yes,
|
||||
repair_meta_id,
|
||||
_ri.reason,
|
||||
std::move(master_node_shard_config),
|
||||
_all_live_peer_nodes.size());
|
||||
|
||||
@@ -2456,7 +2543,7 @@ public:
|
||||
nodes_to_stop.reserve(_all_nodes.size());
|
||||
try {
|
||||
parallel_for_each(_all_nodes, [&, this] (const gms::inet_address& node) {
|
||||
return master.repair_row_level_start(node, _ri.keyspace, _cf_name, _range, schema_version).then([&] () {
|
||||
return master.repair_row_level_start(node, _ri.keyspace, _cf_name, _range, schema_version, _ri.reason).then([&] () {
|
||||
nodes_to_stop.push_back(node);
|
||||
return master.repair_get_estimated_partitions(node).then([this, node] (uint64_t partitions) {
|
||||
rlogger.trace("Get repair_get_estimated_partitions for node={}, estimated_partitions={}", node, partitions);
|
||||
|
||||
@@ -45,6 +45,7 @@ private:
|
||||
};
|
||||
|
||||
future<> repair_init_messaging_service_handler(repair_service& rs, distributed<db::system_distributed_keyspace>& sys_dist_ks, distributed<db::view::view_update_generator>& view_update_generator);
|
||||
future<> repair_uninit_messaging_service_handler();
|
||||
|
||||
class repair_info;
|
||||
|
||||
|
||||
20
row_cache.cc
20
row_cache.cc
@@ -528,8 +528,12 @@ public:
|
||||
return _reader.move_to_next_partition(timeout).then([this] (auto&& mfopt) mutable {
|
||||
{
|
||||
if (!mfopt) {
|
||||
this->handle_end_of_stream();
|
||||
return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(std::nullopt, std::nullopt);
|
||||
return _cache._read_section(_cache._tracker.region(), [&] {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
this->handle_end_of_stream();
|
||||
return make_ready_future<flat_mutation_reader_opt, mutation_fragment_opt>(std::nullopt, std::nullopt);
|
||||
});
|
||||
});
|
||||
}
|
||||
_cache.on_partition_miss();
|
||||
const partition_start& ps = mfopt->as_partition_start();
|
||||
@@ -952,13 +956,15 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
|
||||
// expensive and we need to amortize it somehow.
|
||||
do {
|
||||
STAP_PROBE(scylla, row_cache_update_partition_start);
|
||||
with_linearized_managed_bytes([&] {
|
||||
{
|
||||
if (!update) {
|
||||
_update_section(_tracker.region(), [&] {
|
||||
with_linearized_managed_bytes([&] {
|
||||
memtable_entry& mem_e = *m.partitions.begin();
|
||||
size_entry = mem_e.size_in_allocator_without_rows(_tracker.allocator());
|
||||
auto cache_i = _partitions.lower_bound(mem_e.key(), cmp);
|
||||
update = updater(_update_section, cache_i, mem_e, is_present, real_dirty_acc);
|
||||
});
|
||||
});
|
||||
}
|
||||
// We use cooperative deferring instead of futures so that
|
||||
@@ -970,14 +976,16 @@ future<> row_cache::do_update(external_updater eu, memtable& m, Updater updater)
|
||||
update = {};
|
||||
real_dirty_acc.unpin_memory(size_entry);
|
||||
_update_section(_tracker.region(), [&] {
|
||||
with_linearized_managed_bytes([&] {
|
||||
auto i = m.partitions.begin();
|
||||
memtable_entry& mem_e = *i;
|
||||
m.partitions.erase(i);
|
||||
mem_e.partition().evict(_tracker.memtable_cleaner());
|
||||
current_allocator().destroy(&mem_e);
|
||||
});
|
||||
});
|
||||
++partition_count;
|
||||
});
|
||||
}
|
||||
STAP_PROBE(scylla, row_cache_update_partition_end);
|
||||
} while (!m.partitions.empty() && !need_preempt());
|
||||
with_allocator(standard_allocator(), [&] {
|
||||
@@ -1124,8 +1132,8 @@ future<> row_cache::invalidate(external_updater eu, dht::partition_range_vector&
|
||||
seastar::thread::maybe_yield();
|
||||
|
||||
while (true) {
|
||||
auto done = with_linearized_managed_bytes([&] {
|
||||
return _update_section(_tracker.region(), [&] {
|
||||
auto done = _update_section(_tracker.region(), [&] {
|
||||
return with_linearized_managed_bytes([&] {
|
||||
auto cmp = cache_entry::compare(_schema);
|
||||
auto it = _partitions.lower_bound(*_prev_snapshot_pos, cmp);
|
||||
auto end = _partitions.lower_bound(dht::ring_position_view::for_range_end(range), cmp);
|
||||
|
||||
12
schema.cc
12
schema.cc
@@ -42,6 +42,8 @@
|
||||
|
||||
constexpr int32_t schema::NAME_LENGTH;
|
||||
|
||||
extern logging::logger dblog;
|
||||
|
||||
sstring to_sstring(column_kind k) {
|
||||
switch (k) {
|
||||
case column_kind::partition_key: return "PARTITION_KEY";
|
||||
@@ -319,10 +321,10 @@ schema::schema(const raw_schema& raw, std::optional<raw_view_info> raw_view_info
|
||||
+ column_offset(column_kind::regular_column),
|
||||
_raw._columns.end(), column_definition::name_comparator(regular_column_name_type()));
|
||||
|
||||
std::sort(_raw._columns.begin(),
|
||||
std::stable_sort(_raw._columns.begin(),
|
||||
_raw._columns.begin() + column_offset(column_kind::clustering_key),
|
||||
[] (auto x, auto y) { return x.id < y.id; });
|
||||
std::sort(_raw._columns.begin() + column_offset(column_kind::clustering_key),
|
||||
std::stable_sort(_raw._columns.begin() + column_offset(column_kind::clustering_key),
|
||||
_raw._columns.begin() + column_offset(column_kind::static_column),
|
||||
[] (auto x, auto y) { return x.id < y.id; });
|
||||
|
||||
@@ -575,11 +577,15 @@ schema::get_column_definition(const bytes& name) const {
|
||||
|
||||
const column_definition&
|
||||
schema::column_at(column_kind kind, column_id id) const {
|
||||
return _raw._columns.at(column_offset(kind) + id);
|
||||
return column_at(static_cast<ordinal_column_id>(column_offset(kind) + id));
|
||||
}
|
||||
|
||||
const column_definition&
|
||||
schema::column_at(ordinal_column_id ordinal_id) const {
|
||||
if (size_t(ordinal_id) >= _raw._columns.size()) {
|
||||
on_internal_error(dblog, format("{}.{}@{}: column id {:d} >= {:d}",
|
||||
ks_name(), cf_name(), version(), size_t(ordinal_id), _raw._columns.size()));
|
||||
}
|
||||
return _raw._columns.at(static_cast<column_count_type>(ordinal_id));
|
||||
}
|
||||
|
||||
|
||||
@@ -79,7 +79,8 @@ executables = ['build/{}/scylla'.format(args.mode),
|
||||
'/usr/sbin/ethtool',
|
||||
'/usr/bin/netstat',
|
||||
'/usr/bin/hwloc-distrib',
|
||||
'/usr/bin/hwloc-calc']
|
||||
'/usr/bin/hwloc-calc',
|
||||
'/usr/bin/lsblk']
|
||||
|
||||
output = args.dest
|
||||
|
||||
|
||||
@@ -33,9 +33,10 @@ import os
|
||||
procs = os.sysconf('SC_NPROCESSORS_ONLN')
|
||||
mem = os.sysconf('SC_PHYS_PAGES') * os.sysconf('SC_PAGESIZE')
|
||||
|
||||
mem_reserve = 1000000000
|
||||
job_mem = 4000000000
|
||||
|
||||
jobs = min(procs, mem // job_mem)
|
||||
jobs = min(procs, (mem-mem_reserve) // job_mem)
|
||||
jobs = max(jobs, 1)
|
||||
|
||||
print(jobs)
|
||||
|
||||
@@ -596,7 +596,7 @@ def current_shard():
|
||||
|
||||
|
||||
def find_db(shard=None):
|
||||
if not shard:
|
||||
if shard is None:
|
||||
shard = current_shard()
|
||||
return gdb.parse_and_eval('::debug::db')['_instances']['_M_impl']['_M_start'][shard]['service']['_p']
|
||||
|
||||
|
||||
@@ -63,6 +63,17 @@ MemoryHigh=1200M
|
||||
MemoryMax=1400M
|
||||
MemoryLimit=1400M
|
||||
EOS
|
||||
|
||||
# On CentOS7, systemd does not support percentage-based parameter.
|
||||
# To apply memory parameter on CentOS7, we need to override the parameter
|
||||
# in bytes, instead of percentage.
|
||||
elif [ "$RHEL" -a "$VERSION_ID" = "7" ]; then
|
||||
MEMORY_LIMIT=$((MEMTOTAL_BYTES / 100 * 5))
|
||||
mkdir -p /etc/systemd/system/scylla-helper.slice.d/
|
||||
cat << EOS > /etc/systemd/system/scylla-helper.slice.d/memory.conf
|
||||
[Slice]
|
||||
MemoryLimit=$MEMORY_LIMIT
|
||||
EOS
|
||||
fi
|
||||
|
||||
systemctl --system daemon-reload >/dev/null || true
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 92c488706c...748428930a
@@ -25,6 +25,7 @@
|
||||
#include <seastar/util/bool_class.hh>
|
||||
#include <boost/range/algorithm/for_each.hpp>
|
||||
#include "utils/small_vector.hh"
|
||||
#include <absl/container/btree_set.h>
|
||||
|
||||
namespace ser {
|
||||
|
||||
@@ -81,6 +82,17 @@ static inline void serialize_array(Output& out, const Container& v) {
|
||||
template<typename Container>
|
||||
struct container_traits;
|
||||
|
||||
template<typename T>
|
||||
struct container_traits<absl::btree_set<T>> {
|
||||
struct back_emplacer {
|
||||
absl::btree_set<T>& c;
|
||||
back_emplacer(absl::btree_set<T>& c_) : c(c_) {}
|
||||
void operator()(T&& v) {
|
||||
c.emplace(std::move(v));
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct container_traits<std::unordered_set<T>> {
|
||||
struct back_emplacer {
|
||||
@@ -253,6 +265,27 @@ struct serializer<std::list<T>> {
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct serializer<absl::btree_set<T>> {
|
||||
template<typename Input>
|
||||
static absl::btree_set<T> read(Input& in) {
|
||||
auto sz = deserialize(in, boost::type<uint32_t>());
|
||||
absl::btree_set<T> v;
|
||||
deserialize_array_helper<false, T>::doit(in, v, sz);
|
||||
return v;
|
||||
}
|
||||
template<typename Output>
|
||||
static void write(Output& out, const absl::btree_set<T>& v) {
|
||||
safe_serialize_as_uint32(out, v.size());
|
||||
serialize_array_helper<false, T>::doit(out, v);
|
||||
}
|
||||
template<typename Input>
|
||||
static void skip(Input& in) {
|
||||
auto sz = deserialize(in, boost::type<uint32_t>());
|
||||
skip_array<T>(in, sz);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct serializer<std::unordered_set<T>> {
|
||||
template<typename Input>
|
||||
|
||||
@@ -92,7 +92,7 @@ void migration_manager::init_messaging_service()
|
||||
//FIXME: future discarded.
|
||||
(void)with_gate(_background_tasks, [this] {
|
||||
mlogger.debug("features changed, recalculating schema version");
|
||||
return update_schema_version_and_announce(get_storage_proxy(), _feat.cluster_schema_features());
|
||||
return db::schema_tables::recalculate_schema_version(get_storage_proxy(), _feat);
|
||||
});
|
||||
};
|
||||
|
||||
@@ -277,9 +277,9 @@ future<> migration_manager::maybe_schedule_schema_pull(const utils::UUID& their_
|
||||
}).finally([me = shared_from_this()] {});
|
||||
}
|
||||
|
||||
future<> migration_manager::submit_migration_task(const gms::inet_address& endpoint)
|
||||
future<> migration_manager::submit_migration_task(const gms::inet_address& endpoint, bool can_ignore_down_node)
|
||||
{
|
||||
return service::migration_task::run_may_throw(endpoint);
|
||||
return service::migration_task::run_may_throw(endpoint, can_ignore_down_node);
|
||||
}
|
||||
|
||||
future<> migration_manager::do_merge_schema_from(netw::messaging_service::msg_addr id)
|
||||
@@ -1132,7 +1132,8 @@ future<> migration_manager::sync_schema(const database& db, const std::vector<gm
|
||||
}).then([this, &schema_map] {
|
||||
return parallel_for_each(schema_map, [this] (auto& x) {
|
||||
mlogger.debug("Pulling schema {} from {}", x.first, x.second.front());
|
||||
return submit_migration_task(x.second.front());
|
||||
bool can_ignore_down_node = false;
|
||||
return submit_migration_task(x.second.front(), can_ignore_down_node);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -82,7 +82,7 @@ public:
|
||||
|
||||
future<> maybe_schedule_schema_pull(const utils::UUID& their_version, const gms::inet_address& endpoint);
|
||||
|
||||
future<> submit_migration_task(const gms::inet_address& endpoint);
|
||||
future<> submit_migration_task(const gms::inet_address& endpoint, bool can_ignore_down_node = true);
|
||||
|
||||
// Makes sure that this node knows about all schema changes known by "nodes" that were made prior to this call.
|
||||
future<> sync_schema(const database& db, const std::vector<gms::inet_address>& nodes);
|
||||
|
||||
@@ -51,11 +51,12 @@ namespace service {
|
||||
|
||||
static logging::logger mlogger("migration_task");
|
||||
|
||||
future<> migration_task::run_may_throw(const gms::inet_address& endpoint)
|
||||
future<> migration_task::run_may_throw(const gms::inet_address& endpoint, bool can_ignore_down_node)
|
||||
{
|
||||
if (!gms::get_local_gossiper().is_alive(endpoint)) {
|
||||
mlogger.warn("Can't send migration request: node {} is down.", endpoint);
|
||||
return make_ready_future<>();
|
||||
auto msg = format("Can't send migration request: node {} is down.", endpoint);
|
||||
mlogger.warn("{}", msg);
|
||||
return can_ignore_down_node ? make_ready_future<>() : make_exception_future<>(std::runtime_error(msg));
|
||||
}
|
||||
netw::messaging_service::msg_addr id{endpoint, 0};
|
||||
return service::get_local_migration_manager().merge_schema_from(id).handle_exception([](std::exception_ptr e) {
|
||||
|
||||
@@ -47,7 +47,7 @@ namespace service {
|
||||
|
||||
class migration_task {
|
||||
public:
|
||||
static future<> run_may_throw(const gms::inet_address& endpoint);
|
||||
static future<> run_may_throw(const gms::inet_address& endpoint, bool can_ignore_down_node);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -190,4 +190,11 @@ future<> paxos_state::learn(schema_ptr schema, proposal decision, clock_type::ti
|
||||
});
|
||||
}
|
||||
|
||||
future<> paxos_state::prune(schema_ptr schema, const partition_key& key, utils::UUID ballot, clock_type::time_point timeout,
|
||||
tracing::trace_state_ptr tr_state) {
|
||||
logger.debug("Delete paxos state for ballot {}", ballot);
|
||||
tracing::trace(tr_state, "Delete paxos state for ballot {}", ballot);
|
||||
return db::system_keyspace::delete_paxos_decision(*schema, key, ballot, timeout);
|
||||
}
|
||||
|
||||
} // end of namespace "service::paxos"
|
||||
|
||||
@@ -124,6 +124,9 @@ public:
|
||||
clock_type::time_point timeout);
|
||||
// Replica RPC endpoint for Paxos "learn".
|
||||
static future<> learn(schema_ptr schema, proposal decision, clock_type::time_point timeout, tracing::trace_state_ptr tr_state);
|
||||
// Replica RPC endpoint for pruning Paxos table
|
||||
static future<> prune(schema_ptr schema, const partition_key& key, utils::UUID ballot, clock_type::time_point timeout,
|
||||
tracing::trace_state_ptr tr_state);
|
||||
};
|
||||
|
||||
} // end of namespace "service::paxos"
|
||||
|
||||
@@ -171,6 +171,7 @@ public:
|
||||
const schema_ptr& schema() {
|
||||
return _schema;
|
||||
}
|
||||
// called only when all replicas replied
|
||||
virtual void release_mutation() = 0;
|
||||
};
|
||||
|
||||
@@ -300,9 +301,10 @@ public:
|
||||
|
||||
class cas_mutation : public mutation_holder {
|
||||
lw_shared_ptr<paxos::proposal> _proposal;
|
||||
shared_ptr<paxos_response_handler> _handler;
|
||||
public:
|
||||
explicit cas_mutation(paxos::proposal proposal , schema_ptr s)
|
||||
: _proposal(make_lw_shared<paxos::proposal>(std::move(proposal))) {
|
||||
explicit cas_mutation(paxos::proposal proposal, schema_ptr s, shared_ptr<paxos_response_handler> handler)
|
||||
: _proposal(make_lw_shared<paxos::proposal>(std::move(proposal))), _handler(std::move(handler)) {
|
||||
_size = _proposal->update.representation().size();
|
||||
_schema = std::move(s);
|
||||
}
|
||||
@@ -327,6 +329,11 @@ public:
|
||||
return true;
|
||||
}
|
||||
virtual void release_mutation() override {
|
||||
// The handler will be set for "learn", but not for PAXOS repair
|
||||
// since repair may not include all replicas
|
||||
if (_handler) {
|
||||
_handler->prune(_proposal->ballot);
|
||||
}
|
||||
_proposal.release();
|
||||
}
|
||||
};
|
||||
@@ -1184,6 +1191,12 @@ future<bool> paxos_response_handler::accept_proposal(const paxos::proposal& prop
|
||||
return f;
|
||||
}
|
||||
|
||||
// debug output in mutate_internal needs this
|
||||
std::ostream& operator<<(std::ostream& os, const paxos_response_handler& h) {
|
||||
os << "paxos_response_handler{" << h.id() << "}";
|
||||
return os;
|
||||
}
|
||||
|
||||
// This function implements learning stage of Paxos protocol
|
||||
future<> paxos_response_handler::learn_decision(paxos::proposal decision, bool allow_hints) {
|
||||
tracing::trace(tr_state, "learn_decision: committing {} with cl={}", decision, _cl_for_learn);
|
||||
@@ -1219,12 +1232,41 @@ future<> paxos_response_handler::learn_decision(paxos::proposal decision, bool a
|
||||
}
|
||||
|
||||
// Path for the "base" mutations
|
||||
std::array<std::tuple<paxos::proposal, schema_ptr, dht::token>, 1> m{std::make_tuple(std::move(decision), _schema, _key.token())};
|
||||
std::array<std::tuple<paxos::proposal, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>, 1> m{std::make_tuple(std::move(decision), _schema, shared_from_this(), _key.token())};
|
||||
future<> f_lwt = _proxy->mutate_internal(std::move(m), _cl_for_learn, false, tr_state, _permit, _timeout);
|
||||
|
||||
return when_all_succeed(std::move(f_cdc), std::move(f_lwt));
|
||||
}
|
||||
|
||||
void paxos_response_handler::prune(utils::UUID ballot) {
|
||||
if (_has_dead_endpoints) {
|
||||
return;
|
||||
}
|
||||
if ( _proxy->get_stats().cas_now_pruning >= pruning_limit) {
|
||||
_proxy->get_stats().cas_coordinator_dropped_prune++;
|
||||
return;
|
||||
}
|
||||
_proxy->get_stats().cas_now_pruning++;
|
||||
_proxy->get_stats().cas_prune++;
|
||||
// running in the background, but the amount of the bg job is limited by pruning_limit
|
||||
// it is waited by holding shared pointer to storage_proxy which guaranties
|
||||
// that storage_proxy::stop() will wait for this to complete
|
||||
(void)parallel_for_each(_live_endpoints, [this, ballot] (gms::inet_address peer) mutable {
|
||||
return futurize_apply([&] {
|
||||
if (fbu::is_me(peer)) {
|
||||
tracing::trace(tr_state, "prune: prune {} locally", ballot);
|
||||
return paxos::paxos_state::prune(_schema, _key.key(), ballot, _timeout, tr_state);
|
||||
} else {
|
||||
tracing::trace(tr_state, "prune: send prune of {} to {}", ballot, peer);
|
||||
netw::messaging_service& ms = netw::get_local_messaging_service();
|
||||
return ms.send_paxos_prune(peer, _timeout, _schema->version(), _key.key(), ballot, tracing::make_trace_info(tr_state));
|
||||
}
|
||||
});
|
||||
}).finally([h = shared_from_this()] {
|
||||
h->_proxy->get_stats().cas_now_pruning--;
|
||||
});
|
||||
}
|
||||
|
||||
static std::vector<gms::inet_address>
|
||||
replica_ids_to_endpoints(locator::token_metadata& tm, const std::vector<utils::UUID>& replica_ids) {
|
||||
std::vector<gms::inet_address> endpoints;
|
||||
@@ -1571,6 +1613,14 @@ void storage_proxy_stats::stats::register_stats() {
|
||||
sm::make_histogram("cas_write_contention", sm::description("how many contended writes were encountered"),
|
||||
{storage_proxy_stats::current_scheduling_group_label()},
|
||||
[this]{ return cas_write_contention.get_histogram(1, 8);}),
|
||||
|
||||
sm::make_total_operations("cas_prune", cas_prune,
|
||||
sm::description("how many times paxos prune was done after successful cas operation"),
|
||||
{storage_proxy_stats::current_scheduling_group_label()}),
|
||||
|
||||
sm::make_total_operations("cas_dropped_prune", cas_coordinator_dropped_prune,
|
||||
sm::description("how many times a coordinator did not perfom prune after cas"),
|
||||
{storage_proxy_stats::current_scheduling_group_label()}),
|
||||
});
|
||||
|
||||
_metrics.add_group(REPLICA_STATS_CATEGORY, {
|
||||
@@ -1606,19 +1656,28 @@ void storage_proxy_stats::stats::register_stats() {
|
||||
sm::description("number of operations that crossed a shard boundary"),
|
||||
{storage_proxy_stats::current_scheduling_group_label()}),
|
||||
|
||||
sm::make_total_operations("cas_dropped_prune", cas_replica_dropped_prune,
|
||||
sm::description("how many times a coordinator did not perfom prune after cas"),
|
||||
{storage_proxy_stats::current_scheduling_group_label()}),
|
||||
});
|
||||
}
|
||||
|
||||
inline uint64_t& storage_proxy_stats::split_stats::get_ep_stat(gms::inet_address ep) {
|
||||
inline uint64_t& storage_proxy_stats::split_stats::get_ep_stat(gms::inet_address ep) noexcept {
|
||||
if (fbu::is_me(ep)) {
|
||||
return _local.val;
|
||||
}
|
||||
|
||||
sstring dc = get_dc(ep);
|
||||
if (_auto_register_metrics) {
|
||||
register_metrics_for(ep);
|
||||
try {
|
||||
sstring dc = get_dc(ep);
|
||||
if (_auto_register_metrics) {
|
||||
register_metrics_for(ep);
|
||||
}
|
||||
return _dc_stats[dc].val;
|
||||
} catch (...) {
|
||||
static thread_local uint64_t dummy_stat;
|
||||
slogger.error("Failed to obtain stats ({}), fall-back to dummy", std::current_exception());
|
||||
return dummy_stat;
|
||||
}
|
||||
return _dc_stats[dc].val;
|
||||
}
|
||||
|
||||
void storage_proxy_stats::split_stats::register_metrics_local() {
|
||||
@@ -1879,11 +1938,11 @@ storage_proxy::create_write_response_handler(const std::unordered_map<gms::inet_
|
||||
}
|
||||
|
||||
storage_proxy::response_id_type
|
||||
storage_proxy::create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, dht::token>& meta,
|
||||
storage_proxy::create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& meta,
|
||||
db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit) {
|
||||
auto& [commit, s, t] = meta;
|
||||
auto& [commit, s, h, t] = meta;
|
||||
|
||||
return create_write_response_handler_helper(s, t, std::make_unique<cas_mutation>(std::move(commit), s), cl,
|
||||
return create_write_response_handler_helper(s, t, std::make_unique<cas_mutation>(std::move(commit), s, std::move(h)), cl,
|
||||
db::write_type::CAS, tr_state, std::move(permit));
|
||||
}
|
||||
|
||||
@@ -1898,7 +1957,7 @@ storage_proxy::create_write_response_handler(const std::tuple<paxos::proposal, s
|
||||
auto keyspace_name = s->ks_name();
|
||||
keyspace& ks = _db.local().find_keyspace(keyspace_name);
|
||||
|
||||
return create_write_response_handler(ks, cl, db::write_type::CAS, std::make_unique<cas_mutation>(std::move(commit), s), std::move(endpoints),
|
||||
return create_write_response_handler(ks, cl, db::write_type::CAS, std::make_unique<cas_mutation>(std::move(commit), s, nullptr), std::move(endpoints),
|
||||
std::vector<gms::inet_address>(), std::vector<gms::inet_address>(), std::move(tr_state), get_stats(), std::move(permit));
|
||||
}
|
||||
|
||||
@@ -2146,6 +2205,8 @@ storage_proxy::get_paxos_participants(const sstring& ks_name, const dht::token &
|
||||
cl_for_paxos, participants + 1, live_endpoints.size());
|
||||
}
|
||||
|
||||
bool dead = participants != live_endpoints.size();
|
||||
|
||||
// Apart from the ballot, paxos_state::prepare() also sends the current value of the requested key.
|
||||
// If the values received from different replicas match, we skip a separate query stage thus saving
|
||||
// one network round trip. To generate less traffic, only closest replicas send data, others send
|
||||
@@ -2153,7 +2214,7 @@ storage_proxy::get_paxos_participants(const sstring& ks_name, const dht::token &
|
||||
// list of participants by proximity to this instance.
|
||||
sort_endpoints_by_proximity(live_endpoints);
|
||||
|
||||
return paxos_participants{std::move(live_endpoints), required_participants};
|
||||
return paxos_participants{std::move(live_endpoints), required_participants, dead};
|
||||
}
|
||||
|
||||
|
||||
@@ -3412,7 +3473,9 @@ protected:
|
||||
uint32_t original_partition_limit() const {
|
||||
return _cmd->partition_limit;
|
||||
}
|
||||
virtual void adjust_targets_for_reconciliation() {}
|
||||
void reconcile(db::consistency_level cl, storage_proxy::clock_type::time_point timeout, lw_shared_ptr<query::read_command> cmd) {
|
||||
adjust_targets_for_reconciliation();
|
||||
data_resolver_ptr data_resolver = ::make_shared<data_read_resolver>(_schema, cl, _targets.size(), timeout);
|
||||
auto exec = shared_from_this();
|
||||
|
||||
@@ -3639,6 +3702,9 @@ public:
|
||||
virtual void got_cl() override {
|
||||
_speculate_timer.cancel();
|
||||
}
|
||||
virtual void adjust_targets_for_reconciliation() override {
|
||||
_targets = used_targets();
|
||||
}
|
||||
};
|
||||
|
||||
class range_slice_read_executor : public never_speculating_read_executor {
|
||||
@@ -4942,22 +5008,64 @@ void storage_proxy::init_messaging_service() {
|
||||
|
||||
return f;
|
||||
});
|
||||
ms.register_paxos_prune([this] (const rpc::client_info& cinfo, rpc::opt_time_point timeout,
|
||||
utils::UUID schema_id, partition_key key, utils::UUID ballot, std::optional<tracing::trace_info> trace_info) {
|
||||
static thread_local uint16_t pruning = 0;
|
||||
static constexpr uint16_t pruning_limit = 1000; // since PRUNE verb is one way replica side has its own queue limit
|
||||
auto src_addr = netw::messaging_service::get_source(cinfo);
|
||||
auto src_ip = src_addr.addr;
|
||||
tracing::trace_state_ptr tr_state;
|
||||
if (trace_info) {
|
||||
tr_state = tracing::tracing::get_local_tracing_instance().create_session(*trace_info);
|
||||
tracing::begin(tr_state);
|
||||
tracing::trace(tr_state, "paxos_prune: message received from /{} ballot {}", src_ip, ballot);
|
||||
}
|
||||
|
||||
if (pruning >= pruning_limit) {
|
||||
get_stats().cas_replica_dropped_prune++;
|
||||
tracing::trace(tr_state, "paxos_prune: do not prune due to overload", src_ip);
|
||||
return make_ready_future<seastar::rpc::no_wait_type>(netw::messaging_service::no_wait());
|
||||
}
|
||||
|
||||
pruning++;
|
||||
auto d = defer([] { pruning--; });
|
||||
return get_schema_for_read(schema_id, src_addr).then([this, key = std::move(key), ballot,
|
||||
timeout, tr_state = std::move(tr_state), src_ip, d = std::move(d)] (schema_ptr schema) mutable {
|
||||
dht::token token = dht::get_token(*schema, key);
|
||||
unsigned shard = dht::shard_of(*schema, token);
|
||||
bool local = shard == engine().cpu_id();
|
||||
get_stats().replica_cross_shard_ops += !local;
|
||||
return smp::submit_to(shard, _write_smp_service_group, [gs = global_schema_ptr(schema), gt = tracing::global_trace_state_ptr(std::move(tr_state)),
|
||||
local, key = std::move(key), ballot, timeout, src_ip, d = std::move(d)] () {
|
||||
tracing::trace_state_ptr tr_state = gt;
|
||||
return paxos::paxos_state::prune(gs, key, ballot, *timeout, tr_state).then([src_ip, tr_state] () {
|
||||
tracing::trace(tr_state, "paxos_prune: handling is done, sending a response to /{}", src_ip);
|
||||
return netw::messaging_service::no_wait();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
future<> storage_proxy::uninit_messaging_service() {
|
||||
auto& ms = netw::get_local_messaging_service();
|
||||
return when_all_succeed(
|
||||
ms.unregister_counter_mutation(),
|
||||
ms.unregister_mutation(),
|
||||
ms.unregister_hint_mutation(),
|
||||
ms.unregister_mutation_done(),
|
||||
ms.unregister_mutation_failed(),
|
||||
ms.unregister_read_data(),
|
||||
ms.unregister_read_mutation_data(),
|
||||
ms.unregister_read_digest(),
|
||||
ms.unregister_truncate(),
|
||||
ms.unregister_get_schema_version(),
|
||||
ms.unregister_paxos_prepare(),
|
||||
ms.unregister_paxos_accept(),
|
||||
ms.unregister_paxos_learn()
|
||||
ms.unregister_paxos_learn(),
|
||||
ms.unregister_paxos_prune()
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
|
||||
@@ -5050,8 +5158,7 @@ future<> storage_proxy::drain_on_shutdown() {
|
||||
|
||||
future<>
|
||||
storage_proxy::stop() {
|
||||
// FIXME: hints manager should be stopped here but it seems like this function is never called
|
||||
return uninit_messaging_service();
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -242,6 +242,7 @@ public:
|
||||
std::vector<gms::inet_address> endpoints;
|
||||
// How many participants are required for a quorum (i.e. is it SERIAL or LOCAL_SERIAL).
|
||||
size_t required_participants;
|
||||
bool has_dead_endpoints;
|
||||
};
|
||||
|
||||
const gms::feature_service& features() const { return _features; }
|
||||
@@ -297,7 +298,6 @@ private:
|
||||
cdc::cdc_service* _cdc = nullptr;
|
||||
cdc_stats _cdc_stats;
|
||||
private:
|
||||
future<> uninit_messaging_service();
|
||||
future<coordinator_query_result> query_singular(lw_shared_ptr<query::read_command> cmd,
|
||||
dht::partition_range_vector&& partition_ranges,
|
||||
db::consistency_level cl,
|
||||
@@ -317,7 +317,7 @@ private:
|
||||
response_id_type create_write_response_handler(const mutation&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
||||
response_id_type create_write_response_handler(const hint_wrapper&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
||||
response_id_type create_write_response_handler(const std::unordered_map<gms::inet_address, std::optional<mutation>>&, db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
||||
response_id_type create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, dht::token>& proposal,
|
||||
response_id_type create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, shared_ptr<paxos_response_handler>, dht::token>& proposal,
|
||||
db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
||||
response_id_type create_write_response_handler(const std::tuple<paxos::proposal, schema_ptr, dht::token, std::unordered_set<gms::inet_address>>& meta,
|
||||
db::consistency_level cl, db::write_type type, tracing::trace_state_ptr tr_state, service_permit permit);
|
||||
@@ -451,6 +451,7 @@ public:
|
||||
return next;
|
||||
}
|
||||
void init_messaging_service();
|
||||
future<> uninit_messaging_service();
|
||||
|
||||
// Applies mutation on this node.
|
||||
// Resolves with timed_out_error when timeout is reached.
|
||||
@@ -634,6 +635,11 @@ private:
|
||||
db::consistency_level _cl_for_learn;
|
||||
// Live endpoints, as per get_paxos_participants()
|
||||
std::vector<gms::inet_address> _live_endpoints;
|
||||
// True if there are dead endpoints
|
||||
// We don't include endpoints known to be unavailable in pending
|
||||
// endpoints list, but need to be aware of them to avoid pruning
|
||||
// system.paxos data if some endpoint is missing a Paxos write.
|
||||
bool _has_dead_endpoints;
|
||||
// How many endpoints need to respond favourably for the protocol to progress to the next step.
|
||||
size_t _required_participants;
|
||||
// A deadline when the entire CAS operation timeout expires, derived from write_request_timeout_in_ms
|
||||
@@ -651,6 +657,9 @@ private:
|
||||
// Unique request id for logging purposes.
|
||||
const uint64_t _id = next_id++;
|
||||
|
||||
// max pruning operations to run in parralel
|
||||
static constexpr uint16_t pruning_limit = 1000;
|
||||
|
||||
public:
|
||||
tracing::trace_state_ptr tr_state;
|
||||
|
||||
@@ -674,6 +683,7 @@ public:
|
||||
storage_proxy::paxos_participants pp = _proxy->get_paxos_participants(_schema->ks_name(), _key.token(), _cl_for_paxos);
|
||||
_live_endpoints = std::move(pp.endpoints);
|
||||
_required_participants = pp.required_participants;
|
||||
_has_dead_endpoints = pp.has_dead_endpoints;
|
||||
tracing::trace(tr_state, "Create paxos_response_handler for token {} with live: {} and required participants: {}",
|
||||
_key.token(), _live_endpoints, _required_participants);
|
||||
}
|
||||
@@ -691,6 +701,7 @@ public:
|
||||
future<paxos::prepare_summary> prepare_ballot(utils::UUID ballot);
|
||||
future<bool> accept_proposal(const paxos::proposal& proposal, bool timeout_if_partially_accepted = true);
|
||||
future<> learn_decision(paxos::proposal decision, bool allow_hints = false);
|
||||
void prune(utils::UUID ballot);
|
||||
uint64_t id() const {
|
||||
return _id;
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ public:
|
||||
*
|
||||
* @return a reference to the requested counter
|
||||
*/
|
||||
uint64_t& get_ep_stat(gms::inet_address ep);
|
||||
uint64_t& get_ep_stat(gms::inet_address ep) noexcept;
|
||||
};
|
||||
|
||||
struct write_stats {
|
||||
@@ -116,6 +116,11 @@ struct write_stats {
|
||||
uint64_t cas_write_condition_not_met = 0;
|
||||
uint64_t cas_write_timeout_due_to_uncertainty = 0;
|
||||
uint64_t cas_failed_read_round_optimization = 0;
|
||||
uint16_t cas_now_pruning = 0;
|
||||
uint64_t cas_prune = 0;
|
||||
uint64_t cas_coordinator_dropped_prune = 0;
|
||||
uint64_t cas_replica_dropped_prune = 0;
|
||||
|
||||
|
||||
std::chrono::microseconds last_mv_flow_control_delay; // delay added for MV flow control in the last request
|
||||
public:
|
||||
|
||||
@@ -1007,12 +1007,16 @@ storage_service::is_local_dc(const inet_address& targetHost) const {
|
||||
std::unordered_map<dht::token_range, std::vector<inet_address>>
|
||||
storage_service::get_range_to_address_map(const sstring& keyspace,
|
||||
const std::vector<token>& sorted_tokens) const {
|
||||
sstring ks = keyspace;
|
||||
// some people just want to get a visual representation of things. Allow null and set it to the first
|
||||
// non-system keyspace.
|
||||
if (keyspace == "" && _db.local().get_non_system_keyspaces().empty()) {
|
||||
throw std::runtime_error("No keyspace provided and no non system kespace exist");
|
||||
if (keyspace == "") {
|
||||
auto keyspaces = _db.local().get_non_system_keyspaces();
|
||||
if (keyspaces.empty()) {
|
||||
throw std::runtime_error("No keyspace provided and no non system kespace exist");
|
||||
}
|
||||
ks = keyspaces[0];
|
||||
}
|
||||
const sstring& ks = (keyspace == "") ? _db.local().get_non_system_keyspaces()[0] : keyspace;
|
||||
return construct_range_to_endpoint_map(ks, get_all_ranges(sorted_tokens));
|
||||
}
|
||||
|
||||
@@ -2171,7 +2175,8 @@ storage_service::get_snapshot_details() {
|
||||
}
|
||||
|
||||
future<int64_t> storage_service::true_snapshots_size() {
|
||||
return _db.map_reduce(adder<int64_t>(), [] (database& db) {
|
||||
return run_snapshot_list_operation([] {
|
||||
return get_local_storage_service()._db.map_reduce(adder<int64_t>(), [] (database& db) {
|
||||
return do_with(int64_t(0), [&db] (auto& local_total) {
|
||||
return parallel_for_each(db.get_column_families(), [&local_total] (auto& cf_pair) {
|
||||
return cf_pair.second->get_snapshot_details().then([&local_total] (auto map) {
|
||||
@@ -2185,6 +2190,7 @@ future<int64_t> storage_service::true_snapshots_size() {
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static std::atomic<bool> isolated = { false };
|
||||
@@ -3409,10 +3415,13 @@ void feature_enabled_listener::on_enabled() {
|
||||
|
||||
future<> read_sstables_format(distributed<storage_service>& ss) {
|
||||
return db::system_keyspace::get_scylla_local_param(SSTABLE_FORMAT_PARAM_NAME).then([&ss] (std::optional<sstring> format_opt) {
|
||||
sstables::sstable_version_types format = sstables::from_string(format_opt.value_or("ka"));
|
||||
return ss.invoke_on_all([format] (storage_service& s) {
|
||||
s._sstables_format = format;
|
||||
});
|
||||
if (format_opt) {
|
||||
sstables::sstable_version_types format = sstables::from_string(*format_opt);
|
||||
return ss.invoke_on_all([format] (storage_service& s) {
|
||||
s._sstables_format = format;
|
||||
});
|
||||
}
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -312,7 +312,13 @@ private:
|
||||
*/
|
||||
std::optional<db_clock::time_point> _cdc_streams_ts;
|
||||
|
||||
sstables::sstable_version_types _sstables_format = sstables::sstable_version_types::ka;
|
||||
// _sstables_format is the format used for writing new sstables.
|
||||
// Here we set its default value, but if we discover that all the nodes
|
||||
// in the cluster support a newer format, _sstables_format will be set to
|
||||
// that format. read_sstables_format() also overwrites _sstables_format
|
||||
// if an sstable format was chosen earlier (and this choice was persisted
|
||||
// in the system table).
|
||||
sstables::sstable_version_types _sstables_format = sstables::sstable_version_types::la;
|
||||
seastar::named_semaphore _feature_listeners_sem = {1, named_semaphore_exception_factory{"feature listeners"}};
|
||||
feature_enabled_listener _la_feature_listener;
|
||||
feature_enabled_listener _mc_feature_listener;
|
||||
|
||||
@@ -72,47 +72,8 @@ private:
|
||||
static std::vector<column_info> build(
|
||||
const schema& s,
|
||||
const utils::chunked_vector<serialization_header::column_desc>& src,
|
||||
bool is_static) {
|
||||
std::vector<column_info> cols;
|
||||
if (s.is_dense()) {
|
||||
const column_definition& col = is_static ? *s.static_begin() : *s.regular_begin();
|
||||
cols.push_back(column_info{
|
||||
&col.name(),
|
||||
col.type,
|
||||
col.id,
|
||||
col.type->value_length_if_fixed(),
|
||||
col.is_multi_cell(),
|
||||
col.is_counter(),
|
||||
false
|
||||
});
|
||||
} else {
|
||||
cols.reserve(src.size());
|
||||
for (auto&& desc : src) {
|
||||
const bytes& type_name = desc.type_name.value;
|
||||
data_type type = db::marshal::type_parser::parse(to_sstring_view(type_name));
|
||||
const column_definition* def = s.get_column_definition(desc.name.value);
|
||||
std::optional<column_id> id;
|
||||
bool schema_mismatch = false;
|
||||
if (def) {
|
||||
id = def->id;
|
||||
schema_mismatch = def->is_multi_cell() != type->is_multi_cell() ||
|
||||
def->is_counter() != type->is_counter() ||
|
||||
!def->type->is_value_compatible_with(*type);
|
||||
}
|
||||
cols.push_back(column_info{
|
||||
&desc.name.value,
|
||||
type,
|
||||
id,
|
||||
type->value_length_if_fixed(),
|
||||
type->is_multi_cell(),
|
||||
type->is_counter(),
|
||||
schema_mismatch
|
||||
});
|
||||
}
|
||||
boost::range::stable_partition(cols, [](const column_info& column) { return !column.is_collection; });
|
||||
}
|
||||
return cols;
|
||||
}
|
||||
const sstable_enabled_features& features,
|
||||
bool is_static);
|
||||
|
||||
utils::UUID schema_uuid;
|
||||
std::vector<column_info> regular_schema_columns_from_sstable;
|
||||
@@ -125,10 +86,10 @@ private:
|
||||
state(state&&) = default;
|
||||
state& operator=(state&&) = default;
|
||||
|
||||
state(const schema& s, const serialization_header& header)
|
||||
state(const schema& s, const serialization_header& header, const sstable_enabled_features& features)
|
||||
: schema_uuid(s.version())
|
||||
, regular_schema_columns_from_sstable(build(s, header.regular_columns.elements, false))
|
||||
, static_schema_columns_from_sstable(build(s, header.static_columns.elements, true))
|
||||
, regular_schema_columns_from_sstable(build(s, header.regular_columns.elements, features, false))
|
||||
, static_schema_columns_from_sstable(build(s, header.static_columns.elements, features, true))
|
||||
, clustering_column_value_fix_lengths (get_clustering_values_fixed_lengths(header))
|
||||
{}
|
||||
};
|
||||
@@ -136,9 +97,10 @@ private:
|
||||
lw_shared_ptr<const state> _state = make_lw_shared<const state>();
|
||||
|
||||
public:
|
||||
column_translation get_for_schema(const schema& s, const serialization_header& header) {
|
||||
column_translation get_for_schema(
|
||||
const schema& s, const serialization_header& header, const sstable_enabled_features& features) {
|
||||
if (s.version() != _state->schema_uuid) {
|
||||
_state = make_lw_shared(state(s, header));
|
||||
_state = make_lw_shared(state(s, header, features));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
@@ -66,7 +66,6 @@
|
||||
#include "db_clock.hh"
|
||||
#include "mutation_compactor.hh"
|
||||
#include "leveled_manifest.hh"
|
||||
#include "utils/observable.hh"
|
||||
#include "dht/token.hh"
|
||||
|
||||
namespace sstables {
|
||||
@@ -284,68 +283,73 @@ public:
|
||||
// When compaction finishes, all the temporary sstables generated here will be deleted and removed
|
||||
// from table's sstable set.
|
||||
class garbage_collected_sstable_writer {
|
||||
compaction* _c = nullptr;
|
||||
std::vector<shared_sstable> _temp_sealed_gc_sstables;
|
||||
std::deque<compaction_write_monitor> _active_write_monitors = {};
|
||||
shared_sstable _sst;
|
||||
std::optional<sstable_writer> _writer;
|
||||
std::optional<utils::observer<>> _on_new_sstable_sealed_observer;
|
||||
utils::UUID _run_identifier = utils::make_random_uuid();
|
||||
bool _consuming_new_partition {};
|
||||
private:
|
||||
void setup_on_new_sstable_sealed_handler();
|
||||
void maybe_create_new_sstable_writer();
|
||||
void finish_sstable_writer();
|
||||
void on_end_of_stream();
|
||||
public:
|
||||
garbage_collected_sstable_writer() = default;
|
||||
explicit garbage_collected_sstable_writer(compaction& c) : _c(&c) {
|
||||
setup_on_new_sstable_sealed_handler();
|
||||
}
|
||||
// Data for GC writer is stored separately to allow compaction class to communicate directly
|
||||
// with garbage_collected_sstable_writer which is moved into mutation_compaction, making it
|
||||
// unreachable after the compaction process has started.
|
||||
class data {
|
||||
compaction* _c = nullptr;
|
||||
// Garbage collected sstables that are sealed but were not added to SSTable set yet.
|
||||
std::vector<shared_sstable> _unused_garbage_collected_sstables;
|
||||
// Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
|
||||
std::vector<shared_sstable> _used_garbage_collected_sstables;
|
||||
std::deque<compaction_write_monitor> _active_write_monitors = {};
|
||||
shared_sstable _sst;
|
||||
std::optional<sstable_writer> _writer;
|
||||
utils::UUID _run_identifier = utils::make_random_uuid();
|
||||
public:
|
||||
explicit data(compaction& c) : _c(&c) {
|
||||
}
|
||||
|
||||
data& operator=(const data&) = delete;
|
||||
data(const data&) = delete;
|
||||
|
||||
void maybe_create_new_sstable_writer();
|
||||
void finish_sstable_writer();
|
||||
|
||||
// Retrieves all unused garbage collected sstables that will be subsequently added
|
||||
// to the SSTable set, and mark them as used.
|
||||
std::vector<shared_sstable> consume_unused_garbage_collected_sstables() {
|
||||
auto unused = std::exchange(_unused_garbage_collected_sstables, {});
|
||||
_used_garbage_collected_sstables.insert(_used_garbage_collected_sstables.end(), unused.begin(), unused.end());
|
||||
return unused;
|
||||
}
|
||||
|
||||
const std::vector<shared_sstable>& used_garbage_collected_sstables() const {
|
||||
return _used_garbage_collected_sstables;
|
||||
}
|
||||
|
||||
friend class garbage_collected_sstable_writer;
|
||||
};
|
||||
private:
|
||||
garbage_collected_sstable_writer::data* _data = nullptr;
|
||||
public:
|
||||
explicit garbage_collected_sstable_writer() = default;
|
||||
explicit garbage_collected_sstable_writer(garbage_collected_sstable_writer::data& data) : _data(&data) {}
|
||||
|
||||
garbage_collected_sstable_writer& operator=(const garbage_collected_sstable_writer&) = delete;
|
||||
garbage_collected_sstable_writer(const garbage_collected_sstable_writer&) = delete;
|
||||
|
||||
garbage_collected_sstable_writer(garbage_collected_sstable_writer&& other)
|
||||
: _c(other._c)
|
||||
, _temp_sealed_gc_sstables(std::move(other._temp_sealed_gc_sstables))
|
||||
, _active_write_monitors(std::move(other._active_write_monitors))
|
||||
, _sst(std::move(other._sst))
|
||||
, _writer(std::move(other._writer))
|
||||
, _run_identifier(other._run_identifier)
|
||||
, _consuming_new_partition(other._consuming_new_partition) {
|
||||
other._on_new_sstable_sealed_observer->disconnect();
|
||||
setup_on_new_sstable_sealed_handler();
|
||||
}
|
||||
|
||||
garbage_collected_sstable_writer& operator=(garbage_collected_sstable_writer&& other) {
|
||||
if (this != &other) {
|
||||
this->~garbage_collected_sstable_writer();
|
||||
new (this) garbage_collected_sstable_writer(std::move(other));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
garbage_collected_sstable_writer(garbage_collected_sstable_writer&& other) = default;
|
||||
garbage_collected_sstable_writer& operator=(garbage_collected_sstable_writer&& other) = default;
|
||||
|
||||
void consume_new_partition(const dht::decorated_key& dk) {
|
||||
maybe_create_new_sstable_writer();
|
||||
_writer->consume_new_partition(dk);
|
||||
_consuming_new_partition = true;
|
||||
_data->maybe_create_new_sstable_writer();
|
||||
_data->_writer->consume_new_partition(dk);
|
||||
}
|
||||
|
||||
void consume(tombstone t) { _writer->consume(t); }
|
||||
stop_iteration consume(static_row&& sr, tombstone, bool) { return _writer->consume(std::move(sr)); }
|
||||
stop_iteration consume(clustering_row&& cr, row_tombstone, bool) { return _writer->consume(std::move(cr)); }
|
||||
stop_iteration consume(range_tombstone&& rt) { return _writer->consume(std::move(rt)); }
|
||||
void consume(tombstone t) { _data->_writer->consume(t); }
|
||||
stop_iteration consume(static_row&& sr, tombstone, bool) { return _data->_writer->consume(std::move(sr)); }
|
||||
stop_iteration consume(clustering_row&& cr, row_tombstone, bool) { return _data->_writer->consume(std::move(cr)); }
|
||||
stop_iteration consume(range_tombstone&& rt) { return _data->_writer->consume(std::move(rt)); }
|
||||
|
||||
stop_iteration consume_end_of_partition() {
|
||||
_writer->consume_end_of_partition();
|
||||
_consuming_new_partition = false;
|
||||
_data->_writer->consume_end_of_partition();
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
void consume_end_of_stream() {
|
||||
finish_sstable_writer();
|
||||
on_end_of_stream();
|
||||
_data->finish_sstable_writer();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -380,6 +384,7 @@ public:
|
||||
class compaction {
|
||||
protected:
|
||||
column_family& _cf;
|
||||
creator_fn _sstable_creator;
|
||||
schema_ptr _schema;
|
||||
std::vector<shared_sstable> _sstables;
|
||||
// Unused sstables are tracked because if compaction is interrupted we can only delete them.
|
||||
@@ -393,15 +398,17 @@ protected:
|
||||
std::vector<unsigned long> _ancestors;
|
||||
db::replay_position _rp;
|
||||
encoding_stats_collector _stats_collector;
|
||||
utils::observable<> _on_new_sstable_sealed;
|
||||
bool _contains_multi_fragment_runs = false;
|
||||
garbage_collected_sstable_writer::data _gc_sstable_writer_data;
|
||||
protected:
|
||||
compaction(column_family& cf, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level)
|
||||
compaction(column_family& cf, creator_fn creator, std::vector<shared_sstable> sstables, uint64_t max_sstable_size, uint32_t sstable_level)
|
||||
: _cf(cf)
|
||||
, _sstable_creator(std::move(creator))
|
||||
, _schema(cf.schema())
|
||||
, _sstables(std::move(sstables))
|
||||
, _max_sstable_size(max_sstable_size)
|
||||
, _sstable_level(sstable_level)
|
||||
, _gc_sstable_writer_data(*this)
|
||||
{
|
||||
_info->cf = &cf;
|
||||
for (auto& sst : _sstables) {
|
||||
@@ -434,10 +441,6 @@ protected:
|
||||
writer = std::nullopt;
|
||||
sst->open_data().get0();
|
||||
_info->end_size += sst->bytes_on_disk();
|
||||
// Notify GC'ed-data sstable writer's handler that an output sstable has just been sealed.
|
||||
// The handler is responsible for making sure that deleting an input sstable will not
|
||||
// result in resurrection on failure.
|
||||
_on_new_sstable_sealed();
|
||||
}
|
||||
|
||||
api::timestamp_type maximum_timestamp() const {
|
||||
@@ -447,10 +450,6 @@ protected:
|
||||
return (*m)->get_stats_metadata().max_timestamp;
|
||||
}
|
||||
|
||||
utils::observer<> add_on_new_sstable_sealed_handler(std::function<void (void)> handler) noexcept {
|
||||
return _on_new_sstable_sealed.observe(std::move(handler));
|
||||
}
|
||||
|
||||
encoding_stats get_encoding_stats() const {
|
||||
return _stats_collector.get();
|
||||
}
|
||||
@@ -562,10 +561,9 @@ private:
|
||||
};
|
||||
}
|
||||
|
||||
virtual shared_sstable create_new_sstable() const = 0;
|
||||
|
||||
// select a sstable writer based on decorated key.
|
||||
virtual sstable_writer* select_sstable_writer(const dht::decorated_key& dk) = 0;
|
||||
|
||||
// stop current writer
|
||||
virtual void stop_sstable_writer() = 0;
|
||||
// finish all writers.
|
||||
@@ -588,20 +586,9 @@ private:
|
||||
sst->mark_for_deletion();
|
||||
}
|
||||
}
|
||||
|
||||
void setup_garbage_collected_sstable(shared_sstable sst) {
|
||||
// Add new sstable to table's set because expired tombstone should be available if compaction is abruptly stopped.
|
||||
_cf.add_sstable_and_update_cache(std::move(sst)).get();
|
||||
}
|
||||
|
||||
void eventually_delete_garbage_collected_sstable(shared_sstable sst) {
|
||||
// Add sstable to compaction's input list for it to be eventually removed from table's set.
|
||||
sst->mark_for_deletion();
|
||||
_sstables.push_back(std::move(sst));
|
||||
}
|
||||
public:
|
||||
garbage_collected_sstable_writer make_garbage_collected_sstable_writer() {
|
||||
return garbage_collected_sstable_writer(*this);
|
||||
return garbage_collected_sstable_writer(_gc_sstable_writer_data);
|
||||
}
|
||||
|
||||
bool contains_multi_fragment_runs() const {
|
||||
@@ -616,6 +603,7 @@ public:
|
||||
|
||||
friend class compacting_sstable_writer;
|
||||
friend class garbage_collected_sstable_writer;
|
||||
friend class garbage_collected_sstable_writer::data;
|
||||
};
|
||||
|
||||
void compacting_sstable_writer::consume_new_partition(const dht::decorated_key& dk) {
|
||||
@@ -642,22 +630,9 @@ void compacting_sstable_writer::consume_end_of_stream() {
|
||||
_c.finish_sstable_writer();
|
||||
}
|
||||
|
||||
void garbage_collected_sstable_writer::setup_on_new_sstable_sealed_handler() {
|
||||
_on_new_sstable_sealed_observer = _c->add_on_new_sstable_sealed_handler([this] {
|
||||
// NOTE: This handler is called, BEFORE an input sstable is possibly deleted
|
||||
// *AND* AFTER a new output sstable is sealed, to flush a garbage collected
|
||||
// sstable being currently written.
|
||||
// That way, data is resurrection is prevented by making sure that the
|
||||
// GC'able data is still reachable in a temporary sstable.
|
||||
assert(!_consuming_new_partition);
|
||||
// Wait for current gc'ed-only-sstable to be flushed and added to table's set.
|
||||
this->finish_sstable_writer();
|
||||
});
|
||||
}
|
||||
|
||||
void garbage_collected_sstable_writer::maybe_create_new_sstable_writer() {
|
||||
void garbage_collected_sstable_writer::data::maybe_create_new_sstable_writer() {
|
||||
if (!_writer) {
|
||||
_sst = _c->create_new_sstable();
|
||||
_sst = _c->_sstable_creator(this_shard_id());
|
||||
|
||||
auto&& priority = service::get_local_compaction_priority();
|
||||
_active_write_monitors.emplace_back(_sst, _c->_cf, _c->maximum_timestamp(), _c->_sstable_level);
|
||||
@@ -668,25 +643,16 @@ void garbage_collected_sstable_writer::maybe_create_new_sstable_writer() {
|
||||
}
|
||||
}
|
||||
|
||||
void garbage_collected_sstable_writer::finish_sstable_writer() {
|
||||
void garbage_collected_sstable_writer::data::finish_sstable_writer() {
|
||||
if (_writer) {
|
||||
_writer->consume_end_of_stream();
|
||||
_writer = std::nullopt;
|
||||
_sst->open_data().get0();
|
||||
_c->setup_garbage_collected_sstable(_sst);
|
||||
_temp_sealed_gc_sstables.push_back(std::move(_sst));
|
||||
}
|
||||
}
|
||||
|
||||
void garbage_collected_sstable_writer::on_end_of_stream() {
|
||||
for (auto&& sst : _temp_sealed_gc_sstables) {
|
||||
clogger.debug("Asking for deletion of temporary tombstone-only sstable {}", sst->get_filename());
|
||||
_c->eventually_delete_garbage_collected_sstable(std::move(sst));
|
||||
_unused_garbage_collected_sstables.push_back(std::move(_sst));
|
||||
}
|
||||
}
|
||||
|
||||
class regular_compaction : public compaction {
|
||||
std::function<shared_sstable()> _creator;
|
||||
replacer_fn _replacer;
|
||||
std::unordered_set<shared_sstable> _compacting_for_max_purgeable_func;
|
||||
// store a clone of sstable set for column family, which needs to be alive for incremental selector.
|
||||
@@ -701,10 +667,9 @@ class regular_compaction : public compaction {
|
||||
std::deque<compaction_write_monitor> _active_write_monitors = {};
|
||||
utils::UUID _run_identifier;
|
||||
public:
|
||||
regular_compaction(column_family& cf, compaction_descriptor descriptor, std::function<shared_sstable()> creator, replacer_fn replacer)
|
||||
: compaction(cf, std::move(descriptor.sstables), descriptor.max_sstable_bytes, descriptor.level)
|
||||
, _creator(std::move(creator))
|
||||
, _replacer(std::move(replacer))
|
||||
regular_compaction(column_family& cf, compaction_descriptor descriptor)
|
||||
: compaction(cf, std::move(descriptor.creator), std::move(descriptor.sstables), descriptor.max_sstable_bytes, descriptor.level)
|
||||
, _replacer(std::move(descriptor.replacer))
|
||||
, _compacting_for_max_purgeable_func(std::unordered_set<shared_sstable>(_sstables.begin(), _sstables.end()))
|
||||
, _set(cf.get_sstable_set())
|
||||
, _selector(_set.make_incremental_selector())
|
||||
@@ -755,13 +720,9 @@ public:
|
||||
};
|
||||
}
|
||||
|
||||
virtual shared_sstable create_new_sstable() const override {
|
||||
return _creator();
|
||||
}
|
||||
|
||||
virtual sstable_writer* select_sstable_writer(const dht::decorated_key& dk) override {
|
||||
if (!_writer) {
|
||||
_sst = _creator();
|
||||
_sst = _sstable_creator(0);
|
||||
setup_new_sstable(_sst);
|
||||
|
||||
_active_write_monitors.emplace_back(_sst, _cf, maximum_timestamp(), _sstable_level);
|
||||
@@ -834,6 +795,15 @@ private:
|
||||
// Fully expired sstable is not actually compacted, therefore it's not present in the compacting set.
|
||||
_compacting->erase(sst);
|
||||
});
|
||||
// Make sure SSTable created by garbage collected writer is made available
|
||||
// before exhausted SSTable is released, so as to prevent data resurrection.
|
||||
_gc_sstable_writer_data.finish_sstable_writer();
|
||||
// Added Garbage collected SSTables to list of unused SSTables that will be added
|
||||
// to SSTable set. GC SSTables should be added before compaction completes because
|
||||
// a failure could result in data resurrection if data is not made available.
|
||||
auto unused_gc_sstables = _gc_sstable_writer_data.consume_unused_garbage_collected_sstables();
|
||||
_new_unused_sstables.insert(_new_unused_sstables.end(), unused_gc_sstables.begin(), unused_gc_sstables.end());
|
||||
|
||||
auto exhausted_ssts = std::vector<shared_sstable>(exhausted, _sstables.end());
|
||||
_replacer(get_compaction_completion_desc(exhausted_ssts, std::move(_new_unused_sstables)));
|
||||
_sstables.erase(exhausted, _sstables.end());
|
||||
@@ -842,11 +812,16 @@ private:
|
||||
}
|
||||
|
||||
void replace_remaining_exhausted_sstables() {
|
||||
if (!_sstables.empty()) {
|
||||
std::vector<shared_sstable> sstables_compacted;
|
||||
std::move(_sstables.begin(), _sstables.end(), std::back_inserter(sstables_compacted));
|
||||
_replacer(get_compaction_completion_desc(std::move(sstables_compacted), std::move(_new_unused_sstables)));
|
||||
}
|
||||
if (!_sstables.empty() || !_gc_sstable_writer_data.used_garbage_collected_sstables().empty()) {
|
||||
std::vector<shared_sstable> old_sstables;
|
||||
std::move(_sstables.begin(), _sstables.end(), std::back_inserter(old_sstables));
|
||||
|
||||
// Remove Garbage Collected SSTables from the SSTable set if any was previously added.
|
||||
auto& used_garbage_collected_sstables = _gc_sstable_writer_data.used_garbage_collected_sstables();
|
||||
old_sstables.insert(old_sstables.end(), used_garbage_collected_sstables.begin(), used_garbage_collected_sstables.end());
|
||||
|
||||
_replacer(get_compaction_completion_desc(std::move(old_sstables), std::move(_new_unused_sstables)));
|
||||
}
|
||||
}
|
||||
|
||||
void do_pending_replacements() {
|
||||
@@ -909,8 +884,8 @@ protected:
|
||||
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables), std::move(ranges_for_for_invalidation)};
|
||||
}
|
||||
public:
|
||||
cleanup_compaction(column_family& cf, compaction_descriptor descriptor, std::function<shared_sstable()> creator, replacer_fn replacer)
|
||||
: regular_compaction(cf, std::move(descriptor), std::move(creator), std::move(replacer))
|
||||
cleanup_compaction(column_family& cf, compaction_descriptor descriptor)
|
||||
: regular_compaction(cf, std::move(descriptor))
|
||||
, _owned_ranges(service::get_local_storage_service().get_local_ranges(_schema->ks_name()))
|
||||
{
|
||||
_info->type = compaction_type::Cleanup;
|
||||
@@ -1114,9 +1089,8 @@ private:
|
||||
compaction_options::scrub _options;
|
||||
|
||||
public:
|
||||
scrub_compaction(column_family& cf, compaction_descriptor descriptor, compaction_options::scrub options, std::function<shared_sstable()> creator,
|
||||
replacer_fn replacer)
|
||||
: regular_compaction(cf, std::move(descriptor), std::move(creator), std::move(replacer))
|
||||
scrub_compaction(column_family& cf, compaction_descriptor descriptor, compaction_options::scrub options)
|
||||
: regular_compaction(cf, std::move(descriptor))
|
||||
, _options(options) {
|
||||
_info->type = compaction_type::Scrub;
|
||||
}
|
||||
@@ -1143,7 +1117,6 @@ flat_mutation_reader make_scrubbing_reader(flat_mutation_reader rd, bool skip_co
|
||||
class resharding_compaction final : public compaction {
|
||||
std::vector<std::pair<shared_sstable, std::optional<sstable_writer>>> _output_sstables;
|
||||
shard_id _shard; // shard of current sstable writer
|
||||
std::function<shared_sstable(shard_id)> _sstable_creator;
|
||||
compaction_backlog_tracker _resharding_backlog_tracker;
|
||||
|
||||
// Partition count estimation for a shard S:
|
||||
@@ -1168,11 +1141,9 @@ private:
|
||||
return ceil(double(_estimation_per_shard[s].estimated_partitions) / estimated_sstables);
|
||||
}
|
||||
public:
|
||||
resharding_compaction(std::vector<shared_sstable> sstables, column_family& cf, std::function<shared_sstable(shard_id)> creator,
|
||||
uint64_t max_sstable_size, uint32_t sstable_level)
|
||||
: compaction(cf, std::move(sstables), max_sstable_size, sstable_level)
|
||||
resharding_compaction(column_family& cf, sstables::compaction_descriptor descriptor)
|
||||
: compaction(cf, std::move(descriptor.creator), std::move(descriptor.sstables), descriptor.max_sstable_bytes, descriptor.level)
|
||||
, _output_sstables(smp::count)
|
||||
, _sstable_creator(std::move(creator))
|
||||
, _resharding_backlog_tracker(std::make_unique<resharding_backlog_tracker>())
|
||||
, _estimation_per_shard(smp::count)
|
||||
, _run_identifiers(smp::count)
|
||||
@@ -1224,10 +1195,6 @@ public:
|
||||
|
||||
void backlog_tracker_adjust_charges() override { }
|
||||
|
||||
shared_sstable create_new_sstable() const override {
|
||||
return _sstable_creator(_shard);
|
||||
}
|
||||
|
||||
sstable_writer* select_sstable_writer(const dht::decorated_key& dk) override {
|
||||
_shard = dht::shard_of(*_schema, dk.token());
|
||||
auto& sst = _output_sstables[_shard].first;
|
||||
@@ -1298,38 +1265,35 @@ compaction_type compaction_options::type() const {
|
||||
return index_to_type[_options.index()];
|
||||
}
|
||||
|
||||
static std::unique_ptr<compaction> make_compaction(column_family& cf, sstables::compaction_descriptor descriptor,
|
||||
std::function<shared_sstable()> creator, replacer_fn replacer) {
|
||||
static std::unique_ptr<compaction> make_compaction(column_family& cf, sstables::compaction_descriptor descriptor) {
|
||||
struct {
|
||||
column_family& cf;
|
||||
sstables::compaction_descriptor&& descriptor;
|
||||
std::function<shared_sstable()>&& creator;
|
||||
replacer_fn&& replacer;
|
||||
|
||||
std::unique_ptr<compaction> operator()(compaction_options::regular) {
|
||||
return std::make_unique<regular_compaction>(cf, std::move(descriptor), std::move(creator), std::move(replacer));
|
||||
return std::make_unique<regular_compaction>(cf, std::move(descriptor));
|
||||
}
|
||||
std::unique_ptr<compaction> operator()(compaction_options::cleanup) {
|
||||
return std::make_unique<cleanup_compaction>(cf, std::move(descriptor), std::move(creator), std::move(replacer));
|
||||
return std::make_unique<cleanup_compaction>(cf, std::move(descriptor));
|
||||
}
|
||||
std::unique_ptr<compaction> operator()(compaction_options::upgrade) {
|
||||
return std::make_unique<cleanup_compaction>(cf, std::move(descriptor), std::move(creator), std::move(replacer));
|
||||
return std::make_unique<cleanup_compaction>(cf, std::move(descriptor));
|
||||
}
|
||||
std::unique_ptr<compaction> operator()(compaction_options::scrub scrub_options) {
|
||||
return std::make_unique<scrub_compaction>(cf, std::move(descriptor), scrub_options, std::move(creator), std::move(replacer));
|
||||
return std::make_unique<scrub_compaction>(cf, std::move(descriptor), scrub_options);
|
||||
}
|
||||
} visitor_factory{cf, std::move(descriptor), std::move(creator), std::move(replacer)};
|
||||
} visitor_factory{cf, std::move(descriptor)};
|
||||
|
||||
return descriptor.options.visit(visitor_factory);
|
||||
}
|
||||
|
||||
future<compaction_info>
|
||||
compact_sstables(sstables::compaction_descriptor descriptor, column_family& cf, std::function<shared_sstable()> creator, replacer_fn replacer) {
|
||||
compact_sstables(sstables::compaction_descriptor descriptor, column_family& cf) {
|
||||
if (descriptor.sstables.empty()) {
|
||||
throw std::runtime_error(format("Called {} compaction with empty set on behalf of {}.{}", compaction_name(descriptor.options.type()),
|
||||
cf.schema()->ks_name(), cf.schema()->cf_name()));
|
||||
}
|
||||
auto c = make_compaction(cf, std::move(descriptor), std::move(creator), std::move(replacer));
|
||||
auto c = make_compaction(cf, std::move(descriptor));
|
||||
if (c->contains_multi_fragment_runs()) {
|
||||
auto gc_writer = c->make_garbage_collected_sstable_writer();
|
||||
return compaction::run(std::move(c), std::move(gc_writer));
|
||||
@@ -1343,7 +1307,10 @@ reshard_sstables(std::vector<shared_sstable> sstables, column_family& cf, std::f
|
||||
if (sstables.empty()) {
|
||||
throw std::runtime_error(format("Called resharding with empty set on behalf of {}.{}", cf.schema()->ks_name(), cf.schema()->cf_name()));
|
||||
}
|
||||
auto c = std::make_unique<resharding_compaction>(std::move(sstables), cf, std::move(creator), max_sstable_size, sstable_level);
|
||||
sstables::compaction_descriptor descriptor(std::move(sstables), sstable_level, max_sstable_size);
|
||||
descriptor.creator = std::move(creator);
|
||||
|
||||
auto c = std::make_unique<resharding_compaction>(cf, std::move(descriptor));
|
||||
return compaction::run(std::move(c)).then([] (auto ret) {
|
||||
return std::move(ret.new_sstables);
|
||||
});
|
||||
|
||||
@@ -84,6 +84,20 @@ namespace sstables {
|
||||
compaction_type type() const;
|
||||
};
|
||||
|
||||
struct compaction_completion_desc {
|
||||
// Old, existing SSTables that should be deleted and removed from the SSTable set.
|
||||
std::vector<shared_sstable> old_sstables;
|
||||
// New, fresh SSTables that should be added to SSTable set, replacing the old ones.
|
||||
std::vector<shared_sstable> new_sstables;
|
||||
// Set of compacted partition ranges that should be invalidated in the cache.
|
||||
dht::partition_range_vector ranges_for_cache_invalidation;
|
||||
};
|
||||
|
||||
// creates a new SSTable for a given shard
|
||||
using creator_fn = std::function<shared_sstable(shard_id shard)>;
|
||||
// Replaces old sstable(s) by new one(s) which contain all non-expired data.
|
||||
using replacer_fn = std::function<void(compaction_completion_desc)>;
|
||||
|
||||
struct compaction_descriptor {
|
||||
// List of sstables to be compacted.
|
||||
std::vector<sstables::shared_sstable> sstables;
|
||||
@@ -101,6 +115,9 @@ namespace sstables {
|
||||
// This also selects the kind of compaction to do.
|
||||
compaction_options options = compaction_options::make_regular();
|
||||
|
||||
creator_fn creator;
|
||||
replacer_fn replacer;
|
||||
|
||||
compaction_descriptor() = default;
|
||||
|
||||
static constexpr int default_level = 0;
|
||||
@@ -190,16 +207,6 @@ namespace sstables {
|
||||
}
|
||||
};
|
||||
|
||||
struct compaction_completion_desc {
|
||||
std::vector<shared_sstable> input_sstables;
|
||||
std::vector<shared_sstable> output_sstables;
|
||||
// Set of compacted partition ranges that should be invalidated in the cache.
|
||||
dht::partition_range_vector ranges_for_cache_invalidation;
|
||||
};
|
||||
|
||||
// Replaces old sstable(s) by new one(s) which contain all non-expired data.
|
||||
using replacer_fn = std::function<void(compaction_completion_desc)>;
|
||||
|
||||
// Compact a list of N sstables into M sstables.
|
||||
// Returns info about the finished compaction, which includes vector to new sstables.
|
||||
//
|
||||
@@ -212,8 +219,7 @@ namespace sstables {
|
||||
// If descriptor.cleanup is true, mutation that doesn't belong to current node will be
|
||||
// cleaned up, log messages will inform the user that compact_sstables runs for
|
||||
// cleaning operation, and compaction history will not be updated.
|
||||
future<compaction_info> compact_sstables(sstables::compaction_descriptor descriptor, column_family& cf,
|
||||
std::function<shared_sstable()> creator, replacer_fn replacer);
|
||||
future<compaction_info> compact_sstables(sstables::compaction_descriptor descriptor, column_family& cf);
|
||||
|
||||
// Compacts a set of N shared sstables into M sstables. For every shard involved,
|
||||
// i.e. which owns any of the sstables, a new unshared sstable is created.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user