Compare commits
134 Commits
next
...
scylla-4.5
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
53b0aaa4e8 | ||
|
|
ebfa2279a4 | ||
|
|
5e38a69f6d | ||
|
|
6a54033a63 | ||
|
|
ec44412cd9 | ||
|
|
8f45f65b09 | ||
|
|
5a7324c423 | ||
|
|
2eb0ad7b4f | ||
|
|
5d7064e00e | ||
|
|
b56b9f5ed5 | ||
|
|
c8f14886dc | ||
|
|
5c8057749b | ||
|
|
a35646b874 | ||
|
|
da8708932d | ||
|
|
78a545716a | ||
|
|
1488278fc1 | ||
|
|
d9455a910f | ||
|
|
406b4bce8d | ||
|
|
417e853b9b | ||
|
|
44c784cb79 | ||
|
|
ab425a11a8 | ||
|
|
2228a1a92a | ||
|
|
36b190a65e | ||
|
|
f7e5339c14 | ||
|
|
4c4972cb33 | ||
|
|
50ce5bef2c | ||
|
|
f864eea844 | ||
|
|
b9735ab079 | ||
|
|
766e16f19e | ||
|
|
e6520df41c | ||
|
|
26aca7b9f7 | ||
|
|
103c85a23f | ||
|
|
db66b62e80 | ||
|
|
098fcf900f | ||
|
|
84025f6ce0 | ||
|
|
9898a114a6 | ||
|
|
4c0eac0491 | ||
|
|
c1d8ce7328 | ||
|
|
5c5a71d2d7 | ||
|
|
454ff04ff6 | ||
|
|
f5f5b9a307 | ||
|
|
a433c5fe06 | ||
|
|
7f96ee6689 | ||
|
|
ebe196e32d | ||
|
|
38aa455e83 | ||
|
|
a99382a076 | ||
|
|
6e2d055be3 | ||
|
|
152f710dec | ||
|
|
14620444a2 | ||
|
|
47be33a104 | ||
|
|
18b8388958 | ||
|
|
56b24818ec | ||
|
|
5ed149b7e1 | ||
|
|
9265dbd5f7 | ||
|
|
443fda8fb1 | ||
|
|
02da29fd05 | ||
|
|
edead1caf9 | ||
|
|
8dbd4edbb5 | ||
|
|
02bb2e1f4c | ||
|
|
4bae31523d | ||
|
|
55348131f9 | ||
|
|
c1b9de3d5e | ||
|
|
9956bce436 | ||
|
|
95f32428e4 | ||
|
|
5b3319816a | ||
|
|
8f63a9de31 | ||
|
|
88314fedfa | ||
|
|
b0edfa6d70 | ||
|
|
9338f6b6b8 | ||
|
|
28940ef505 | ||
|
|
4c03bcce4c | ||
|
|
860e2190a9 | ||
|
|
6cf88812f6 | ||
|
|
89fbcf9c81 | ||
|
|
6dc7ef512d | ||
|
|
ae39b30ed3 | ||
|
|
d54372d699 | ||
|
|
7f96719c55 | ||
|
|
0c33983c71 | ||
|
|
3c51b4066b | ||
|
|
efa4d24deb | ||
|
|
3e1d608111 | ||
|
|
a87bb38c29 | ||
|
|
ab3e284e04 | ||
|
|
b0e833d9e5 | ||
|
|
9e55d9bd04 | ||
|
|
f89f4e69a0 | ||
|
|
7146646bf4 | ||
|
|
b3aba49ab0 | ||
|
|
706de00ef2 | ||
|
|
cd5b915460 | ||
|
|
247d30f075 | ||
|
|
13da17e6fe | ||
|
|
6e29d74ab8 | ||
|
|
e820e7f3c5 | ||
|
|
0cebafd104 | ||
|
|
9abd4677b1 | ||
|
|
9a07d7ca76 | ||
|
|
d92a26636a | ||
|
|
7445bfec86 | ||
|
|
b077b198bf | ||
|
|
44f85d2ba0 | ||
|
|
ccfe1d12ea | ||
|
|
b0399a7c3b | ||
|
|
b81919dbe2 | ||
|
|
5651a20ba1 | ||
|
|
8c30b83ea4 | ||
|
|
fce7eab9ac | ||
|
|
ab8eefade7 | ||
|
|
e2704554b5 | ||
|
|
e36e490469 | ||
|
|
c97005fbb8 | ||
|
|
d881d539f3 | ||
|
|
b8a502fab0 | ||
|
|
f7f2bb482f | ||
|
|
b16db6512c | ||
|
|
1a7c8223fe | ||
|
|
ac6aa66a7b | ||
|
|
98a39884c3 | ||
|
|
88192811e7 | ||
|
|
32f21f7281 | ||
|
|
c9eaf95750 | ||
|
|
44c6d0fcf9 | ||
|
|
4bcc0badb2 | ||
|
|
97664e63fe | ||
|
|
204964637a | ||
|
|
c402abe8e9 | ||
|
|
4a78d6403e | ||
|
|
2f20d52ac7 | ||
|
|
540439ee46 | ||
|
|
a0622e85ab | ||
|
|
90741dc62c | ||
|
|
83cfa6a63c | ||
|
|
1816c6df8c |
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
|
||||
1
DEDICATION.txt
Normal file
1
DEDICATION.txt
Normal file
@@ -0,0 +1 @@
|
||||
Dedicated to the memory of Alberto José Araújo, a coworker and a friend.
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
PRODUCT=scylla
|
||||
VERSION=4.5.dev
|
||||
VERSION=4.5.4
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -123,7 +123,7 @@ struct rjson_engaged_ptr_comp {
|
||||
// as internally they're stored in an array, and the order of elements is
|
||||
// not important in set equality. See issue #5021
|
||||
static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
|
||||
if (set1.Size() != set2.Size()) {
|
||||
if (!set1.IsArray() || !set2.IsArray() || set1.Size() != set2.Size()) {
|
||||
return false;
|
||||
}
|
||||
std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
|
||||
@@ -137,25 +137,70 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// Moreover, the JSON being compared can be a nested document with outer
|
||||
// layers of lists and maps and some inner set - and we need to get to that
|
||||
// inner set to compare it correctly with check_EQ_for_sets() (issue #8514).
|
||||
static bool check_EQ(const rjson::value* v1, const rjson::value& v2);
|
||||
static bool check_EQ_for_lists(const rjson::value& list1, const rjson::value& list2) {
|
||||
if (!list1.IsArray() || !list2.IsArray() || list1.Size() != list2.Size()) {
|
||||
return false;
|
||||
}
|
||||
auto it1 = list1.Begin();
|
||||
auto it2 = list2.Begin();
|
||||
while (it1 != list1.End()) {
|
||||
// Note: Alternator limits an item's depth (rjson::parse() limits
|
||||
// it to around 37 levels), so this recursion is safe.
|
||||
if (!check_EQ(&*it1, *it2)) {
|
||||
return false;
|
||||
}
|
||||
++it1;
|
||||
++it2;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
static bool check_EQ_for_maps(const rjson::value& list1, const rjson::value& list2) {
|
||||
if (!list1.IsObject() || !list2.IsObject() || list1.MemberCount() != list2.MemberCount()) {
|
||||
return false;
|
||||
}
|
||||
for (auto it1 = list1.MemberBegin(); it1 != list1.MemberEnd(); ++it1) {
|
||||
auto it2 = list2.FindMember(it1->name);
|
||||
if (it2 == list2.MemberEnd() || !check_EQ(&it1->value, it2->value)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if two JSON-encoded values match with the EQ relation
|
||||
static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
|
||||
if (!v1) {
|
||||
return false;
|
||||
}
|
||||
if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
|
||||
if (v1 && v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
|
||||
auto it1 = v1->MemberBegin();
|
||||
auto it2 = v2.MemberBegin();
|
||||
if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
|
||||
return check_EQ_for_sets(it1->value, it2->value);
|
||||
if (it1->name != it2->name) {
|
||||
return false;
|
||||
}
|
||||
if (it1->name == "SS" || it1->name == "NS" || it1->name == "BS") {
|
||||
return check_EQ_for_sets(it1->value, it2->value);
|
||||
} else if(it1->name == "L") {
|
||||
return check_EQ_for_lists(it1->value, it2->value);
|
||||
} else if(it1->name == "M") {
|
||||
return check_EQ_for_maps(it1->value, it2->value);
|
||||
} else {
|
||||
// Other, non-nested types (number, string, etc.) can be compared
|
||||
// literally, comparing their JSON representation.
|
||||
return it1->value == it2->value;
|
||||
}
|
||||
} else {
|
||||
// If v1 and/or v2 are missing (IsNull()) the result should be false.
|
||||
// In the unlikely case that the object is malformed (issue #8070),
|
||||
// let's also return false.
|
||||
return false;
|
||||
}
|
||||
return *v1 == v2;
|
||||
}
|
||||
|
||||
// Check if two JSON-encoded values match with the NE relation
|
||||
static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
|
||||
return !v1 || *v1 != v2; // null is unequal to anything.
|
||||
return !check_EQ(v1, v2);
|
||||
}
|
||||
|
||||
// Check if two JSON-encoded values match with the BEGINS_WITH relation
|
||||
@@ -298,6 +343,8 @@ static bool check_NOT_NULL(const rjson::value* val) {
|
||||
|
||||
// Only types S, N or B (string, number or bytes) may be compared by the
|
||||
// various comparion operators - lt, le, gt, ge, and between.
|
||||
// Note that in particular, if the value is missing (v->IsNull()), this
|
||||
// check returns false.
|
||||
static bool check_comparable_type(const rjson::value& v) {
|
||||
if (!v.IsObject() || v.MemberCount() != 1) {
|
||||
return false;
|
||||
|
||||
@@ -2442,8 +2442,8 @@ static bool hierarchy_actions(
|
||||
if (newv) {
|
||||
rjson::set_with_string_name(v, attr, std::move(*newv));
|
||||
} else {
|
||||
throw api_error::validation(format("Can't remove document path {} - not present in item",
|
||||
subh.get_value()._path));
|
||||
// Removing a.b when a is a map but a.b doesn't exist
|
||||
// is silently ignored. It's not considered an error.
|
||||
}
|
||||
} else {
|
||||
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
||||
@@ -2509,7 +2509,7 @@ update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::t
|
||||
const attribute_path_map_node<parsed::update_expression::action>* h = nullptr) {
|
||||
any_updates = true;
|
||||
if (_returnvalues == returnvalues::ALL_NEW) {
|
||||
rjson::set_with_string_name(_return_attributes,
|
||||
rjson::replace_with_string_name(_return_attributes,
|
||||
to_sstring_view(column_name), rjson::copy(json_value));
|
||||
} else if (_returnvalues == returnvalues::UPDATED_NEW) {
|
||||
rjson::value&& v = rjson::copy(json_value);
|
||||
|
||||
@@ -129,6 +129,10 @@ public:
|
||||
[&] (const json::json_return_type& json_return_value) {
|
||||
slogger.trace("api_handler success case");
|
||||
if (json_return_value._body_writer) {
|
||||
// Unfortunately, write_body() forces us to choose
|
||||
// from a fixed and irrelevant list of "mime-types"
|
||||
// at this point. But we'll override it with the
|
||||
// one (application/x-amz-json-1.0) below.
|
||||
rep->write_body("json", std::move(json_return_value._body_writer));
|
||||
} else {
|
||||
rep->_content += json_return_value._res;
|
||||
@@ -141,7 +145,7 @@ public:
|
||||
|
||||
return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
|
||||
});
|
||||
}), _type("json") { }
|
||||
}) { }
|
||||
|
||||
api_handler(const api_handler&) = default;
|
||||
future<std::unique_ptr<reply>> handle(const sstring& path,
|
||||
@@ -149,7 +153,8 @@ public:
|
||||
handle_CORS(*req, *rep, false);
|
||||
return _f_handle(std::move(req), std::move(rep)).then(
|
||||
[this](std::unique_ptr<reply> rep) {
|
||||
rep->done(_type);
|
||||
rep->set_mime_type("application/x-amz-json-1.0");
|
||||
rep->done();
|
||||
return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
|
||||
});
|
||||
}
|
||||
@@ -163,7 +168,6 @@ protected:
|
||||
}
|
||||
|
||||
future_handler_function _f_handle;
|
||||
sstring _type;
|
||||
};
|
||||
|
||||
class gated_handler : public handler_base {
|
||||
@@ -246,24 +250,31 @@ future<> server::verify_signature(const request& req, const chunked_content& con
|
||||
throw api_error::missing_authentication_token("Authorization header is mandatory for signature verification");
|
||||
}
|
||||
std::string host = host_it->second;
|
||||
std::vector<std::string_view> credentials_raw = split(authorization_it->second, ' ');
|
||||
std::string_view authorization_header = authorization_it->second;
|
||||
auto pos = authorization_header.find_first_of(' ');
|
||||
if (pos == std::string_view::npos || authorization_header.substr(0, pos) != "AWS4-HMAC-SHA256") {
|
||||
throw api_error::invalid_signature(format("Authorization header must use AWS4-HMAC-SHA256 algorithm: {}", authorization_header));
|
||||
}
|
||||
authorization_header.remove_prefix(pos+1);
|
||||
std::string credential;
|
||||
std::string user_signature;
|
||||
std::string signed_headers_str;
|
||||
std::vector<std::string_view> signed_headers;
|
||||
for (std::string_view entry : credentials_raw) {
|
||||
do {
|
||||
// Either one of a comma or space can mark the end of an entry
|
||||
pos = authorization_header.find_first_of(" ,");
|
||||
std::string_view entry = authorization_header.substr(0, pos);
|
||||
if (pos != std::string_view::npos) {
|
||||
authorization_header.remove_prefix(pos + 1);
|
||||
}
|
||||
if (entry.empty()) {
|
||||
continue;
|
||||
}
|
||||
std::vector<std::string_view> entry_split = split(entry, '=');
|
||||
if (entry_split.size() != 2) {
|
||||
if (entry != "AWS4-HMAC-SHA256") {
|
||||
throw api_error::invalid_signature(format("Only AWS4-HMAC-SHA256 algorithm is supported. Found: {}", entry));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
std::string_view auth_value = entry_split[1];
|
||||
// Commas appear as an additional (quite redundant) delimiter
|
||||
if (auth_value.back() == ',') {
|
||||
auth_value.remove_suffix(1);
|
||||
}
|
||||
if (entry_split[0] == "Credential") {
|
||||
credential = std::string(auth_value);
|
||||
} else if (entry_split[0] == "Signature") {
|
||||
@@ -273,7 +284,8 @@ future<> server::verify_signature(const request& req, const chunked_content& con
|
||||
signed_headers = split(auth_value, ';');
|
||||
std::sort(signed_headers.begin(), signed_headers.end());
|
||||
}
|
||||
}
|
||||
} while (pos != std::string_view::npos);
|
||||
|
||||
std::vector<std::string_view> credential_split = split(credential, '/');
|
||||
if (credential_split.size() != 5) {
|
||||
throw api_error::validation(format("Incorrect credential information format: {}", credential));
|
||||
|
||||
@@ -38,6 +38,7 @@ stats::stats() : api_operations{} {
|
||||
#define OPERATION_LATENCY(name, CamelCaseName) \
|
||||
seastar::metrics::make_histogram("op_latency", \
|
||||
seastar::metrics::description("Latency histogram of an operation via Alternator API"), {op(CamelCaseName)}, [this]{return to_metrics_histogram(api_operations.name);}),
|
||||
OPERATION(batch_get_item, "BatchGetItem")
|
||||
OPERATION(batch_write_item, "BatchWriteItem")
|
||||
OPERATION(create_backup, "CreateBackup")
|
||||
OPERATION(create_global_table, "CreateGlobalTable")
|
||||
|
||||
@@ -332,15 +332,15 @@ void set_column_family(http_context& ctx, routes& r) {
|
||||
});
|
||||
|
||||
cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
|
||||
return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](column_family& cf) {
|
||||
return cf.active_memtable().partition_count();
|
||||
}, std::plus<int>());
|
||||
}, std::plus<>());
|
||||
});
|
||||
|
||||
cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return map_reduce_cf(ctx, 0, [](column_family& cf) {
|
||||
return map_reduce_cf(ctx, uint64_t{0}, [](column_family& cf) {
|
||||
return cf.active_memtable().partition_count();
|
||||
}, std::plus<int>());
|
||||
}, std::plus<>());
|
||||
});
|
||||
|
||||
cf::get_memtable_on_heap_size.set(r, [] (const_req req) {
|
||||
|
||||
@@ -262,7 +262,7 @@ void set_repair(http_context& ctx, routes& r, sharded<netw::messaging_service>&
|
||||
try {
|
||||
res = fut.get0();
|
||||
} catch (std::exception& e) {
|
||||
return make_exception_future<json::json_return_type>(httpd::server_error_exception(e.what()));
|
||||
return make_exception_future<json::json_return_type>(httpd::bad_param_exception(e.what()));
|
||||
}
|
||||
return make_ready_future<json::json_return_type>(json::json_return_type(res));
|
||||
});
|
||||
|
||||
@@ -267,6 +267,9 @@ future<> cache_flat_mutation_reader::do_fill_buffer(db::timeout_clock::time_poin
|
||||
}
|
||||
_state = state::reading_from_underlying;
|
||||
_population_range_starts_before_all_rows = _lower_bound.is_before_all_clustered_rows(*_schema);
|
||||
if (!_read_context->partition_exists()) {
|
||||
return read_from_underlying(timeout);
|
||||
}
|
||||
auto end = _next_row_in_range ? position_in_partition(_next_row.position())
|
||||
: position_in_partition(_upper_bound);
|
||||
return _underlying->fast_forward_to(position_range{_lower_bound, std::move(end)}, timeout).then([this, timeout] {
|
||||
@@ -573,8 +576,8 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
|
||||
clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
|
||||
auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
|
||||
auto& rows = _snp->version()->partition().clustered_rows();
|
||||
auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
|
||||
return rows.insert_before(_next_row.get_iterator_in_latest_version(), *new_entry);
|
||||
auto new_entry = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no));
|
||||
return rows.insert_before(_next_row.get_iterator_in_latest_version(), std::move(new_entry));
|
||||
});
|
||||
_snp->tracker()->insert(*it);
|
||||
_last_row = partition_snapshot_row_weakref(*_snp, it, true);
|
||||
|
||||
@@ -716,16 +716,16 @@ private:
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool compare(const T&, const value_type& v);
|
||||
int32_t compare(const T&, const value_type& v);
|
||||
};
|
||||
|
||||
template<>
|
||||
bool maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
|
||||
int32_t maybe_back_insert_iterator<std::vector<std::pair<bytes_view, bytes_view>>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
|
||||
return _type.compare(t, v.first);
|
||||
}
|
||||
|
||||
template<>
|
||||
bool maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
|
||||
int32_t maybe_back_insert_iterator<std::vector<bytes_view>, bytes_view>::compare(const bytes_view& t, const value_type& v) {
|
||||
return _type.compare(t, v);
|
||||
}
|
||||
|
||||
|
||||
@@ -302,7 +302,7 @@ scylla_tests = set([
|
||||
'test/boost/cdc_generation_test',
|
||||
'test/boost/aggregate_fcts_test',
|
||||
'test/boost/allocation_strategy_test',
|
||||
'test/boost/alternator_base64_test',
|
||||
'test/boost/alternator_unit_test',
|
||||
'test/boost/anchorless_list_test',
|
||||
'test/boost/auth_passwords_test',
|
||||
'test/boost/auth_resource_test',
|
||||
@@ -1076,7 +1076,7 @@ pure_boost_tests = set([
|
||||
])
|
||||
|
||||
tests_not_using_seastar_test_framework = set([
|
||||
'test/boost/alternator_base64_test',
|
||||
'test/boost/alternator_unit_test',
|
||||
'test/boost/small_vector_test',
|
||||
'test/manual/gossip',
|
||||
'test/manual/message',
|
||||
@@ -1152,7 +1152,7 @@ deps['test/boost/linearizing_input_stream_test'] = [
|
||||
]
|
||||
|
||||
deps['test/boost/duration_test'] += ['test/lib/exception_utils.cc']
|
||||
deps['test/boost/alternator_base64_test'] += ['alternator/base64.cc']
|
||||
deps['test/boost/alternator_unit_test'] += ['alternator/base64.cc']
|
||||
|
||||
deps['test/raft/replication_test'] = ['test/raft/replication_test.cc'] + scylla_raft_dependencies
|
||||
deps['test/raft/fsm_test'] = ['test/raft/fsm_test.cc', 'test/lib/log.cc'] + scylla_raft_dependencies
|
||||
@@ -2018,7 +2018,7 @@ with open(buildfile_tmp, 'w') as f:
|
||||
command = ./dist/debian/debian_files_gen.py
|
||||
build $builddir/debian/debian: debian_files_gen | always
|
||||
rule extract_node_exporter
|
||||
command = tar -C build -xvpf {node_exporter_filename} && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
|
||||
command = tar -C build -xvpf {node_exporter_filename} --no-same-owner && rm -rfv build/node_exporter && mv -v build/{node_exporter_dirname} build/node_exporter
|
||||
build $builddir/node_exporter: extract_node_exporter | always
|
||||
''').format(**globals()))
|
||||
|
||||
|
||||
@@ -1103,16 +1103,27 @@ bool statement_restrictions::need_filtering() const {
|
||||
// clustering restrictions. Therefore, a continuous clustering range is guaranteed.
|
||||
return false;
|
||||
}
|
||||
if (!_clustering_columns_restrictions->needs_filtering(*_schema)) { // Guaranteed continuous clustering range.
|
||||
return false;
|
||||
}
|
||||
// Now we know there are some clustering-column restrictions that are out-of-order or not EQ. A naive base-table
|
||||
// query must be filtered. What about an index-table query? That can only avoid filtering if there is exactly one
|
||||
// EQ supported by an index.
|
||||
return !(_clustering_columns_restrictions->size() == 1 && _has_queriable_ck_index);
|
||||
|
||||
// TODO: it is also possible to avoid filtering here if a non-empty CK prefix is specified and token_known, plus
|
||||
// there's exactly one out-of-order-but-index-supported clustering-column restriction.
|
||||
if (_has_queriable_ck_index && _uses_secondary_indexing) {
|
||||
// In cases where we use an index, clustering column restrictions might cause the need for filtering.
|
||||
// TODO: This is overly conservative, there are some cases when this returns true but filtering
|
||||
// is not needed. Because of that the database will sometimes perform filtering when it's not actually needed.
|
||||
// Query performance shouldn't be affected much, at most we will filter rows that are all correct.
|
||||
// Here are some cases to consider:
|
||||
// On a table with primary key (p, c1, c2, c3) with an index on c3
|
||||
// WHERE c3 = ? - doesn't require filtering
|
||||
// WHERE c1 = ? AND c2 = ? AND c3 = ? - requires filtering
|
||||
// WHERE p = ? AND c1 = ? AND c3 = ? - doesn't require filtering, but we conservatively report it does
|
||||
// WHERE p = ? AND c1 LIKE ? AND c3 = ? - requires filtering
|
||||
// WHERE p = ? AND c1 = ? AND c2 LIKE ? AND c3 = ? - requires filtering
|
||||
// WHERE p = ? AND c1 = ? AND c2 = ? AND c3 = ? - doesn't use an index
|
||||
// WHERE p = ? AND c1 = ? AND c2 < ? AND c3 = ? - doesn't require filtering, but we report it does
|
||||
return _clustering_columns_restrictions->size() > 1;
|
||||
}
|
||||
// Now we know that the query doesn't use an index.
|
||||
|
||||
// The only thing that can cause filtering now are the clustering columns.
|
||||
return _clustering_columns_restrictions->needs_filtering(*_schema);
|
||||
}
|
||||
|
||||
void statement_restrictions::validate_secondary_index_selections(bool selects_only_static_columns) {
|
||||
|
||||
@@ -307,6 +307,13 @@ create_index_statement::announce_migration(query_processor& qp) const {
|
||||
format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
|
||||
}
|
||||
}
|
||||
auto index_table_name = secondary_index::index_table_name(accepted_name);
|
||||
if (db.has_schema(keyspace(), index_table_name)) {
|
||||
return make_exception_future<::shared_ptr<cql_transport::event::schema_change>>(
|
||||
exceptions::invalid_request_exception(format("Index {} cannot be created, because table {} already exists",
|
||||
accepted_name, index_table_name))
|
||||
);
|
||||
}
|
||||
++_cql_stats->secondary_index_creates;
|
||||
schema_builder builder{schema};
|
||||
builder.with_index(index);
|
||||
|
||||
@@ -461,7 +461,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
|
||||
if (!view_col) {
|
||||
throw std::runtime_error(format("Base key column not found in the view: {}", base_col.name_as_text()));
|
||||
}
|
||||
if (base_col.type != view_col->type) {
|
||||
if (base_col.type->without_reversed() != *view_col->type) {
|
||||
throw std::runtime_error(format("Mismatched types for base and view columns {}: {} and {}",
|
||||
base_col.name_as_text(), base_col.type->cql3_type_name(), view_col->type->cql3_type_name()));
|
||||
}
|
||||
@@ -965,6 +965,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
|
||||
}
|
||||
|
||||
auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
|
||||
paging_state_copy->set_remaining(internal_paging_size);
|
||||
paging_state_copy->set_partition_key(std::move(index_pk));
|
||||
paging_state_copy->set_clustering_key(std::move(index_ck));
|
||||
return std::move(paging_state_copy);
|
||||
|
||||
10
database.cc
10
database.cc
@@ -747,10 +747,8 @@ void database::set_format(sstables::sstable_version_types format) {
|
||||
void database::set_format_by_config() {
|
||||
if (_cfg.enable_sstables_md_format()) {
|
||||
set_format(sstables::sstable_version_types::md);
|
||||
} else if (_cfg.enable_sstables_mc_format()) {
|
||||
set_format(sstables::sstable_version_types::mc);
|
||||
} else {
|
||||
set_format(sstables::sstable_version_types::la);
|
||||
set_format(sstables::sstable_version_types::mc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1957,7 +1955,11 @@ sstring database::get_available_index_name(const sstring &ks_name, const sstring
|
||||
auto base_name = index_metadata::get_default_index_name(cf_name, index_name_root);
|
||||
sstring accepted_name = base_name;
|
||||
int i = 0;
|
||||
while (existing_names.contains(accepted_name)) {
|
||||
auto name_accepted = [&] {
|
||||
auto index_table_name = secondary_index::index_table_name(accepted_name);
|
||||
return !has_schema(ks_name, index_table_name) && !existing_names.contains(accepted_name);
|
||||
};
|
||||
while (!name_accepted()) {
|
||||
accepted_name = base_name + "_" + std::to_string(++i);
|
||||
}
|
||||
return accepted_name;
|
||||
|
||||
18
database.hh
18
database.hh
@@ -239,9 +239,13 @@ public:
|
||||
return _memtables.back();
|
||||
}
|
||||
|
||||
// The caller has to make sure the element exist before calling this.
|
||||
// # 8904 - this method is akin to std::set::erase(key_type), not
|
||||
// erase(iterator). Should be tolerant against non-existing.
|
||||
void erase(const shared_memtable& element) {
|
||||
_memtables.erase(boost::range::find(_memtables, element));
|
||||
auto i = boost::range::find(_memtables, element);
|
||||
if (i != _memtables.end()) {
|
||||
_memtables.erase(i);
|
||||
}
|
||||
}
|
||||
void clear() {
|
||||
_memtables.clear();
|
||||
@@ -924,7 +928,7 @@ public:
|
||||
return _pending_writes_phaser.start();
|
||||
}
|
||||
|
||||
future<> await_pending_writes() {
|
||||
future<> await_pending_writes() noexcept {
|
||||
return _pending_writes_phaser.advance_and_await();
|
||||
}
|
||||
|
||||
@@ -936,7 +940,7 @@ public:
|
||||
return _pending_reads_phaser.start();
|
||||
}
|
||||
|
||||
future<> await_pending_reads() {
|
||||
future<> await_pending_reads() noexcept {
|
||||
return _pending_reads_phaser.advance_and_await();
|
||||
}
|
||||
|
||||
@@ -948,7 +952,7 @@ public:
|
||||
return _pending_streams_phaser.start();
|
||||
}
|
||||
|
||||
future<> await_pending_streams() {
|
||||
future<> await_pending_streams() noexcept {
|
||||
return _pending_streams_phaser.advance_and_await();
|
||||
}
|
||||
|
||||
@@ -956,11 +960,11 @@ public:
|
||||
return _pending_streams_phaser.operations_in_progress();
|
||||
}
|
||||
|
||||
future<> await_pending_flushes() {
|
||||
future<> await_pending_flushes() noexcept {
|
||||
return _pending_flushes_phaser.advance_and_await();
|
||||
}
|
||||
|
||||
future<> await_pending_ops() {
|
||||
future<> await_pending_ops() noexcept {
|
||||
return when_all(await_pending_reads(), await_pending_writes(), await_pending_streams(), await_pending_flushes()).discard_result();
|
||||
}
|
||||
|
||||
|
||||
@@ -136,6 +136,7 @@ db::commitlog::config db::commitlog::config::from_db_config(const db::config& cf
|
||||
c.extensions = &cfg.extensions();
|
||||
c.reuse_segments = cfg.commitlog_reuse_segments();
|
||||
c.use_o_dsync = cfg.commitlog_use_o_dsync();
|
||||
c.allow_going_over_size_limit = !cfg.commitlog_use_hard_size_limit();
|
||||
|
||||
return c;
|
||||
}
|
||||
@@ -316,6 +317,7 @@ public:
|
||||
uint64_t buffer_list_bytes = 0;
|
||||
// size on disk, actually used - i.e. containing data (allocate+cycle)
|
||||
uint64_t active_size_on_disk = 0;
|
||||
uint64_t wasted_size_on_disk = 0;
|
||||
// size allocated on disk - i.e. files created (new, reserve, recycled)
|
||||
uint64_t total_size_on_disk = 0;
|
||||
uint64_t requests_blocked_memory = 0;
|
||||
@@ -419,7 +421,11 @@ public:
|
||||
void flush_segments(uint64_t size_to_remove);
|
||||
|
||||
private:
|
||||
class shutdown_marker{};
|
||||
|
||||
future<> clear_reserve_segments();
|
||||
void abort_recycled_list(std::exception_ptr);
|
||||
void abort_deletion_promise(std::exception_ptr);
|
||||
|
||||
future<> rename_file(sstring, sstring) const;
|
||||
size_t max_request_controller_units() const;
|
||||
@@ -433,6 +439,7 @@ private:
|
||||
timer<clock_type> _timer;
|
||||
future<> replenish_reserve();
|
||||
future<> _reserve_replenisher;
|
||||
future<> _background_sync;
|
||||
seastar::gate _gate;
|
||||
uint64_t _new_counter = 0;
|
||||
};
|
||||
@@ -541,6 +548,9 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
|
||||
|
||||
void end_flush() {
|
||||
_segment_manager->end_flush();
|
||||
if (can_delete()) {
|
||||
_segment_manager->discard_unused_segments();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
@@ -584,6 +594,7 @@ public:
|
||||
clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
|
||||
++_segment_manager->totals.segments_destroyed;
|
||||
_segment_manager->totals.active_size_on_disk -= file_position();
|
||||
_segment_manager->totals.wasted_size_on_disk -= (_size_on_disk - file_position());
|
||||
_segment_manager->add_file_to_delete(_file_name, _desc);
|
||||
} else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
|
||||
clogger.warn("Segment {} is dirty and is left on disk.", *this);
|
||||
@@ -695,7 +706,14 @@ public:
|
||||
}
|
||||
future<sseg_ptr> close() {
|
||||
_closed = true;
|
||||
return sync().then([] (sseg_ptr s) { return s->flush(); }).then([] (sseg_ptr s) { return s->terminate(); });
|
||||
return sync().then([] (sseg_ptr s) {
|
||||
return s->flush();
|
||||
}).then([](sseg_ptr s) {
|
||||
return s->terminate();
|
||||
}).then([](sseg_ptr s) {
|
||||
s->_segment_manager->totals.wasted_size_on_disk += (s->_size_on_disk - s->file_position());
|
||||
return s;
|
||||
});
|
||||
}
|
||||
future<sseg_ptr> do_flush(uint64_t pos) {
|
||||
auto me = shared_from_this();
|
||||
@@ -1137,13 +1155,15 @@ db::commitlog::segment_manager::segment_manager(config c)
|
||||
|
||||
return cfg;
|
||||
}())
|
||||
, max_size(std::min<size_t>(std::numeric_limits<position_type>::max(), std::max<size_t>(cfg.commitlog_segment_size_in_mb, 1) * 1024 * 1024))
|
||||
, max_size(std::min<size_t>(std::numeric_limits<position_type>::max() / (1024 * 1024), std::max<size_t>(cfg.commitlog_segment_size_in_mb, 1)) * 1024 * 1024)
|
||||
, max_mutation_size(max_size >> 1)
|
||||
, max_disk_size(size_t(std::ceil(cfg.commitlog_total_space_in_mb / double(smp::count))) * 1024 * 1024)
|
||||
// our threshold for trying to force a flush. needs heristics, for now max - segment_size/2.
|
||||
, disk_usage_threshold(cfg.commitlog_flush_threshold_in_mb.has_value()
|
||||
? size_t(std::ceil(*cfg.commitlog_flush_threshold_in_mb / double(smp::count))) * 1024 * 1024
|
||||
: (max_disk_size - (max_disk_size > (max_size/2) ? (max_size/2) : 0)))
|
||||
: (max_disk_size -
|
||||
(max_disk_size >= (max_size*2) ? max_size
|
||||
: (max_disk_size > (max_size/2) ? (max_size/2) : max_disk_size/3))))
|
||||
, _flush_semaphore(cfg.max_active_flushes)
|
||||
// That is enough concurrency to allow for our largest mutation (max_mutation_size), plus
|
||||
// an existing in-flight buffer. Since we'll force the cycling() of any buffer that is bigger
|
||||
@@ -1153,6 +1173,7 @@ db::commitlog::segment_manager::segment_manager(config c)
|
||||
, _reserve_segments(1)
|
||||
, _recycled_segments(std::numeric_limits<size_t>::max())
|
||||
, _reserve_replenisher(make_ready_future<>())
|
||||
, _background_sync(make_ready_future<>())
|
||||
{
|
||||
assert(max_size > 0);
|
||||
assert(max_mutation_size < segment::multi_entry_size_magic);
|
||||
@@ -1190,6 +1211,12 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}).handle_exception([](std::exception_ptr ep) {
|
||||
try {
|
||||
std::rethrow_exception(ep);
|
||||
} catch (shutdown_marker&) {
|
||||
return make_ready_future<>();
|
||||
} catch (...) {
|
||||
}
|
||||
clogger.warn("Exception in segment reservation: {}", ep);
|
||||
return sleep(100ms);
|
||||
});
|
||||
@@ -1334,6 +1361,10 @@ void db::commitlog::segment_manager::create_counters(const sstring& metrics_cate
|
||||
sm::description("Holds a size of disk space in bytes used for data so far. "
|
||||
"A too high value indicates that we have some bottleneck in the writing to sstables path.")),
|
||||
|
||||
sm::make_gauge("disk_slack_end_bytes", totals.wasted_size_on_disk,
|
||||
sm::description("Holds a size of disk space in bytes unused because of segment switching (end slack). "
|
||||
"A too high value indicates that we do not write enough data to each segment.")),
|
||||
|
||||
sm::make_gauge("memory_buffer_bytes", totals.buffer_list_bytes,
|
||||
sm::description("Holds the total number of bytes in internal memory buffers.")),
|
||||
});
|
||||
@@ -1370,7 +1401,8 @@ void db::commitlog::segment_manager::flush_segments(uint64_t size_to_remove) {
|
||||
|
||||
// Now get a set of used CF ids:
|
||||
std::unordered_set<cf_id_type> ids;
|
||||
std::for_each(_segments.begin(), _segments.end() - 1, [&ids](sseg_ptr& s) {
|
||||
auto e = std::find_if(_segments.begin(), _segments.end(), std::mem_fn(&segment::is_still_allocating));
|
||||
std::for_each(_segments.begin(), e, [&ids](sseg_ptr& s) {
|
||||
for (auto& id : s->_cf_dirty | boost::adaptors::map_keys) {
|
||||
ids.insert(id);
|
||||
}
|
||||
@@ -1446,6 +1478,9 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
m += s;
|
||||
}
|
||||
auto s = co_await f.dma_write(max_size - rem, std::move(v), service::get_local_commitlog_priority());
|
||||
if (!s) [[unlikely]] {
|
||||
on_internal_error(clogger, format("dma_write returned 0: max_size={} rem={} iovec.n={}", max_size, rem, n));
|
||||
}
|
||||
rem -= s;
|
||||
}
|
||||
|
||||
@@ -1466,6 +1501,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
f = make_checked_file(commit_error_handler, std::move(f));
|
||||
} catch (...) {
|
||||
ep = std::current_exception();
|
||||
commit_error_handler(ep);
|
||||
@@ -1511,7 +1548,19 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
if (!cfg.allow_going_over_size_limit && max_disk_size != 0 && totals.total_size_on_disk >= max_disk_size) {
|
||||
clogger.debug("Disk usage ({} MB) exceeds maximum ({} MB) - allocation will wait...", totals.total_size_on_disk/(1024*1024), max_disk_size/(1024*1024));
|
||||
auto f = cfg.reuse_segments ? _recycled_segments.not_empty() : _disk_deletions.get_shared_future();
|
||||
return f.then([this] {
|
||||
if (!f.available()) {
|
||||
_new_counter = 0; // zero this so timer task does not duplicate the below flush
|
||||
flush_segments(0); // force memtable flush already
|
||||
}
|
||||
return f.handle_exception([this](auto ep) {
|
||||
try {
|
||||
std::rethrow_exception(ep);
|
||||
} catch (shutdown_marker&) {
|
||||
throw;
|
||||
} catch (...) {
|
||||
}
|
||||
clogger.warn("Exception while waiting for segments {}. Will retry allocation...", ep);
|
||||
}).then([this] {
|
||||
return allocate_segment();
|
||||
});
|
||||
}
|
||||
@@ -1533,7 +1582,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
clogger.debug("Increased segment reserve count to {}", _reserve_segments.max_size());
|
||||
}
|
||||
// if we have no reserve and we're above/at limits, make background task a little more eager.
|
||||
if (!_shutdown && totals.total_size_on_disk >= disk_usage_threshold) {
|
||||
auto cur = totals.active_size_on_disk + totals.wasted_size_on_disk;
|
||||
if (!_shutdown && cur >= disk_usage_threshold) {
|
||||
_timer.cancel();
|
||||
_timer.arm(std::chrono::milliseconds(0));
|
||||
}
|
||||
@@ -1670,7 +1720,10 @@ future<> db::commitlog::segment_manager::clear_reserve_segments() {
|
||||
|
||||
future<> db::commitlog::segment_manager::sync_all_segments() {
|
||||
clogger.debug("Issuing sync for all segments");
|
||||
return parallel_for_each(_segments, [] (sseg_ptr s) {
|
||||
// #8952 - calls that do sync/cycle can end up altering
|
||||
// _segments (end_flush()->discard_unused())
|
||||
auto def_copy = _segments;
|
||||
return parallel_for_each(def_copy, [] (sseg_ptr s) {
|
||||
return s->sync().then([](sseg_ptr s) {
|
||||
clogger.debug("Synced segment {}", *s);
|
||||
});
|
||||
@@ -1679,7 +1732,10 @@ future<> db::commitlog::segment_manager::sync_all_segments() {
|
||||
|
||||
future<> db::commitlog::segment_manager::shutdown_all_segments() {
|
||||
clogger.debug("Issuing shutdown for all segments");
|
||||
return parallel_for_each(_segments, [] (sseg_ptr s) {
|
||||
// #8952 - calls that do sync/cycle can end up altering
|
||||
// _segments (end_flush()->discard_unused())
|
||||
auto def_copy = _segments;
|
||||
return parallel_for_each(def_copy, [] (sseg_ptr s) {
|
||||
return s->shutdown().then([](sseg_ptr s) {
|
||||
clogger.debug("Shutdown segment {}", *s);
|
||||
});
|
||||
@@ -1693,13 +1749,36 @@ future<> db::commitlog::segment_manager::shutdown() {
|
||||
// Wait for all pending requests to finish. Need to sync first because segments that are
|
||||
// alive may be holding semaphore permits.
|
||||
auto block_new_requests = get_units(_request_controller, max_request_controller_units());
|
||||
return sync_all_segments().then([this, block_new_requests = std::move(block_new_requests)] () mutable {
|
||||
return sync_all_segments().then_wrapped([this, block_new_requests = std::move(block_new_requests)] (future<> f) mutable {
|
||||
if (f.failed()) {
|
||||
clogger.error("Syncing all segments failed during shutdown: {}. Aborting.", f.get_exception());
|
||||
abort();
|
||||
}
|
||||
return std::move(block_new_requests).then([this] (auto permits) {
|
||||
_timer.cancel(); // no more timer calls
|
||||
_shutdown = true; // no re-arm, no create new segments.
|
||||
|
||||
// do a discard + delete sweep to force
|
||||
// gate holder (i.e. replenish) to wake up
|
||||
discard_unused_segments();
|
||||
auto f = do_pending_deletes().then([this] {
|
||||
auto ep = std::make_exception_ptr(shutdown_marker{});
|
||||
if (_recycled_segments.empty()) {
|
||||
abort_recycled_list(ep);
|
||||
}
|
||||
abort_deletion_promise(ep);
|
||||
return std::exchange(_background_sync, make_ready_future<>());
|
||||
});
|
||||
|
||||
|
||||
// Now first wait for periodic task to finish, then sync and close all
|
||||
// segments, flushing out any remaining data.
|
||||
return _gate.close().then(std::bind(&segment_manager::shutdown_all_segments, this)).finally([permits = std::move(permits)] { });
|
||||
return _gate.close().then([this, f = std::move(f)]() mutable {
|
||||
return std::move(f).then(std::bind(&segment_manager::shutdown_all_segments, this)).handle_exception([](std::exception_ptr ex) {
|
||||
clogger.error("Shutting down all segments failed during shutdown: {}. Aborting.", ex);
|
||||
abort();
|
||||
});
|
||||
}).finally([permits = std::move(permits)] { });
|
||||
});
|
||||
}).finally([this] {
|
||||
discard_unused_segments();
|
||||
@@ -1741,41 +1820,89 @@ future<> db::commitlog::segment_manager::delete_file(const sstring& filename) {
|
||||
}
|
||||
|
||||
future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> files) {
|
||||
auto i = files.begin();
|
||||
auto e = files.end();
|
||||
if (files.empty()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
return parallel_for_each(i, e, [this](auto& filename) {
|
||||
auto f = make_ready_future();
|
||||
auto exts = cfg.extensions;
|
||||
if (exts && !exts->commitlog_file_extensions().empty()) {
|
||||
f = parallel_for_each(exts->commitlog_file_extensions(), [&](auto& ext) {
|
||||
return ext->before_delete(filename);
|
||||
});
|
||||
}
|
||||
return f.finally([&] {
|
||||
// We allow reuse of the segment if the current disk size is less than shard max.
|
||||
auto usage = totals.total_size_on_disk;
|
||||
if (!_shutdown && cfg.reuse_segments && usage <= max_disk_size) {
|
||||
descriptor d(next_id(), "Recycled-" + cfg.fname_prefix);
|
||||
auto dst = this->filename(d);
|
||||
clogger.debug("Delete segments {}", files);
|
||||
|
||||
clogger.debug("Recycling segment file {}", filename);
|
||||
// must rename the file since we must ensure the
|
||||
// data is not replayed. Changing the name will
|
||||
// cause header ID to be invalid in the file -> ignored
|
||||
return rename_file(filename, dst).then([this, dst]() mutable {
|
||||
auto b = _recycled_segments.push(std::move(dst));
|
||||
assert(b); // we set this to max_size_t so...
|
||||
return make_ready_future<>();
|
||||
}).handle_exception([this, filename](auto&&) {
|
||||
return delete_file(filename);
|
||||
});
|
||||
std::exception_ptr recycle_error;
|
||||
|
||||
while (!files.empty()) {
|
||||
auto filename = std::move(files.back());
|
||||
files.pop_back();
|
||||
|
||||
try {
|
||||
auto exts = cfg.extensions;
|
||||
if (exts && !exts->commitlog_file_extensions().empty()) {
|
||||
for (auto& ext : exts->commitlog_file_extensions()) {
|
||||
co_await ext->before_delete(filename);
|
||||
}
|
||||
}
|
||||
return delete_file(filename);
|
||||
}).handle_exception([&filename](auto ep) {
|
||||
clogger.error("Could not delete segment {}: {}", filename, ep);
|
||||
});
|
||||
}).finally([files = std::move(files)] {});
|
||||
|
||||
// We allow reuse of the segment if the current disk size is less than shard max.
|
||||
if (cfg.reuse_segments) {
|
||||
auto usage = totals.total_size_on_disk;
|
||||
auto recycle = usage <= max_disk_size;
|
||||
|
||||
// if total size is not a multiple of segment size, we need
|
||||
// to check if we are the overlap segment, and noone else
|
||||
// can be recycled. If so, let this one live so allocation
|
||||
// can proceed. We assume/hope a future delete will kill
|
||||
// files down to under the threshold, but we should expect
|
||||
// to stomp around nearest multiple of segment size, not
|
||||
// the actual limit.
|
||||
if (!recycle && _recycled_segments.empty() && files.empty()) {
|
||||
auto size = co_await seastar::file_size(filename);
|
||||
recycle = (usage - size) <= max_disk_size;
|
||||
}
|
||||
|
||||
if (recycle) {
|
||||
descriptor d(next_id(), "Recycled-" + cfg.fname_prefix);
|
||||
auto dst = this->filename(d);
|
||||
|
||||
clogger.debug("Recycling segment file {}", filename);
|
||||
// must rename the file since we must ensure the
|
||||
// data is not replayed. Changing the name will
|
||||
// cause header ID to be invalid in the file -> ignored
|
||||
try {
|
||||
co_await rename_file(filename, dst);
|
||||
auto b = _recycled_segments.push(std::move(dst));
|
||||
assert(b); // we set this to max_size_t so...
|
||||
continue;
|
||||
} catch (...) {
|
||||
recycle_error = std::current_exception();
|
||||
// fallthrough
|
||||
}
|
||||
}
|
||||
}
|
||||
co_await delete_file(filename);
|
||||
} catch (...) {
|
||||
clogger.error("Could not delete segment {}: {}", filename, std::current_exception());
|
||||
}
|
||||
}
|
||||
|
||||
// #8376 - if we had an error in recycling (disk rename?), and no elements
|
||||
// are available, we could have waiters hoping they will get segements.
|
||||
// abort the queue (wakes up any existing waiters - futures), and let them
|
||||
// retry. Since we did deletions instead, disk footprint should allow
|
||||
// for new allocs at least. Or more likely, everything is broken, but
|
||||
// we will at least make more noise.
|
||||
if (recycle_error && _recycled_segments.empty()) {
|
||||
abort_recycled_list(recycle_error);
|
||||
}
|
||||
}
|
||||
|
||||
void db::commitlog::segment_manager::abort_recycled_list(std::exception_ptr ep) {
|
||||
// may not call here with elements in list. that would leak files.
|
||||
assert(_recycled_segments.empty());
|
||||
_recycled_segments.abort(ep);
|
||||
// and ensure next lap(s) still has a queue
|
||||
_recycled_segments = queue<sstring>(std::numeric_limits<size_t>::max());
|
||||
}
|
||||
|
||||
void db::commitlog::segment_manager::abort_deletion_promise(std::exception_ptr ep) {
|
||||
std::exchange(_disk_deletions, {}).set_exception(ep);
|
||||
}
|
||||
|
||||
future<> db::commitlog::segment_manager::do_pending_deletes() {
|
||||
@@ -1814,9 +1941,15 @@ future<> db::commitlog::segment_manager::clear() {
|
||||
* Called by timer in periodic mode.
|
||||
*/
|
||||
void db::commitlog::segment_manager::sync() {
|
||||
for (auto s : _segments) {
|
||||
(void)s->sync(); // we do not care about waiting...
|
||||
}
|
||||
auto f = std::exchange(_background_sync, make_ready_future<>());
|
||||
// #8952 - calls that do sync/cycle can end up altering
|
||||
// _segments (end_flush()->discard_unused())
|
||||
auto def_copy = _segments;
|
||||
_background_sync = parallel_for_each(def_copy, [](sseg_ptr s) {
|
||||
return s->sync().discard_result();
|
||||
}).then([f = std::move(f)]() mutable {
|
||||
return std::move(f);
|
||||
});
|
||||
}
|
||||
|
||||
void db::commitlog::segment_manager::on_timer() {
|
||||
@@ -1831,10 +1964,11 @@ void db::commitlog::segment_manager::on_timer() {
|
||||
// above threshold, request flush.
|
||||
if (_new_counter > 0) {
|
||||
auto max = disk_usage_threshold;
|
||||
auto cur = totals.active_size_on_disk;
|
||||
auto cur = totals.active_size_on_disk + totals.wasted_size_on_disk;
|
||||
|
||||
if (max != 0 && cur >= max) {
|
||||
_new_counter = 0;
|
||||
clogger.debug("Used size on disk {} MB exceeds local threshold {} MB", cur / (1024 * 1024), max / (1024 * 1024));
|
||||
_new_counter = 0;
|
||||
flush_segments(cur - max);
|
||||
}
|
||||
}
|
||||
@@ -2449,8 +2583,19 @@ std::vector<sstring> db::commitlog::get_active_segment_names() const {
|
||||
return _segment_manager->get_active_names();
|
||||
}
|
||||
|
||||
uint64_t db::commitlog::disk_limit() const {
|
||||
return _segment_manager->max_disk_size;
|
||||
}
|
||||
|
||||
uint64_t db::commitlog::disk_footprint() const {
|
||||
return _segment_manager->totals.total_size_on_disk;
|
||||
}
|
||||
|
||||
uint64_t db::commitlog::get_total_size() const {
|
||||
return _segment_manager->totals.active_size_on_disk + _segment_manager->totals.buffer_list_bytes;
|
||||
return _segment_manager->totals.active_size_on_disk
|
||||
+ _segment_manager->totals.wasted_size_on_disk
|
||||
+ _segment_manager->totals.buffer_list_bytes
|
||||
;
|
||||
}
|
||||
|
||||
uint64_t db::commitlog::get_completed_tasks() const {
|
||||
|
||||
@@ -140,7 +140,7 @@ public:
|
||||
bool reuse_segments = true;
|
||||
bool use_o_dsync = false;
|
||||
bool warn_about_segments_left_on_disk_after_shutdown = true;
|
||||
bool allow_going_over_size_limit = false;
|
||||
bool allow_going_over_size_limit = true;
|
||||
|
||||
const db::extensions * extensions = nullptr;
|
||||
};
|
||||
@@ -336,6 +336,16 @@ public:
|
||||
*/
|
||||
uint64_t max_active_flushes() const;
|
||||
|
||||
/**
|
||||
* Return disk footprint
|
||||
*/
|
||||
uint64_t disk_footprint() const;
|
||||
|
||||
/**
|
||||
* Return configured disk footprint limit
|
||||
*/
|
||||
uint64_t disk_limit() const;
|
||||
|
||||
future<> clear();
|
||||
|
||||
const config& active_config() const;
|
||||
|
||||
23
db/config.cc
23
db/config.cc
@@ -371,6 +371,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
"Whether or not to re-use commitlog segments when finished instead of deleting them. Can improve commitlog latency on some file systems.\n")
|
||||
, commitlog_use_o_dsync(this, "commitlog_use_o_dsync", value_status::Used, true,
|
||||
"Whether or not to use O_DSYNC mode for commitlog segments IO. Can improve commitlog latency on some file systems.\n")
|
||||
, commitlog_use_hard_size_limit(this, "commitlog_use_hard_size_limit", value_status::Used, false,
|
||||
"Whether or not to use a hard size limit for commitlog disk usage. Default is false. Enabling this can cause latency spikes, whereas the default can lead to occasional disk usage peaks.\n")
|
||||
/* Compaction settings */
|
||||
/* Related information: Configuring compaction */
|
||||
, compaction_preheat_key_cache(this, "compaction_preheat_key_cache", value_status::Unused, true,
|
||||
@@ -747,8 +749,8 @@ db::config::config(std::shared_ptr<db::extensions> exts)
|
||||
" Performance is affected to some extent as a result. Useful to help debugging problems that may arise at another layers.")
|
||||
, cpu_scheduler(this, "cpu_scheduler", value_status::Used, true, "Enable cpu scheduling")
|
||||
, view_building(this, "view_building", value_status::Used, true, "Enable view building; should only be set to false when the node is experience issues due to view building")
|
||||
, enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Used, true, "Enable SSTables 'mc' format to be used as the default file format")
|
||||
, enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Used, true, "Enable SSTables 'md' format to be used as the default file format (requires enable_sstables_mc_format)")
|
||||
, enable_sstables_mc_format(this, "enable_sstables_mc_format", value_status::Unused, true, "Enable SSTables 'mc' format to be used as the default file format")
|
||||
, enable_sstables_md_format(this, "enable_sstables_md_format", value_status::Used, true, "Enable SSTables 'md' format to be used as the default file format")
|
||||
, enable_dangerous_direct_import_of_cassandra_counters(this, "enable_dangerous_direct_import_of_cassandra_counters", value_status::Used, false, "Only turn this option on if you want to import tables from Cassandra containing counters, and you are SURE that no counters in that table were created in a version earlier than Cassandra 2.1."
|
||||
" It is not enough to have ever since upgraded to newer versions of Cassandra. If you EVER used a version earlier than 2.1 in the cluster where these SSTables come from, DO NOT TURN ON THIS OPTION! You will corrupt your data. You have been warned.")
|
||||
, enable_shard_aware_drivers(this, "enable_shard_aware_drivers", value_status::Used, true, "Enable native transport drivers to use connection-per-shard for better performance")
|
||||
@@ -906,8 +908,11 @@ db::fs::path db::config::get_conf_sub(db::fs::path sub) {
|
||||
}
|
||||
|
||||
bool db::config::check_experimental(experimental_features_t::feature f) const {
|
||||
if (experimental() && f != experimental_features_t::UNUSED && f != experimental_features_t::UNUSED_CDC) {
|
||||
return true;
|
||||
if (experimental()
|
||||
&& f != experimental_features_t::UNUSED
|
||||
&& f != experimental_features_t::UNUSED_CDC
|
||||
&& f != experimental_features_t::RAFT) {
|
||||
return true;
|
||||
}
|
||||
const auto& optval = experimental_features();
|
||||
return find(begin(optval), end(optval), enum_option<experimental_features_t>{f}) != end(optval);
|
||||
@@ -962,11 +967,17 @@ std::unordered_map<sstring, db::experimental_features_t::feature> db::experiment
|
||||
// to UNUSED switch for a while, then remove altogether.
|
||||
// Change Data Capture is no longer experimental. Map it
|
||||
// to UNUSED_CDC switch for a while, then remove altogether.
|
||||
return {{"lwt", UNUSED}, {"udf", UDF}, {"cdc", UNUSED_CDC}, {"alternator-streams", ALTERNATOR_STREAMS}};
|
||||
return {
|
||||
{"lwt", UNUSED},
|
||||
{"udf", UDF},
|
||||
{"cdc", UNUSED_CDC},
|
||||
{"alternator-streams", ALTERNATOR_STREAMS},
|
||||
{"raft", RAFT}
|
||||
};
|
||||
}
|
||||
|
||||
std::vector<enum_option<db::experimental_features_t>> db::experimental_features_t::all() {
|
||||
return {UDF, ALTERNATOR_STREAMS};
|
||||
return {UDF, ALTERNATOR_STREAMS, RAFT};
|
||||
}
|
||||
|
||||
template struct utils::config_file::named_value<seastar::log_level>;
|
||||
|
||||
@@ -82,7 +82,9 @@ namespace db {
|
||||
|
||||
/// Enumeration of all valid values for the `experimental` config entry.
|
||||
struct experimental_features_t {
|
||||
enum feature { UNUSED, UDF, UNUSED_CDC, ALTERNATOR_STREAMS };
|
||||
// NOTE: RAFT feature is not enabled via `experimental` umbrella flag.
|
||||
// This option should be enabled explicitly.
|
||||
enum feature { UNUSED, UDF, UNUSED_CDC, ALTERNATOR_STREAMS, RAFT };
|
||||
static std::unordered_map<sstring, feature> map(); // See enum_option.
|
||||
static std::vector<enum_option<experimental_features_t>> all();
|
||||
};
|
||||
@@ -163,6 +165,7 @@ public:
|
||||
named_value<int64_t> commitlog_total_space_in_mb;
|
||||
named_value<bool> commitlog_reuse_segments;
|
||||
named_value<bool> commitlog_use_o_dsync;
|
||||
named_value<bool> commitlog_use_hard_size_limit;
|
||||
named_value<bool> compaction_preheat_key_cache;
|
||||
named_value<uint32_t> concurrent_compactors;
|
||||
named_value<uint32_t> in_memory_compaction_limit_in_mb;
|
||||
|
||||
@@ -441,30 +441,15 @@ bool manager::end_point_hints_manager::sender::can_send() noexcept {
|
||||
|
||||
try {
|
||||
auto ep_state_ptr = _gossiper. get_endpoint_state_for_endpoint_ptr(end_point_key());
|
||||
if (!ep_state_ptr || !ep_state_ptr->is_alive()) {
|
||||
if (ep_state_ptr && ep_state_ptr->is_alive()) {
|
||||
_state.remove(state::ep_state_left_the_ring);
|
||||
return true;
|
||||
} else {
|
||||
if (!_state.contains(state::ep_state_left_the_ring)) {
|
||||
auto ep_gossip_state_val = _gossiper.get_gossip_status(end_point_key());
|
||||
// If node has been removed from the ring it's going to be removed from the gossiper::endpoint_state
|
||||
// map as well.
|
||||
//
|
||||
// However if it is still in the map then there are 3 possible STATE values for the node when it's in a DN/UN
|
||||
// state:
|
||||
// - NORMAL
|
||||
// - SHUTDOWN
|
||||
// - "" - when node is in a DN state but was DOWN since the local node started up. In this case
|
||||
// gossiper::endpoint_state[node][STATUS] value is going to be not set at all.
|
||||
_state.set_if<state::ep_state_left_the_ring>(
|
||||
!ep_state_ptr ||
|
||||
(ep_gossip_state_val != gms::versioned_value::STATUS_NORMAL &&
|
||||
ep_gossip_state_val != gms::versioned_value::SHUTDOWN &&
|
||||
ep_gossip_state_val != gms::versioned_value::STATUS_UNKNOWN)
|
||||
);
|
||||
_state.set_if<state::ep_state_left_the_ring>(!_shard_manager.local_db().get_token_metadata().is_member(end_point_key()));
|
||||
}
|
||||
// send the hints out if the destination Node is part of the ring - we will send to all new replicas in this case
|
||||
return _state.contains(state::ep_state_left_the_ring);
|
||||
} else {
|
||||
_state.remove(state::ep_state_left_the_ring);
|
||||
return true;
|
||||
}
|
||||
} catch (...) {
|
||||
return false;
|
||||
|
||||
@@ -235,7 +235,7 @@ public:
|
||||
bool send_one_file(const sstring& fname);
|
||||
|
||||
/// \brief Checks if we can still send hints.
|
||||
/// \return TRUE if the destination Node is either ALIVE or has left the NORMAL state (e.g. has been decommissioned).
|
||||
/// \return TRUE if the destination Node is either ALIVE or has left the ring (e.g. after decommission or removenode).
|
||||
bool can_send() noexcept;
|
||||
|
||||
/// \brief Restore a mutation object from the hints file entry.
|
||||
|
||||
@@ -124,7 +124,7 @@ static future<> try_record(std::string_view large_table, const sstables::sstable
|
||||
const auto sstable_name = sst.get_filename();
|
||||
std::string pk_str = key_to_str(partition_key.to_partition_key(s), s);
|
||||
auto timestamp = db_clock::now();
|
||||
large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes)", desc, ks_name, cf_name, pk_str, extra_path, size);
|
||||
large_data_logger.warn("Writing large {} {}/{}: {}{} ({} bytes) to {}", desc, ks_name, cf_name, pk_str, extra_path, size, sstable_name);
|
||||
return db::qctx->execute_cql(req, ks_name, cf_name, sstable_name, size, pk_str, timestamp, args...)
|
||||
.discard_result()
|
||||
.handle_exception([ks_name, cf_name, large_table, sstable_name] (std::exception_ptr ep) {
|
||||
@@ -140,9 +140,10 @@ future<> cql_table_large_data_handler::record_large_partitions(const sstables::s
|
||||
void cql_table_large_data_handler::log_too_many_rows(const sstables::sstable& sst, const sstables::key& partition_key,
|
||||
uint64_t rows_count) const {
|
||||
const schema& s = *sst.get_schema();
|
||||
large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows)",
|
||||
const auto sstable_name = sst.get_filename();
|
||||
large_data_logger.warn("Writing a partition with too many rows [{}/{}:{}] ({} rows) to {}",
|
||||
s.ks_name(), s.cf_name(), partition_key.to_partition_key(s).with_schema(s),
|
||||
rows_count);
|
||||
rows_count, sstable_name);
|
||||
}
|
||||
|
||||
future<> cql_table_large_data_handler::record_large_cells(const sstables::sstable& sst, const sstables::key& partition_key,
|
||||
|
||||
@@ -1751,7 +1751,7 @@ future<> set_bootstrap_state(bootstrap_state state) {
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<schema_ptr> all_tables() {
|
||||
std::vector<schema_ptr> all_tables(const db::config& cfg) {
|
||||
std::vector<schema_ptr> r;
|
||||
auto schema_tables = db::schema_tables::all_tables(schema_features::full());
|
||||
std::copy(schema_tables.begin(), schema_tables.end(), std::back_inserter(r));
|
||||
@@ -1760,12 +1760,14 @@ std::vector<schema_ptr> all_tables() {
|
||||
compactions_in_progress(), compaction_history(),
|
||||
sstable_activity(), clients(), size_estimates(), large_partitions(), large_rows(), large_cells(),
|
||||
scylla_local(), db::schema_tables::scylla_table_schema_history(),
|
||||
raft(), raft_snapshots(),
|
||||
v3::views_builds_in_progress(), v3::built_views(),
|
||||
v3::scylla_views_builds_in_progress(),
|
||||
v3::truncated(),
|
||||
v3::cdc_local(),
|
||||
});
|
||||
if (cfg.check_experimental(db::experimental_features_t::RAFT)) {
|
||||
r.insert(r.end(), {raft(), raft_snapshots()});
|
||||
}
|
||||
// legacy schema
|
||||
r.insert(r.end(), {
|
||||
// TODO: once we migrate hints/batchlog and add convertor
|
||||
@@ -1797,7 +1799,7 @@ static bool maybe_write_in_user_memory(schema_ptr s, database& db) {
|
||||
future<> make(database& db) {
|
||||
auto enable_cache = db.get_config().enable_cache();
|
||||
bool durable = db.get_config().data_file_directories().size() > 0;
|
||||
for (auto&& table : all_tables()) {
|
||||
for (auto&& table : all_tables(db.get_config())) {
|
||||
auto ks_name = table->ks_name();
|
||||
if (!db.has_keyspace(ks_name)) {
|
||||
auto ksm = make_lw_shared<keyspace_metadata>(ks_name,
|
||||
|
||||
@@ -82,6 +82,8 @@ namespace db {
|
||||
|
||||
sstring system_keyspace_name();
|
||||
|
||||
class config;
|
||||
|
||||
namespace system_keyspace {
|
||||
|
||||
static constexpr auto NAME = "system";
|
||||
@@ -210,7 +212,7 @@ future<> remove_endpoint(gms::inet_address ep);
|
||||
future<> set_scylla_local_param(const sstring& key, const sstring& value);
|
||||
future<std::optional<sstring>> get_scylla_local_param(const sstring& key);
|
||||
|
||||
std::vector<schema_ptr> all_tables();
|
||||
std::vector<schema_ptr> all_tables(const db::config& cfg);
|
||||
future<> make(database& db);
|
||||
|
||||
/// overloads
|
||||
|
||||
@@ -1162,7 +1162,7 @@ get_view_natural_endpoint(const sstring& keyspace_name,
|
||||
}
|
||||
|
||||
static future<> apply_to_remote_endpoints(gms::inet_address target, std::vector<gms::inet_address>&& pending_endpoints,
|
||||
frozen_mutation_and_schema& mut, const dht::token& base_token, const dht::token& view_token,
|
||||
frozen_mutation_and_schema&& mut, const dht::token& base_token, const dht::token& view_token,
|
||||
service::allow_hints allow_hints, tracing::trace_state_ptr tr_state) {
|
||||
|
||||
tracing::trace(tr_state, "Sending view update for {}.{} to {}, with pending endpoints = {}; base token = {}; view token = {}",
|
||||
@@ -1181,7 +1181,7 @@ static future<> apply_to_remote_endpoints(gms::inet_address target, std::vector<
|
||||
// appropriate paired replicas. This is done asynchronously - we do not wait
|
||||
// for the writes to complete.
|
||||
future<> mutate_MV(
|
||||
const dht::token& base_token,
|
||||
dht::token base_token,
|
||||
std::vector<frozen_mutation_and_schema> view_updates,
|
||||
db::view::stats& stats,
|
||||
cf_stats& cf_stats,
|
||||
@@ -1197,28 +1197,7 @@ future<> mutate_MV(
|
||||
auto& keyspace_name = mut.s->ks_name();
|
||||
auto target_endpoint = get_view_natural_endpoint(keyspace_name, base_token, view_token);
|
||||
auto remote_endpoints = service::get_local_storage_service().get_token_metadata().pending_endpoints_for(view_token, keyspace_name);
|
||||
auto maybe_account_failure = [s = mut.s, tr_state, &stats, &cf_stats, base_token, view_token, units = pending_view_updates.split(mut.fm.representation().size())] (
|
||||
future<>&& f,
|
||||
gms::inet_address target,
|
||||
bool is_local,
|
||||
size_t remotes) {
|
||||
if (f.failed()) {
|
||||
stats.view_updates_failed_local += is_local;
|
||||
stats.view_updates_failed_remote += remotes;
|
||||
cf_stats.total_view_updates_failed_local += is_local;
|
||||
cf_stats.total_view_updates_failed_remote += remotes;
|
||||
auto ep = f.get_exception();
|
||||
tracing::trace(tr_state, "Failed to apply {}view update for {} and {} remote endpoints",
|
||||
seastar::value_of([is_local]{return is_local ? "local " : "";}), target, remotes);
|
||||
vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
|
||||
target, s->ks_name(), s->cf_name(), base_token, view_token, ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
} else {
|
||||
tracing::trace(tr_state, "Successfully applied {}view update for {} and {} remote endpoints",
|
||||
seastar::value_of([is_local]{return is_local ? "local " : "";}), target, remotes);
|
||||
return make_ready_future<>();
|
||||
}
|
||||
};
|
||||
auto sem_units = pending_view_updates.split(mut.fm.representation().size());
|
||||
|
||||
// First, find the local endpoint and ensure that if it exists,
|
||||
// it will be the target endpoint. That way, all endpoints in the
|
||||
@@ -1255,11 +1234,20 @@ future<> mutate_MV(
|
||||
tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
|
||||
mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
|
||||
future<> local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
|
||||
[&stats,
|
||||
maybe_account_failure = std::move(maybe_account_failure),
|
||||
mut_ptr = std::move(mut_ptr)] (future<>&& f) {
|
||||
[s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
|
||||
units = sem_units.split(sem_units.count())] (future<>&& f) {
|
||||
--stats.writes;
|
||||
return maybe_account_failure(std::move(f), utils::fb_utilities::get_broadcast_address(), true, 0);
|
||||
if (f.failed()) {
|
||||
++stats.view_updates_failed_local;
|
||||
++cf_stats.total_view_updates_failed_local;
|
||||
auto ep = f.get_exception();
|
||||
tracing::trace(tr_state, "Failed to apply local view update for {}", my_address);
|
||||
vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
|
||||
my_address, s->ks_name(), s->cf_name(), base_token, view_token, ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
}
|
||||
tracing::trace(tr_state, "Successfully applied local view update for {}", my_address);
|
||||
return make_ready_future<>();
|
||||
});
|
||||
fs->push_back(std::move(local_view_update));
|
||||
// We just applied a local update to the target endpoint, so it should now be removed
|
||||
@@ -1281,11 +1269,23 @@ future<> mutate_MV(
|
||||
size_t updates_pushed_remote = remote_endpoints.size() + 1;
|
||||
stats.view_updates_pushed_remote += updates_pushed_remote;
|
||||
cf_stats.total_view_updates_pushed_remote += updates_pushed_remote;
|
||||
future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), mut, base_token, view_token, allow_hints, tr_state).then_wrapped(
|
||||
[target_endpoint,
|
||||
updates_pushed_remote,
|
||||
maybe_account_failure = std::move(maybe_account_failure)] (future<>&& f) mutable {
|
||||
return maybe_account_failure(std::move(f), std::move(*target_endpoint), false, updates_pushed_remote);
|
||||
schema_ptr s = mut.s;
|
||||
future<> view_update = apply_to_remote_endpoints(*target_endpoint, std::move(remote_endpoints), std::move(mut), base_token, view_token, allow_hints, tr_state).then_wrapped(
|
||||
[s = std::move(s), &stats, &cf_stats, tr_state, base_token, view_token, target_endpoint, updates_pushed_remote,
|
||||
units = sem_units.split(sem_units.count())] (future<>&& f) mutable {
|
||||
if (f.failed()) {
|
||||
stats.view_updates_failed_remote += updates_pushed_remote;
|
||||
cf_stats.total_view_updates_failed_remote += updates_pushed_remote;
|
||||
auto ep = f.get_exception();
|
||||
tracing::trace(tr_state, "Failed to apply view update for {} and {} remote endpoints",
|
||||
*target_endpoint, updates_pushed_remote);
|
||||
vlogger.error("Error applying view update to {} (view: {}.{}, base token: {}, view token: {}): {}",
|
||||
*target_endpoint, s->ks_name(), s->cf_name(), base_token, view_token, ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
}
|
||||
tracing::trace(tr_state, "Successfully applied view update for {} and {} remote endpoints",
|
||||
*target_endpoint, updates_pushed_remote);
|
||||
return make_ready_future<>();
|
||||
});
|
||||
if (wait_for_all) {
|
||||
fs->push_back(std::move(view_update));
|
||||
|
||||
@@ -153,7 +153,7 @@ query::clustering_row_ranges calculate_affected_clustering_ranges(
|
||||
struct wait_for_all_updates_tag {};
|
||||
using wait_for_all_updates = bool_class<wait_for_all_updates_tag>;
|
||||
future<> mutate_MV(
|
||||
const dht::token& base_token,
|
||||
dht::token base_token,
|
||||
std::vector<frozen_mutation_and_schema> view_updates,
|
||||
db::view::stats& stats,
|
||||
cf_stats& cf_stats,
|
||||
|
||||
@@ -58,7 +58,8 @@ public:
|
||||
|
||||
template<typename T, typename... Args>
|
||||
void feed_hash(const T& value, Args&&... args) {
|
||||
std::visit([&] (auto& hasher) noexcept -> void {
|
||||
// FIXME uncomment the noexcept marking once clang bug 50994 is fixed or gcc compilation is turned on
|
||||
std::visit([&] (auto& hasher) /* noexcept(noexcept(::feed_hash(hasher, value, args...))) */ -> void {
|
||||
::feed_hash(hasher, value, std::forward<Args>(args)...);
|
||||
}, _impl);
|
||||
};
|
||||
|
||||
1
dist/common/scripts/scylla_coredump_setup
vendored
1
dist/common/scripts/scylla_coredump_setup
vendored
@@ -67,6 +67,7 @@ Description=Save coredump to scylla data directory
|
||||
Conflicts=umount.target
|
||||
Before=scylla-server.service
|
||||
After=local-fs.target
|
||||
DefaultDependencies=no
|
||||
|
||||
[Mount]
|
||||
What=/var/lib/scylla/coredump
|
||||
|
||||
13
dist/common/scripts/scylla_cpuscaling_setup
vendored
13
dist/common/scripts/scylla_cpuscaling_setup
vendored
@@ -22,6 +22,7 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import shlex
|
||||
import distro
|
||||
from scylla_util import *
|
||||
@@ -46,7 +47,12 @@ if __name__ == '__main__':
|
||||
if os.getuid() > 0:
|
||||
print('Requires root permission.')
|
||||
sys.exit(1)
|
||||
if not os.path.exists('/sys/devices/system/cpu/cpufreq/policy0/scaling_governor'):
|
||||
parser = argparse.ArgumentParser(description='CPU scaling setup script for Scylla.')
|
||||
parser.add_argument('--force', dest='force', action='store_true',
|
||||
help='force running setup even CPU scaling unsupported')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.force and not os.path.exists('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor'):
|
||||
print('This computer doesn\'t supported CPU scaling configuration.')
|
||||
sys.exit(0)
|
||||
if not is_debian_variant():
|
||||
@@ -56,6 +62,11 @@ if __name__ == '__main__':
|
||||
if not shutil.which('cpufreq-set'):
|
||||
pkg_install('cpufrequtils')
|
||||
if is_debian_variant():
|
||||
try:
|
||||
ondemand = systemd_unit('ondemand')
|
||||
ondemand.disable()
|
||||
except:
|
||||
pass
|
||||
cfg = sysconfig_parser('/etc/default/cpufrequtils')
|
||||
cfg.set('GOVERNOR', 'performance')
|
||||
cfg.commit()
|
||||
|
||||
50
dist/common/scripts/scylla_io_setup
vendored
50
dist/common/scripts/scylla_io_setup
vendored
@@ -229,6 +229,52 @@ if __name__ == "__main__":
|
||||
disk_properties["read_bandwidth"] = 507338935 * nr_disks
|
||||
disk_properties["write_iops"] = 57100 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 483141731 * nr_disks
|
||||
elif idata.instance_class() in ("c6gd", "m6gd", "r6gd", "x2gd"):
|
||||
if idata.instance_size() == "medium":
|
||||
disk_properties["read_iops"] = 14808
|
||||
disk_properties["read_bandwidth"] = 77869147
|
||||
disk_properties["write_iops"] = 5972
|
||||
disk_properties["write_bandwidth"] = 32820302
|
||||
elif idata.instance_size() == "large":
|
||||
disk_properties["read_iops"] = 29690
|
||||
disk_properties["read_bandwidth"] = 157712240
|
||||
disk_properties["write_iops"] = 12148
|
||||
disk_properties["write_bandwidth"] = 65978069
|
||||
elif idata.instance_size() == "xlarge":
|
||||
disk_properties["read_iops"] = 59688
|
||||
disk_properties["read_bandwidth"] = 318762880
|
||||
disk_properties["write_iops"] = 24449
|
||||
disk_properties["write_bandwidth"] = 133311808
|
||||
elif idata.instance_size() == "2xlarge":
|
||||
disk_properties["read_iops"] = 119353
|
||||
disk_properties["read_bandwidth"] = 634795733
|
||||
disk_properties["write_iops"] = 49069
|
||||
disk_properties["write_bandwidth"] = 266841680
|
||||
elif idata.instance_size() == "4xlarge":
|
||||
disk_properties["read_iops"] = 237196
|
||||
disk_properties["read_bandwidth"] = 1262309504
|
||||
disk_properties["write_iops"] = 98884
|
||||
disk_properties["write_bandwidth"] = 533938080
|
||||
elif idata.instance_size() == "8xlarge":
|
||||
disk_properties["read_iops"] = 442945
|
||||
disk_properties["read_bandwidth"] = 2522688939
|
||||
disk_properties["write_iops"] = 166021
|
||||
disk_properties["write_bandwidth"] = 1063041152
|
||||
elif idata.instance_size() == "12xlarge":
|
||||
disk_properties["read_iops"] = 353691 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 1908192256 * nr_disks
|
||||
disk_properties["write_iops"] = 146732 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 806399360 * nr_disks
|
||||
elif idata.instance_size() == "16xlarge":
|
||||
disk_properties["read_iops"] = 426893 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 2525781589 * nr_disks
|
||||
disk_properties["write_iops"] = 161740 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1063389952 * nr_disks
|
||||
elif idata.instance_size() == "metal":
|
||||
disk_properties["read_iops"] = 416257 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 2527296683 * nr_disks
|
||||
disk_properties["write_iops"] = 156326 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1063657088 * nr_disks
|
||||
properties_file = open(etcdir() + "/scylla.d/io_properties.yaml", "w")
|
||||
yaml.dump({ "disks": [ disk_properties ] }, properties_file, default_flow_style=False)
|
||||
ioconf = open(etcdir() + "/scylla.d/io.conf", "w")
|
||||
@@ -254,7 +300,7 @@ if __name__ == "__main__":
|
||||
disk_properties["read_bandwidth"] = 2650 * mbs
|
||||
disk_properties["write_iops"] = 360000
|
||||
disk_properties["write_bandwidth"] = 1400 * mbs
|
||||
elif nr_disks == "16":
|
||||
elif nr_disks == 16:
|
||||
disk_properties["read_iops"] = 1600000
|
||||
disk_properties["read_bandwidth"] = 4521251328
|
||||
#below is google, above is our measured
|
||||
@@ -263,7 +309,7 @@ if __name__ == "__main__":
|
||||
disk_properties["write_bandwidth"] = 2759452672
|
||||
#below is google, above is our measured
|
||||
#disk_properties["write_bandwidth"] = 3120 * mbs
|
||||
elif nr_disks == "24":
|
||||
elif nr_disks == 24:
|
||||
disk_properties["read_iops"] = 2400000
|
||||
disk_properties["read_bandwidth"] = 5921532416
|
||||
#below is google, above is our measured
|
||||
|
||||
14
dist/common/scripts/scylla_prepare
vendored
14
dist/common/scripts/scylla_prepare
vendored
@@ -28,7 +28,6 @@ import distro
|
||||
|
||||
from scylla_util import *
|
||||
from subprocess import run
|
||||
from multiprocessing import cpu_count
|
||||
|
||||
def get_mode_cpuset(nic, mode):
|
||||
mode_cpu_mask = run('/opt/scylladb/scripts/perftune.py --tune net --nic {} --mode {} --get-cpu-mask-quiet'.format(nic, mode), shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
||||
@@ -100,16 +99,6 @@ def verify_cpu():
|
||||
print('\nIf this is a virtual machine, please update its CPU feature configuration or upgrade to a newer hypervisor.')
|
||||
sys.exit(1)
|
||||
|
||||
def configure_aio_slots():
|
||||
with open('/proc/sys/fs/aio-max-nr') as f:
|
||||
aio_max_nr = int(f.read())
|
||||
# (10000 + 1024 + 2) * ncpus for scylla,
|
||||
# 65536 for other apps
|
||||
required_aio_slots = cpu_count() * 11026 + 65536
|
||||
if aio_max_nr < required_aio_slots:
|
||||
with open('/proc/sys/fs/aio-max-nr', 'w') as f:
|
||||
f.write(str(required_aio_slots))
|
||||
|
||||
if __name__ == '__main__':
|
||||
verify_cpu()
|
||||
|
||||
@@ -124,8 +113,6 @@ if __name__ == '__main__':
|
||||
os.remove('/etc/scylla/ami_disabled')
|
||||
sys.exit(1)
|
||||
|
||||
configure_aio_slots()
|
||||
|
||||
if mode == 'virtio':
|
||||
tap = cfg.get('TAP')
|
||||
user = cfg.get('USER')
|
||||
@@ -155,4 +142,3 @@ if __name__ == '__main__':
|
||||
print(f'Exception occurred while creating perftune.yaml: {e}')
|
||||
print('To fix the error, please re-run scylla_setup.')
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
14
dist/common/scripts/scylla_raid_setup
vendored
14
dist/common/scripts/scylla_raid_setup
vendored
@@ -115,10 +115,6 @@ if __name__ == '__main__':
|
||||
pkg_install('xfsprogs')
|
||||
if not shutil.which('mdadm'):
|
||||
pkg_install('mdadm')
|
||||
try:
|
||||
md_service = systemd_unit('mdmonitor.service')
|
||||
except SystemdException:
|
||||
md_service = systemd_unit('mdadm.service')
|
||||
|
||||
print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='RAID0' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
|
||||
procs=[]
|
||||
@@ -153,17 +149,15 @@ if __name__ == '__main__':
|
||||
os.makedirs(mount_at, exist_ok=True)
|
||||
|
||||
uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
||||
after = 'local-fs.target'
|
||||
if raid:
|
||||
after += f' {md_service}'
|
||||
unit_data = f'''
|
||||
[Unit]
|
||||
Description=Scylla data directory
|
||||
Before=scylla-server.service
|
||||
After={after}
|
||||
After=local-fs.target
|
||||
DefaultDependencies=no
|
||||
|
||||
[Mount]
|
||||
What=UUID={uuid}
|
||||
What=/dev/disk/by-uuid/{uuid}
|
||||
Where={mount_at}
|
||||
Type=xfs
|
||||
Options=noatime
|
||||
@@ -183,8 +177,6 @@ WantedBy=multi-user.target
|
||||
f.write(f'RequiresMountsFor={mount_at}\n')
|
||||
|
||||
systemd_unit.reload()
|
||||
md_service.enable()
|
||||
md_service.start()
|
||||
mount = systemd_unit(mntunit_bn)
|
||||
mount.start()
|
||||
if args.enable_on_nextboot:
|
||||
|
||||
6
dist/common/scripts/scylla_util.py
vendored
6
dist/common/scripts/scylla_util.py
vendored
@@ -36,6 +36,7 @@ from subprocess import run, DEVNULL
|
||||
import distro
|
||||
from scylla_sysconfdir import SYSCONFDIR
|
||||
|
||||
from multiprocessing import cpu_count
|
||||
|
||||
def scriptsdir_p():
|
||||
p = Path(sys.argv[0]).resolve()
|
||||
@@ -146,6 +147,11 @@ class gcp_instance:
|
||||
if af == socket.AF_INET:
|
||||
addr, port = sa
|
||||
if addr == "169.254.169.254":
|
||||
# Make sure it is not on GKE
|
||||
try:
|
||||
gcp_instance().__instance_metadata("machine-type")
|
||||
except urllib.error.HTTPError:
|
||||
return False
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
Normal file
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# Raise max AIO events
|
||||
fs.aio-max-nr = 5578536
|
||||
2
dist/common/systemd/scylla-fstrim.timer
vendored
2
dist/common/systemd/scylla-fstrim.timer
vendored
@@ -1,7 +1,5 @@
|
||||
[Unit]
|
||||
Description=Run Scylla fstrim daily
|
||||
After=scylla-server.service
|
||||
BindsTo=scylla-server.service
|
||||
|
||||
[Timer]
|
||||
OnCalendar=Sat *-*-* 00:00:00
|
||||
|
||||
2
dist/debian/control.template
vendored
2
dist/debian/control.template
vendored
@@ -12,6 +12,8 @@ Architecture: any
|
||||
Description: Scylla database main configuration file
|
||||
Scylla is a highly scalable, eventually consistent, distributed,
|
||||
partitioned row DB.
|
||||
.
|
||||
Dedicated to the memory of Alberto José Araújo, a coworker and a friend.
|
||||
Replaces: %{product}-server (<< 1.1)
|
||||
Conflicts: %{product}-server (<< 1.1)
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ else
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
|
||||
fi
|
||||
|
||||
#DEBHELPER#
|
||||
|
||||
2
dist/debian/debian/scylla-server.postrm
vendored
2
dist/debian/debian/scylla-server.postrm
vendored
@@ -12,8 +12,6 @@ case "$1" in
|
||||
if [ "$1" = "purge" ]; then
|
||||
rm -rf /etc/systemd/system/scylla-server.service.d/
|
||||
fi
|
||||
rm -f /etc/systemd/system/var-lib-systemd-coredump.mount
|
||||
rm -f /etc/systemd/system/var-lib-scylla.mount
|
||||
;;
|
||||
esac
|
||||
|
||||
|
||||
4
dist/docker/redhat/Dockerfile
vendored
4
dist/docker/redhat/Dockerfile
vendored
@@ -5,8 +5,8 @@ MAINTAINER Avi Kivity <avi@cloudius-systems.com>
|
||||
ENV container docker
|
||||
|
||||
# The SCYLLA_REPO_URL argument specifies the URL to the RPM repository this Docker image uses to install Scylla. The default value is the Scylla's unstable RPM repository, which contains the daily build.
|
||||
ARG SCYLLA_REPO_URL=http://downloads.scylladb.com/rpm/unstable/centos/master/latest/scylla.repo
|
||||
ARG VERSION=4.5.dev
|
||||
ARG SCYLLA_REPO_URL=downloads.scylladb.com/unstable/scylla/branch-4.5/rpm/centos/latest/
|
||||
ARG VERSION=4.5.4
|
||||
|
||||
ADD scylla_bashrc /scylla_bashrc
|
||||
|
||||
|
||||
@@ -4,3 +4,4 @@ stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
stopwaitsecs=900
|
||||
|
||||
5
dist/docker/redhat/scyllasetup.py
vendored
5
dist/docker/redhat/scyllasetup.py
vendored
@@ -121,12 +121,13 @@ class ScyllaSetup:
|
||||
if self._apiAddress is not None:
|
||||
args += ["--api-address %s" % self._apiAddress]
|
||||
|
||||
if self._alternatorPort is not None:
|
||||
if self._alternatorAddress is not None:
|
||||
args += ["--alternator-address %s" % self._alternatorAddress]
|
||||
|
||||
if self._alternatorPort is not None:
|
||||
args += ["--alternator-port %s" % self._alternatorPort]
|
||||
|
||||
if self._alternatorHttpsPort is not None:
|
||||
args += ["--alternator-address %s" % self._alternatorAddress]
|
||||
args += ["--alternator-https-port %s" % self._alternatorHttpsPort]
|
||||
|
||||
if self._alternatorWriteIsolation is not None:
|
||||
|
||||
17
dist/redhat/scylla.spec
vendored
17
dist/redhat/scylla.spec
vendored
@@ -7,7 +7,7 @@ Group: Applications/Databases
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Source0: %{reloc_pkg}
|
||||
Requires: %{product}-server = %{version} %{product}-conf = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
|
||||
Requires: %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version} %{product}-node-exporter = %{version}
|
||||
Obsoletes: scylla-server < 1.1
|
||||
|
||||
%global _debugsource_template %{nil}
|
||||
@@ -30,6 +30,8 @@ partitioned row DB.
|
||||
This package installs all required packages for ScyllaDB, including
|
||||
%{product}-server, %{product}-jmx, %{product}-tools, %{product}-tools-core %{product}-node-exporter.
|
||||
|
||||
Dedicated to the memory of Alberto José Araújo, a coworker and a friend.
|
||||
|
||||
# this is needed to prevent python compilation error on CentOS (#2235)
|
||||
%if 0%{?rhel}
|
||||
%global __os_install_post \
|
||||
@@ -54,7 +56,7 @@ Group: Applications/Databases
|
||||
Summary: The Scylla database server
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Requires: %{product}-conf %{product}-python3
|
||||
Requires: %{product}-conf = %{version} %{product}-python3 = %{version}
|
||||
Conflicts: abrt
|
||||
AutoReqProv: no
|
||||
|
||||
@@ -141,9 +143,9 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
|
||||
%ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
|
||||
/etc/systemd/system/scylla-server.service.d/dependencies.conf
|
||||
%ghost /etc/systemd/system/var-lib-systemd-coredump.mount
|
||||
%ghost %config /etc/systemd/system/var-lib-systemd-coredump.mount
|
||||
%ghost /etc/systemd/system/scylla-cpupower.service
|
||||
%ghost /etc/systemd/system/var-lib-scylla.mount
|
||||
%ghost %config /etc/systemd/system/var-lib-scylla.mount
|
||||
|
||||
%package conf
|
||||
Group: Applications/Databases
|
||||
@@ -211,6 +213,7 @@ if Scylla is the main application on your server and you wish to optimize its la
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
|
||||
|
||||
%files kernel-conf
|
||||
%defattr(-,root,root)
|
||||
@@ -228,13 +231,13 @@ Prometheus exporter for machine metrics, written in Go with pluggable metric col
|
||||
|
||||
%post node-exporter
|
||||
if [ $1 -eq 1 ] ; then
|
||||
/usr/bin/systemctl preset node-exporter.service ||:
|
||||
/usr/bin/systemctl preset scylla-node-exporter.service ||:
|
||||
fi
|
||||
|
||||
%preun node-exporter
|
||||
if [ $1 -eq 0 ] ; then
|
||||
/usr/bin/systemctl --no-reload disable node-exporter.service ||:
|
||||
/usr/bin/systemctl stop node-exporter.service ||:
|
||||
/usr/bin/systemctl --no-reload disable scylla-node-exporter.service ||:
|
||||
/usr/bin/systemctl stop scylla-node-exporter.service ||:
|
||||
fi
|
||||
|
||||
%postun node-exporter
|
||||
|
||||
@@ -478,7 +478,7 @@ distributed_loader::get_sstables_from_upload_dir(distributed<database>& db, sstr
|
||||
sstables_on_shards[this_shard_id()] = d.get_unsorted_sstables();
|
||||
}).get();
|
||||
|
||||
return std::make_tuple(table_id, sstables_on_shards);
|
||||
return std::make_tuple(table_id, std::move(sstables_on_shards));
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -98,14 +98,6 @@ feature_config feature_config_from_db_config(db::config& cfg, std::set<sstring>
|
||||
|
||||
fcfg._disabled_features = std::move(disabled);
|
||||
|
||||
if (!cfg.enable_sstables_mc_format()) {
|
||||
if (cfg.enable_sstables_md_format()) {
|
||||
throw std::runtime_error(
|
||||
"You must use both enable_sstables_mc_format and enable_sstables_md_format "
|
||||
"to enable SSTables md format support");
|
||||
}
|
||||
fcfg._disabled_features.insert(sstring(gms::features::MC_SSTABLE));
|
||||
}
|
||||
if (!cfg.enable_sstables_md_format()) {
|
||||
fcfg._disabled_features.insert(sstring(gms::features::MD_SSTABLE));
|
||||
}
|
||||
|
||||
@@ -1448,7 +1448,7 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
logger.trace("marking as alive {}", addr);
|
||||
|
||||
// Do not mark a node with status shutdown as UP.
|
||||
auto status = get_gossip_status(local_state);
|
||||
auto status = sstring(get_gossip_status(local_state));
|
||||
if (status == sstring(versioned_value::SHUTDOWN)) {
|
||||
logger.warn("Skip marking node {} with status = {} as UP", addr, status);
|
||||
return;
|
||||
@@ -1467,6 +1467,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Make a copy for endpoint_state because the code below can yield
|
||||
endpoint_state state = local_state;
|
||||
_live_endpoints.push_back(addr);
|
||||
if (_endpoints_to_talk_with.empty()) {
|
||||
_endpoints_to_talk_with.push_back({addr});
|
||||
@@ -1478,8 +1480,8 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
logger.info("InetAddress {} is now UP, status = {}", addr, status);
|
||||
}
|
||||
|
||||
_subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_alive(addr, local_state);
|
||||
_subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_alive(addr, state);
|
||||
logger.trace("Notified {}", fmt::ptr(subscriber.get()));
|
||||
});
|
||||
}
|
||||
@@ -1488,11 +1490,12 @@ void gossiper::real_mark_alive(inet_address addr, endpoint_state& local_state) {
|
||||
void gossiper::mark_dead(inet_address addr, endpoint_state& local_state) {
|
||||
logger.trace("marking as down {}", addr);
|
||||
local_state.mark_dead();
|
||||
endpoint_state state = local_state;
|
||||
_live_endpoints.resize(std::distance(_live_endpoints.begin(), std::remove(_live_endpoints.begin(), _live_endpoints.end(), addr)));
|
||||
_unreachable_endpoints[addr] = now();
|
||||
logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(local_state));
|
||||
_subscribers.for_each([addr, local_state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_dead(addr, local_state);
|
||||
logger.info("InetAddress {} is now DOWN, status = {}", addr, get_gossip_status(state));
|
||||
_subscribers.for_each([addr, state] (shared_ptr<i_endpoint_state_change_subscriber> subscriber) {
|
||||
subscriber->on_dead(addr, state);
|
||||
logger.trace("Notified {}", fmt::ptr(subscriber.get()));
|
||||
});
|
||||
}
|
||||
@@ -2131,6 +2134,32 @@ bool gossiper::is_alive(inet_address ep) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Runs inside seastar::async context
|
||||
void gossiper::wait_alive(std::vector<gms::inet_address> nodes, std::chrono::milliseconds timeout) {
|
||||
auto start_time = std::chrono::steady_clock::now();
|
||||
for (;;) {
|
||||
std::vector<gms::inet_address> live_nodes;
|
||||
for (const auto& node: nodes) {
|
||||
size_t nr_alive = container().map_reduce0([node] (gossiper& g) -> size_t {
|
||||
return g.is_alive(node) ? 1 : 0;
|
||||
}, 0, std::plus<size_t>()).get0();
|
||||
logger.debug("Marked node={} as alive on {} out of {} shards", node, nr_alive, smp::count);
|
||||
if (nr_alive == smp::count) {
|
||||
live_nodes.push_back(node);
|
||||
}
|
||||
}
|
||||
logger.debug("Waited for marking node as up, replace_nodes={}, live_nodes={}", nodes, live_nodes);
|
||||
if (live_nodes.size() == nodes.size()) {
|
||||
break;
|
||||
}
|
||||
if (std::chrono::steady_clock::now() > timeout + start_time) {
|
||||
throw std::runtime_error(format("Failed to mark node as alive in {} ms, nodes={}, live_nodes={}",
|
||||
timeout.count(), nodes, live_nodes));
|
||||
}
|
||||
sleep_abortable(std::chrono::milliseconds(100), _abort_source).get();
|
||||
}
|
||||
}
|
||||
|
||||
const versioned_value* gossiper::get_application_state_ptr(inet_address endpoint, application_state appstate) const noexcept {
|
||||
auto* eps = get_endpoint_state_for_endpoint_ptr(std::move(endpoint));
|
||||
if (!eps) {
|
||||
|
||||
@@ -442,6 +442,8 @@ private:
|
||||
public:
|
||||
bool is_alive(inet_address ep) const;
|
||||
bool is_dead_state(const endpoint_state& eps) const;
|
||||
// Wait for nodes to be alive on all shards
|
||||
void wait_alive(std::vector<gms::inet_address> nodes, std::chrono::milliseconds timeout);
|
||||
|
||||
future<> apply_state_locally(std::map<inet_address, endpoint_state> map);
|
||||
|
||||
|
||||
@@ -62,7 +62,7 @@ struct appending_hash;
|
||||
template<typename H, typename T, typename... Args>
|
||||
requires Hasher<H>
|
||||
inline
|
||||
void feed_hash(H& h, const T& value, Args&&... args) noexcept {
|
||||
void feed_hash(H& h, const T& value, Args&&... args) noexcept(noexcept(std::declval<appending_hash<T>>()(h, value, args...))) {
|
||||
appending_hash<T>()(h, value, std::forward<Args>(args)...);
|
||||
};
|
||||
|
||||
|
||||
20
install.sh
20
install.sh
@@ -150,6 +150,10 @@ EOF
|
||||
chmod +x "$install"
|
||||
}
|
||||
|
||||
install() {
|
||||
command install -Z "$@"
|
||||
}
|
||||
|
||||
installconfig() {
|
||||
local perm="$1"
|
||||
local src="$2"
|
||||
@@ -210,13 +214,13 @@ if [ -z "$python3" ]; then
|
||||
fi
|
||||
rpython3=$(realpath -m "$root/$python3")
|
||||
if ! $nonroot; then
|
||||
retc="$root/etc"
|
||||
rsysconfdir="$root/$sysconfdir"
|
||||
rusr="$root/usr"
|
||||
rsystemd="$rusr/lib/systemd/system"
|
||||
retc=$(realpath -m "$root/etc")
|
||||
rsysconfdir=$(realpath -m "$root/$sysconfdir")
|
||||
rusr=$(realpath -m "$root/usr")
|
||||
rsystemd=$(realpath -m "$rusr/lib/systemd/system")
|
||||
rdoc="$rprefix/share/doc"
|
||||
rdata="$root/var/lib/scylla"
|
||||
rhkdata="$root/var/lib/scylla-housekeeping"
|
||||
rdata=$(realpath -m "$root/var/lib/scylla")
|
||||
rhkdata=$(realpath -m "$root/var/lib/scylla-housekeeping")
|
||||
else
|
||||
retc="$rprefix/etc"
|
||||
rsysconfdir="$rprefix/$sysconfdir"
|
||||
@@ -245,6 +249,7 @@ if ! $nonroot; then
|
||||
done
|
||||
fi
|
||||
# scylla-node-exporter
|
||||
install -d -m755 "$rsysconfdir" "$rsystemd"
|
||||
install -d -m755 "$rprefix"/node_exporter
|
||||
install -d -m755 "$rprefix"/node_exporter/licenses
|
||||
install -m755 node_exporter/node_exporter "$rprefix"/node_exporter
|
||||
@@ -278,7 +283,6 @@ fi
|
||||
|
||||
# scylla-server
|
||||
install -m755 -d "$rprefix"
|
||||
install -m755 -d "$rsysconfdir"
|
||||
install -m755 -d "$retc/scylla.d"
|
||||
installconfig 644 dist/common/sysconfig/scylla-housekeeping "$rsysconfdir"
|
||||
installconfig 644 dist/common/sysconfig/scylla-server "$rsysconfdir"
|
||||
@@ -286,7 +290,7 @@ for file in dist/common/scylla.d/*.conf; do
|
||||
installconfig 644 "$file" "$retc"/scylla.d
|
||||
done
|
||||
|
||||
install -d -m755 "$retc"/scylla "$rsystemd" "$rprefix/bin" "$rprefix/libexec" "$rprefix/libreloc" "$rprefix/scripts" "$rprefix/bin"
|
||||
install -d -m755 "$retc"/scylla "$rprefix/bin" "$rprefix/libexec" "$rprefix/libreloc" "$rprefix/scripts" "$rprefix/bin"
|
||||
install -m644 dist/common/systemd/scylla-fstrim.service -Dt "$rsystemd"
|
||||
install -m644 dist/common/systemd/scylla-housekeeping-daily.service -Dt "$rsystemd"
|
||||
install -m644 dist/common/systemd/scylla-housekeeping-restart.service -Dt "$rsystemd"
|
||||
|
||||
64
main.cc
64
main.cc
@@ -591,6 +591,22 @@ int main(int ac, char** av) {
|
||||
};
|
||||
auto background_reclaim_scheduling_group = make_sched_group("background_reclaim", 50);
|
||||
auto maintenance_scheduling_group = make_sched_group("streaming", 200);
|
||||
|
||||
smp::invoke_on_all([&cfg, background_reclaim_scheduling_group] {
|
||||
logalloc::tracker::config st_cfg;
|
||||
st_cfg.defragment_on_idle = cfg->defragment_memory_on_idle();
|
||||
st_cfg.abort_on_lsa_bad_alloc = cfg->abort_on_lsa_bad_alloc();
|
||||
st_cfg.lsa_reclamation_step = cfg->lsa_reclamation_step();
|
||||
st_cfg.background_reclaim_sched_group = background_reclaim_scheduling_group;
|
||||
logalloc::shard_tracker().configure(st_cfg);
|
||||
}).get();
|
||||
|
||||
auto stop_lsa_background_reclaim = defer([&] {
|
||||
smp::invoke_on_all([&] {
|
||||
return logalloc::shard_tracker().stop();
|
||||
}).get();
|
||||
});
|
||||
|
||||
uint16_t api_port = cfg->api_port();
|
||||
ctx.api_dir = cfg->api_ui_dir();
|
||||
ctx.api_doc = cfg->api_doc_dir();
|
||||
@@ -716,7 +732,7 @@ int main(int ac, char** av) {
|
||||
tracing::backend_registry tracing_backend_registry;
|
||||
tracing::register_tracing_keyspace_backend(tracing_backend_registry);
|
||||
tracing::tracing::create_tracing(tracing_backend_registry, "trace_keyspace_helper").get();
|
||||
auto stop_tracing = defer_verbose_shutdown("tracing", [] {
|
||||
auto destroy_tracing = defer_verbose_shutdown("tracing instance", [] {
|
||||
tracing::tracing::tracing_instance().stop().get();
|
||||
});
|
||||
supervisor::notify("creating snitch");
|
||||
@@ -777,13 +793,6 @@ int main(int ac, char** av) {
|
||||
mscfg.encrypt = netw::messaging_service::encrypt_what::rack;
|
||||
}
|
||||
|
||||
if (clauth && (mscfg.encrypt == netw::messaging_service::encrypt_what::dc || mscfg.encrypt == netw::messaging_service::encrypt_what::dc)) {
|
||||
startlog.warn("Setting require_client_auth is incompatible with 'rack' and 'dc' internode_encryption values."
|
||||
" To ensure that mutual TLS authentication is enforced, please set internode_encryption to 'all'. Continuing with"
|
||||
" potentially insecure configuration."
|
||||
);
|
||||
}
|
||||
|
||||
sstring compress_what = cfg->internode_compression();
|
||||
if (compress_what == "all") {
|
||||
mscfg.compress = netw::messaging_service::compress_what::all;
|
||||
@@ -1046,12 +1055,20 @@ int main(int ac, char** av) {
|
||||
auto stop_proxy_handlers = defer_verbose_shutdown("storage proxy RPC verbs", [&proxy] {
|
||||
proxy.invoke_on_all(&service::storage_proxy::uninit_messaging_service).get();
|
||||
});
|
||||
supervisor::notify("initializing Raft services");
|
||||
raft_srvs.start(std::ref(messaging), std::ref(gossiper), std::ref(qp)).get();
|
||||
raft_srvs.invoke_on_all(&raft_services::init).get();
|
||||
|
||||
const bool raft_enabled = cfg->check_experimental(db::experimental_features_t::RAFT);
|
||||
if (raft_enabled) {
|
||||
supervisor::notify("initializing Raft services");
|
||||
raft_srvs.start(std::ref(messaging), std::ref(gossiper), std::ref(qp)).get();
|
||||
raft_srvs.invoke_on_all(&raft_services::init).get();
|
||||
}
|
||||
auto stop_raft_sc_handlers = defer_verbose_shutdown("Raft services", [&raft_srvs] {
|
||||
raft_srvs.invoke_on_all(&raft_services::uninit).get();
|
||||
});
|
||||
if (!raft_enabled) {
|
||||
stop_raft_sc_handlers->cancel();
|
||||
}
|
||||
|
||||
supervisor::notify("starting streaming service");
|
||||
streaming::stream_session::init_streaming_service(db, sys_dist_ks, view_update_generator, messaging).get();
|
||||
auto stop_streaming_service = defer_verbose_shutdown("streaming service", [] {
|
||||
@@ -1172,13 +1189,9 @@ int main(int ac, char** av) {
|
||||
|
||||
supervisor::notify("starting tracing");
|
||||
tracing::tracing::start_tracing(qp).get();
|
||||
/*
|
||||
* FIXME -- tracing is stopped inside drain_on_shutdown, which
|
||||
* is deferred later on. If the start aborts before it, the
|
||||
* tracing will remain started and will continue referencing
|
||||
* the query processor. Nowadays the latter is not stopped
|
||||
* either, but when it will, this place shold be fixed too.
|
||||
*/
|
||||
auto stop_tracing = defer_verbose_shutdown("tracing", [] {
|
||||
tracing::tracing::stop_tracing().get();
|
||||
});
|
||||
|
||||
startlog.info("SSTable data integrity checker is {}.",
|
||||
cfg->enable_sstable_data_integrity_check() ? "enabled" : "disabled");
|
||||
@@ -1409,21 +1422,6 @@ int main(int ac, char** av) {
|
||||
}).get();
|
||||
}
|
||||
|
||||
smp::invoke_on_all([&cfg, background_reclaim_scheduling_group] {
|
||||
logalloc::tracker::config st_cfg;
|
||||
st_cfg.defragment_on_idle = cfg->defragment_memory_on_idle();
|
||||
st_cfg.abort_on_lsa_bad_alloc = cfg->abort_on_lsa_bad_alloc();
|
||||
st_cfg.lsa_reclamation_step = cfg->lsa_reclamation_step();
|
||||
st_cfg.background_reclaim_sched_group = background_reclaim_scheduling_group;
|
||||
logalloc::shard_tracker().configure(st_cfg);
|
||||
}).get();
|
||||
|
||||
auto stop_lsa_background_reclaim = defer([&] {
|
||||
smp::invoke_on_all([&] {
|
||||
return logalloc::shard_tracker().stop();
|
||||
}).get();
|
||||
});
|
||||
|
||||
seastar::set_abort_on_ebadf(cfg->abort_on_ebadf());
|
||||
api::set_server_done(ctx).get();
|
||||
supervisor::notify("serving");
|
||||
|
||||
@@ -357,29 +357,9 @@ void messaging_service::do_start_listen() {
|
||||
cfg.sched_group = scheduling_group_for_isolation_cookie(isolation_cookie);
|
||||
return cfg;
|
||||
};
|
||||
if (!_server[0] && _cfg.encrypt != encrypt_what::all) {
|
||||
if (!_server[0]) {
|
||||
auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
|
||||
so.streaming_domain = sdomain;
|
||||
so.filter_connection = {};
|
||||
switch (_cfg.encrypt) {
|
||||
default:
|
||||
case encrypt_what::none:
|
||||
break;
|
||||
case encrypt_what::dc:
|
||||
so.filter_connection = [](const seastar::socket_address& addr) {
|
||||
auto& snitch = locator::i_endpoint_snitch::get_local_snitch_ptr();
|
||||
return snitch->get_datacenter(addr) == snitch->get_datacenter(utils::fb_utilities::get_broadcast_address());
|
||||
};
|
||||
break;
|
||||
case encrypt_what::rack:
|
||||
so.filter_connection = [](const seastar::socket_address& addr) {
|
||||
auto& snitch = locator::i_endpoint_snitch::get_local_snitch_ptr();
|
||||
return snitch->get_datacenter(addr) == snitch->get_datacenter(utils::fb_utilities::get_broadcast_address())
|
||||
&& snitch->get_rack(addr) == snitch->get_rack(utils::fb_utilities::get_broadcast_address())
|
||||
;
|
||||
};
|
||||
break;
|
||||
}
|
||||
auto addr = socket_address{a, _cfg.port};
|
||||
return std::unique_ptr<rpc_protocol_server_wrapper>(new rpc_protocol_server_wrapper(_rpc->protocol(),
|
||||
so, addr, limits));
|
||||
@@ -389,10 +369,9 @@ void messaging_service::do_start_listen() {
|
||||
_server[1] = listen(utils::fb_utilities::get_broadcast_address(), rpc::streaming_domain_type(0x66BB));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!_server_tls[0]) {
|
||||
auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) {
|
||||
so.filter_connection = {};
|
||||
so.streaming_domain = sdomain;
|
||||
return std::unique_ptr<rpc_protocol_server_wrapper>(
|
||||
[this, &so, &a, limits] () -> std::unique_ptr<rpc_protocol_server_wrapper>{
|
||||
@@ -713,7 +692,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
remove_error_rpc_client(verb, id);
|
||||
}
|
||||
|
||||
auto must_encrypt = [&id, &verb, this] {
|
||||
auto must_encrypt = [&id, this] {
|
||||
if (_cfg.encrypt == encrypt_what::none) {
|
||||
return false;
|
||||
}
|
||||
@@ -721,23 +700,14 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
return true;
|
||||
}
|
||||
|
||||
// if we have dc/rack encryption but this is gossip, we should
|
||||
// use tls anyway, to avoid having mismatched ideas on which
|
||||
// group we/client are in.
|
||||
if (verb >= messaging_verb::GOSSIP_DIGEST_SYN && verb <= messaging_verb::GOSSIP_SHUTDOWN) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto& snitch_ptr = locator::i_endpoint_snitch::get_local_snitch_ptr();
|
||||
|
||||
// either rack/dc need to be in same dc to use non-tls
|
||||
if (snitch_ptr->get_datacenter(id.addr) != snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address())) {
|
||||
return true;
|
||||
if (_cfg.encrypt == encrypt_what::dc) {
|
||||
return snitch_ptr->get_datacenter(id.addr)
|
||||
!= snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address());
|
||||
}
|
||||
// if cross-rack tls, check rack.
|
||||
return _cfg.encrypt == encrypt_what::rack &&
|
||||
snitch_ptr->get_rack(id.addr) != snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address())
|
||||
;
|
||||
return snitch_ptr->get_rack(id.addr)
|
||||
!= snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address());
|
||||
}();
|
||||
|
||||
auto must_compress = [&id, this] {
|
||||
@@ -781,12 +751,11 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
|
||||
}
|
||||
|
||||
auto baddr = socket_address(utils::fb_utilities::get_broadcast_address(), 0);
|
||||
auto client = must_encrypt ?
|
||||
::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
|
||||
remote_addr, baddr, _credentials) :
|
||||
remote_addr, socket_address(), _credentials) :
|
||||
::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
|
||||
remote_addr, baddr);
|
||||
remote_addr);
|
||||
|
||||
auto res = _clients[idx].emplace(id, shard_info(std::move(client)));
|
||||
assert(res.second);
|
||||
|
||||
@@ -188,7 +188,7 @@ stop_iteration consume_clustering_fragments(const schema& s, mutation_partition&
|
||||
emit_rt = rts_it != rts_end;
|
||||
}
|
||||
if (emit_rt) {
|
||||
stop = consumer.consume(std::move(*rts_it));
|
||||
stop = consumer.consume(range_tombstone(std::move(*rts_it), range_tombstone::without_link{}));
|
||||
++rts_it;
|
||||
} else {
|
||||
stop = consumer.consume(clustering_row(std::move(*crs_it)));
|
||||
|
||||
@@ -1150,6 +1150,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
|
||||
_range_override.reset();
|
||||
_slice_override.reset();
|
||||
|
||||
_drop_partition_start = false;
|
||||
_drop_static_row = false;
|
||||
|
||||
if (_last_pkey) {
|
||||
bool partition_range_is_inclusive = true;
|
||||
|
||||
@@ -1235,13 +1238,25 @@ void evictable_reader::maybe_validate_partition_start(const flat_mutation_reader
|
||||
// is in range.
|
||||
if (_last_pkey) {
|
||||
const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
|
||||
if (_drop_partition_start) { // should be the same partition
|
||||
if (_drop_partition_start) { // we expect to continue from the same partition
|
||||
// We cannot assume the partition we stopped the read at is still alive
|
||||
// when we recreate the reader. It might have been compacted away in the
|
||||
// meanwhile, so allow for a larger partition too.
|
||||
require(
|
||||
cmp_res == 0,
|
||||
"{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
|
||||
cmp_res <= 0,
|
||||
"{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
|
||||
__FUNCTION__,
|
||||
*_last_pkey,
|
||||
ps.key());
|
||||
// Reset drop flags and next pos if we are not continuing from the same partition
|
||||
if (cmp_res < 0) {
|
||||
// Close previous partition, we are not going to continue it.
|
||||
push_mutation_fragment(*_schema, _permit, partition_end{});
|
||||
_drop_partition_start = false;
|
||||
_drop_static_row = false;
|
||||
_next_position_in_partition = position_in_partition::for_partition_start();
|
||||
_trim_range_tombstones = false;
|
||||
}
|
||||
} else { // should be a larger partition
|
||||
require(
|
||||
cmp_res < 0,
|
||||
@@ -1292,9 +1307,14 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
|
||||
_drop_partition_start = false;
|
||||
return true;
|
||||
}
|
||||
if (_drop_static_row && mf.is_static_row()) {
|
||||
_drop_static_row = false;
|
||||
return true;
|
||||
// Unlike partition-start above, a partition is not guaranteed to have a
|
||||
// static row fragment. So reset the flag regardless of whether we could
|
||||
// drop one or not.
|
||||
// We are guaranteed to get here only right after dropping a partition-start,
|
||||
// so if we are not seeing a static row here, the partition doesn't have one.
|
||||
if (_drop_static_row) {
|
||||
_drop_static_row = false;
|
||||
return mf.is_static_row();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -1537,8 +1557,8 @@ class shard_reader : public enable_lw_shared_from_this<shard_reader>, public fla
|
||||
private:
|
||||
shared_ptr<reader_lifecycle_policy> _lifecycle_policy;
|
||||
const unsigned _shard;
|
||||
const dht::partition_range* _pr;
|
||||
const query::partition_slice& _ps;
|
||||
dht::partition_range _pr;
|
||||
query::partition_slice _ps;
|
||||
const io_priority_class& _pc;
|
||||
tracing::global_trace_state_ptr _trace_state;
|
||||
const mutation_reader::forwarding _fwd_mr;
|
||||
@@ -1563,7 +1583,7 @@ public:
|
||||
: impl(std::move(schema), std::move(permit))
|
||||
, _lifecycle_policy(std::move(lifecycle_policy))
|
||||
, _shard(shard)
|
||||
, _pr(&pr)
|
||||
, _pr(pr)
|
||||
, _ps(ps)
|
||||
, _pc(pc)
|
||||
, _trace_state(std::move(trace_state))
|
||||
@@ -1647,7 +1667,7 @@ future<> shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
});
|
||||
auto s = gs.get();
|
||||
auto rreader = make_foreign(std::make_unique<evictable_reader>(evictable_reader::auto_pause::yes, std::move(ms),
|
||||
s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), *_pr, _ps, _pc, _trace_state, _fwd_mr));
|
||||
s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), _pr, _ps, _pc, _trace_state, _fwd_mr));
|
||||
tracing::trace(_trace_state, "Creating shard reader on shard: {}", this_shard_id());
|
||||
auto f = rreader->fill_buffer(timeout);
|
||||
return f.then([rreader = std::move(rreader)] () mutable {
|
||||
@@ -1701,7 +1721,7 @@ future<> shard_reader::next_partition() {
|
||||
}
|
||||
|
||||
future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) {
|
||||
_pr = ≺
|
||||
_pr = pr;
|
||||
|
||||
if (!_reader && !_read_ahead) {
|
||||
// No need to fast-forward uncreated readers, they will be passed the new
|
||||
@@ -1710,12 +1730,12 @@ future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeo
|
||||
}
|
||||
|
||||
auto f = _read_ahead ? *std::exchange(_read_ahead, std::nullopt) : make_ready_future<>();
|
||||
return f.then([this, &pr, timeout] {
|
||||
return f.then([this, timeout] {
|
||||
_end_of_stream = false;
|
||||
clear_buffer();
|
||||
|
||||
return smp::submit_to(_shard, [this, &pr, timeout] {
|
||||
return _reader->fast_forward_to(pr, timeout);
|
||||
return smp::submit_to(_shard, [this, timeout] {
|
||||
return _reader->fast_forward_to(_pr, timeout);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -2275,9 +2295,9 @@ position_reader_queue::~position_reader_queue() {}
|
||||
// are not implemented and throw an error; the reader is only used for single partition queries.
|
||||
//
|
||||
// Assumes that:
|
||||
// - the queue contains at least one reader,
|
||||
// - there are no static rows,
|
||||
// - the returned fragments do not contain partition tombstones.
|
||||
// - the returned fragments do not contain partition tombstones,
|
||||
// - the merged readers return fragments from the same partition (but some or even all of them may be empty).
|
||||
class clustering_order_reader_merger {
|
||||
const schema_ptr _schema;
|
||||
const reader_permit _permit;
|
||||
@@ -2389,12 +2409,17 @@ class clustering_order_reader_merger {
|
||||
if (!mf) {
|
||||
// The reader returned end-of-stream before returning end-of-partition
|
||||
// (otherwise we would have removed it in a previous peek). This means that
|
||||
// we are in forwarding mode and the reader won't return any more fragments in the current range.
|
||||
// either the reader was empty from the beginning (not even returning a `partition_start`)
|
||||
// or we are in forwarding mode and the reader won't return any more fragments in the current range.
|
||||
// If the reader's upper bound is smaller then the end of the current range then it won't
|
||||
// return any more fragments in later ranges as well (subsequent fast-forward-to ranges
|
||||
// are non-overlapping and strictly increasing), so we can remove it now.
|
||||
// Otherwise it may start returning fragments later, so we save it for the moment
|
||||
// in _halted_readers and will bring it back when we get fast-forwarded.
|
||||
// Otherwise, if it previously returned a `partition_start`, it may start returning more fragments
|
||||
// later (after we fast-forward) so we save it for the moment in _halted_readers and will bring it
|
||||
// back when we get fast-forwarded.
|
||||
// We also save the reader if it was empty from the beginning (no `partition_start`) since
|
||||
// it makes the code simpler (to check for this here we would need additional state); it is a bit wasteful
|
||||
// but completely empty readers should be rare.
|
||||
if (_cmp(it->upper_bound, _pr_end) < 0) {
|
||||
_all_readers.erase(it);
|
||||
} else {
|
||||
@@ -2524,19 +2549,6 @@ public:
|
||||
: position_in_partition_view::after_all_clustered_rows())
|
||||
, _should_emit_partition_end(fwd_sm == streamed_mutation::forwarding::no)
|
||||
{
|
||||
// The first call to `_reader_queue::pop` uses `after_all_clustered_rows`
|
||||
// so we obtain at least one reader; we will return this reader's `partition_start`
|
||||
// as the first fragment.
|
||||
auto rs = _reader_queue->pop(position_in_partition_view::after_all_clustered_rows());
|
||||
for (auto& r: rs) {
|
||||
_all_readers.push_front(std::move(r));
|
||||
_unpeeked_readers.push_back(_all_readers.begin());
|
||||
}
|
||||
|
||||
if (rs.empty()) {
|
||||
// No readers, no partition.
|
||||
_should_emit_partition_end = false;
|
||||
}
|
||||
}
|
||||
|
||||
// We assume that operator() is called sequentially and that the caller doesn't use the batch
|
||||
@@ -2553,8 +2565,22 @@ public:
|
||||
return peek_readers(timeout).then([this, timeout] { return (*this)(timeout); });
|
||||
}
|
||||
|
||||
auto next_peeked_pos = _peeked_readers.empty() ? _pr_end : _peeked_readers.front()->reader.peek_buffer().position();
|
||||
// There might be queued readers containing fragments with positions <= next_peeked_pos:
|
||||
// Before we return a batch of fragments using currently opened readers we must check the queue
|
||||
// for potential new readers that must be opened. There are three cases which determine how ``far''
|
||||
// should we look:
|
||||
// - If there are some peeked readers in the heap, we must check for new readers
|
||||
// whose `min_position`s are <= the position of the first peeked reader; there is no need
|
||||
// to check for ``later'' readers (yet).
|
||||
// - Otherwise, if we already fetched a partition start fragment, we need to look no further
|
||||
// than the end of the current position range (_pr_end).
|
||||
// - Otherwise we need to look for any reader (by calling the queue with `after_all_clustered_rows`),
|
||||
// even for readers whose `min_position`s may be outside the current position range since they
|
||||
// may be the only readers which have a `partition_start` fragment which we need to return
|
||||
// before end-of-stream.
|
||||
auto next_peeked_pos =
|
||||
_peeked_readers.empty()
|
||||
? (_partition_start_fetched ? _pr_end : position_in_partition_view::after_all_clustered_rows())
|
||||
: _peeked_readers.front()->reader.peek_buffer().position();
|
||||
if (!_reader_queue->empty(next_peeked_pos)) {
|
||||
auto rs = _reader_queue->pop(next_peeked_pos);
|
||||
for (auto& r: rs) {
|
||||
@@ -2568,8 +2594,11 @@ public:
|
||||
// We are either in forwarding mode and waiting for a fast-forward,
|
||||
// or we've exhausted all the readers.
|
||||
if (_should_emit_partition_end) {
|
||||
// Not forwarding, so all readers must be exhausted. Return the last fragment.
|
||||
_current_batch.push_back(mutation_fragment(*_schema, _permit, partition_end()));
|
||||
// Not forwarding, so all readers must be exhausted.
|
||||
// Return a partition end fragment unless all readers have been empty from the beginning.
|
||||
if (_partition_start_fetched) {
|
||||
_current_batch.push_back(mutation_fragment(*_schema, _permit, partition_end()));
|
||||
}
|
||||
_should_emit_partition_end = false;
|
||||
}
|
||||
return make_ready_future<mutation_fragment_batch>(_current_batch);
|
||||
|
||||
@@ -417,11 +417,11 @@ public:
|
||||
} else {
|
||||
// Copy row from older version because rows in evictable versions must
|
||||
// hold values which are independently complete to be consistent on eviction.
|
||||
auto e = current_allocator().construct<rows_entry>(_schema, *_current_row[0].it);
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, *_current_row[0].it));
|
||||
e->set_continuous(latest_i != rows.end() && latest_i->continuous());
|
||||
_snp.tracker()->insert(*e);
|
||||
rows.insert_before(latest_i, *e);
|
||||
return {*e, true};
|
||||
auto e_i = rows.insert_before(latest_i, std::move(e));
|
||||
return ensure_result{*e_i, true};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -453,11 +453,11 @@ public:
|
||||
}
|
||||
auto&& rows = _snp.version()->partition().clustered_rows();
|
||||
auto latest_i = get_iterator_in_latest_version();
|
||||
auto e = current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
|
||||
is_continuous(latest_i != rows.end() && latest_i->continuous()));
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
|
||||
is_continuous(latest_i != rows.end() && latest_i->continuous())));
|
||||
_snp.tracker()->insert(*e);
|
||||
rows.insert_before(latest_i, *e);
|
||||
return ensure_result{*e, true};
|
||||
auto e_i = rows.insert_before(latest_i, std::move(e));
|
||||
return ensure_result{*e_i, true};
|
||||
}
|
||||
|
||||
// Brings the entry pointed to by the cursor to the front of the LRU
|
||||
|
||||
@@ -267,9 +267,14 @@ public:
|
||||
return _current_tombstone;
|
||||
}
|
||||
|
||||
const std::deque<range_tombstone>& range_tombstones_for_row(const clustering_key_prefix& ck) {
|
||||
std::vector<range_tombstone> range_tombstones_for_row(const clustering_key_prefix& ck) {
|
||||
drop_unneeded_tombstones(ck);
|
||||
return _range_tombstones;
|
||||
std::vector<range_tombstone> result(_range_tombstones.begin(), _range_tombstones.end());
|
||||
auto cmp = [&] (const range_tombstone& rt1, const range_tombstone& rt2) {
|
||||
return _cmp(rt1.start_bound(), rt2.start_bound());
|
||||
};
|
||||
std::sort(result.begin(), result.end(), cmp);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::deque<range_tombstone> range_tombstones() && {
|
||||
|
||||
@@ -142,6 +142,7 @@ class read_context final : public enable_lw_shared_from_this<read_context> {
|
||||
mutation_source_opt _underlying_snapshot;
|
||||
dht::partition_range _sm_range;
|
||||
std::optional<dht::decorated_key> _key;
|
||||
bool _partition_exists;
|
||||
row_cache::phase_type _phase;
|
||||
public:
|
||||
read_context(row_cache& cache,
|
||||
@@ -190,22 +191,34 @@ public:
|
||||
autoupdating_underlying_reader& underlying() { return _underlying; }
|
||||
row_cache::phase_type phase() const { return _phase; }
|
||||
const dht::decorated_key& key() const { return *_key; }
|
||||
bool partition_exists() const { return _partition_exists; }
|
||||
void on_underlying_created() { ++_underlying_created; }
|
||||
bool digest_requested() const { return _slice.options.contains<query::partition_slice::option::with_digest>(); }
|
||||
public:
|
||||
future<> ensure_underlying(db::timeout_clock::time_point timeout) {
|
||||
if (_underlying_snapshot) {
|
||||
return create_underlying(true, timeout);
|
||||
return create_underlying(timeout).then([this, timeout] {
|
||||
return _underlying.underlying()(timeout).then([this] (mutation_fragment_opt&& mfopt) {
|
||||
_partition_exists = bool(mfopt);
|
||||
});
|
||||
});
|
||||
}
|
||||
// We know that partition exists because all the callers of
|
||||
// enter_partition(const dht::decorated_key&, row_cache::phase_type)
|
||||
// check that and there's no other way of setting _underlying_snapshot
|
||||
// to empty. Except for calling create_underlying.
|
||||
_partition_exists = true;
|
||||
return make_ready_future<>();
|
||||
}
|
||||
public:
|
||||
future<> create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout);
|
||||
future<> create_underlying(db::timeout_clock::time_point timeout);
|
||||
void enter_partition(const dht::decorated_key& dk, mutation_source& snapshot, row_cache::phase_type phase) {
|
||||
_phase = phase;
|
||||
_underlying_snapshot = snapshot;
|
||||
_key = dk;
|
||||
}
|
||||
// Precondition: each caller needs to make sure that partition with |dk| key
|
||||
// exists in underlying before calling this function.
|
||||
void enter_partition(const dht::decorated_key& dk, row_cache::phase_type phase) {
|
||||
_phase = phase;
|
||||
_underlying_snapshot = {};
|
||||
|
||||
@@ -77,7 +77,7 @@ class reader_permit::impl : public boost::intrusive::list_base_hook<boost::intru
|
||||
sstring _op_name;
|
||||
std::string_view _op_name_view;
|
||||
reader_resources _resources;
|
||||
reader_permit::state _state = reader_permit::state::registered;
|
||||
reader_permit::state _state = reader_permit::state::active;
|
||||
|
||||
public:
|
||||
struct value_tag {};
|
||||
@@ -126,40 +126,25 @@ public:
|
||||
}
|
||||
|
||||
void on_admission() {
|
||||
_state = reader_permit::state::admitted;
|
||||
_semaphore.consume(_resources);
|
||||
_state = reader_permit::state::active;
|
||||
}
|
||||
|
||||
void on_register_as_inactive() {
|
||||
if (_state != reader_permit::state::admitted) {
|
||||
_state = reader_permit::state::inactive;
|
||||
_semaphore.consume(_resources);
|
||||
}
|
||||
_state = reader_permit::state::inactive;
|
||||
}
|
||||
|
||||
void on_unregister_as_inactive() {
|
||||
if (_state == reader_permit::state::inactive) {
|
||||
_state = reader_permit::state::registered;
|
||||
_semaphore.signal(_resources);
|
||||
}
|
||||
}
|
||||
|
||||
bool should_forward_cost() const {
|
||||
return _state == reader_permit::state::admitted || _state == reader_permit::state::inactive;
|
||||
_state = reader_permit::state::active;
|
||||
}
|
||||
|
||||
void consume(reader_resources res) {
|
||||
_resources += res;
|
||||
if (should_forward_cost()) {
|
||||
_semaphore.consume(res);
|
||||
}
|
||||
_semaphore.consume(res);
|
||||
}
|
||||
|
||||
void signal(reader_resources res) {
|
||||
_resources -= res;
|
||||
if (should_forward_cost()) {
|
||||
_semaphore.signal(res);
|
||||
}
|
||||
_semaphore.signal(res);
|
||||
}
|
||||
|
||||
reader_resources resources() const {
|
||||
@@ -226,14 +211,11 @@ reader_resources reader_permit::consumed_resources() const {
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, reader_permit::state s) {
|
||||
switch (s) {
|
||||
case reader_permit::state::registered:
|
||||
os << "registered";
|
||||
break;
|
||||
case reader_permit::state::waiting:
|
||||
os << "waiting";
|
||||
break;
|
||||
case reader_permit::state::admitted:
|
||||
os << "admitted";
|
||||
case reader_permit::state::active:
|
||||
os << "active";
|
||||
break;
|
||||
case reader_permit::state::inactive:
|
||||
os << "inactive";
|
||||
@@ -273,7 +255,7 @@ struct permit_group_key_hash {
|
||||
|
||||
using permit_groups = std::unordered_map<permit_group_key, permit_stats, permit_group_key_hash>;
|
||||
|
||||
static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state, bool sort_by_memory) {
|
||||
static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state) {
|
||||
struct permit_summary {
|
||||
const schema* s;
|
||||
std::string_view op_name;
|
||||
@@ -289,25 +271,17 @@ static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const pe
|
||||
}
|
||||
}
|
||||
|
||||
std::ranges::sort(permit_summaries, [sort_by_memory] (const permit_summary& a, const permit_summary& b) {
|
||||
if (sort_by_memory) {
|
||||
return a.memory < b.memory;
|
||||
} else {
|
||||
return a.count < b.count;
|
||||
}
|
||||
std::ranges::sort(permit_summaries, [] (const permit_summary& a, const permit_summary& b) {
|
||||
return a.memory < b.memory;
|
||||
});
|
||||
|
||||
permit_stats total;
|
||||
|
||||
auto print_line = [&os, sort_by_memory] (auto col1, auto col2, auto col3) {
|
||||
if (sort_by_memory) {
|
||||
fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
|
||||
} else {
|
||||
fmt::print(os, "{}\t{}\t{}\n", col1, col2, col3);
|
||||
}
|
||||
auto print_line = [&os] (auto col1, auto col2, auto col3) {
|
||||
fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
|
||||
};
|
||||
|
||||
fmt::print(os, "Permits with state {}, sorted by {}\n", state, sort_by_memory ? "memory" : "count");
|
||||
fmt::print(os, "Permits with state {}\n", state);
|
||||
print_line("count", "memory", "name");
|
||||
for (const auto& summary : permit_summaries) {
|
||||
total.count += summary.count;
|
||||
@@ -333,13 +307,11 @@ static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_con
|
||||
permit_stats total;
|
||||
|
||||
fmt::print(os, "Semaphore {}: {}, dumping permit diagnostics:\n", semaphore.name(), problem);
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::admitted, true);
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::active);
|
||||
fmt::print(os, "\n");
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::inactive, false);
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::inactive);
|
||||
fmt::print(os, "\n");
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting, false);
|
||||
fmt::print(os, "\n");
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::registered, false);
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting);
|
||||
fmt::print(os, "\n");
|
||||
fmt::print(os, "Total: permits: {}, memory: {}\n", total.count, utils::to_hr_size(total.memory));
|
||||
}
|
||||
@@ -417,11 +389,9 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
|
||||
auto& permit_impl = *reader.permit()._impl;
|
||||
// Implies _inactive_reads.empty(), we don't queue new readers before
|
||||
// evicting all inactive reads.
|
||||
// FIXME: #4758, workaround for keeping tabs on un-admitted reads that are
|
||||
// still registered as inactive. Without the below check, these can
|
||||
// accumulate without limit. The real fix is #4758 -- that is to make all
|
||||
// reads pass admission before getting started.
|
||||
if (_wait_list.empty() && (permit_impl.get_state() == reader_permit::state::admitted || _resources >= permit_impl.resources())) {
|
||||
// Checking the _wait_list covers the count resources only, so check memory
|
||||
// separately.
|
||||
if (_wait_list.empty() && _resources.memory > 0) {
|
||||
try {
|
||||
auto irp = std::make_unique<inactive_read>(std::move(reader));
|
||||
auto& ir = *irp;
|
||||
@@ -514,13 +484,13 @@ void reader_concurrency_semaphore::evict(inactive_read& ir, evict_reason reason)
|
||||
}
|
||||
|
||||
bool reader_concurrency_semaphore::has_available_units(const resources& r) const {
|
||||
return bool(_resources) && _resources >= r;
|
||||
// Special case: when there is no active reader (based on count) admit one
|
||||
// regardless of availability of memory.
|
||||
return (bool(_resources) && _resources >= r) || _resources.count == _initial_resources.count;
|
||||
}
|
||||
|
||||
bool reader_concurrency_semaphore::may_proceed(const resources& r) const {
|
||||
// Special case: when there is no active reader (based on count) admit one
|
||||
// regardless of availability of memory.
|
||||
return _wait_list.empty() && (has_available_units(r) || _resources.count == _initial_resources.count);
|
||||
return _wait_list.empty() && has_available_units(r);
|
||||
}
|
||||
|
||||
future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, size_t memory,
|
||||
@@ -567,6 +537,12 @@ void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
|
||||
}
|
||||
}
|
||||
|
||||
std::string reader_concurrency_semaphore::dump_diagnostics() const {
|
||||
std::ostringstream os;
|
||||
do_dump_reader_permit_diagnostics(os, *this, *_permit_list, "user request");
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// A file that tracks the memory usage of buffers resulting from read
|
||||
// operations.
|
||||
class tracking_file_impl : public file_impl {
|
||||
|
||||
@@ -293,4 +293,6 @@ public:
|
||||
}
|
||||
|
||||
void broken(std::exception_ptr ex);
|
||||
|
||||
std::string dump_diagnostics() const;
|
||||
};
|
||||
|
||||
@@ -91,10 +91,9 @@ public:
|
||||
class resource_units;
|
||||
|
||||
enum class state {
|
||||
registered, // read is registered, but didn't attempt admission yet
|
||||
waiting, // waiting for admission
|
||||
admitted,
|
||||
inactive, // un-admitted reads that are registered as inactive
|
||||
active,
|
||||
inactive,
|
||||
};
|
||||
|
||||
class impl;
|
||||
|
||||
@@ -326,7 +326,7 @@ float node_ops_metrics::repair_finished_percentage() {
|
||||
tracker::tracker(size_t nr_shards, size_t max_repair_memory)
|
||||
: _shutdown(false)
|
||||
, _repairs(nr_shards) {
|
||||
auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range()));
|
||||
auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range() / 4));
|
||||
rlogger.info("Setting max_repair_memory={}, max_repair_memory_per_range={}, max_repair_ranges_in_parallel={}",
|
||||
max_repair_memory, max_repair_memory_per_range(), nr);
|
||||
_range_parallelism_semaphores.reserve(nr_shards);
|
||||
@@ -1578,6 +1578,7 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
|
||||
auto& strat = ks.get_replication_strategy();
|
||||
dht::token_range_vector desired_ranges = strat.get_pending_address_ranges(tmptr, tokens, myip, utils::can_yield::yes);
|
||||
bool find_node_in_local_dc_only = strat.get_type() == locator::replication_strategy_type::network_topology;
|
||||
bool everywhere_topology = strat.get_type() == locator::replication_strategy_type::everywhere_topology;
|
||||
|
||||
//Active ranges
|
||||
auto metadata_clone = tmptr->clone_only_token_map().get0();
|
||||
@@ -1655,7 +1656,9 @@ future<> bootstrap_with_repair(seastar::sharded<database>& db, seastar::sharded<
|
||||
};
|
||||
auto old_endpoints_in_local_dc = get_old_endpoints_in_local_dc();
|
||||
auto rf_in_local_dc = get_rf_in_local_dc();
|
||||
if (old_endpoints.size() == strat.get_replication_factor()) {
|
||||
if (everywhere_topology) {
|
||||
neighbors = old_endpoints_in_local_dc;
|
||||
} else if (old_endpoints.size() == strat.get_replication_factor()) {
|
||||
// For example, with RF = 3 and 3 nodes n1, n2, n3
|
||||
// in the cluster, n4 is bootstrapped, old_replicas
|
||||
// = {n1, n2, n3}, new_replicas = {n1, n2, n4}, n3
|
||||
|
||||
13
row_cache.cc
13
row_cache.cc
@@ -332,7 +332,7 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_clock::time_point timeout) {
|
||||
future<> read_context::create_underlying(db::timeout_clock::time_point timeout) {
|
||||
if (_range_query) {
|
||||
// FIXME: Singular-range mutation readers don't support fast_forward_to(), so need to use a wide range
|
||||
// here in case the same reader will need to be fast forwarded later.
|
||||
@@ -340,13 +340,8 @@ future<> read_context::create_underlying(bool skip_first_fragment, db::timeout_c
|
||||
} else {
|
||||
_sm_range = dht::partition_range::make_singular({dht::ring_position(*_key)});
|
||||
}
|
||||
return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this, skip_first_fragment, timeout] {
|
||||
return _underlying.fast_forward_to(std::move(_sm_range), *_underlying_snapshot, _phase, timeout).then([this] {
|
||||
_underlying_snapshot = {};
|
||||
if (skip_first_fragment) {
|
||||
return _underlying.underlying()(timeout).then([](auto &&mf) {});
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -366,7 +361,7 @@ private:
|
||||
auto src_and_phase = _cache.snapshot_of(_read_context->range().start()->value());
|
||||
auto phase = src_and_phase.phase;
|
||||
_read_context->enter_partition(_read_context->range().start()->value().as_decorated_key(), src_and_phase.snapshot, phase);
|
||||
return _read_context->create_underlying(false, timeout).then([this, phase, timeout] {
|
||||
return _read_context->create_underlying(timeout).then([this, phase, timeout] {
|
||||
return _read_context->underlying().underlying()(timeout).then([this, phase] (auto&& mfopt) {
|
||||
if (!mfopt) {
|
||||
if (phase == _cache.phase_of(_read_context->range().start()->value())) {
|
||||
@@ -728,7 +723,7 @@ row_cache::make_reader(schema_ptr s,
|
||||
auto&& pos = ctx->range().start()->value();
|
||||
partitions_type::bound_hint hint;
|
||||
auto i = _partitions.lower_bound(pos, cmp, hint);
|
||||
if (i != _partitions.end() && hint.match) {
|
||||
if (hint.match) {
|
||||
cache_entry& e = *i;
|
||||
upgrade_entry(e);
|
||||
on_partition_hit();
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: 72e3baed9c...770167e835
@@ -89,7 +89,7 @@ template<typename Input>
|
||||
size_type read_frame_size(Input& in) {
|
||||
auto sz = deserialize(in, boost::type<size_type>());
|
||||
if (sz < sizeof(size_type)) {
|
||||
throw std::runtime_error("Truncated frame");
|
||||
throw std::runtime_error(fmt::format("IDL frame truncated: expected to have at least {} bytes, got {}", sizeof(size_type), sz));
|
||||
}
|
||||
return sz - sizeof(size_type);
|
||||
}
|
||||
|
||||
@@ -53,6 +53,7 @@
|
||||
#include "database.hh"
|
||||
#include "db/schema_tables.hh"
|
||||
#include "types/user.hh"
|
||||
#include "db/schema_tables.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -1075,8 +1076,19 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
|
||||
// referenced by the incoming request.
|
||||
// That means the column mapping for the schema should always be inserted
|
||||
// with TTL (refresh TTL in case column mapping already existed prior to that).
|
||||
return db::schema_tables::store_column_mapping(proxy, s.unfreeze(db::schema_ctxt(proxy)), true).then([s] {
|
||||
return s;
|
||||
auto us = s.unfreeze(db::schema_ctxt(proxy));
|
||||
// if this is a view - we might need to fix it's schema before registering it.
|
||||
if (us->is_view()) {
|
||||
auto& db = proxy.local().local_db();
|
||||
schema_ptr base_schema = db.find_schema(us->view_info()->base_id());
|
||||
auto fixed_view = db::schema_tables::maybe_fix_legacy_secondary_index_mv_schema(db, view_ptr(us), base_schema,
|
||||
db::schema_tables::preserve_version::yes);
|
||||
if (fixed_view) {
|
||||
us = fixed_view;
|
||||
}
|
||||
}
|
||||
return db::schema_tables::store_column_mapping(proxy, us, true).then([us] {
|
||||
return frozen_schema{us};
|
||||
});
|
||||
});
|
||||
}).then([] (schema_ptr s) {
|
||||
@@ -1084,7 +1096,7 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
|
||||
// table.
|
||||
if (s->is_view()) {
|
||||
if (!s->view_info()->base_info()) {
|
||||
auto& db = service::get_local_storage_proxy().get_db().local();
|
||||
auto& db = service::get_local_storage_proxy().local_db();
|
||||
// This line might throw a no_such_column_family
|
||||
// It should be fine since if we tried to register a view for which
|
||||
// we don't know the base table, our registry is broken.
|
||||
|
||||
@@ -3643,7 +3643,12 @@ protected:
|
||||
}
|
||||
|
||||
public:
|
||||
virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
|
||||
future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
|
||||
if (_targets.empty()) {
|
||||
// We may have no targets to read from if a DC with zero replication is queried with LOCACL_QUORUM.
|
||||
// Return an empty result in this case
|
||||
return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>>(make_foreign(make_lw_shared(query::result())));
|
||||
}
|
||||
digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_schema, _cl, _block_for,
|
||||
db::is_datacenter_local(_cl) ? db::count_local_endpoints(_targets): _targets.size(), timeout);
|
||||
auto exec = shared_from_this();
|
||||
|
||||
@@ -797,6 +797,19 @@ storage_service::get_range_to_address_map(const sstring& keyspace,
|
||||
return construct_range_to_endpoint_map(ks, get_all_ranges(sorted_tokens));
|
||||
}
|
||||
|
||||
void storage_service::handle_state_replacing_update_pending_ranges(mutable_token_metadata_ptr tmptr, inet_address replacing_node) {
|
||||
try {
|
||||
slogger.info("handle_state_replacing: Waiting for replacing node {} to be alive on all shards", replacing_node);
|
||||
_gossiper.wait_alive({replacing_node}, std::chrono::milliseconds(5 * 1000));
|
||||
slogger.info("handle_state_replacing: Replacing node {} is now alive on all shards", replacing_node);
|
||||
} catch (...) {
|
||||
slogger.warn("handle_state_replacing: Failed to wait for replacing node {} to be alive on all shards: {}",
|
||||
replacing_node, std::current_exception());
|
||||
}
|
||||
slogger.info("handle_state_replacing: Update pending ranges for replacing node {}", replacing_node);
|
||||
update_pending_ranges(tmptr, format("handle_state_replacing {}", replacing_node)).get();
|
||||
}
|
||||
|
||||
void storage_service::handle_state_replacing(inet_address replacing_node) {
|
||||
slogger.debug("endpoint={} handle_state_replacing", replacing_node);
|
||||
auto host_id = _gossiper.get_host_id(replacing_node);
|
||||
@@ -817,7 +830,13 @@ void storage_service::handle_state_replacing(inet_address replacing_node) {
|
||||
slogger.info("Node {} is replacing existing node {} with host_id={}, existing_tokens={}, replacing_tokens={}",
|
||||
replacing_node, existing_node, host_id, existing_tokens, replacing_tokens);
|
||||
tmptr->add_replacing_endpoint(existing_node, replacing_node);
|
||||
update_pending_ranges(tmptr, format("handle_state_replacing {}", replacing_node)).get();
|
||||
if (_gossiper.is_alive(replacing_node)) {
|
||||
slogger.info("handle_state_replacing: Replacing node {} is already alive, update pending ranges", replacing_node);
|
||||
handle_state_replacing_update_pending_ranges(tmptr, replacing_node);
|
||||
} else {
|
||||
slogger.info("handle_state_replacing: Replacing node {} is not alive yet, delay update pending ranges", replacing_node);
|
||||
_replacing_nodes_pending_ranges_updater.insert(replacing_node);
|
||||
}
|
||||
replicate_to_all_cores(std::move(tmptr)).get();
|
||||
}
|
||||
|
||||
@@ -1127,6 +1146,14 @@ void storage_service::on_alive(gms::inet_address endpoint, gms::endpoint_state s
|
||||
if (get_token_metadata().is_member(endpoint)) {
|
||||
notify_up(endpoint);
|
||||
}
|
||||
if (_replacing_nodes_pending_ranges_updater.contains(endpoint)) {
|
||||
_replacing_nodes_pending_ranges_updater.erase(endpoint);
|
||||
slogger.info("Trigger pending ranges updater for replacing node {}", endpoint);
|
||||
auto tmlock = get_token_metadata_lock().get0();
|
||||
auto tmptr = get_mutable_token_metadata_ptr().get0();
|
||||
handle_state_replacing_update_pending_ranges(tmptr, endpoint);
|
||||
replicate_to_all_cores(std::move(tmptr)).get();
|
||||
}
|
||||
}
|
||||
|
||||
void storage_service::before_change(gms::inet_address endpoint, gms::endpoint_state current_state, gms::application_state new_state_key, const gms::versioned_value& new_value) {
|
||||
@@ -2301,7 +2328,13 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
|
||||
}
|
||||
return seastar::async([this, endpoint, notify_endpoint] {
|
||||
auto tmptr = get_token_metadata_ptr();
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(_db, tmptr, _abort_source, get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
|
||||
abort_source as;
|
||||
auto sub = _abort_source.subscribe([&as] () noexcept {
|
||||
if (!as.abort_requested()) {
|
||||
as.request_abort();
|
||||
}
|
||||
});
|
||||
auto streamer = make_lw_shared<dht::range_streamer>(_db, tmptr, as, get_broadcast_address(), "Restore_replica_count", streaming::stream_reason::removenode);
|
||||
auto my_address = get_broadcast_address();
|
||||
auto non_system_keyspaces = _db.local().get_non_system_keyspaces();
|
||||
for (const auto& keyspace_name : non_system_keyspaces) {
|
||||
@@ -2319,6 +2352,42 @@ future<> storage_service::restore_replica_count(inet_address endpoint, inet_addr
|
||||
}
|
||||
streamer->add_rx_ranges(keyspace_name, std::move(ranges_per_endpoint));
|
||||
}
|
||||
auto status_checker = seastar::async([this, endpoint, &as] {
|
||||
slogger.info("restore_replica_count: Started status checker for removing node {}", endpoint);
|
||||
while (!as.abort_requested()) {
|
||||
auto status = _gossiper.get_gossip_status(endpoint);
|
||||
// If the node to be removed is already in removed status, it has
|
||||
// probably been removed forcely with `nodetool removenode force`.
|
||||
// Abort the restore_replica_count in such case to avoid streaming
|
||||
// attempt since the user has removed the node forcely.
|
||||
if (status == sstring(versioned_value::REMOVED_TOKEN)) {
|
||||
slogger.info("restore_replica_count: Detected node {} has left the cluster, status={}, abort restore_replica_count for removing node {}",
|
||||
endpoint, status, endpoint);
|
||||
if (!as.abort_requested()) {
|
||||
as.request_abort();
|
||||
}
|
||||
return;
|
||||
}
|
||||
slogger.debug("restore_replica_count: Sleep and detect removing node {}, status={}", endpoint, status);
|
||||
sleep_abortable(std::chrono::seconds(10), as).get();
|
||||
}
|
||||
});
|
||||
auto stop_status_checker = defer([endpoint, &status_checker, &as] () mutable {
|
||||
try {
|
||||
slogger.info("restore_replica_count: Started to stop status checker for removing node {}", endpoint);
|
||||
if (!as.abort_requested()) {
|
||||
as.request_abort();
|
||||
}
|
||||
status_checker.get();
|
||||
} catch (const seastar::sleep_aborted& ignored) {
|
||||
slogger.debug("restore_replica_count: Got sleep_abort to stop status checker for removing node {}: {}", endpoint, ignored);
|
||||
} catch (...) {
|
||||
slogger.warn("restore_replica_count: Found error in status checker for removing node {}: {}",
|
||||
endpoint, std::current_exception());
|
||||
}
|
||||
slogger.info("restore_replica_count: Finished to stop status checker for removing node {}", endpoint);
|
||||
});
|
||||
|
||||
streamer->stream_async().then_wrapped([this, streamer, notify_endpoint] (auto&& f) {
|
||||
try {
|
||||
f.get();
|
||||
@@ -2338,15 +2407,16 @@ void storage_service::excise(std::unordered_set<token> tokens, inet_address endp
|
||||
slogger.info("Removing tokens {} for {}", tokens, endpoint);
|
||||
// FIXME: HintedHandOffManager.instance.deleteHintsForEndpoint(endpoint);
|
||||
remove_endpoint(endpoint);
|
||||
auto tmlock = get_token_metadata_lock().get0();
|
||||
auto tmlock = std::make_optional(get_token_metadata_lock().get0());
|
||||
auto tmptr = get_mutable_token_metadata_ptr().get0();
|
||||
tmptr->remove_endpoint(endpoint);
|
||||
tmptr->remove_bootstrap_tokens(tokens);
|
||||
|
||||
notify_left(endpoint);
|
||||
|
||||
update_pending_ranges(tmptr, format("excise {}", endpoint)).get();
|
||||
replicate_to_all_cores(std::move(tmptr)).get();
|
||||
tmlock.reset();
|
||||
|
||||
notify_left(endpoint);
|
||||
}
|
||||
|
||||
void storage_service::excise(std::unordered_set<token> tokens, inet_address endpoint, int64_t expire_time) {
|
||||
@@ -2473,7 +2543,7 @@ private:
|
||||
int32_t status = 0;
|
||||
while (auto status_opt = co_await _source()) {
|
||||
status = std::get<0>(*status_opt);
|
||||
slogger.debug("send_meta_data: got error code={}, from node={}, status={}", status, _node);
|
||||
slogger.debug("send_meta_data: got error code={}, from node={}", status, _node);
|
||||
if (status == -1) {
|
||||
_error_from_peer = true;
|
||||
}
|
||||
@@ -3278,7 +3348,7 @@ shared_ptr<node_ops_info> node_ops_meta_data::get_ops_info() {
|
||||
|
||||
void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_update_heartbeat: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
@@ -3288,7 +3358,7 @@ void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
|
||||
|
||||
void storage_service::node_ops_done(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_done: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
@@ -3299,7 +3369,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {
|
||||
|
||||
void storage_service::node_ops_abort(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
|
||||
@@ -587,6 +587,7 @@ private:
|
||||
sharded<db::view::view_update_generator>& _view_update_generator;
|
||||
locator::snitch_signal_slot_t _snitch_reconfigure;
|
||||
serialized_action _schema_version_publisher;
|
||||
std::unordered_set<gms::inet_address> _replacing_nodes_pending_ranges_updater;
|
||||
private:
|
||||
/**
|
||||
* Handle node bootstrap
|
||||
@@ -641,6 +642,8 @@ private:
|
||||
*/
|
||||
void handle_state_replacing(inet_address endpoint);
|
||||
|
||||
void handle_state_replacing_update_pending_ranges(mutable_token_metadata_ptr tmptr, inet_address replacing_node);
|
||||
|
||||
private:
|
||||
void excise(std::unordered_set<token> tokens, inet_address endpoint);
|
||||
void excise(std::unordered_set<token> tokens, inet_address endpoint, long expire_time);
|
||||
|
||||
@@ -468,7 +468,6 @@ protected:
|
||||
mutation_source_metadata _ms_metadata = {};
|
||||
garbage_collected_sstable_writer::data _gc_sstable_writer_data;
|
||||
compaction_sstable_replacer_fn _replacer;
|
||||
std::optional<compaction_weight_registration> _weight_registration;
|
||||
utils::UUID _run_identifier;
|
||||
::io_priority_class _io_priority;
|
||||
// optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
|
||||
@@ -487,7 +486,6 @@ protected:
|
||||
, _sstable_level(descriptor.level)
|
||||
, _gc_sstable_writer_data(*this)
|
||||
, _replacer(std::move(descriptor.replacer))
|
||||
, _weight_registration(std::move(descriptor.weight_registration))
|
||||
, _run_identifier(descriptor.run_identifier)
|
||||
, _io_priority(descriptor.io_priority)
|
||||
, _sstable_set(std::move(descriptor.all_sstables_snapshot))
|
||||
@@ -951,9 +949,6 @@ public:
|
||||
}
|
||||
|
||||
virtual void on_end_of_compaction() override {
|
||||
if (_weight_registration) {
|
||||
_cf.get_compaction_manager().on_compaction_complete(*_weight_registration);
|
||||
}
|
||||
replace_remaining_exhausted_sstables();
|
||||
}
|
||||
|
||||
|
||||
@@ -134,8 +134,6 @@ struct compaction_descriptor {
|
||||
uint64_t max_sstable_bytes;
|
||||
// Run identifier of output sstables.
|
||||
utils::UUID run_identifier;
|
||||
// Holds ownership of a weight assigned to this compaction iff it's a regular one.
|
||||
std::optional<compaction_weight_registration> weight_registration;
|
||||
// Calls compaction manager's task for this compaction to release reference to exhausted sstables.
|
||||
std::function<void(const std::vector<shared_sstable>& exhausted_sstables)> release_exhausted;
|
||||
// The options passed down to the compaction code.
|
||||
|
||||
@@ -314,6 +314,7 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstring name, non
|
||||
cmlog.info("{} was abruptly stopped, reason: {}", name, e.what());
|
||||
} catch (...) {
|
||||
cmlog.error("{} failed: {}", name, std::current_exception());
|
||||
throw;
|
||||
}
|
||||
});
|
||||
return task->compaction_done.get_future().then([task] {});
|
||||
@@ -438,7 +439,7 @@ void compaction_manager::reevaluate_postponed_compactions() {
|
||||
}
|
||||
|
||||
void compaction_manager::postpone_compaction_for_column_family(column_family* cf) {
|
||||
_postponed.push_back(cf);
|
||||
_postponed.insert(cf);
|
||||
}
|
||||
|
||||
future<> compaction_manager::stop_ongoing_compactions(sstring reason) {
|
||||
@@ -578,7 +579,7 @@ void compaction_manager::submit(column_family* cf) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
|
||||
descriptor.weight_registration = compaction_weight_registration(this, weight);
|
||||
auto weight_r = compaction_weight_registration(this, weight);
|
||||
descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
|
||||
compacting->release_compacting(exhausted_sstables);
|
||||
};
|
||||
@@ -588,7 +589,7 @@ void compaction_manager::submit(column_family* cf) {
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
task->compaction_running = true;
|
||||
return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
|
||||
return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting), weight_r = std::move(weight_r)] (future<> f) mutable {
|
||||
_stats.active_tasks--;
|
||||
task->compaction_running = false;
|
||||
|
||||
@@ -853,12 +854,15 @@ future<> compaction_manager::remove(column_family* cf) {
|
||||
task->stopping = true;
|
||||
}
|
||||
}
|
||||
_postponed.erase(boost::remove(_postponed, cf), _postponed.end());
|
||||
_postponed.erase(cf);
|
||||
|
||||
// Wait for the termination of an ongoing compaction on cf, if any.
|
||||
return do_for_each(*tasks_to_stop, [this, cf] (auto& task) {
|
||||
return this->task_stop(task);
|
||||
}).then([this, cf, tasks_to_stop] {
|
||||
#ifdef DEBUG
|
||||
assert(std::find_if(_tasks.begin(), _tasks.end(), [cf] (auto& task) { return task->compacting_cf == cf; }) == _tasks.end());
|
||||
#endif
|
||||
_compaction_locks.erase(cf);
|
||||
});
|
||||
}
|
||||
@@ -885,11 +889,6 @@ void compaction_manager::stop_compaction(sstring type) {
|
||||
}
|
||||
}
|
||||
|
||||
void compaction_manager::on_compaction_complete(compaction_weight_registration& weight_registration) {
|
||||
weight_registration.deregister();
|
||||
reevaluate_postponed_compactions();
|
||||
}
|
||||
|
||||
void compaction_manager::propagate_replacement(column_family* cf,
|
||||
const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added) {
|
||||
for (auto& info : _compactions) {
|
||||
|
||||
@@ -100,7 +100,7 @@ private:
|
||||
future<> _waiting_reevalution = make_ready_future<>();
|
||||
condition_variable _postponed_reevaluation;
|
||||
// column families that wait for compaction but had its submission postponed due to ongoing compaction.
|
||||
std::vector<column_family*> _postponed;
|
||||
std::unordered_set<column_family*> _postponed;
|
||||
// tracks taken weights of ongoing compactions, only one compaction per weight is allowed.
|
||||
// weight is value assigned to a compaction job that is log base N of total size of all input sstables.
|
||||
std::unordered_set<int> _weight_tracker;
|
||||
@@ -257,11 +257,6 @@ public:
|
||||
// Stops ongoing compaction of a given type.
|
||||
void stop_compaction(sstring type);
|
||||
|
||||
// Called by compaction procedure to release the weight lock assigned to it, such that
|
||||
// another compaction waiting on same weight can start as soon as possible. That's usually
|
||||
// called before compaction seals sstable and such and after all compaction work is done.
|
||||
void on_compaction_complete(compaction_weight_registration& weight_registration);
|
||||
|
||||
double backlog() {
|
||||
return _backlog_manager.backlog();
|
||||
}
|
||||
|
||||
@@ -503,7 +503,8 @@ date_tiered_manifest::get_compaction_candidates(column_family& cf, std::vector<s
|
||||
|
||||
int64_t date_tiered_manifest::get_now(column_family& cf) {
|
||||
int64_t max_timestamp = 0;
|
||||
for (auto& sst : *cf.get_sstables()) {
|
||||
auto shared_set = cf.get_sstables();
|
||||
for (auto& sst : *shared_set) {
|
||||
int64_t candidate = sst->get_stats_metadata().max_timestamp;
|
||||
max_timestamp = candidate > max_timestamp ? candidate : max_timestamp;
|
||||
}
|
||||
|
||||
@@ -124,7 +124,7 @@ void sstable_writer_k_l::maybe_flush_pi_block(file_writer& out,
|
||||
// block includes them), but we set block_next_start_offset after - so
|
||||
// even if we wrote a lot of open tombstones, we still get a full
|
||||
// block size of new data.
|
||||
auto& rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
|
||||
auto rts = _pi_write.tombstone_accumulator->range_tombstones_for_row(
|
||||
clustering_key_prefix::from_range(clustering_key.values()));
|
||||
for (const auto& rt : rts) {
|
||||
auto start = composite::from_clustering_element(*_pi_write.schemap, rt.start);
|
||||
|
||||
@@ -147,7 +147,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
|
||||
unsigned overlapping_sstables = 0;
|
||||
auto prev_last = dht::ring_position::min();
|
||||
for (auto& sst : sstables) {
|
||||
if (dht::ring_position(sst->get_first_decorated_key()).less_compare(*schema, prev_last)) {
|
||||
if (dht::ring_position(sst->get_first_decorated_key()).tri_compare(*schema, prev_last) <= 0) {
|
||||
overlapping_sstables++;
|
||||
}
|
||||
prev_last = dht::ring_position(sst->get_last_decorated_key());
|
||||
@@ -193,7 +193,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
|
||||
// If there's only disjoint L0 sstables like on bootstrap, let's compact them all into a level L which has capacity to store the output.
|
||||
// The best possible level can be calculated with the formula: log (base fan_out) of (L0_total_bytes / max_sstable_size)
|
||||
auto [l0_disjoint, _] = is_disjoint(level_info[0], 0);
|
||||
if (mode == reshape_mode::strict && level_info[0].size() == input.size() && l0_disjoint) {
|
||||
if (mode == reshape_mode::strict && level_info[0].size() >= offstrategy_threshold && level_info[0].size() == input.size() && l0_disjoint) {
|
||||
auto log_fanout = [fanout = leveled_manifest::leveled_fan_out] (double x) {
|
||||
double inv_log_fanout = 1.0f / std::log(fanout);
|
||||
return log(x) * inv_log_fanout;
|
||||
|
||||
@@ -424,6 +424,11 @@ std::unique_ptr<incremental_selector_impl> time_series_sstable_set::make_increme
|
||||
// exactly once after all sstables are iterated over.
|
||||
//
|
||||
// The readers are created lazily on-demand using the supplied factory function.
|
||||
//
|
||||
// Additionally to the sstable readers, the queue always returns one ``dummy reader''
|
||||
// that contains only the partition_start/end markers. This dummy reader is always
|
||||
// returned as the first on the first `pop(b)` call for any `b`. Its upper bound
|
||||
// is `before_all_clustered_rows`.
|
||||
class min_position_reader_queue : public position_reader_queue {
|
||||
using container_t = time_series_sstable_set::container_t;
|
||||
using value_t = container_t::value_type;
|
||||
@@ -441,6 +446,11 @@ class min_position_reader_queue : public position_reader_queue {
|
||||
std::function<flat_mutation_reader(sstable&)> _create_reader;
|
||||
std::function<bool(const sstable&)> _filter;
|
||||
|
||||
// After construction contains a reader which returns only the partition
|
||||
// start (and end, if not in forwarding mode) markers. This is the first
|
||||
// returned reader.
|
||||
std::optional<flat_mutation_reader> _dummy_reader;
|
||||
|
||||
flat_mutation_reader create_reader(sstable& sst) {
|
||||
return _create_reader(sst);
|
||||
}
|
||||
@@ -450,10 +460,14 @@ class min_position_reader_queue : public position_reader_queue {
|
||||
}
|
||||
|
||||
public:
|
||||
// Assumes that `create_reader` returns readers that emit only fragments from partition `pk`.
|
||||
min_position_reader_queue(schema_ptr schema,
|
||||
lw_shared_ptr<const time_series_sstable_set::container_t> sstables,
|
||||
std::function<flat_mutation_reader(sstable&)> create_reader,
|
||||
std::function<bool(const sstable&)> filter)
|
||||
std::function<bool(const sstable&)> filter,
|
||||
partition_key pk,
|
||||
reader_permit permit,
|
||||
streamed_mutation::forwarding fwd_sm)
|
||||
: _schema(std::move(schema))
|
||||
, _sstables(std::move(sstables))
|
||||
, _it(_sstables->begin())
|
||||
@@ -461,6 +475,8 @@ public:
|
||||
, _cmp(*_schema)
|
||||
, _create_reader(std::move(create_reader))
|
||||
, _filter(std::move(filter))
|
||||
, _dummy_reader(flat_mutation_reader_from_mutations(
|
||||
std::move(permit), {mutation(_schema, std::move(pk))}, _schema->full_slice(), fwd_sm))
|
||||
{
|
||||
while (_it != _end && !this->filter(*_it->second)) {
|
||||
++_it;
|
||||
@@ -469,7 +485,8 @@ public:
|
||||
|
||||
virtual ~min_position_reader_queue() override = default;
|
||||
|
||||
// Open sstable readers to all sstables with smallest min_position() from the set
|
||||
// If the dummy reader was not yet returned, return the dummy reader.
|
||||
// Otherwise, open sstable readers to all sstables with smallest min_position() from the set
|
||||
// {S: filter(S) and prev_min_pos < S.min_position() <= bound}, where `prev_min_pos` is the min_position()
|
||||
// of the sstables returned from last non-empty pop() or -infinity if no sstables were previously returned,
|
||||
// and `filter` is the filtering function provided when creating the queue.
|
||||
@@ -483,6 +500,12 @@ public:
|
||||
return {};
|
||||
}
|
||||
|
||||
if (_dummy_reader) {
|
||||
std::vector<reader_and_upper_bound> ret;
|
||||
ret.emplace_back(*std::exchange(_dummy_reader, std::nullopt), position_in_partition::before_all_clustered_rows());
|
||||
return ret;
|
||||
}
|
||||
|
||||
// by !empty(bound) and `_it` invariant:
|
||||
// _it != _end, _it->first <= bound, and filter(*_it->second) == true
|
||||
assert(_cmp(_it->first, bound) <= 0);
|
||||
@@ -511,17 +534,22 @@ public:
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Is the set of sstables {S: filter(S) and prev_min_pos < S.min_position() <= bound} empty?
|
||||
// (see pop() for definition of `prev_min_pos`)
|
||||
// If the dummy reader was not returned yet, returns false.
|
||||
// Otherwise checks if the set of sstables {S: filter(S) and prev_min_pos < S.min_position() <= bound}
|
||||
// is empty (see pop() for definition of `prev_min_pos`).
|
||||
virtual bool empty(position_in_partition_view bound) const override {
|
||||
return _it == _end || _cmp(_it->first, bound) > 0;
|
||||
return !_dummy_reader && (_it == _end || _cmp(_it->first, bound) > 0);
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<position_reader_queue> time_series_sstable_set::make_min_position_reader_queue(
|
||||
std::function<flat_mutation_reader(sstable&)> create_reader,
|
||||
std::function<bool(const sstable&)> filter) const {
|
||||
return std::make_unique<min_position_reader_queue>(_schema, _sstables, std::move(create_reader), std::move(filter));
|
||||
std::function<bool(const sstable&)> filter,
|
||||
partition_key pk, schema_ptr schema, reader_permit permit,
|
||||
streamed_mutation::forwarding fwd_sm) const {
|
||||
return std::make_unique<min_position_reader_queue>(
|
||||
std::move(schema), _sstables, std::move(create_reader), std::move(filter),
|
||||
std::move(pk), std::move(permit), fwd_sm);
|
||||
}
|
||||
|
||||
std::unique_ptr<incremental_selector_impl> partitioned_sstable_set::make_incremental_selector() const {
|
||||
@@ -787,30 +815,19 @@ time_series_sstable_set::create_single_key_sstable_reader(
|
||||
auto& stats = *cf->cf_stats();
|
||||
stats.clustering_filter_count++;
|
||||
|
||||
auto ck_filter = [ranges = slice.get_all_ranges()] (const sstable& sst) { return sst.may_contain_rows(ranges); };
|
||||
{
|
||||
auto next = std::find_if(it, _sstables->end(), [&] (const sst_entry& e) { return ck_filter(*e.second); });
|
||||
stats.sstables_checked_by_clustering_filter += std::distance(it, next);
|
||||
it = next;
|
||||
}
|
||||
if (it == _sstables->end()) {
|
||||
// Some sstables passed the partition key filter, but none passed the clustering key filter.
|
||||
// However, we still have to emit a partition (even though it will be empty) so we don't fool the cache
|
||||
// into thinking this partition doesn't exist in any sstable (#3552).
|
||||
return flat_mutation_reader_from_mutations(std::move(permit), {mutation(schema, *pos.key())}, slice, fwd_sm);
|
||||
}
|
||||
|
||||
auto create_reader = [schema, permit, &pr, &slice, &pc, trace_state, fwd_sm] (sstable& sst) {
|
||||
return sst.make_reader(schema, permit, pr, slice, pc, trace_state, fwd_sm);
|
||||
};
|
||||
|
||||
auto ck_filter = [ranges = slice.get_all_ranges()] (const sstable& sst) { return sst.may_contain_rows(ranges); };
|
||||
|
||||
// We're going to pass this filter into min_position_reader_queue. The queue guarantees that
|
||||
// the filter is going to be called at most once for each sstable and exactly once after
|
||||
// the queue is exhausted. We use that fact to gather statistics.
|
||||
auto filter = [pk_filter = std::move(pk_filter), ck_filter = std::move(ck_filter), &stats]
|
||||
(const sstable& sst) {
|
||||
if (pk_filter(sst)) {
|
||||
return true;
|
||||
if (!pk_filter(sst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
++stats.sstables_checked_by_clustering_filter;
|
||||
@@ -822,9 +839,12 @@ time_series_sstable_set::create_single_key_sstable_reader(
|
||||
return false;
|
||||
};
|
||||
|
||||
// Note that `min_position_reader_queue` always includes a reader which emits a `partition_start` fragment,
|
||||
// guaranteeing that the reader we return emits it as well; this helps us avoid the problem from #3552.
|
||||
return make_clustering_combined_reader(
|
||||
std::move(schema), std::move(permit), fwd_sm,
|
||||
make_min_position_reader_queue(std::move(create_reader), std::move(filter)));
|
||||
schema, permit, fwd_sm,
|
||||
make_min_position_reader_queue(
|
||||
std::move(create_reader), std::move(filter), *pos.key(), schema, permit, fwd_sm));
|
||||
}
|
||||
|
||||
compound_sstable_set::compound_sstable_set(schema_ptr schema, std::vector<lw_shared_ptr<sstable_set>> sets)
|
||||
@@ -954,6 +974,40 @@ sstable_set make_compound_sstable_set(schema_ptr schema, std::vector<lw_shared_p
|
||||
return sstable_set(std::make_unique<compound_sstable_set>(schema, std::move(sets)), schema);
|
||||
}
|
||||
|
||||
flat_mutation_reader
|
||||
compound_sstable_set::create_single_key_sstable_reader(
|
||||
column_family* cf,
|
||||
schema_ptr schema,
|
||||
reader_permit permit,
|
||||
utils::estimated_histogram& sstable_histogram,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) const {
|
||||
auto sets = _sets;
|
||||
auto it = std::partition(sets.begin(), sets.end(), [] (const auto& set) { return set->all()->size() > 0; });
|
||||
auto non_empty_set_count = std::distance(sets.begin(), it);
|
||||
|
||||
if (!non_empty_set_count) {
|
||||
return make_empty_flat_reader(schema, permit);
|
||||
}
|
||||
// optimize for common case where only 1 set is populated, avoiding the expensive combined reader
|
||||
if (non_empty_set_count == 1) {
|
||||
const auto& non_empty_set = *std::begin(sets);
|
||||
return non_empty_set->create_single_key_sstable_reader(cf, std::move(schema), std::move(permit), sstable_histogram, pr, slice, pc, trace_state, fwd, fwd_mr);
|
||||
}
|
||||
|
||||
auto readers = boost::copy_range<std::vector<flat_mutation_reader>>(
|
||||
boost::make_iterator_range(sets.begin(), it)
|
||||
| boost::adaptors::transformed([&] (const lw_shared_ptr<sstable_set>& non_empty_set) {
|
||||
return non_empty_set->create_single_key_sstable_reader(cf, schema, permit, sstable_histogram, pr, slice, pc, trace_state, fwd, fwd_mr);
|
||||
})
|
||||
);
|
||||
return make_combined_reader(std::move(schema), std::move(permit), std::move(readers), fwd, fwd_mr);
|
||||
}
|
||||
|
||||
flat_mutation_reader
|
||||
sstable_set::create_single_key_sstable_reader(
|
||||
column_family* cf,
|
||||
|
||||
@@ -136,7 +136,9 @@ public:
|
||||
|
||||
std::unique_ptr<position_reader_queue> make_min_position_reader_queue(
|
||||
std::function<flat_mutation_reader(sstable&)> create_reader,
|
||||
std::function<bool(const sstable&)> filter) const;
|
||||
std::function<bool(const sstable&)> filter,
|
||||
partition_key pk, schema_ptr schema, reader_permit permit,
|
||||
streamed_mutation::forwarding fwd_sm) const;
|
||||
|
||||
virtual flat_mutation_reader create_single_key_sstable_reader(
|
||||
column_family*,
|
||||
@@ -167,6 +169,19 @@ public:
|
||||
virtual void insert(shared_sstable sst) override;
|
||||
virtual void erase(shared_sstable sst) override;
|
||||
virtual std::unique_ptr<incremental_selector_impl> make_incremental_selector() const override;
|
||||
|
||||
virtual flat_mutation_reader create_single_key_sstable_reader(
|
||||
column_family*,
|
||||
schema_ptr,
|
||||
reader_permit,
|
||||
utils::estimated_histogram&,
|
||||
const dht::partition_range&,
|
||||
const query::partition_slice&,
|
||||
const io_priority_class&,
|
||||
tracing::trace_state_ptr,
|
||||
streamed_mutation::forwarding,
|
||||
mutation_reader::forwarding) const override;
|
||||
|
||||
class incremental_selector;
|
||||
};
|
||||
|
||||
|
||||
@@ -101,7 +101,8 @@ class time_window_compaction_strategy : public compaction_strategy_impl {
|
||||
time_window_compaction_strategy_options _options;
|
||||
int64_t _estimated_remaining_tasks = 0;
|
||||
db_clock::time_point _last_expired_check;
|
||||
timestamp_type _highest_window_seen;
|
||||
// As timestamp_type is an int64_t, a primitive type, it must be initialized here.
|
||||
timestamp_type _highest_window_seen = 0;
|
||||
// Keep track of all recent active windows that still need to be compacted into a single SSTable
|
||||
std::unordered_set<timestamp_type> _recent_active_windows;
|
||||
size_tiered_compaction_strategy_options _stcs_options;
|
||||
|
||||
@@ -380,7 +380,7 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
|
||||
try {
|
||||
db.find_column_family(ks, cf);
|
||||
} catch (no_such_column_family&) {
|
||||
auto err = format("[Stream #{{}}] prepare requested ks={{}} cf={{}} does not exist", plan_id, ks, cf);
|
||||
auto err = format("[Stream #{}] prepare requested ks={} cf={} does not exist", plan_id, ks, cf);
|
||||
sslog.warn(err.c_str());
|
||||
throw std::runtime_error(err);
|
||||
}
|
||||
@@ -394,7 +394,7 @@ future<prepare_message> stream_session::prepare(std::vector<stream_request> requ
|
||||
try {
|
||||
db.find_column_family(cf_id);
|
||||
} catch (no_such_column_family&) {
|
||||
auto err = format("[Stream #{{}}] prepare cf_id={} does not exist", plan_id, cf_id);
|
||||
auto err = format("[Stream #{}] prepare cf_id={} does not exist", plan_id, cf_id);
|
||||
sslog.warn(err.c_str());
|
||||
throw std::runtime_error(err);
|
||||
}
|
||||
|
||||
7
table.cc
7
table.cc
@@ -273,7 +273,8 @@ flat_mutation_reader table::make_streaming_reader(schema_ptr schema, const dht::
|
||||
auto trace_state = tracing::trace_state_ptr();
|
||||
const auto fwd = streamed_mutation::forwarding::no;
|
||||
const auto fwd_mr = mutation_reader::forwarding::no;
|
||||
return make_sstable_reader(schema, permit, sstables, range, slice, pc, std::move(trace_state), fwd, fwd_mr);
|
||||
return sstables->make_range_sstable_reader(std::move(schema), std::move(permit), range, slice, pc,
|
||||
std::move(trace_state), fwd, fwd_mr);
|
||||
}
|
||||
|
||||
future<std::vector<locked_cell>> table::lock_counter_cells(const mutation& m, db::timeout_clock::time_point timeout) {
|
||||
@@ -952,8 +953,8 @@ void table::try_trigger_compaction() noexcept {
|
||||
}
|
||||
|
||||
void table::do_trigger_compaction() {
|
||||
// But only submit if we're not locked out
|
||||
if (!_compaction_disabled) {
|
||||
// But not if we're locked out or stopping
|
||||
if (!_compaction_disabled && !_async_gate.is_closed()) {
|
||||
_compaction_manager.submit(this);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,3 +85,20 @@ def test_signature_too_futuristic(dynamodb, test_table):
|
||||
response = requests.post(url, headers=headers, verify=False)
|
||||
assert not response.ok
|
||||
assert "InvalidSignatureException" in response.text and "Signature not yet current" in response.text
|
||||
|
||||
# A test that commas can be uses instead of whitespace to separate components
|
||||
# of the Authorization headers - reproducing issue #9568.
|
||||
def test_authorization_no_whitespace(dynamodb, test_table):
|
||||
# Unlike the above tests which checked error cases so didn't need to
|
||||
# calculate a real signature, in this test we really a correct signature,
|
||||
# so we use a function we already have in test_manual_requests.py.
|
||||
from test_manual_requests import get_signed_request
|
||||
payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "x"}, "c": {"S": "x"}}}'
|
||||
req = get_signed_request(dynamodb, 'PutItem', payload)
|
||||
# Boto3 separates the components of the Authorization header by spaces.
|
||||
# Let's remove all of them except the first one (which separates the
|
||||
# signature algorithm name from the rest) and check the result still works:
|
||||
a = req.headers['Authorization'].split()
|
||||
req.headers['Authorization'] = a[0] + ' ' + ''.join(a[1:])
|
||||
response = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
|
||||
assert response.ok
|
||||
|
||||
@@ -154,6 +154,27 @@ def test_update_condition_eq_unequal(test_table_s):
|
||||
ConditionExpression='q = :oldval',
|
||||
ExpressionAttributeValues={':val1': 3, ':oldval': 2})
|
||||
|
||||
# In test_update_condition_eq_unequal() above we saw that a non-existent
|
||||
# attribute is not "=" to a value. Here we check what happens when two
|
||||
# non-existent attributes are checked for equality. It turns out, they should
|
||||
# *not* be considered equal. In short, an unset attribute is never equal to
|
||||
# anything - not even to another unset attribute.
|
||||
# Reproduces issue #8511.
|
||||
def test_update_condition_eq_two_unset(test_table_s):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q = z',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'}})
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q = z',
|
||||
ExpressionAttributeValues={':val1': 3})
|
||||
|
||||
# Check that set equality is checked correctly. Unlike string equality (for
|
||||
# example), it cannot be done with just naive string comparison of the JSON
|
||||
# representation, and we need to allow for any order. (see issue #5021)
|
||||
@@ -175,6 +196,38 @@ def test_update_condition_eq_set(test_table_s):
|
||||
ExpressionAttributeValues={':val1': 3, ':oldval': set(['chinchilla', 'cat', 'dog', 'mouse'])})
|
||||
assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
|
||||
# The above test (test_update_condition_eq_set()) checked equality of simple
|
||||
# set attributes. But an attributes can contain a nested document, where the
|
||||
# set sits in a deep level (the set itself is a leaf in this heirarchy because
|
||||
# it can only contain numbers, strings or bytes). We need to correctly support
|
||||
# equality check in that case too.
|
||||
# Reproduces issue #8514.
|
||||
def test_update_condition_eq_nested_set(test_table_s):
|
||||
p = random_string()
|
||||
# Because boto3 sorts the set values we give it, in order to generate a
|
||||
# set with a different order, we need to build it incrementally.
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
AttributeUpdates={'a': {'Value': {'b': 'c', 'd': ['e', 'f', set(['g', 'h'])], 'i': set(['j', 'k'])}, 'Action': 'PUT'}})
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='ADD a.d[2] :val1, a.i :val2',
|
||||
ExpressionAttributeValues={':val1': set(['l', 'm']), ':val2': set(['n', 'o'])})
|
||||
# Sanity check - the attribute contains the set we think it does
|
||||
expected = {'b': 'c', 'd': ['e', 'f', set(['g', 'h', 'l', 'm'])], 'i': set(['j', 'k', 'n', 'o'])}
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == expected
|
||||
# Now finally check that condition expression check knows the equality too.
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET b = :val1',
|
||||
ConditionExpression='a = :oldval',
|
||||
ExpressionAttributeValues={':val1': 3, ':oldval': expected})
|
||||
assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
# Check that equality can also fail, if the inner set differs
|
||||
wrong = {'b': 'c', 'd': ['e', 'f', set(['g', 'h', 'l', 'bad'])], 'i': set(['j', 'k', 'n', 'o'])}
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET b = :val1',
|
||||
ConditionExpression='a = :oldval',
|
||||
ExpressionAttributeValues={':val1': 4, ':oldval': wrong})
|
||||
|
||||
# Test for ConditionExpression with operator "<>" (non-equality),
|
||||
def test_update_condition_ne(test_table_s):
|
||||
p = random_string()
|
||||
@@ -215,6 +268,54 @@ def test_update_condition_ne(test_table_s):
|
||||
ExpressionAttributeValues={':newval': 3, ':oldval': 1})
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 3
|
||||
|
||||
# Check that set inequality is checked correctly. This reproduces the same
|
||||
# bug #5021 that we reproduced above in test_update_condition_eq_set(), just
|
||||
# that here we check the inequality operator instead of equality.
|
||||
# Reproduces issue #8513.
|
||||
def test_update_condition_ne_set(test_table_s):
|
||||
p = random_string()
|
||||
# Because boto3 sorts the set values we give it, in order to generate a
|
||||
# set with a different order, we need to build it incrementally.
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
AttributeUpdates={'a': {'Value': set(['dog', 'chinchilla']), 'Action': 'PUT'}})
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='ADD a :val1',
|
||||
ExpressionAttributeValues={':val1': set(['cat', 'mouse'])})
|
||||
# Sanity check - the attribute contains the set we think it does
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == set(['chinchilla', 'cat', 'dog', 'mouse'])
|
||||
# Now check that condition expression check knows there is no inequality
|
||||
# here.
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET b = :val1',
|
||||
ConditionExpression='a <> :oldval',
|
||||
ExpressionAttributeValues={':val1': 2, ':oldval': set(['chinchilla', 'cat', 'dog', 'mouse'])})
|
||||
# As a sanity check, also check something which should be unequal:
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET b = :val1',
|
||||
ConditionExpression='a <> :oldval',
|
||||
ExpressionAttributeValues={':val1': 3, ':oldval': set(['chinchilla', 'cat', 'dog', 'horse'])})
|
||||
assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
|
||||
# In test_update_condition_ne() above we saw that a non-existent attribute is
|
||||
# "not equal" to any value. Here we check what happens when two non-existent
|
||||
# attributes are checked for non-equality. It turns out, they are also
|
||||
# considered "not equal". In short, an unset attribute is always "not equal" to
|
||||
# anything - even to another unset attribute.
|
||||
# Reproduces issue #8511.
|
||||
def test_update_condition_ne_two_unset(test_table_s):
|
||||
p = random_string()
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q <> z',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 2
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q <> z',
|
||||
ExpressionAttributeValues={':val1': 3})
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 3
|
||||
|
||||
# Test for ConditionExpression with operator "<"
|
||||
def test_update_condition_lt(test_table_s):
|
||||
p = random_string()
|
||||
@@ -316,6 +417,45 @@ def test_update_condition_lt(test_table_s):
|
||||
ExpressionAttributeValues={':newval': 2, ':oldval': 1})
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4
|
||||
|
||||
# In test_update_condition_lt() above we saw that a non-existent attribute is
|
||||
# not "<" any value. Here we check what happens when two non-existent
|
||||
# attributes are compared with "<". It turns out that the result of such
|
||||
# comparison is also false.
|
||||
# The same is true for other order operators - any order comparison involving
|
||||
# one unset attribute should be false - even if the second operand is an
|
||||
# unset attribute as well. Note that the <> operator is different - it is
|
||||
# always results in true if one of the operands is an unset attribute (see
|
||||
# test_update_condition_ne_two_unset() above).
|
||||
# This test is related to issue #8511 (although it passed even before fixing
|
||||
# that issue).
|
||||
def test_update_condition_comparison_two_unset(test_table_s):
|
||||
p = random_string()
|
||||
ops = ['<', '<=', '>', '>=']
|
||||
for op in ops:
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q ' + op + ' z',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q between z and x',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'}})
|
||||
for op in ops:
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q ' + op + ' z',
|
||||
ExpressionAttributeValues={':val1': 3})
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q between z and x',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
|
||||
# Test for ConditionExpression with operator "<="
|
||||
def test_update_condition_le(test_table_s):
|
||||
p = random_string()
|
||||
|
||||
@@ -186,3 +186,25 @@ def test_incorrect_numbers(dynamodb, test_table):
|
||||
req = get_signed_request(dynamodb, 'PutItem', payload)
|
||||
response = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
|
||||
assert "ValidationException" in response.text and "numeric" in response.text
|
||||
|
||||
# Although the DynamoDB API responses are JSON, additional conventions apply
|
||||
# to these responses - such as how error codes are encoded in JSON. For this
|
||||
# reason, DynamoDB uses the content type 'application/x-amz-json-1.0' instead
|
||||
# of the standard 'application/json'. This test verifies that we return the
|
||||
# correct content type header.
|
||||
# While most DynamoDB libraries we tried do not care about an unexpected
|
||||
# content-type, it turns out that one (aiodynamo) does. Moreover, AWS already
|
||||
# defined x-amz-json-1.1 - see
|
||||
# https://awslabs.github.io/smithy/1.0/spec/aws/aws-json-1_1-protocol.html
|
||||
# which differs (only) in how it encodes error replies.
|
||||
# So in the future it may become even more important that Scylla return the
|
||||
# correct content type.
|
||||
def test_content_type(dynamodb, test_table):
|
||||
payload = '{"TableName": "' + test_table.name + '", "Item": {"p": {"S": "x"}, "c": {"S": "x"}}}'
|
||||
# Note that get_signed_request() uses x-amz-json-1.0 to encode the
|
||||
# *request*. In the future this may or may not effect the content type
|
||||
# in the response (today, DynamoDB doesn't allow any other content type
|
||||
# in the request anyway).
|
||||
req = get_signed_request(dynamodb, 'PutItem', payload)
|
||||
response = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
|
||||
assert response.headers['Content-Type'] == 'application/x-amz-json-1.0'
|
||||
|
||||
113
test/alternator/test_metrics.py
Normal file
113
test/alternator/test_metrics.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# Copyright 2021-present ScyllaDB
|
||||
#
|
||||
# This file is part of Scylla.
|
||||
#
|
||||
# Scylla is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Scylla is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
##############################################################################
|
||||
# Tests for Scylla's metrics (see docs/design-notes/metrics.md) for Alternator
|
||||
# queries. Reproduces issue #9406, where although metrics was implemented for
|
||||
# Alternator requests, they were missing for some operations (BatchGetItem).
|
||||
# In the tests here we attempt to ensure that the metrics continue to work
|
||||
# for the relevant operations as the code evolves.
|
||||
#
|
||||
# Note that all tests in this file test Scylla-specific features, and are
|
||||
# "skipped" when not running against Scylla, or when unable to retrieve
|
||||
# metrics through out-of-band HTTP requests to Scylla's Prometheus port (9180).
|
||||
#
|
||||
# IMPORTANT: we do not want these tests to assume that are not running in
|
||||
# parallel with any other tests or workload - because such an assumption
|
||||
# would limit our test deployment options in the future. NOT making this
|
||||
# assumption means that these tests can't check that a certain operation
|
||||
# increases a certain counter by exactly 1 - because other concurrent
|
||||
# operations might increase it further! So our test can only check that the
|
||||
# counter increases.
|
||||
##############################################################################
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import re
|
||||
|
||||
from util import random_string
|
||||
|
||||
# Fixture for checking if we are able to test Scylla metrics. Scylla metrics
|
||||
# are not available on AWS (of course), but may also not be available for
|
||||
# Scylla if for some reason we have only access to the Alternator protocol
|
||||
# port but no access to the metrics port (9180).
|
||||
# If metrics are *not* available, tests using this fixture will be skipped.
|
||||
# Tests using this fixture may call get_metrics(metrics).
|
||||
@pytest.fixture(scope="module")
|
||||
def metrics(dynamodb):
|
||||
if dynamodb.meta.client._endpoint.host.endswith('.amazonaws.com'):
|
||||
pytest.skip('Scylla-only feature not supported by AWS')
|
||||
url = dynamodb.meta.client._endpoint.host
|
||||
# The Prometheus API is on port 9180, and always http, not https.
|
||||
url = re.sub(r':[0-9]+(/|$)', ':9180', url)
|
||||
url = re.sub(r'^https:', 'http:', url)
|
||||
url = url + '/metrics'
|
||||
resp = requests.get(url)
|
||||
if resp.status_code != 200:
|
||||
pytest.skip('Metrics port 9180 is not available')
|
||||
yield url
|
||||
|
||||
# Utility function for fetching all metrics from Scylla, using an HTTP request
|
||||
# to port 9180. The response format is defined by the Prometheus protocol.
|
||||
# Only use get_metrics() in a test using the metrics_available fixture.
|
||||
def get_metrics(metrics):
|
||||
response = requests.get(metrics)
|
||||
assert response.status_code == 200
|
||||
return response.text
|
||||
|
||||
# Utility function for fetching a metric with a given name and optionally a
|
||||
# given sub-metric label (which should be a name-value map). If multiple
|
||||
# matches are found, they are summed - this is useful for summing up the
|
||||
# counts from multiple shards.
|
||||
def get_metric(metrics, name, requested_labels=None):
|
||||
total = 0.0
|
||||
lines = re.compile('^'+name+'{.*$', re.MULTILINE)
|
||||
for match in re.findall(lines, get_metrics(metrics)):
|
||||
a = match.split()
|
||||
metric = a[0]
|
||||
val = float(a[1])
|
||||
# Check if match also matches the requested labels
|
||||
if requested_labels:
|
||||
# we know metric begins with name{ and ends with } - the labels
|
||||
# are what we have between those
|
||||
got_labels = metric[len(name)+1:-1].split(',')
|
||||
# Check that every one of the requested labels is in got_labels:
|
||||
for k, v in requested_labels.items():
|
||||
if not f'{k}="{v}"' in got_labels:
|
||||
# No match for requested label, skip this metric (python
|
||||
# doesn't have "continue 2" so let's just set val to 0...
|
||||
val = 0
|
||||
break
|
||||
total += float(val)
|
||||
return total
|
||||
|
||||
def test_batch_write_item(test_table_s, metrics):
|
||||
n1 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchWriteItem'})
|
||||
test_table_s.meta.client.batch_write_item(RequestItems = {
|
||||
test_table_s.name: [{'PutRequest': {'Item': {'p': random_string(), 'a': 'hi'}}}]})
|
||||
n2 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchWriteItem'})
|
||||
assert n2 > n1
|
||||
|
||||
# Reproduces issue #9406:
|
||||
def test_batch_get_item(test_table_s, metrics):
|
||||
n1 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchGetItem'})
|
||||
test_table_s.meta.client.batch_get_item(RequestItems = {
|
||||
test_table_s.name: {'Keys': [{'p': random_string()}], 'ConsistentRead': True}})
|
||||
n2 = get_metric(metrics, 'scylla_alternator_operation', {'op': 'BatchGetItem'})
|
||||
assert n2 > n1
|
||||
|
||||
# TODO: check the rest of the operations
|
||||
@@ -431,3 +431,14 @@ def test_update_item_returnvalues_nested(test_table_s):
|
||||
ret=test_table_s.update_item(Key={'p': p}, ReturnValues='UPDATED_NEW',
|
||||
UpdateExpression='REMOVE a.c[1]')
|
||||
assert ret['Attributes'] == {'a': {'c': [70]}}
|
||||
|
||||
# A reproducer for issue #9542 - when UpdateExpression's REMOVE operation
|
||||
# actually deletes an existing attribute, it breaks the ALL_NEW ReturnValues
|
||||
# for other attributes set in the same command.
|
||||
def test_update_item_returnvalues_all_new_remove_etc(test_table_s):
|
||||
p = random_string()
|
||||
test_table_s.put_item(Item={'p': p, 's': 'dog', 'd': 'foo'})
|
||||
ret=test_table_s.update_item(Key={'p': p}, ReturnValues='ALL_NEW',
|
||||
UpdateExpression='REMOVE d SET s = :v',
|
||||
ExpressionAttributeValues={':v': 'cat'})
|
||||
assert ret['Attributes']['s'] == 'cat'
|
||||
|
||||
@@ -1014,6 +1014,20 @@ def test_nested_attribute_remove_from_missing_item(test_table_s):
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x.y')
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x[0]')
|
||||
|
||||
# Though in an above test (test_nested_attribute_update_bad_path_dot) we
|
||||
# showed that DynamoDB does not allow REMOVE x.y if attribute x doesn't
|
||||
# exist - and generates a ValidationException, if x *does* exist but y
|
||||
# doesn't, it's fine and the removal should just be silently ignored.
|
||||
def test_nested_attribute_remove_missing_leaf(test_table_s):
|
||||
p = random_string()
|
||||
item = {'p': p, 'a': {'x': 3}, 'b': ['hi']}
|
||||
test_table_s.put_item(Item=item)
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.y')
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE b[7]')
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE c')
|
||||
# The above UpdateItem calls didn't change anything...
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == item
|
||||
|
||||
# Similarly for other types of bad paths - using [0] on something which
|
||||
# doesn't exist or isn't an array.
|
||||
def test_nested_attribute_update_bad_path_array(test_table_s):
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#define BOOST_TEST_MODULE alternator
|
||||
#include <boost/test/included/unit_test.hpp>
|
||||
|
||||
#include <seastar/util/defer.hh>
|
||||
#include "alternator/base64.hh"
|
||||
|
||||
static bytes_view to_bytes_view(const std::string& s) {
|
||||
@@ -78,3 +79,22 @@ BOOST_AUTO_TEST_CASE(test_base64_begins_with) {
|
||||
BOOST_REQUIRE(!base64_begins_with(encoded_str3, encoded_non_prefix));
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_allocator_fail_gracefully) {
|
||||
// Unfortunately the address sanitizer fails if the allocator is not able
|
||||
// to allocate the requested memory. The test is therefore skipped for debug mode
|
||||
#ifndef DEBUG
|
||||
static constexpr size_t too_large_alloc_size = 0xffffffffff;
|
||||
rjson::allocator allocator;
|
||||
// Impossible allocation should throw
|
||||
BOOST_REQUIRE_THROW(allocator.Malloc(too_large_alloc_size), rjson::error);
|
||||
// So should impossible reallocation
|
||||
void* memory = allocator.Malloc(1);
|
||||
auto release = defer([memory] { rjson::allocator::Free(memory); });
|
||||
BOOST_REQUIRE_THROW(allocator.Realloc(memory, 1, too_large_alloc_size), rjson::error);
|
||||
// Internal rapidjson stack should also throw
|
||||
// and also be destroyed gracefully later
|
||||
rapidjson::internal::Stack stack(&allocator, 0);
|
||||
BOOST_REQUIRE_THROW(stack.Push<char>(too_large_alloc_size), rjson::error);
|
||||
#endif
|
||||
}
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <set>
|
||||
#include <deque>
|
||||
|
||||
#include <seastar/testing/test_case.hh>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
@@ -249,7 +250,7 @@ SEASTAR_TEST_CASE(test_commitlog_discard_completed_segments){
|
||||
}).then([&log] {
|
||||
return log.shutdown().then([&log] {
|
||||
return log.list_existing_segments().then([] (auto descs) {
|
||||
BOOST_REQUIRE(descs.empty());
|
||||
BOOST_CHECK_EQUAL(descs, decltype(descs){});
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -748,3 +749,129 @@ SEASTAR_TEST_CASE(test_commitlog_new_segment_odsync){
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Test for #8363
|
||||
// try to provoke edge case where we race segment deletion
|
||||
// and waiting for recycled to be replenished.
|
||||
SEASTAR_TEST_CASE(test_commitlog_deadlock_in_recycle) {
|
||||
commitlog::config cfg;
|
||||
|
||||
constexpr auto max_size_mb = 2;
|
||||
cfg.commitlog_segment_size_in_mb = max_size_mb;
|
||||
// ensure total size per shard is not multiple of segment size.
|
||||
cfg.commitlog_total_space_in_mb = 5 * smp::count;
|
||||
cfg.commitlog_sync_period_in_ms = 10;
|
||||
cfg.reuse_segments = true;
|
||||
cfg.allow_going_over_size_limit = false;
|
||||
cfg.use_o_dsync = true; // make sure we pre-allocate.
|
||||
|
||||
// not using cl_test, because we need to be able to abandon
|
||||
// the log.
|
||||
|
||||
tmpdir tmp;
|
||||
cfg.commit_log_location = tmp.path().string();
|
||||
auto log = co_await commitlog::create_commitlog(cfg);
|
||||
|
||||
rp_set rps;
|
||||
std::deque<rp_set> queue;
|
||||
size_t n = 0;
|
||||
|
||||
// uncomment for verbosity
|
||||
// logging::logger_registry().set_logger_level("commitlog", logging::log_level::debug);
|
||||
|
||||
auto uuid = utils::UUID_gen::get_time_UUID();
|
||||
auto size = log.max_record_size() / 2;
|
||||
|
||||
timer<> t;
|
||||
t.set_callback([&] {
|
||||
while (!queue.empty()) {
|
||||
auto flush = std::move(queue.front());
|
||||
queue.pop_front();
|
||||
log.discard_completed_segments(uuid, flush);
|
||||
++n;
|
||||
};
|
||||
});
|
||||
|
||||
// add a flush handler that delays releasing things until disk threshold is reached.
|
||||
auto r = log.add_flush_handler([&](cf_id_type, replay_position pos) {
|
||||
auto old = std::exchange(rps, rp_set{});
|
||||
queue.emplace_back(std::move(old));
|
||||
if (log.disk_footprint() >= log.disk_limit() && !t.armed()) {
|
||||
t.arm(5s);
|
||||
}
|
||||
});
|
||||
|
||||
bool release = true;
|
||||
|
||||
try {
|
||||
while (n < 10) {
|
||||
auto now = timeout_clock::now();
|
||||
rp_handle h = co_await with_timeout(now + 30s, log.add_mutation(uuid, size, db::commitlog::force_sync::no, [&](db::commitlog::output& dst) {
|
||||
dst.fill('1', size);
|
||||
}));
|
||||
rps.put(std::move(h));
|
||||
}
|
||||
} catch (timed_out_error&) {
|
||||
BOOST_FAIL("log write timed out. maybe it is deadlocked... Will not free log. ASAN errors and leaks will follow...");
|
||||
release = false;
|
||||
}
|
||||
|
||||
if (release) {
|
||||
co_await log.shutdown();
|
||||
co_await log.clear();
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_commitlog_deadlock_with_flush_threshold) {
|
||||
commitlog::config cfg;
|
||||
|
||||
constexpr auto max_size_mb = 1;
|
||||
|
||||
cfg.commitlog_segment_size_in_mb = max_size_mb;
|
||||
cfg.commitlog_total_space_in_mb = 2 * max_size_mb * smp::count;
|
||||
cfg.commitlog_sync_period_in_ms = 10;
|
||||
cfg.reuse_segments = true;
|
||||
cfg.allow_going_over_size_limit = false;
|
||||
cfg.use_o_dsync = true; // make sure we pre-allocate.
|
||||
|
||||
// not using cl_test, because we need to be able to abandon
|
||||
// the log.
|
||||
|
||||
tmpdir tmp;
|
||||
cfg.commit_log_location = tmp.path().string();
|
||||
auto log = co_await commitlog::create_commitlog(cfg);
|
||||
|
||||
rp_set rps;
|
||||
// uncomment for verbosity
|
||||
// logging::logger_registry().set_logger_level("commitlog", logging::log_level::debug);
|
||||
|
||||
auto uuid = utils::UUID_gen::get_time_UUID();
|
||||
auto size = log.max_record_size();
|
||||
|
||||
bool done = false;
|
||||
|
||||
auto r = log.add_flush_handler([&](cf_id_type id, replay_position pos) {
|
||||
log.discard_completed_segments(id, rps);
|
||||
done = true;
|
||||
});
|
||||
|
||||
bool release = true;
|
||||
|
||||
try {
|
||||
while (!done) {
|
||||
auto now = timeout_clock::now();
|
||||
rp_handle h = co_await with_timeout(now + 30s, log.add_mutation(uuid, size, db::commitlog::force_sync::no, [&](db::commitlog::output& dst) {
|
||||
dst.fill('1', size);
|
||||
}));
|
||||
rps.put(std::move(h));
|
||||
}
|
||||
} catch (timed_out_error&) {
|
||||
BOOST_FAIL("log write timed out. maybe it is deadlocked... Will not free log. ASAN errors and leaks will follow...");
|
||||
release = false;
|
||||
}
|
||||
|
||||
if (release) {
|
||||
co_await log.shutdown();
|
||||
co_await log.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -936,6 +936,7 @@ SEASTAR_TEST_CASE(test_parse_experimental_features_cdc) {
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UDF));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::RAFT));
|
||||
return make_ready_future();
|
||||
}
|
||||
|
||||
@@ -948,6 +949,7 @@ SEASTAR_TEST_CASE(test_parse_experimental_features_unused) {
|
||||
BOOST_CHECK(cfg.check_experimental(ef::UNUSED));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UDF));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::RAFT));
|
||||
return make_ready_future();
|
||||
}
|
||||
|
||||
@@ -960,6 +962,7 @@ SEASTAR_TEST_CASE(test_parse_experimental_features_udf) {
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
|
||||
BOOST_CHECK(cfg.check_experimental(ef::UDF));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::RAFT));
|
||||
return make_ready_future();
|
||||
}
|
||||
|
||||
@@ -972,6 +975,20 @@ SEASTAR_TEST_CASE(test_parse_experimental_features_alternator_streams) {
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UDF));
|
||||
BOOST_CHECK(cfg.check_experimental(ef::ALTERNATOR_STREAMS));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::RAFT));
|
||||
return make_ready_future();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_parse_experimental_features_raft) {
|
||||
auto cfg_ptr = std::make_unique<config>();
|
||||
config& cfg = *cfg_ptr;
|
||||
cfg.read_from_yaml("experimental_features:\n - raft\n", throw_on_error);
|
||||
BOOST_CHECK_EQUAL(cfg.experimental_features(), features{ef::RAFT});
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UNUSED_CDC));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UDF));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
|
||||
BOOST_CHECK(cfg.check_experimental(ef::RAFT));
|
||||
return make_ready_future();
|
||||
}
|
||||
|
||||
@@ -1011,6 +1028,7 @@ SEASTAR_TEST_CASE(test_parse_experimental_true) {
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
|
||||
BOOST_CHECK(cfg.check_experimental(ef::UDF));
|
||||
BOOST_CHECK(cfg.check_experimental(ef::ALTERNATOR_STREAMS));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::RAFT));
|
||||
return make_ready_future();
|
||||
}
|
||||
|
||||
@@ -1022,5 +1040,6 @@ SEASTAR_TEST_CASE(test_parse_experimental_false) {
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UNUSED));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::UDF));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::ALTERNATOR_STREAMS));
|
||||
BOOST_CHECK(!cfg.check_experimental(ef::RAFT));
|
||||
return make_ready_future();
|
||||
}
|
||||
|
||||
@@ -22,6 +22,8 @@
|
||||
#include <seastar/testing/test_case.hh>
|
||||
#include "test/lib/cql_test_env.hh"
|
||||
#include "test/lib/cql_assertions.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
|
||||
SEASTAR_TEST_CASE(test_index_with_paging) {
|
||||
@@ -48,3 +50,51 @@ SEASTAR_TEST_CASE(test_index_with_paging) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read) {
|
||||
return do_with_cql_env_thread([] (auto& e) {
|
||||
e.execute_cql("CREATE TABLE tab (pk int, ck text, v int, v2 int, v3 text, PRIMARY KEY (pk, ck))").get();
|
||||
e.execute_cql("CREATE INDEX ON tab (v)").get();
|
||||
|
||||
// Enough to trigger a short read on the base table during scan
|
||||
sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
|
||||
|
||||
const int row_count = 67;
|
||||
for (int i = 0; i < row_count; ++i) {
|
||||
e.execute_cql(format("INSERT INTO tab (pk, ck, v, v2, v3) VALUES ({}, 'hello{}', 1, {}, '{}')", i % 3, i, i, big_string)).get();
|
||||
}
|
||||
|
||||
eventually([&] {
|
||||
uint64_t count = 0;
|
||||
e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
|
||||
++count;
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
}).get();
|
||||
BOOST_REQUIRE_EQUAL(count, row_count);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read_no_ck) {
|
||||
return do_with_cql_env_thread([] (auto& e) {
|
||||
e.execute_cql("CREATE TABLE tab (pk int, v int, v2 int, v3 text, PRIMARY KEY (pk))").get();
|
||||
e.execute_cql("CREATE INDEX ON tab (v)").get();
|
||||
|
||||
// Enough to trigger a short read on the base table during scan
|
||||
sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
|
||||
|
||||
const int row_count = 67;
|
||||
for (int i = 0; i < row_count; ++i) {
|
||||
e.execute_cql(format("INSERT INTO tab (pk, v, v2, v3) VALUES ({}, 1, {}, '{}')", i, i, big_string)).get();
|
||||
}
|
||||
|
||||
eventually([&] {
|
||||
uint64_t count = 0;
|
||||
e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
|
||||
++count;
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
}).get();
|
||||
BOOST_REQUIRE_EQUAL(count, row_count);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -974,14 +974,7 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) {
|
||||
|
||||
const auto& partitions = pop_desc.partitions;
|
||||
smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(pop_desc.schema), &partitions] {
|
||||
auto s = gs.get();
|
||||
auto& sem = db->local().get_reader_concurrency_semaphore();
|
||||
|
||||
auto resources = sem.available_resources();
|
||||
resources -= reader_concurrency_semaphore::resources{1, 0};
|
||||
auto permit = sem.make_permit(s.get(), "fuzzy-test");
|
||||
|
||||
return run_fuzzy_test_workload(cfg, *db, std::move(s), partitions).finally([units = permit.consume_resources(resources)] {});
|
||||
return run_fuzzy_test_workload(cfg, *db, gs.get(), partitions);
|
||||
}).handle_exception([seed] (std::exception_ptr e) {
|
||||
testlog.error("Test workload failed with exception {}."
|
||||
" To repeat this particular run, replace the random seed of the test, with that of this run ({})."
|
||||
|
||||
@@ -970,6 +970,192 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_destroyed_permit_rele
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
|
||||
}
|
||||
|
||||
// This unit test passes a read through admission again-and-again, just
|
||||
// like an evictable reader would be during its lifetime. When readmitted
|
||||
// the read sometimes has to wait and sometimes not. This is to check that
|
||||
// the readmitting a previously admitted reader doesn't leak any units.
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves_units) {
|
||||
simple_schema s;
|
||||
const auto initial_resources = reader_concurrency_semaphore::resources{10, 1024 * 1024};
|
||||
reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name());
|
||||
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
|
||||
std::optional<reader_permit::resource_units> residue_units;
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
const auto have_residue_units = bool(residue_units);
|
||||
|
||||
auto current_resources = initial_resources;
|
||||
if (have_residue_units) {
|
||||
current_resources -= residue_units->resources();
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == current_resources);
|
||||
|
||||
std::optional<reader_permit::resource_units> admitted_units;
|
||||
if (i % 2) {
|
||||
const auto consumed_resources = semaphore.available_resources();
|
||||
semaphore.consume(consumed_resources);
|
||||
|
||||
auto units_fut = permit.wait_admission(1024, db::no_timeout);
|
||||
BOOST_REQUIRE(!units_fut.available());
|
||||
|
||||
semaphore.signal(consumed_resources);
|
||||
admitted_units = units_fut.get();
|
||||
} else {
|
||||
admitted_units = permit.wait_admission(1024, db::no_timeout).get();
|
||||
}
|
||||
|
||||
current_resources -= admitted_units->resources();
|
||||
BOOST_REQUIRE(semaphore.available_resources() == current_resources);
|
||||
|
||||
residue_units.emplace(permit.consume_resources(reader_resources(0, 100)));
|
||||
if (!have_residue_units) {
|
||||
current_resources -= residue_units->resources();
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == current_resources);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit));
|
||||
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources - residue_units->resources());
|
||||
|
||||
residue_units.reset();
|
||||
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
|
||||
}
|
||||
|
||||
// This unit test checks that the semaphore doesn't get into a deadlock
|
||||
// when contended, in the presence of many memory-only reads (that don't
|
||||
// wait for admission). This is tested by simulating the 3 kind of reads we
|
||||
// currently have in the system:
|
||||
// * memory-only: reads that don't pass admission and only own memory.
|
||||
// * admitted: reads that pass admission.
|
||||
// * evictable: admitted reads that are furthermore evictable.
|
||||
//
|
||||
// The test creates and runs a large number of these reads in parallel,
|
||||
// read kinds being selected randomly, then creates a watchdog which
|
||||
// kills the test if no progress is being made.
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
|
||||
class reader {
|
||||
class skeleton_reader : public flat_mutation_reader::impl {
|
||||
reader_permit::resource_units _base_resources;
|
||||
std::optional<reader_permit::resource_units> _resources;
|
||||
public:
|
||||
skeleton_reader(schema_ptr s, reader_permit permit, reader_permit::resource_units res)
|
||||
: impl(std::move(s), std::move(permit)), _base_resources(std::move(res)) { }
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
||||
_resources.emplace(_permit.consume_resources(reader_resources(0, tests::random::get_int(1024, 2048))));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
virtual future<> next_partition() override { return make_ready_future<>(); }
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
|
||||
virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
|
||||
};
|
||||
struct reader_visitor {
|
||||
reader& r;
|
||||
future<> operator()(std::monostate& ms) { return r.tick(ms); }
|
||||
future<> operator()(flat_mutation_reader& reader) { return r.tick(reader); }
|
||||
future<> operator()(reader_concurrency_semaphore::inactive_read_handle& handle) { return r.tick(handle); }
|
||||
};
|
||||
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
reader_permit _permit;
|
||||
bool _memory_only = true;
|
||||
bool _evictable = false;
|
||||
std::optional<reader_permit::resource_units> _units;
|
||||
std::variant<std::monostate, flat_mutation_reader, reader_concurrency_semaphore::inactive_read_handle> _reader;
|
||||
|
||||
private:
|
||||
future<> make_reader() {
|
||||
auto res = _permit.consume_memory();
|
||||
if (!_memory_only) {
|
||||
res = co_await _permit.wait_admission(1024, db::no_timeout);
|
||||
}
|
||||
_reader = make_flat_mutation_reader<skeleton_reader>(_schema, _permit, std::move(res));
|
||||
}
|
||||
future<> tick(std::monostate&) {
|
||||
co_await make_reader();
|
||||
co_await tick(std::get<flat_mutation_reader>(_reader));
|
||||
}
|
||||
future<> tick(flat_mutation_reader& reader) {
|
||||
co_await reader.fill_buffer(db::no_timeout);
|
||||
if (_evictable) {
|
||||
_reader = _permit.semaphore().register_inactive_read(std::move(reader));
|
||||
}
|
||||
}
|
||||
future<> tick(reader_concurrency_semaphore::inactive_read_handle& handle) {
|
||||
if (auto reader = _permit.semaphore().unregister_inactive_read(std::move(handle)); reader) {
|
||||
_reader = std::move(*reader);
|
||||
} else {
|
||||
co_await make_reader();
|
||||
}
|
||||
co_await tick(std::get<flat_mutation_reader>(_reader));
|
||||
}
|
||||
|
||||
public:
|
||||
reader(schema_ptr s, reader_permit permit, bool memory_only, bool evictable)
|
||||
: _schema(std::move(s))
|
||||
, _permit(std::move(permit))
|
||||
, _memory_only(memory_only)
|
||||
, _evictable(evictable)
|
||||
, _units(_permit.consume_memory(tests::random::get_int(128, 1024)))
|
||||
{
|
||||
}
|
||||
future<> tick() {
|
||||
return std::visit(reader_visitor{*this}, _reader);
|
||||
}
|
||||
};
|
||||
|
||||
const auto count = 10;
|
||||
const auto num_readers = 512;
|
||||
const auto ticks = 1000;
|
||||
|
||||
simple_schema s;
|
||||
reader_concurrency_semaphore semaphore(count, count * 1024, get_name());
|
||||
|
||||
std::list<std::optional<reader>> readers;
|
||||
unsigned nr_memory_only = 0;
|
||||
unsigned nr_admitted = 0;
|
||||
unsigned nr_evictable = 0;
|
||||
|
||||
for (auto i = 0; i < num_readers; ++i) {
|
||||
const auto memory_only = tests::random::get_bool();
|
||||
const auto evictable = !memory_only && tests::random::get_bool();
|
||||
if (memory_only) {
|
||||
++nr_memory_only;
|
||||
} else if (evictable) {
|
||||
++nr_evictable;
|
||||
} else {
|
||||
++nr_admitted;
|
||||
}
|
||||
readers.emplace_back(reader(s.schema(), semaphore.make_permit(s.schema().get(), fmt::format("reader{}", i)), memory_only, evictable));
|
||||
}
|
||||
|
||||
testlog.info("Created {} readers, memory_only={}, admitted={}, evictable={}", readers.size(), nr_memory_only, nr_admitted, nr_evictable);
|
||||
|
||||
bool watchdog_touched = false;
|
||||
auto watchdog = timer<db::timeout_clock>([&semaphore, &watchdog_touched] {
|
||||
if (!watchdog_touched) {
|
||||
testlog.error("Watchdog detected a deadlock, dumping diagnostics before killing the test: {}", semaphore.dump_diagnostics());
|
||||
semaphore.broken(std::make_exception_ptr(std::runtime_error("test killed by watchdog")));
|
||||
}
|
||||
watchdog_touched = false;
|
||||
});
|
||||
watchdog.arm_periodic(std::chrono::seconds(30));
|
||||
|
||||
parallel_for_each(readers, [&] (std::optional<reader>& r) -> future<> {
|
||||
for (auto i = 0; i < ticks; ++i) {
|
||||
watchdog_touched = true;
|
||||
co_await r->tick();
|
||||
}
|
||||
r.reset();
|
||||
watchdog_touched = true;
|
||||
}).get();
|
||||
}
|
||||
|
||||
static
|
||||
sstables::shared_sstable create_sstable(sstables::test_env& env, schema_ptr s, std::vector<mutation> mutations) {
|
||||
static thread_local auto tmp = tmpdir();
|
||||
@@ -3240,39 +3426,30 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
reader_permit permit,
|
||||
const dht::partition_range& prange,
|
||||
const query::partition_slice& slice,
|
||||
std::deque<mutation_fragment> first_buffer,
|
||||
position_in_partition_view last_fragment_position,
|
||||
std::deque<mutation_fragment> second_buffer,
|
||||
size_t max_buffer_size) {
|
||||
std::list<std::deque<mutation_fragment>> buffers,
|
||||
position_in_partition_view first_buf_last_fragment_position,
|
||||
size_t max_buffer_size,
|
||||
bool detach_buffer = true) {
|
||||
class factory {
|
||||
schema_ptr _schema;
|
||||
reader_permit _permit;
|
||||
std::optional<std::deque<mutation_fragment>> _first_buffer;
|
||||
std::optional<std::deque<mutation_fragment>> _second_buffer;
|
||||
std::list<std::deque<mutation_fragment>> _buffers;
|
||||
size_t _max_buffer_size;
|
||||
|
||||
private:
|
||||
std::optional<std::deque<mutation_fragment>> copy_buffer(const std::optional<std::deque<mutation_fragment>>& o) {
|
||||
if (!o) {
|
||||
return {};
|
||||
}
|
||||
return copy_fragments(*_schema, _permit, *o);
|
||||
}
|
||||
|
||||
public:
|
||||
factory(schema_ptr schema, reader_permit permit, std::deque<mutation_fragment> first_buffer, std::deque<mutation_fragment> second_buffer, size_t max_buffer_size)
|
||||
factory(schema_ptr schema, reader_permit permit, std::list<std::deque<mutation_fragment>> buffers, size_t max_buffer_size)
|
||||
: _schema(std::move(schema))
|
||||
, _permit(std::move(permit))
|
||||
, _first_buffer(std::move(first_buffer))
|
||||
, _second_buffer(std::move(second_buffer))
|
||||
, _buffers(std::move(buffers))
|
||||
, _max_buffer_size(max_buffer_size) {
|
||||
}
|
||||
|
||||
factory(const factory& o)
|
||||
: _schema(o._schema)
|
||||
, _permit(o._permit)
|
||||
, _first_buffer(copy_buffer(o._first_buffer))
|
||||
, _second_buffer(copy_buffer(o._second_buffer)) {
|
||||
, _permit(o._permit) {
|
||||
for (const auto& buf : o._buffers) {
|
||||
_buffers.emplace_back(copy_fragments(*_schema, _permit, buf));
|
||||
}
|
||||
}
|
||||
factory(factory&& o) = default;
|
||||
|
||||
@@ -3286,14 +3463,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
streamed_mutation::forwarding fwd_sm,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
BOOST_REQUIRE(s == _schema);
|
||||
if (_first_buffer) {
|
||||
auto buf = *std::exchange(_first_buffer, {});
|
||||
auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
|
||||
rd.set_max_buffer_size(_max_buffer_size);
|
||||
return rd;
|
||||
}
|
||||
if (_second_buffer) {
|
||||
auto buf = *std::exchange(_second_buffer, {});
|
||||
if (!_buffers.empty()) {
|
||||
auto buf = std::move(_buffers.front());
|
||||
_buffers.pop_front();
|
||||
auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
|
||||
rd.set_max_buffer_size(_max_buffer_size);
|
||||
return rd;
|
||||
@@ -3301,9 +3473,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
return make_empty_flat_reader(_schema, std::move(permit));
|
||||
}
|
||||
};
|
||||
auto ms = mutation_source(factory(schema, permit, std::move(first_buffer), std::move(second_buffer), max_buffer_size));
|
||||
auto ms = mutation_source(factory(schema, permit, std::move(buffers), max_buffer_size));
|
||||
|
||||
auto [rd, handle] = make_manually_paused_evictable_reader(
|
||||
auto rd = make_auto_paused_evictable_reader(
|
||||
std::move(ms),
|
||||
schema,
|
||||
permit,
|
||||
@@ -3319,18 +3491,42 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
|
||||
const auto eq_cmp = position_in_partition::equal_compare(*schema);
|
||||
BOOST_REQUIRE(rd.is_buffer_full());
|
||||
BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), last_fragment_position));
|
||||
BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), first_buf_last_fragment_position));
|
||||
BOOST_REQUIRE(!rd.is_end_of_stream());
|
||||
|
||||
rd.detach_buffer();
|
||||
|
||||
handle.pause();
|
||||
if (detach_buffer) {
|
||||
rd.detach_buffer();
|
||||
}
|
||||
|
||||
while(permit.semaphore().try_evict_one_inactive_read());
|
||||
|
||||
return std::move(rd);
|
||||
}
|
||||
|
||||
flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
schema_ptr schema,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& prange,
|
||||
const query::partition_slice& slice,
|
||||
std::deque<mutation_fragment> first_buffer,
|
||||
position_in_partition_view last_fragment_position,
|
||||
std::deque<mutation_fragment> last_buffer,
|
||||
size_t max_buffer_size,
|
||||
bool detach_buffer = true) {
|
||||
std::list<std::deque<mutation_fragment>> list;
|
||||
list.emplace_back(std::move(first_buffer));
|
||||
list.emplace_back(std::move(last_buffer));
|
||||
return create_evictable_reader_and_evict_after_first_buffer(
|
||||
std::move(schema),
|
||||
std::move(permit),
|
||||
prange,
|
||||
slice,
|
||||
std::move(list),
|
||||
last_fragment_position,
|
||||
max_buffer_size,
|
||||
detach_buffer);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_evictable_reader_trim_range_tombstones) {
|
||||
@@ -3632,7 +3828,7 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {
|
||||
|
||||
check_evictable_reader_validation_is_triggered(
|
||||
"pkey > _last_pkey; pkey ∈ pkrange",
|
||||
partition_error_prefix,
|
||||
"",
|
||||
s.schema(),
|
||||
permit,
|
||||
prange,
|
||||
@@ -3828,12 +4024,232 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_recreate_before_fast_forward_to)
|
||||
reader_assert.produces(pkeys[4]);
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_evictable_reader_drop_flags) {
|
||||
reader_concurrency_semaphore semaphore(1, 0, get_name());
|
||||
simple_schema s;
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
|
||||
auto pkeys = s.make_pkeys(2);
|
||||
std::sort(pkeys.begin(), pkeys.end(), [&s] (const auto& pk1, const auto& pk2) {
|
||||
return pk1.less_compare(*s.schema(), pk2);
|
||||
});
|
||||
const auto& pkey1 = pkeys[0];
|
||||
const auto& pkey2 = pkeys[1];
|
||||
const int second_buffer_ck = 10;
|
||||
|
||||
struct buffer {
|
||||
simple_schema& s;
|
||||
reader_permit permit;
|
||||
std::deque<mutation_fragment> frags;
|
||||
std::vector<mutation> muts;
|
||||
size_t size = 0;
|
||||
std::optional<position_in_partition_view> last_pos;
|
||||
|
||||
buffer(simple_schema& s_, reader_permit permit_, dht::decorated_key key)
|
||||
: s(s_), permit(std::move(permit_)) {
|
||||
add_partition(key);
|
||||
}
|
||||
size_t add_partition(dht::decorated_key key) {
|
||||
size += frags.emplace_back(*s.schema(), permit, partition_start{key, {}}).memory_usage();
|
||||
muts.emplace_back(s.schema(), key);
|
||||
return size;
|
||||
}
|
||||
size_t add_mutation_fragment(mutation_fragment&& mf, bool only_to_frags = false) {
|
||||
if (!only_to_frags) {
|
||||
muts.back().apply(mf);
|
||||
}
|
||||
size += frags.emplace_back(*s.schema(), permit, std::move(mf)).memory_usage();
|
||||
return size;
|
||||
}
|
||||
size_t add_static_row(std::optional<mutation_fragment> sr = {}) {
|
||||
auto srow = sr ? std::move(*sr) : s.make_static_row("s");
|
||||
return add_mutation_fragment(std::move(srow));
|
||||
}
|
||||
size_t add_clustering_row(int i, bool only_to_frags = false) {
|
||||
return add_mutation_fragment(mutation_fragment(*s.schema(), permit, s.make_row(s.make_ckey(i), "v")), only_to_frags);
|
||||
}
|
||||
size_t add_clustering_rows(int start, int end) {
|
||||
for (int i = start; i < end; ++i) {
|
||||
add_clustering_row(i);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
size_t add_partition_end() {
|
||||
size += frags.emplace_back(*s.schema(), permit, partition_end{}).memory_usage();
|
||||
return size;
|
||||
}
|
||||
void save_position() { last_pos = frags.back().position(); }
|
||||
void find_position(size_t buf_size) {
|
||||
size_t s = 0;
|
||||
for (const auto& frag : frags) {
|
||||
s += frag.memory_usage();
|
||||
if (s >= buf_size) {
|
||||
last_pos = frag.position();
|
||||
break;
|
||||
}
|
||||
}
|
||||
BOOST_REQUIRE(last_pos);
|
||||
}
|
||||
};
|
||||
|
||||
auto make_reader = [&] (const buffer& first_buffer, const buffer& second_buffer, const buffer* const third_buffer, size_t max_buffer_size) {
|
||||
std::list<std::deque<mutation_fragment>> buffers;
|
||||
buffers.emplace_back(copy_fragments(*s.schema(), permit, first_buffer.frags));
|
||||
buffers.emplace_back(copy_fragments(*s.schema(), permit, second_buffer.frags));
|
||||
if (third_buffer) {
|
||||
buffers.emplace_back(copy_fragments(*s.schema(), permit, third_buffer->frags));
|
||||
}
|
||||
return create_evictable_reader_and_evict_after_first_buffer(
|
||||
s.schema(),
|
||||
permit,
|
||||
query::full_partition_range,
|
||||
s.schema()->full_slice(),
|
||||
std::move(buffers),
|
||||
*first_buffer.last_pos,
|
||||
max_buffer_size,
|
||||
false);
|
||||
};
|
||||
|
||||
testlog.info("Same partition, with static row");
|
||||
{
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
first_buffer.add_static_row();
|
||||
auto srow = mutation_fragment(*s.schema(), permit, first_buffer.frags.back());
|
||||
const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_clustering_row(second_buffer_ck);
|
||||
|
||||
buffer second_buffer(s, permit, pkey1);
|
||||
second_buffer.add_static_row(std::move(srow));
|
||||
second_buffer.add_clustering_row(second_buffer_ck);
|
||||
second_buffer.add_clustering_row(second_buffer_ck + 1);
|
||||
second_buffer.add_partition_end();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.produces(first_buffer.muts[0] + second_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
testlog.info("Same partition, no static row");
|
||||
{
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_clustering_row(second_buffer_ck);
|
||||
|
||||
buffer second_buffer(s, permit, pkey1);
|
||||
second_buffer.add_clustering_row(second_buffer_ck);
|
||||
second_buffer.add_clustering_row(second_buffer_ck + 1);
|
||||
second_buffer.add_partition_end();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.produces(first_buffer.muts[0] + second_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
testlog.info("Same partition as expected, no static row, next partition has static row (#8923)");
|
||||
{
|
||||
buffer second_buffer(s, permit, pkey1);
|
||||
second_buffer.add_clustering_rows(second_buffer_ck, second_buffer_ck + second_buffer_ck / 2);
|
||||
// We want to end the buffer on the partition-start below, but since a
|
||||
// partition start will be dropped from it, we have to use the size
|
||||
// without it.
|
||||
const auto buf_size = second_buffer.add_partition_end();
|
||||
second_buffer.add_partition(pkey2);
|
||||
second_buffer.add_static_row();
|
||||
auto srow = mutation_fragment(*s.schema(), permit, second_buffer.frags.back());
|
||||
second_buffer.add_clustering_rows(0, 2);
|
||||
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
for (int i = 0; first_buffer.add_clustering_row(i) < buf_size; ++i);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_mutation_fragment(mutation_fragment(*s.schema(), permit, second_buffer.frags[1]));
|
||||
|
||||
buffer third_buffer(s, permit, pkey2);
|
||||
third_buffer.add_static_row(std::move(srow));
|
||||
third_buffer.add_clustering_rows(0, 2);
|
||||
third_buffer.add_partition_end();
|
||||
|
||||
first_buffer.find_position(buf_size);
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
|
||||
.produces(first_buffer.muts[0] + second_buffer.muts[0])
|
||||
.produces(second_buffer.muts[1] + third_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
testlog.info("Next partition, with no static row");
|
||||
{
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_clustering_row(second_buffer_ck + 1, true);
|
||||
|
||||
buffer second_buffer(s, permit, pkey2);
|
||||
second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
|
||||
second_buffer.add_partition_end();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.produces(first_buffer.muts[0])
|
||||
.produces(second_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
testlog.info("Next partition, with static row");
|
||||
{
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_clustering_row(second_buffer_ck + 1, true);
|
||||
|
||||
buffer second_buffer(s, permit, pkey2);
|
||||
second_buffer.add_static_row();
|
||||
second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
|
||||
second_buffer.add_partition_end();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.produces(first_buffer.muts[0])
|
||||
.produces(second_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
}
|
||||
|
||||
struct mutation_bounds {
|
||||
mutation m;
|
||||
std::optional<mutation> m;
|
||||
position_in_partition lower;
|
||||
position_in_partition upper;
|
||||
};
|
||||
|
||||
static reader_bounds make_reader_bounds(
|
||||
schema_ptr s, reader_permit permit, mutation_bounds mb, streamed_mutation::forwarding fwd,
|
||||
const query::partition_slice* slice = nullptr) {
|
||||
if (!slice) {
|
||||
slice = &s->full_slice();
|
||||
}
|
||||
|
||||
return reader_bounds {
|
||||
.r = mb.m ? flat_mutation_reader_from_mutations(permit, {std::move(*mb.m)}, *slice, fwd)
|
||||
: make_empty_flat_reader(s, permit),
|
||||
.lower = std::move(mb.lower),
|
||||
.upper = std::move(mb.upper)
|
||||
};
|
||||
}
|
||||
|
||||
struct clustering_order_merger_test_generator {
|
||||
struct scenario {
|
||||
std::vector<mutation_bounds> readers_data;
|
||||
@@ -3843,13 +4259,13 @@ struct clustering_order_merger_test_generator {
|
||||
schema_ptr _s;
|
||||
partition_key _pk;
|
||||
|
||||
clustering_order_merger_test_generator()
|
||||
: _s(make_schema()), _pk(partition_key::from_single_value(*_s, int32_type->decompose(0)))
|
||||
clustering_order_merger_test_generator(std::optional<sstring> pk = std::nullopt)
|
||||
: _s(make_schema()), _pk(partition_key::from_single_value(*_s, utf8_type->decompose(pk ? *pk : make_local_key(make_schema()))))
|
||||
{}
|
||||
|
||||
static schema_ptr make_schema() {
|
||||
return schema_builder("ks", "t")
|
||||
.with_column("pk", int32_type, column_kind::partition_key)
|
||||
.with_column("pk", utf8_type, column_kind::partition_key)
|
||||
.with_column("ck", int32_type, column_kind::clustering_key)
|
||||
.with_column("v", int32_type, column_kind::regular_column)
|
||||
.build();
|
||||
@@ -3873,15 +4289,18 @@ struct clustering_order_merger_test_generator {
|
||||
return m;
|
||||
}
|
||||
|
||||
dht::decorated_key decorated_pk() const {
|
||||
return dht::decorate_key(*_s, _pk);
|
||||
}
|
||||
|
||||
scenario generate_scenario(std::mt19937& engine) const {
|
||||
std::set<int> all_ks;
|
||||
std::vector<mutation_bounds> readers_data;
|
||||
|
||||
auto num_readers = tests::random::get_int(1, 10, engine);
|
||||
auto num_empty_readers = tests::random::get_int(1, num_readers, engine);
|
||||
while (num_empty_readers--) {
|
||||
auto lower = -tests::random::get_int(0, 5, engine);
|
||||
auto upper = tests::random::get_int(0, 5, engine);
|
||||
readers_data.push_back(mutation_bounds{std::nullopt, mk_pos_for(lower), mk_pos_for(upper)});
|
||||
num_readers--;
|
||||
}
|
||||
while (num_readers--) {
|
||||
auto len = tests::random::get_int(0, 15, engine);
|
||||
auto ks = tests::random::random_subset<int>(100, len, engine);
|
||||
@@ -3929,16 +4348,17 @@ struct clustering_order_merger_test_generator {
|
||||
SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_in_memory) {
|
||||
clustering_order_merger_test_generator g;
|
||||
|
||||
auto make_authority = [] (mutation mut, streamed_mutation::forwarding fwd) {
|
||||
return flat_mutation_reader_from_mutations(tests::make_permit(), {std::move(mut)}, fwd);
|
||||
auto make_authority = [s = g._s] (std::optional<mutation> mut, streamed_mutation::forwarding fwd) {
|
||||
if (mut) {
|
||||
return flat_mutation_reader_from_mutations(tests::make_permit(), {std::move(*mut)}, fwd);
|
||||
}
|
||||
return make_empty_flat_reader(s, tests::make_permit());
|
||||
};
|
||||
|
||||
auto make_tested = [s = g._s] (std::vector<mutation_bounds> ms, streamed_mutation::forwarding fwd) {
|
||||
auto rs = boost::copy_range<std::vector<reader_bounds>>(std::move(ms)
|
||||
| boost::adaptors::transformed([fwd] (auto&& mb) {
|
||||
return reader_bounds{
|
||||
flat_mutation_reader_from_mutations(tests::make_permit(), {std::move(mb.m)}, fwd),
|
||||
std::move(mb.lower), std::move(mb.upper)};
|
||||
| boost::adaptors::transformed([s, fwd] (auto&& mb) {
|
||||
return make_reader_bounds(s, tests::make_permit(), std::move(mb), fwd);
|
||||
}));
|
||||
auto q = std::make_unique<simple_position_reader_queue>(*s, std::move(rs));
|
||||
return make_clustering_combined_reader(s, tests::make_permit(), fwd, std::move(q));
|
||||
@@ -3951,7 +4371,15 @@ SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_in_memory) {
|
||||
for (int run = 0; run < 1000; ++run) {
|
||||
auto scenario = g.generate_scenario(engine);
|
||||
auto merged = std::accumulate(scenario.readers_data.begin(), scenario.readers_data.end(),
|
||||
mutation(g._s, g._pk), [] (mutation curr, const mutation_bounds& mb) { return std::move(curr) + mb.m; });
|
||||
std::optional<mutation>{}, [&g] (std::optional<mutation> curr, const mutation_bounds& mb) {
|
||||
if (mb.m) {
|
||||
if (!curr) {
|
||||
curr = mutation(g._s, g._pk);
|
||||
}
|
||||
*curr += *mb.m;
|
||||
}
|
||||
return curr;
|
||||
});
|
||||
|
||||
{
|
||||
auto fwd = streamed_mutation::forwarding::no;
|
||||
@@ -3974,13 +4402,16 @@ SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_in_memory) {
|
||||
SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_sstable_set) {
|
||||
sstables::test_env::do_with_async([] (sstables::test_env& env) {
|
||||
storage_service_for_tests ssft;
|
||||
clustering_order_merger_test_generator g;
|
||||
|
||||
auto make_authority = [] (mutation mut, streamed_mutation::forwarding fwd) {
|
||||
auto pkeys = make_local_keys(2, clustering_order_merger_test_generator::make_schema());
|
||||
clustering_order_merger_test_generator g(pkeys[0]);
|
||||
|
||||
auto make_authority = [s = g._s] (mutation mut, streamed_mutation::forwarding fwd) {
|
||||
return flat_mutation_reader_from_mutations(tests::make_permit(), {std::move(mut)}, fwd);
|
||||
};
|
||||
|
||||
auto make_tested = [s = g._s, pr = dht::partition_range::make_singular(dht::ring_position(g.decorated_pk()))]
|
||||
auto pr = dht::partition_range::make_singular(dht::ring_position(dht::decorate_key(*g._s, g._pk)));
|
||||
auto make_tested = [s = g._s, pk = g._pk, &pr]
|
||||
(const time_series_sstable_set& sst_set,
|
||||
const std::unordered_set<int64_t>& included_gens, streamed_mutation::forwarding fwd) {
|
||||
auto q = sst_set.make_min_position_reader_queue(
|
||||
@@ -3988,7 +4419,8 @@ SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_sstable_set) {
|
||||
return sst.make_reader(s, tests::make_permit(), pr,
|
||||
s->full_slice(), seastar::default_priority_class(), nullptr, fwd);
|
||||
},
|
||||
[included_gens] (const sstable& sst) { return included_gens.contains(sst.generation()); });
|
||||
[included_gens] (const sstable& sst) { return included_gens.contains(sst.generation()); },
|
||||
pk, s, tests::make_permit(), fwd);
|
||||
return make_clustering_combined_reader(s, tests::make_permit(), fwd, std::move(q));
|
||||
};
|
||||
|
||||
@@ -4006,32 +4438,39 @@ SEASTAR_THREAD_TEST_CASE(test_clustering_order_merger_sstable_set) {
|
||||
std::unordered_set<int64_t> included_gens;
|
||||
int64_t gen = 0;
|
||||
for (auto& mb: scenario.readers_data) {
|
||||
sst_set.insert(make_sstable_containing([s = g._s, &env, &tmp, gen = ++gen] () {
|
||||
auto sst_factory = [s = g._s, &env, &tmp, gen = ++gen] () {
|
||||
return env.make_sstable(std::move(s), tmp.path().string(), gen,
|
||||
sstables::sstable::version_types::md, sstables::sstable::format_types::big);
|
||||
}, {mb.m}));
|
||||
};
|
||||
|
||||
if (mb.m) {
|
||||
sst_set.insert(make_sstable_containing(std::move(sst_factory), {*mb.m}));
|
||||
} else {
|
||||
// We want to have an sstable that won't return any fragments when we query it
|
||||
// for our partition (not even `partition_start`). For that we create an sstable
|
||||
// with a different partition.
|
||||
auto pk = partition_key::from_single_value(*g._s, utf8_type->decompose(pkeys[1]));
|
||||
assert(pk != g._pk);
|
||||
|
||||
sst_set.insert(make_sstable_containing(std::move(sst_factory), {mutation(g._s, pk)}));
|
||||
}
|
||||
|
||||
if (dist(engine)) {
|
||||
included_gens.insert(gen);
|
||||
merged += mb.m;
|
||||
if (mb.m) {
|
||||
merged += *mb.m;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (included_gens.empty()) {
|
||||
for (auto fwd: {streamed_mutation::forwarding::no, streamed_mutation::forwarding::yes}) {
|
||||
assert_that(make_tested(sst_set, included_gens, fwd)).produces_end_of_stream();
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
{
|
||||
auto fwd = streamed_mutation::forwarding::no;
|
||||
compare_readers(*g._s, make_authority(merged, fwd), make_tested(sst_set, included_gens, fwd));
|
||||
}
|
||||
|
||||
auto fwd = streamed_mutation::forwarding::yes;
|
||||
compare_readers(*g._s, make_authority(std::move(merged), fwd), make_tested(sst_set, included_gens, fwd), scenario.fwd_ranges);
|
||||
compare_readers(*g._s, make_authority(std::move(merged), fwd),
|
||||
make_tested(sst_set, included_gens, fwd), scenario.fwd_ranges);
|
||||
}
|
||||
|
||||
}).get();
|
||||
@@ -4220,9 +4659,7 @@ SEASTAR_THREAD_TEST_CASE(clustering_combined_reader_mutation_source_test) {
|
||||
for (auto& [k, ms]: good) {
|
||||
auto rs = boost::copy_range<std::vector<reader_bounds>>(std::move(ms)
|
||||
| boost::adaptors::transformed([&] (auto&& mb) {
|
||||
return reader_bounds{
|
||||
flat_mutation_reader_from_mutations(permit, {std::move(mb.m)}, slice, fwd_sm),
|
||||
std::move(mb.lower), std::move(mb.upper)};
|
||||
return make_reader_bounds(s, permit, std::move(mb), fwd_sm, &slice);
|
||||
}));
|
||||
std::sort(rs.begin(), rs.end(), [less = position_in_partition::less_compare(*s)]
|
||||
(const reader_bounds& a, const reader_bounds& b) { return less(a.lower, b.lower); });
|
||||
@@ -4242,3 +4679,23 @@ SEASTAR_THREAD_TEST_CASE(clustering_combined_reader_mutation_source_test) {
|
||||
|
||||
run_mutation_source_tests(std::move(populate));
|
||||
}
|
||||
|
||||
// Regression test for #8445.
|
||||
SEASTAR_THREAD_TEST_CASE(test_clustering_combining_of_empty_readers) {
|
||||
auto s = clustering_order_merger_test_generator::make_schema();
|
||||
|
||||
std::vector<reader_bounds> rs;
|
||||
rs.push_back({
|
||||
.r = make_empty_flat_reader(s, tests::make_permit()),
|
||||
.lower = position_in_partition::before_all_clustered_rows(),
|
||||
.upper = position_in_partition::after_all_clustered_rows()
|
||||
});
|
||||
auto r = make_clustering_combined_reader(
|
||||
s, tests::make_permit(), streamed_mutation::forwarding::no,
|
||||
std::make_unique<simple_position_reader_queue>(*s, std::move(rs)));
|
||||
|
||||
auto mf = r(db::no_timeout).get0();
|
||||
if (mf) {
|
||||
BOOST_FAIL(format("reader combined of empty readers returned fragment {}", mutation_fragment::printer(*s, *mf)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -702,7 +702,10 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
|
||||
nullptr,
|
||||
db::no_timeout).get();
|
||||
|
||||
BOOST_CHECK_EQUAL(db.get_querier_cache_stats().resource_based_evictions, 1);
|
||||
// The second read might be evicted too if it consumes more
|
||||
// memory than the first and hence triggers memory control when
|
||||
// saved in the querier cache.
|
||||
BOOST_CHECK_GE(db.get_querier_cache_stats().resource_based_evictions, 1);
|
||||
|
||||
// We want to read the entire partition so that the querier
|
||||
// is not saved at the end and thus ensure it is destroyed.
|
||||
|
||||
@@ -96,7 +96,7 @@ SEASTAR_TEST_CASE(test_store_load_term_and_vote) {
|
||||
|
||||
BOOST_CHECK_EQUAL(vote_term, persisted.first);
|
||||
BOOST_CHECK_EQUAL(vote_id, persisted.second);
|
||||
});
|
||||
}, raft_cql_test_config());
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_store_load_snapshot) {
|
||||
@@ -122,7 +122,7 @@ SEASTAR_TEST_CASE(test_store_load_snapshot) {
|
||||
raft::snapshot loaded_snp = co_await storage.load_snapshot();
|
||||
|
||||
BOOST_CHECK(snp == loaded_snp);
|
||||
});
|
||||
}, raft_cql_test_config());
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_store_load_log_entries) {
|
||||
@@ -138,7 +138,7 @@ SEASTAR_TEST_CASE(test_store_load_log_entries) {
|
||||
for (size_t i = 0, end = entries.size(); i != end; ++i) {
|
||||
BOOST_CHECK(*entries[i] == *loaded_entries[i]);
|
||||
}
|
||||
});
|
||||
}, raft_cql_test_config());
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_truncate_log) {
|
||||
@@ -156,7 +156,7 @@ SEASTAR_TEST_CASE(test_truncate_log) {
|
||||
for (size_t i = 0, end = loaded_entries.size(); i != end; ++i) {
|
||||
BOOST_CHECK(*entries[i] == *loaded_entries[i]);
|
||||
}
|
||||
});
|
||||
}, raft_cql_test_config());
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_store_snapshot_truncate_log_tail) {
|
||||
@@ -187,5 +187,5 @@ SEASTAR_TEST_CASE(test_store_snapshot_truncate_log_tail) {
|
||||
for (size_t i = 0, end = loaded_entries.size(); i != end; ++i) {
|
||||
BOOST_CHECK(*entries[i + 1] == *loaded_entries[i]);
|
||||
}
|
||||
});
|
||||
}, raft_cql_test_config());
|
||||
}
|
||||
|
||||
@@ -804,5 +804,5 @@ SEASTAR_TEST_CASE(test_schema_tables_use_null_sharder) {
|
||||
BOOST_REQUIRE_EQUAL(s->get_sharder().shard_count(), 1);
|
||||
}
|
||||
}).get();
|
||||
});
|
||||
}, raft_cql_test_config());
|
||||
}
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
#include "sstables/sstables.hh"
|
||||
#include "test/lib/mutation_source_test.hh"
|
||||
#include "test/lib/sstable_utils.hh"
|
||||
#include "test/lib/mutation_assertions.hh"
|
||||
#include "partition_slice_builder.hh"
|
||||
|
||||
using namespace sstables;
|
||||
using namespace std::chrono_literals;
|
||||
@@ -62,3 +64,69 @@ SEASTAR_TEST_CASE(test_sstable_conforms_to_mutation_source) {
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Regression test for scylladb/scylla-enterprise#2016
|
||||
SEASTAR_THREAD_TEST_CASE(test_produces_range_tombstone) {
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", int32_type, column_kind::partition_key)
|
||||
.with_column("ck", int32_type, column_kind::clustering_key)
|
||||
.with_column("v", int32_type, column_kind::regular_column)
|
||||
.build();
|
||||
|
||||
mutation m(s, partition_key::from_single_value(*s, int32_type->decompose(0)));
|
||||
m.partition().apply_row_tombstone(*s, range_tombstone{
|
||||
clustering_key::from_exploded(*s, {int32_type->decompose(6)}), bound_kind::excl_start,
|
||||
clustering_key::from_exploded(*s, {int32_type->decompose(10)}), bound_kind::incl_end,
|
||||
tombstone(0, gc_clock::time_point())
|
||||
});
|
||||
|
||||
{
|
||||
auto ckey = clustering_key::from_exploded(*s, {int32_type->decompose(6)});
|
||||
deletable_row& row = m.partition().clustered_row(*s, ckey, is_dummy::no, is_continuous(false));
|
||||
row.marker() = row_marker(4);
|
||||
}
|
||||
{
|
||||
auto ckey = clustering_key::from_exploded(*s, {int32_type->decompose(8)});
|
||||
deletable_row& row = m.partition().clustered_row(*s, ckey, is_dummy::no, is_continuous(false));
|
||||
row.apply(tombstone(2, gc_clock::time_point()));
|
||||
row.marker() = row_marker(5);
|
||||
}
|
||||
|
||||
testlog.info("m: {}", m);
|
||||
|
||||
auto slice = partition_slice_builder(*s)
|
||||
.with_range(query::clustering_range::make(
|
||||
{clustering_key::from_exploded(*s, {int32_type->decompose(8)}), false},
|
||||
{clustering_key::from_exploded(*s, {int32_type->decompose(10)}), true}
|
||||
))
|
||||
.build();
|
||||
|
||||
auto pr = dht::partition_range::make_singular(m.decorated_key());
|
||||
|
||||
std::vector<tmpdir> dirs;
|
||||
dirs.emplace_back();
|
||||
sstables::test_env::do_with_async([&] (sstables::test_env& env) {
|
||||
storage_service_for_tests ssft;
|
||||
auto version = sstable_version_types::la;
|
||||
auto index_block_size = 1;
|
||||
sstable_writer_config cfg = env.manager().configure_writer();
|
||||
cfg.promoted_index_block_size = index_block_size;
|
||||
|
||||
auto source = make_sstable_mutation_source(env, s, dirs.back().path().string(), {m}, cfg, version, gc_clock::now());
|
||||
|
||||
{
|
||||
auto rd = source.make_reader(s, tests::make_permit(), pr, slice);
|
||||
while (auto mf = rd(db::no_timeout).get0()) {
|
||||
testlog.info("produced {}", mutation_fragment::printer(*s, *mf));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto rd = source.make_reader(s, tests::make_permit(), pr, slice);
|
||||
mutation_opt sliced_m = read_mutation_from_flat_mutation_reader(rd, db::no_timeout).get0();
|
||||
BOOST_REQUIRE(bool(sliced_m));
|
||||
|
||||
assert_that(*sliced_m).is_equal_to(m, slice.row_ranges(*m.schema(), m.key()));
|
||||
}
|
||||
}).get();
|
||||
}
|
||||
|
||||
@@ -6750,6 +6750,49 @@ SEASTAR_TEST_CASE(stcs_reshape_test) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(lcs_reshape_test) {
|
||||
return test_env::do_with_async([] (test_env& env) {
|
||||
simple_schema ss;
|
||||
auto s = ss.schema();
|
||||
auto keys = token_generation_for_current_shard(256);
|
||||
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::leveled,
|
||||
s->compaction_strategy_options());
|
||||
|
||||
// non overlapping
|
||||
{
|
||||
std::vector <shared_sstable> sstables;
|
||||
for (auto i = 0; i < 256; i++) {
|
||||
auto sst = env.make_sstable(s, "", i + 1);
|
||||
auto key = keys[i].first;
|
||||
sstables::test(sst).set_values_for_leveled_strategy(1 /* size */, 0 /* level */, 0 /* max ts */, key, key);
|
||||
sstables.push_back(std::move(sst));
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size() == 256);
|
||||
}
|
||||
// all overlapping
|
||||
{
|
||||
std::vector <shared_sstable> sstables;
|
||||
for (auto i = 0; i < 256; i++) {
|
||||
auto sst = env.make_sstable(s, "", i + 1);
|
||||
auto key = keys[0].first;
|
||||
sstables::test(sst).set_values_for_leveled_strategy(1 /* size */, 0 /* level */, 0 /* max ts */, key, key);
|
||||
sstables.push_back(std::move(sst));
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size() == s->max_compaction_threshold());
|
||||
}
|
||||
// single sstable
|
||||
{
|
||||
auto sst = env.make_sstable(s, "", 1);
|
||||
auto key = keys[0].first;
|
||||
sstables::test(sst).set_values_for_leveled_strategy(1 /* size */, 0 /* level */, 0 /* max ts */, key, key);
|
||||
|
||||
BOOST_REQUIRE(cs.get_reshaping_job({ sst }, s, default_priority_class(), reshape_mode::strict).sstables.size() == 0);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_twcs_interposer_on_memtable_flush) {
|
||||
return test_env::do_with_async([] (test_env& env) {
|
||||
storage_service_for_tests ssft;
|
||||
@@ -7041,3 +7084,266 @@ SEASTAR_TEST_CASE(test_offstrategy_sstable_compaction) {
|
||||
cf->stop().get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(single_key_reader_through_compound_set_test) {
|
||||
return test_env::do_with_async([] (test_env& env) {
|
||||
auto builder = schema_builder("tests", "single_key_reader_through_compound_set_test")
|
||||
.with_column("id", utf8_type, column_kind::partition_key)
|
||||
.with_column("cl", ::timestamp_type, column_kind::clustering_key)
|
||||
.with_column("value", int32_type);
|
||||
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
|
||||
std::map <sstring, sstring> opts = {
|
||||
{time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS"},
|
||||
{time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1"},
|
||||
};
|
||||
builder.set_compaction_strategy_options(std::move(opts));
|
||||
auto s = builder.build();
|
||||
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::time_window, std::move(opts));
|
||||
|
||||
auto next_timestamp = [](auto step) {
|
||||
using namespace std::chrono;
|
||||
return (gc_clock::now().time_since_epoch() + duration_cast<microseconds>(step)).count();
|
||||
};
|
||||
auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
|
||||
|
||||
auto make_row = [&](std::chrono::hours step) {
|
||||
static thread_local int32_t value = 1;
|
||||
auto key_str = tokens[0].first;
|
||||
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
|
||||
|
||||
mutation m(s, key);
|
||||
auto next_ts = next_timestamp(step);
|
||||
auto c_key = clustering_key::from_exploded(*s, {::timestamp_type->decompose(next_ts)});
|
||||
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value++)), next_ts);
|
||||
return m;
|
||||
};
|
||||
|
||||
auto tmp = tmpdir();
|
||||
auto cm = make_lw_shared<compaction_manager>();
|
||||
column_family::config cfg = column_family_test_config(env.manager());
|
||||
::cf_stats cf_stats{0};
|
||||
cfg.cf_stats = &cf_stats;
|
||||
cfg.datadir = tmp.path().string();
|
||||
cfg.enable_disk_writes = true;
|
||||
cfg.enable_cache = false;
|
||||
auto tracker = make_lw_shared<cache_tracker>();
|
||||
cell_locker_stats cl_stats;
|
||||
auto cf = make_lw_shared<column_family>(s, cfg, column_family::no_commitlog(), *cm, cl_stats, *tracker);
|
||||
cf->mark_ready_for_writes();
|
||||
cf->start();
|
||||
|
||||
auto set1 = make_lw_shared<sstable_set>(cs.make_sstable_set(s));
|
||||
auto set2 = make_lw_shared<sstable_set>(cs.make_sstable_set(s));
|
||||
|
||||
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)]() {
|
||||
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::sstable::version_types::md, big);
|
||||
};
|
||||
|
||||
// sstables with same key but belonging to different windows
|
||||
auto sst1 = make_sstable_containing(sst_gen, {make_row(std::chrono::hours(1))});
|
||||
auto sst2 = make_sstable_containing(sst_gen, {make_row(std::chrono::hours(5))});
|
||||
BOOST_REQUIRE(sst1->get_first_decorated_key().token() == sst2->get_last_decorated_key().token());
|
||||
auto dkey = sst1->get_first_decorated_key();
|
||||
|
||||
set1->insert(std::move(sst1));
|
||||
set2->insert(std::move(sst2));
|
||||
sstable_set compound = sstables::make_compound_sstable_set(s, {set1, set2});
|
||||
|
||||
reader_permit permit = tests::make_permit();
|
||||
utils::estimated_histogram eh;
|
||||
auto pr = dht::partition_range::make_singular(dkey);
|
||||
|
||||
auto reader = compound.create_single_key_sstable_reader(&*cf, s, permit, eh, pr, s->full_slice(), default_priority_class(),
|
||||
tracing::trace_state_ptr(), ::streamed_mutation::forwarding::no,
|
||||
::mutation_reader::forwarding::no);
|
||||
auto mfopt = read_mutation_from_flat_mutation_reader(reader, db::no_timeout).get0();
|
||||
BOOST_REQUIRE(mfopt);
|
||||
mfopt = read_mutation_from_flat_mutation_reader(reader, db::no_timeout).get0();
|
||||
BOOST_REQUIRE(!mfopt);
|
||||
BOOST_REQUIRE(cf_stats.clustering_filter_count > 0);
|
||||
});
|
||||
}
|
||||
|
||||
// Regression test for #8432
|
||||
SEASTAR_TEST_CASE(test_twcs_single_key_reader_filtering) {
|
||||
return test_env::do_with_async([] (test_env& env) {
|
||||
auto builder = schema_builder("tests", "twcs_single_key_reader_filtering")
|
||||
.with_column("pk", int32_type, column_kind::partition_key)
|
||||
.with_column("ck", int32_type, column_kind::clustering_key)
|
||||
.with_column("v", int32_type);
|
||||
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
|
||||
auto s = builder.build();
|
||||
|
||||
auto tmp = tmpdir();
|
||||
auto sst_gen = [&env, s, &tmp, gen = make_lw_shared<unsigned>(1)]() {
|
||||
return env.make_sstable(s, tmp.path().string(), (*gen)++, sstables::sstable::version_types::md, big);
|
||||
};
|
||||
|
||||
auto make_row = [&] (int32_t pk, int32_t ck) {
|
||||
mutation m(s, partition_key::from_single_value(*s, int32_type->decompose(pk)));
|
||||
m.set_clustered_cell(clustering_key::from_single_value(*s, int32_type->decompose(ck)), to_bytes("v"), int32_t(0), api::new_timestamp());
|
||||
return m;
|
||||
};
|
||||
|
||||
auto sst1 = make_sstable_containing(sst_gen, {make_row(0, 0)});
|
||||
auto sst2 = make_sstable_containing(sst_gen, {make_row(0, 1)});
|
||||
auto dkey = sst1->get_first_decorated_key();
|
||||
|
||||
auto cm = make_lw_shared<compaction_manager>();
|
||||
column_family::config cfg = column_family_test_config(env.manager());
|
||||
::cf_stats cf_stats{0};
|
||||
cfg.cf_stats = &cf_stats;
|
||||
cfg.datadir = tmp.path().string();
|
||||
auto tracker = make_lw_shared<cache_tracker>();
|
||||
cell_locker_stats cl_stats;
|
||||
column_family cf(s, cfg, column_family::no_commitlog(), *cm, cl_stats, *tracker);
|
||||
cf.mark_ready_for_writes();
|
||||
cf.start();
|
||||
|
||||
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::time_window, {});
|
||||
|
||||
auto set = cs.make_sstable_set(s);
|
||||
set.insert(std::move(sst1));
|
||||
set.insert(std::move(sst2));
|
||||
|
||||
reader_permit permit = tests::make_permit();
|
||||
utils::estimated_histogram eh;
|
||||
auto pr = dht::partition_range::make_singular(dkey);
|
||||
|
||||
auto slice = partition_slice_builder(*s)
|
||||
.with_range(query::clustering_range {
|
||||
query::clustering_range::bound { clustering_key_prefix::from_single_value(*s, int32_type->decompose(0)) },
|
||||
query::clustering_range::bound { clustering_key_prefix::from_single_value(*s, int32_type->decompose(1)) },
|
||||
}).build();
|
||||
|
||||
auto reader = set.create_single_key_sstable_reader(
|
||||
&cf, s, permit, eh, pr, slice, default_priority_class(),
|
||||
tracing::trace_state_ptr(), ::streamed_mutation::forwarding::no,
|
||||
::mutation_reader::forwarding::no);
|
||||
|
||||
auto checked_by_ck = cf_stats.sstables_checked_by_clustering_filter;
|
||||
auto surviving_after_ck = cf_stats.surviving_sstables_after_clustering_filter;
|
||||
|
||||
// consume all fragments
|
||||
while (reader(db::no_timeout).get());
|
||||
|
||||
// At least sst2 should be checked by the CK filter during fragment consumption and should pass.
|
||||
// With the bug in #8432, sst2 wouldn't even be checked by the CK filter since it would pass right after checking the PK filter.
|
||||
BOOST_REQUIRE_GE(cf_stats.sstables_checked_by_clustering_filter - checked_by_ck, 1);
|
||||
BOOST_REQUIRE_EQUAL(
|
||||
cf_stats.surviving_sstables_after_clustering_filter - surviving_after_ck,
|
||||
cf_stats.sstables_checked_by_clustering_filter - checked_by_ck);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(max_ongoing_compaction_test) {
|
||||
return test_env::do_with_async([] (test_env& env) {
|
||||
BOOST_REQUIRE(smp::count == 1);
|
||||
|
||||
auto make_schema = [] (auto idx) {
|
||||
auto builder = schema_builder("tests", std::to_string(idx))
|
||||
.with_column("id", utf8_type, column_kind::partition_key)
|
||||
.with_column("cl", int32_type, column_kind::clustering_key)
|
||||
.with_column("value", int32_type);
|
||||
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
|
||||
std::map <sstring, sstring> opts = {
|
||||
{time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS"},
|
||||
{time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1"},
|
||||
{time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0"},
|
||||
};
|
||||
builder.set_compaction_strategy_options(std::move(opts));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
return builder.build();
|
||||
};
|
||||
|
||||
auto cm = make_lw_shared<compaction_manager>();
|
||||
cm->enable();
|
||||
auto stop_cm = defer([&cm] {
|
||||
cm->stop().get();
|
||||
});
|
||||
|
||||
auto tmp = tmpdir();
|
||||
auto cl_stats = make_lw_shared<cell_locker_stats>();
|
||||
auto tracker = make_lw_shared<cache_tracker>();
|
||||
auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
|
||||
|
||||
auto next_timestamp = [] (auto step) {
|
||||
using namespace std::chrono;
|
||||
return (gc_clock::now().time_since_epoch() - duration_cast<microseconds>(step)).count();
|
||||
};
|
||||
auto make_expiring_cell = [&] (schema_ptr s, std::chrono::hours step) {
|
||||
static thread_local int32_t value = 1;
|
||||
|
||||
auto key_str = tokens[0].first;
|
||||
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
|
||||
|
||||
mutation m(s, key);
|
||||
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
|
||||
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step), gc_clock::duration(step + 5s));
|
||||
return m;
|
||||
};
|
||||
|
||||
auto make_table_with_single_fully_expired_sstable = [&] (auto idx) {
|
||||
auto s = make_schema(idx);
|
||||
column_family::config cfg = column_family_test_config(env.manager());
|
||||
cfg.datadir = tmp.path().string() + "/" + std::to_string(idx);
|
||||
touch_directory(cfg.datadir).get();
|
||||
cfg.enable_commitlog = false;
|
||||
cfg.enable_incremental_backups = false;
|
||||
|
||||
auto sst_gen = [&env, s, dir = cfg.datadir, gen = make_lw_shared<unsigned>(1)] () mutable {
|
||||
return env.make_sstable(s, dir, (*gen)++, sstables::sstable::version_types::md, big);
|
||||
};
|
||||
|
||||
auto cf = make_lw_shared<column_family>(s, cfg, column_family::no_commitlog(), *cm, *cl_stats, *tracker);
|
||||
cf->start();
|
||||
cf->mark_ready_for_writes();
|
||||
|
||||
auto muts = { make_expiring_cell(s, std::chrono::hours(1)) };
|
||||
auto sst = make_sstable_containing(sst_gen, muts);
|
||||
column_family_test(cf).add_sstable(sst);
|
||||
return cf;
|
||||
};
|
||||
|
||||
std::vector<lw_shared_ptr<column_family>> tables;
|
||||
auto stop_tables = defer([&tables] {
|
||||
for (auto& t : tables) {
|
||||
t->stop().get();
|
||||
}
|
||||
});
|
||||
for (auto i = 0; i < 100; i++) {
|
||||
tables.push_back(make_table_with_single_fully_expired_sstable(i));
|
||||
}
|
||||
|
||||
// Make sure everything is expired
|
||||
forward_jump_clocks(std::chrono::hours(100));
|
||||
|
||||
for (auto& t : tables) {
|
||||
BOOST_REQUIRE(t->sstables_count() == 1);
|
||||
t->trigger_compaction();
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(cm->get_stats().pending_tasks >= 1 || cm->get_stats().active_tasks >= 1);
|
||||
|
||||
size_t max_ongoing_compaction = 0;
|
||||
|
||||
// wait for submitted jobs to finish.
|
||||
auto end = [cm, &tables] {
|
||||
return cm->get_stats().pending_tasks == 0 && cm->get_stats().active_tasks == 0
|
||||
&& boost::algorithm::all_of(tables, [] (auto& t) { return t->sstables_count() == 0; });
|
||||
};
|
||||
while (!end()) {
|
||||
if (!cm->get_stats().pending_tasks && !cm->get_stats().active_tasks) {
|
||||
for (auto& t : tables) {
|
||||
if (t->sstables_count()) {
|
||||
t->trigger_compaction();
|
||||
}
|
||||
}
|
||||
}
|
||||
max_ongoing_compaction = std::max(cm->get_stats().active_tasks, max_ongoing_compaction);
|
||||
later().get();
|
||||
}
|
||||
BOOST_REQUIRE(cm->get_stats().errors == 0);
|
||||
BOOST_REQUIRE(max_ongoing_compaction == 1);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -175,9 +175,12 @@ def wait_for_index(cql, table, column, everything):
|
||||
results = []
|
||||
for v in column_values:
|
||||
results.extend(list(cql.execute(f'SELECT * FROM {table} WHERE {column}={v}')))
|
||||
if set(results) == set(everything):
|
||||
|
||||
if sorted(results) == sorted(everything):
|
||||
return
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
pytest.fail('Timeout waiting for index to become up to date.')
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@@ -291,3 +294,46 @@ def test_contains_frozen_collection_ck(cql, test_keyspace):
|
||||
"SELECT * FROM " + table + " WHERE a=0 AND c=0 AND b CONTAINS 0 ALLOW FILTERING")))
|
||||
assert 1 == len(list(cql.execute(
|
||||
"SELECT * FROM " + table + " WHERE a=0 AND c=0 AND b CONTAINS KEY 0 ALLOW FILTERING")))
|
||||
|
||||
# table5 contains an indexed table with 3 clustering columns.
|
||||
# used to test correct filtering of rows fetched from an index table.
|
||||
@pytest.fixture(scope="module")
|
||||
def table5(cql, test_keyspace):
|
||||
table = test_keyspace + "." + unique_name()
|
||||
cql.execute(f"CREATE TABLE {table} (p int, c1 frozen<list<int>>, c2 frozen<list<int>>, c3 int, PRIMARY KEY (p,c1,c2,c3))")
|
||||
cql.execute(f"CREATE INDEX ON {table} (c3)")
|
||||
cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [1], [2], 0)")
|
||||
cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [2], [2], 0)")
|
||||
cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [1], [3], 0)")
|
||||
cql.execute(f"INSERT INTO {table} (p, c1, c2, c3) VALUES (0, [1], [2], 1)")
|
||||
|
||||
everything = list(cql.execute(f"SELECT * FROM {table}"))
|
||||
wait_for_index(cql, table, 'c3', everything)
|
||||
yield (table, everything)
|
||||
cql.execute(f"DROP TABLE {table}")
|
||||
|
||||
# Test that implementation of filtering for indexes works ok.
|
||||
# Current implementation is a bit conservative - it might sometimes state
|
||||
# that filtering is needed when it isn't actually required, but at least it's safe.
|
||||
def test_select_indexed_cluster_three_keys(cql, table5):
|
||||
def check_good_row(row):
|
||||
return row.p == 0 and row.c1 == [1] and row.c2 == [2] and row.c3 == 0
|
||||
|
||||
check_af_optional(cql, table5, "c3 = 0", lambda r : r.c3 == 0)
|
||||
check_af_mandatory(cql, table5, "c1 = [1] AND c2 = [2] AND c3 = 0", check_good_row)
|
||||
check_af_mandatory(cql, table5, "p = 0 AND c1 CONTAINS 1 AND c3 = 0", lambda r : r.p == 0 and r.c1 == [1] and r.c3 == 0)
|
||||
check_af_mandatory(cql, table5, "p = 0 AND c1 = [1] AND c2 CONTAINS 2 AND c3 = 0", check_good_row)
|
||||
|
||||
# Doesn't use an index - shouldn't be affected
|
||||
check_af_optional(cql, table5, "p = 0 AND c1 = [1] AND c2 = [2] AND c3 = 0", check_good_row)
|
||||
|
||||
# Here are the cases where current implementation of need_filtering() fails
|
||||
# By coincidence they also fail on cassandra, it looks like cassandra is buggy
|
||||
@pytest.mark.xfail(reason="Too conservative need_filtering() implementation")
|
||||
def test_select_indexed_cluster_three_keys_conservative(cql, table5, cassandra_bug):
|
||||
def check_good_row(row):
|
||||
return row.p == 0 and row.c1 == [1] and row.c3 == 0
|
||||
|
||||
# Don't require filtering, but for now we report they do
|
||||
check_af_optional(cql, table5, "p = 0 AND c1 = [1] AND c3 = 0", check_good_row)
|
||||
check_af_optional(cql, table5, "p = 0 AND c1 = [1] AND c2 < [3] AND c3 = 0", lambda r : check_good_row(r) and r.c2 < [3])
|
||||
|
||||
@@ -20,8 +20,9 @@
|
||||
import time
|
||||
import pytest
|
||||
from cassandra.protocol import SyntaxException, AlreadyExists, InvalidRequest, ConfigurationException, ReadFailure
|
||||
from cassandra.query import SimpleStatement
|
||||
|
||||
from util import new_test_table
|
||||
from util import new_test_table, unique_name
|
||||
|
||||
# A reproducer for issue #7443: Normally, when the entire table is SELECTed,
|
||||
# the partitions are returned sorted by the partitions' token. When there
|
||||
@@ -64,3 +65,123 @@ def test_partition_order_with_si(cql, test_keyspace):
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
assert tokens2 == tokens
|
||||
|
||||
# Test that the paging state works properly for indexes on tables
|
||||
# with descending clustering order. There was a problem with indexes
|
||||
# created on clustering keys with DESC clustering order - they are represented
|
||||
# as "reverse" types internally and Scylla assertions failed that the base type
|
||||
# is different from the underlying view type, even though, from the perspective
|
||||
# of deserialization, they're equal. Issue #8666
|
||||
def test_paging_with_desc_clustering_order(cql, test_keyspace):
|
||||
schema = 'p int, c int, primary key (p,c)'
|
||||
extra = 'with clustering order by (c desc)'
|
||||
with new_test_table(cql, test_keyspace, schema, extra) as table:
|
||||
cql.execute(f"CREATE INDEX ON {table}(c)")
|
||||
for i in range(3):
|
||||
cql.execute(f"INSERT INTO {table}(p,c) VALUES ({i}, 42)")
|
||||
stmt = SimpleStatement(f"SELECT * FROM {table} WHERE c = 42", fetch_size=1)
|
||||
assert len([row for row in cql.execute(stmt)]) == 3
|
||||
|
||||
# Test which ensures that indexes for a query are picked by the order in which
|
||||
# they appear in restrictions. That way, users can deterministically pick
|
||||
# which indexes are used for which queries.
|
||||
# Note that the order of picking indexing is not set in stone and may be
|
||||
# subject to change - in which case this test case should be amended as well.
|
||||
# The order tested in this case was decided as a good first step in issue
|
||||
# #7969, but it's possible that it will eventually be implemented another
|
||||
# way, e.g. dynamically based on estimated query selectivity statistics.
|
||||
# Ref: #7969
|
||||
@pytest.mark.xfail(reason="The order of picking indexes is currently arbitrary. Issue #7969")
|
||||
def test_order_of_indexes(scylla_only, cql, test_keyspace):
|
||||
schema = 'p int primary key, v1 int, v2 int, v3 int'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
cql.execute(f"CREATE INDEX my_v3_idx ON {table}(v3)")
|
||||
cql.execute(f"CREATE INDEX my_v1_idx ON {table}(v1)")
|
||||
cql.execute(f"CREATE INDEX my_v2_idx ON {table}((p),v2)")
|
||||
# All queries below should use the first index they find in the list
|
||||
# of restrictions. Tracing information will be consulted to ensure
|
||||
# it's true. Currently some of the cases below succeed, because the
|
||||
# order is not well defined (and may, for instance, change upon
|
||||
# server restart), but some of them fail. Once a proper ordering
|
||||
# is implemented, all cases below should succeed.
|
||||
def index_used(query, index_name):
|
||||
assert any([index_name in event.description for event in cql.execute(query, trace=True).get_query_trace().events])
|
||||
index_used(f"SELECT * FROM {table} WHERE v3 = 1", "my_v3_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 allow filtering", "my_v1_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE p = 1 and v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
|
||||
# Local indexes are still skipped if they cannot be used
|
||||
index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v1 = 2 allow filtering", "my_v1_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v3 = 2 and v1 = 3 allow filtering", "my_v3_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE v1 = 1 and v2 = 2 and v3 = 3 allow filtering", "my_v1_idx")
|
||||
# Local indexes are still preferred over global ones, if they can be used
|
||||
index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 and v2 = 2 allow filtering", "my_v2_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE p = 1 and v2 = 1 and v1 = 2 allow filtering", "my_v2_idx")
|
||||
|
||||
# Indexes can be created without an explicit name, in which case a default name is chosen.
|
||||
# However, due to #8620 it was possible to break the index creation mechanism by creating
|
||||
# a properly named regular table, which conflicts with the generated index name.
|
||||
def test_create_unnamed_index_when_its_name_is_taken(cql, test_keyspace):
|
||||
schema = 'p int primary key, v int'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
try:
|
||||
cql.execute(f"CREATE TABLE {table}_v_idx_index (i_do_not_exist_in_the_base_table int primary key)")
|
||||
# Creating an index should succeed, even though its default name is taken
|
||||
# by the table above
|
||||
cql.execute(f"CREATE INDEX ON {table}(v)")
|
||||
finally:
|
||||
cql.execute(f"DROP TABLE {table}_v_idx_index")
|
||||
|
||||
# Indexed created with an explicit name cause a materialized view to be created,
|
||||
# and this view has a specific name - <index-name>_index. If there happens to be
|
||||
# a regular table (or another view) named just like that, index creation should fail.
|
||||
def test_create_named_index_when_its_name_is_taken(scylla_only, cql, test_keyspace):
|
||||
schema = 'p int primary key, v int'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
index_name = unique_name()
|
||||
try:
|
||||
cql.execute(f"CREATE TABLE {test_keyspace}.{index_name}_index (i_do_not_exist_in_the_base_table int primary key)")
|
||||
# Creating an index should fail, because it's impossible to create
|
||||
# its underlying materialized view, because its name is taken by a regular table
|
||||
with pytest.raises(InvalidRequest, match="already exists"):
|
||||
cql.execute(f"CREATE INDEX {index_name} ON {table}(v)")
|
||||
finally:
|
||||
cql.execute(f"DROP TABLE {test_keyspace}.{index_name}_index")
|
||||
|
||||
# Tests for CREATE INDEX IF NOT EXISTS
|
||||
# Reproduces issue #8717.
|
||||
def test_create_index_if_not_exists(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, 'p int primary key, v int') as table:
|
||||
cql.execute(f"CREATE INDEX ON {table}(v)")
|
||||
# Can't create the same index again without "IF NOT EXISTS", but can
|
||||
# do it with "IF NOT EXISTS":
|
||||
with pytest.raises(InvalidRequest, match="duplicate"):
|
||||
cql.execute(f"CREATE INDEX ON {table}(v)")
|
||||
cql.execute(f"CREATE INDEX IF NOT EXISTS ON {table}(v)")
|
||||
cql.execute(f"DROP INDEX {test_keyspace}.{table.split('.')[1]}_v_idx")
|
||||
|
||||
# Now test the same thing for named indexes. This is what broke in #8717:
|
||||
cql.execute(f"CREATE INDEX xyz ON {table}(v)")
|
||||
with pytest.raises(InvalidRequest, match="already exists"):
|
||||
cql.execute(f"CREATE INDEX xyz ON {table}(v)")
|
||||
cql.execute(f"CREATE INDEX IF NOT EXISTS xyz ON {table}(v)")
|
||||
cql.execute(f"DROP INDEX {test_keyspace}.xyz")
|
||||
|
||||
# Exactly the same with non-lower case name.
|
||||
cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
|
||||
with pytest.raises(InvalidRequest, match="already exists"):
|
||||
cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
|
||||
cql.execute(f'CREATE INDEX IF NOT EXISTS "CamelCase" ON {table}(v)')
|
||||
cql.execute(f'DROP INDEX {test_keyspace}."CamelCase"')
|
||||
|
||||
# Trying to create an index for an attribute that's already indexed,
|
||||
# but with a different name. The "IF NOT EXISTS" appears to succeed
|
||||
# in this case, but does not actually create the new index name -
|
||||
# only the old one remains.
|
||||
cql.execute(f"CREATE INDEX xyz ON {table}(v)")
|
||||
with pytest.raises(InvalidRequest, match="duplicate"):
|
||||
cql.execute(f"CREATE INDEX abc ON {table}(v)")
|
||||
cql.execute(f"CREATE INDEX IF NOT EXISTS abc ON {table}(v)")
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(f"DROP INDEX {test_keyspace}.abc")
|
||||
cql.execute(f"DROP INDEX {test_keyspace}.xyz")
|
||||
|
||||
@@ -693,6 +693,12 @@ future<> do_with_cql_env_thread(std::function<void(cql_test_env&)> func, cql_tes
|
||||
}, std::move(cfg_in));
|
||||
}
|
||||
|
||||
cql_test_config raft_cql_test_config() {
|
||||
cql_test_config c;
|
||||
c.db_config->experimental_features({db::experimental_features_t::RAFT});
|
||||
return c;
|
||||
}
|
||||
|
||||
namespace debug {
|
||||
|
||||
seastar::sharded<database>* db;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user