Compare commits
84 Commits
scylla-4.3
...
next-4.3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
67a62b3e8d | ||
|
|
92effccf52 | ||
|
|
7357529834 | ||
|
|
3dd7874f08 | ||
|
|
1bf218c29e | ||
|
|
89c47a44dc | ||
|
|
dd93f297c1 | ||
|
|
b0b2606a8c | ||
|
|
6de458e915 | ||
|
|
b6aa5ab2d4 | ||
|
|
08cbd180ff | ||
|
|
693c7b300a | ||
|
|
2e7f618632 | ||
|
|
5cd698c89d | ||
|
|
482fa83a0e | ||
|
|
cabb7fbd3b | ||
|
|
4d1c83a4e8 | ||
|
|
7da9884d09 | ||
|
|
b4242f01a8 | ||
|
|
27cd231f61 | ||
|
|
030419d5ed | ||
|
|
0d1362fc31 | ||
|
|
0888aa1717 | ||
|
|
690a96ff54 | ||
|
|
38cdf30a35 | ||
|
|
61b71e4da0 | ||
|
|
e4b42e622e | ||
|
|
e625144d6e | ||
|
|
76ec7513f1 | ||
|
|
f36f7035c8 | ||
|
|
709e934164 | ||
|
|
13428d56f6 | ||
|
|
2c1f5e5225 | ||
|
|
9ae3edb102 | ||
|
|
11851fa4d9 | ||
|
|
1a56e41f44 | ||
|
|
d737d56a08 | ||
|
|
e1c993fc13 | ||
|
|
0a6e38bf18 | ||
|
|
55bca74e90 | ||
|
|
162d466034 | ||
|
|
3e6d8c3fa7 | ||
|
|
5d3ff1e8a1 | ||
|
|
5358eaf1d6 | ||
|
|
e78b96ee49 | ||
|
|
add245a27e | ||
|
|
108f56c6ed | ||
|
|
d01ce491c0 | ||
|
|
7b2f65191c | ||
|
|
add5ffa787 | ||
|
|
32a1f2dcd9 | ||
|
|
f2072665d1 | ||
|
|
beb2bcb8bd | ||
|
|
8255b7984d | ||
|
|
28f5e0bd20 | ||
|
|
09f3bb93a3 | ||
|
|
76642eb00d | ||
|
|
a60f394d9a | ||
|
|
f2af68850c | ||
|
|
c7781f8c9e | ||
|
|
8f37924694 | ||
|
|
8588eef807 | ||
|
|
c50a2898cf | ||
|
|
44f7251809 | ||
|
|
fc070d3dc6 | ||
|
|
901784e122 | ||
|
|
2ccda04d57 | ||
|
|
e8facb1932 | ||
|
|
6f338e7656 | ||
|
|
7bb9230cfa | ||
|
|
2898e98733 | ||
|
|
2796b0050d | ||
|
|
6bc005643e | ||
|
|
d591ff5422 | ||
|
|
acb1c3eebf | ||
|
|
a04242ea62 | ||
|
|
7131c7c523 | ||
|
|
6af7cf8a39 | ||
|
|
e2d4940b6d | ||
|
|
09f9ff3f96 | ||
|
|
d671185828 | ||
|
|
8d1784805a | ||
|
|
1d4ce229eb | ||
|
|
ba9897a34e |
@@ -1,7 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
PRODUCT=scylla
|
||||
VERSION=4.3.2
|
||||
VERSION=4.3.7
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -123,7 +123,7 @@ struct rjson_engaged_ptr_comp {
|
||||
// as internally they're stored in an array, and the order of elements is
|
||||
// not important in set equality. See issue #5021
|
||||
static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2) {
|
||||
if (set1.Size() != set2.Size()) {
|
||||
if (!set1.IsArray() || !set2.IsArray() || set1.Size() != set2.Size()) {
|
||||
return false;
|
||||
}
|
||||
std::set<const rjson::value*, rjson_engaged_ptr_comp> set1_raw;
|
||||
@@ -137,25 +137,70 @@ static bool check_EQ_for_sets(const rjson::value& set1, const rjson::value& set2
|
||||
}
|
||||
return true;
|
||||
}
|
||||
// Moreover, the JSON being compared can be a nested document with outer
|
||||
// layers of lists and maps and some inner set - and we need to get to that
|
||||
// inner set to compare it correctly with check_EQ_for_sets() (issue #8514).
|
||||
static bool check_EQ(const rjson::value* v1, const rjson::value& v2);
|
||||
static bool check_EQ_for_lists(const rjson::value& list1, const rjson::value& list2) {
|
||||
if (!list1.IsArray() || !list2.IsArray() || list1.Size() != list2.Size()) {
|
||||
return false;
|
||||
}
|
||||
auto it1 = list1.Begin();
|
||||
auto it2 = list2.Begin();
|
||||
while (it1 != list1.End()) {
|
||||
// Note: Alternator limits an item's depth (rjson::parse() limits
|
||||
// it to around 37 levels), so this recursion is safe.
|
||||
if (!check_EQ(&*it1, *it2)) {
|
||||
return false;
|
||||
}
|
||||
++it1;
|
||||
++it2;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
static bool check_EQ_for_maps(const rjson::value& list1, const rjson::value& list2) {
|
||||
if (!list1.IsObject() || !list2.IsObject() || list1.MemberCount() != list2.MemberCount()) {
|
||||
return false;
|
||||
}
|
||||
for (auto it1 = list1.MemberBegin(); it1 != list1.MemberEnd(); ++it1) {
|
||||
auto it2 = list2.FindMember(it1->name);
|
||||
if (it2 == list2.MemberEnd() || !check_EQ(&it1->value, it2->value)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if two JSON-encoded values match with the EQ relation
|
||||
static bool check_EQ(const rjson::value* v1, const rjson::value& v2) {
|
||||
if (!v1) {
|
||||
return false;
|
||||
}
|
||||
if (v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
|
||||
if (v1 && v1->IsObject() && v1->MemberCount() == 1 && v2.IsObject() && v2.MemberCount() == 1) {
|
||||
auto it1 = v1->MemberBegin();
|
||||
auto it2 = v2.MemberBegin();
|
||||
if ((it1->name == "SS" && it2->name == "SS") || (it1->name == "NS" && it2->name == "NS") || (it1->name == "BS" && it2->name == "BS")) {
|
||||
return check_EQ_for_sets(it1->value, it2->value);
|
||||
if (it1->name != it2->name) {
|
||||
return false;
|
||||
}
|
||||
if (it1->name == "SS" || it1->name == "NS" || it1->name == "BS") {
|
||||
return check_EQ_for_sets(it1->value, it2->value);
|
||||
} else if(it1->name == "L") {
|
||||
return check_EQ_for_lists(it1->value, it2->value);
|
||||
} else if(it1->name == "M") {
|
||||
return check_EQ_for_maps(it1->value, it2->value);
|
||||
} else {
|
||||
// Other, non-nested types (number, string, etc.) can be compared
|
||||
// literally, comparing their JSON representation.
|
||||
return it1->value == it2->value;
|
||||
}
|
||||
} else {
|
||||
// If v1 and/or v2 are missing (IsNull()) the result should be false.
|
||||
// In the unlikely case that the object is malformed (issue #8070),
|
||||
// let's also return false.
|
||||
return false;
|
||||
}
|
||||
return *v1 == v2;
|
||||
}
|
||||
|
||||
// Check if two JSON-encoded values match with the NE relation
|
||||
static bool check_NE(const rjson::value* v1, const rjson::value& v2) {
|
||||
return !v1 || *v1 != v2; // null is unequal to anything.
|
||||
return !check_EQ(v1, v2);
|
||||
}
|
||||
|
||||
// Check if two JSON-encoded values match with the BEGINS_WITH relation
|
||||
@@ -298,6 +343,8 @@ static bool check_NOT_NULL(const rjson::value* val) {
|
||||
|
||||
// Only types S, N or B (string, number or bytes) may be compared by the
|
||||
// various comparion operators - lt, le, gt, ge, and between.
|
||||
// Note that in particular, if the value is missing (v->IsNull()), this
|
||||
// check returns false.
|
||||
static bool check_comparable_type(const rjson::value& v) {
|
||||
if (!v.IsObject() || v.MemberCount() != 1) {
|
||||
return false;
|
||||
|
||||
@@ -331,15 +331,15 @@ void set_column_family(http_context& ctx, routes& r) {
|
||||
});
|
||||
|
||||
cf::get_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return map_reduce_cf(ctx, req->param["name"], 0, [](column_family& cf) {
|
||||
return map_reduce_cf(ctx, req->param["name"], uint64_t{0}, [](column_family& cf) {
|
||||
return cf.active_memtable().partition_count();
|
||||
}, std::plus<int>());
|
||||
}, std::plus<>());
|
||||
});
|
||||
|
||||
cf::get_all_memtable_columns_count.set(r, [&ctx] (std::unique_ptr<request> req) {
|
||||
return map_reduce_cf(ctx, 0, [](column_family& cf) {
|
||||
return map_reduce_cf(ctx, uint64_t{0}, [](column_family& cf) {
|
||||
return cf.active_memtable().partition_count();
|
||||
}, std::plus<int>());
|
||||
}, std::plus<>());
|
||||
});
|
||||
|
||||
cf::get_memtable_on_heap_size.set(r, [] (const_req req) {
|
||||
|
||||
16
cdc/log.cc
16
cdc/log.cc
@@ -980,9 +980,9 @@ static bytes get_bytes(const atomic_cell_view& acv) {
|
||||
return acv.value().linearize();
|
||||
}
|
||||
|
||||
static bytes_view get_bytes_view(const atomic_cell_view& acv, std::vector<bytes>& buf) {
|
||||
static bytes_view get_bytes_view(const atomic_cell_view& acv, std::forward_list<bytes>& buf) {
|
||||
return acv.value().is_fragmented()
|
||||
? bytes_view{buf.emplace_back(acv.value().linearize())}
|
||||
? bytes_view{buf.emplace_front(acv.value().linearize())}
|
||||
: acv.value().first_fragment();
|
||||
}
|
||||
|
||||
@@ -1137,9 +1137,9 @@ struct process_row_visitor {
|
||||
|
||||
struct udt_visitor : public collection_visitor {
|
||||
std::vector<bytes_opt> _added_cells;
|
||||
std::vector<bytes>& _buf;
|
||||
std::forward_list<bytes>& _buf;
|
||||
|
||||
udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::vector<bytes>& buf)
|
||||
udt_visitor(ttl_opt& ttl_column, size_t num_keys, std::forward_list<bytes>& buf)
|
||||
: collection_visitor(ttl_column), _added_cells(num_keys), _buf(buf) {}
|
||||
|
||||
void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
|
||||
@@ -1148,7 +1148,7 @@ struct process_row_visitor {
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<bytes> buf;
|
||||
std::forward_list<bytes> buf;
|
||||
udt_visitor v(_ttl_column, type.size(), buf);
|
||||
|
||||
visit_collection(v);
|
||||
@@ -1167,9 +1167,9 @@ struct process_row_visitor {
|
||||
|
||||
struct map_or_list_visitor : public collection_visitor {
|
||||
std::vector<std::pair<bytes_view, bytes_view>> _added_cells;
|
||||
std::vector<bytes>& _buf;
|
||||
std::forward_list<bytes>& _buf;
|
||||
|
||||
map_or_list_visitor(ttl_opt& ttl_column, std::vector<bytes>& buf)
|
||||
map_or_list_visitor(ttl_opt& ttl_column, std::forward_list<bytes>& buf)
|
||||
: collection_visitor(ttl_column), _buf(buf) {}
|
||||
|
||||
void live_collection_cell(bytes_view key, const atomic_cell_view& cell) {
|
||||
@@ -1178,7 +1178,7 @@ struct process_row_visitor {
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<bytes> buf;
|
||||
std::forward_list<bytes> buf;
|
||||
map_or_list_visitor v(_ttl_column, buf);
|
||||
|
||||
visit_collection(v);
|
||||
|
||||
@@ -855,6 +855,7 @@ scylla_core = (['database.cc',
|
||||
'utils/error_injection.cc',
|
||||
'mutation_writer/timestamp_based_splitting_writer.cc',
|
||||
'mutation_writer/shard_based_splitting_writer.cc',
|
||||
'mutation_writer/feed_writers.cc',
|
||||
'lua.cc',
|
||||
] + [Antlr3Grammar('cql3/Cql.g')] + [Thrift('interface/cassandra.thrift', 'Cassandra')]
|
||||
)
|
||||
|
||||
@@ -192,9 +192,12 @@ public:
|
||||
|
||||
virtual ::shared_ptr<terminal> bind(const query_options& options) override {
|
||||
auto bytes = bind_and_get(options);
|
||||
if (!bytes) {
|
||||
if (bytes.is_null()) {
|
||||
return ::shared_ptr<terminal>{};
|
||||
}
|
||||
if (bytes.is_unset_value()) {
|
||||
return UNSET_VALUE;
|
||||
}
|
||||
return ::make_shared<constants::value>(std::move(cql3::raw_value::make_value(to_bytes(*bytes))));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -27,7 +27,9 @@
|
||||
#include <fmt/ostream.h>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "cql3/constants.hh"
|
||||
#include "cql3/lists.hh"
|
||||
#include "cql3/statements/request_validations.hh"
|
||||
#include "cql3/tuples.hh"
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "types/list.hh"
|
||||
@@ -417,6 +419,8 @@ bool is_one_of(const column_value& col, term& rhs, const column_value_eval_bag&
|
||||
} else if (auto mkr = dynamic_cast<lists::marker*>(&rhs)) {
|
||||
// This is `a IN ?`. RHS elements are values representable as bytes_opt.
|
||||
const auto values = static_pointer_cast<lists::value>(mkr->bind(bag.options));
|
||||
statements::request_validations::check_not_null(
|
||||
values, "Invalid null value for column %s", col.col->name_as_text());
|
||||
return boost::algorithm::any_of(values->get_elements(), [&] (const bytes_opt& b) {
|
||||
return equal(b, col, bag);
|
||||
});
|
||||
@@ -568,7 +572,8 @@ const auto deref = boost::adaptors::transformed([] (const bytes_opt& b) { return
|
||||
|
||||
/// Returns possible values from t, which must be RHS of IN.
|
||||
value_list get_IN_values(
|
||||
const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator) {
|
||||
const ::shared_ptr<term>& t, const query_options& options, const serialized_compare& comparator,
|
||||
sstring_view column_name) {
|
||||
// RHS is prepared differently for different CQL cases. Cast it dynamically to discern which case this is.
|
||||
if (auto dv = dynamic_pointer_cast<lists::delayed_value>(t)) {
|
||||
// Case `a IN (1,2,3)`.
|
||||
@@ -578,8 +583,12 @@ value_list get_IN_values(
|
||||
return to_sorted_vector(std::move(result_range), comparator);
|
||||
} else if (auto mkr = dynamic_pointer_cast<lists::marker>(t)) {
|
||||
// Case `a IN ?`. Collect all list-element values.
|
||||
const auto val = static_pointer_cast<lists::value>(mkr->bind(options));
|
||||
return to_sorted_vector(val->get_elements() | non_null | deref, comparator);
|
||||
const auto val = mkr->bind(options);
|
||||
if (val == constants::UNSET_VALUE) {
|
||||
throw exceptions::invalid_request_exception(format("Invalid unset value for column {}", column_name));
|
||||
}
|
||||
statements::request_validations::check_not_null(val, "Invalid null value for IN tuple");
|
||||
return to_sorted_vector(static_pointer_cast<lists::value>(val)->get_elements() | non_null | deref, comparator);
|
||||
}
|
||||
throw std::logic_error(format("get_IN_values(single column) on invalid term {}", *t));
|
||||
}
|
||||
@@ -686,7 +695,7 @@ value_set possible_lhs_values(const column_definition* cdef, const expression& e
|
||||
return oper.op == oper_t::EQ ? value_set(value_list{*val})
|
||||
: to_range(oper.op, *val);
|
||||
} else if (oper.op == oper_t::IN) {
|
||||
return get_IN_values(oper.rhs, options, type->as_less_comparator());
|
||||
return get_IN_values(oper.rhs, options, type->as_less_comparator(), cdef->name_as_text());
|
||||
}
|
||||
throw std::logic_error(format("possible_lhs_values: unhandled operator {}", oper));
|
||||
},
|
||||
|
||||
@@ -305,6 +305,12 @@ maps::setter_by_key::execute(mutation& m, const clustering_key_prefix& prefix, c
|
||||
assert(column.type->is_multi_cell()); // "Attempted to set a value for a single key on a frozen map"m
|
||||
auto key = _k->bind_and_get(params._options);
|
||||
auto value = _t->bind_and_get(params._options);
|
||||
if (value.is_unset_value()) {
|
||||
return;
|
||||
}
|
||||
if (key.is_unset_value() || value.is_unset_value()) {
|
||||
throw invalid_request_exception("Invalid unset map key");
|
||||
}
|
||||
if (!key) {
|
||||
throw invalid_request_exception("Invalid null map key");
|
||||
}
|
||||
|
||||
@@ -315,7 +315,7 @@ sets::discarder::execute(mutation& m, const clustering_key_prefix& row_key, cons
|
||||
assert(column.type->is_multi_cell()); // "Attempted to remove items from a frozen set";
|
||||
|
||||
auto&& value = _t->bind(params._options);
|
||||
if (!value) {
|
||||
if (!value || value == constants::UNSET_VALUE) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -306,6 +306,13 @@ create_index_statement::announce_migration(service::storage_proxy& proxy, bool i
|
||||
format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
|
||||
}
|
||||
}
|
||||
auto index_table_name = secondary_index::index_table_name(accepted_name);
|
||||
if (db.has_schema(keyspace(), index_table_name)) {
|
||||
return make_exception_future<::shared_ptr<cql_transport::event::schema_change>>(
|
||||
exceptions::invalid_request_exception(format("Index {} cannot be created, because table {} already exists",
|
||||
accepted_name, index_table_name))
|
||||
);
|
||||
}
|
||||
++_cql_stats->secondary_index_creates;
|
||||
schema_builder builder{schema};
|
||||
builder.with_index(index);
|
||||
|
||||
@@ -456,7 +456,7 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
|
||||
if (!view_col) {
|
||||
throw std::runtime_error(format("Base key column not found in the view: {}", base_col.name_as_text()));
|
||||
}
|
||||
if (base_col.type != view_col->type) {
|
||||
if (base_col.type->without_reversed() != *view_col->type) {
|
||||
throw std::runtime_error(format("Mismatched types for base and view columns {}: {} and {}",
|
||||
base_col.name_as_text(), base_col.type->cql3_type_name(), view_col->type->cql3_type_name()));
|
||||
}
|
||||
@@ -1120,7 +1120,11 @@ query::partition_slice indexed_table_select_statement::get_partition_slice_for_g
|
||||
if (single_ck_restrictions) {
|
||||
auto prefix_restrictions = single_ck_restrictions->get_longest_prefix_restrictions();
|
||||
auto clustering_restrictions_from_base = ::make_shared<restrictions::single_column_clustering_key_restrictions>(_view_schema, *prefix_restrictions);
|
||||
const auto indexed_column = _view_schema->get_column_definition(to_bytes(_index.target_column()));
|
||||
for (auto restriction_it : clustering_restrictions_from_base->restrictions()) {
|
||||
if (restriction_it.first == indexed_column) {
|
||||
continue; // In the index table, the indexed column is the partition (not clustering) key.
|
||||
}
|
||||
clustering_restrictions->merge_with(restriction_it.second);
|
||||
}
|
||||
}
|
||||
|
||||
51
database.cc
51
database.cc
@@ -572,9 +572,6 @@ void database::set_format_by_config() {
|
||||
}
|
||||
|
||||
database::~database() {
|
||||
_read_concurrency_sem.clear_inactive_reads();
|
||||
_streaming_concurrency_sem.clear_inactive_reads();
|
||||
_system_read_concurrency_sem.clear_inactive_reads();
|
||||
}
|
||||
|
||||
void database::update_version(const utils::UUID& version) {
|
||||
@@ -662,11 +659,22 @@ future<> database::parse_system_tables(distributed<service::storage_proxy>& prox
|
||||
});
|
||||
}).then([&proxy, &mm, this] {
|
||||
return do_parse_schema_tables(proxy, db::schema_tables::VIEWS, [this, &proxy, &mm] (schema_result_value_type &v) {
|
||||
return create_views_from_schema_partition(proxy, v.second).then([this, &mm] (std::vector<view_ptr> views) {
|
||||
return parallel_for_each(views.begin(), views.end(), [this, &mm] (auto&& v) {
|
||||
return this->add_column_family_and_make_directory(v).then([this, &mm, v] {
|
||||
return maybe_update_legacy_secondary_index_mv_schema(mm.local(), *this, v);
|
||||
});
|
||||
return create_views_from_schema_partition(proxy, v.second).then([this, &mm, &proxy] (std::vector<view_ptr> views) {
|
||||
return parallel_for_each(views.begin(), views.end(), [this, &mm, &proxy] (auto&& v) {
|
||||
// TODO: Remove once computed columns are guaranteed to be featured in the whole cluster.
|
||||
// we fix here the schema in place in oreder to avoid races (write commands comming from other coordinators).
|
||||
view_ptr fixed_v = maybe_fix_legacy_secondary_index_mv_schema(*this, v, nullptr, preserve_version::yes);
|
||||
view_ptr v_to_add = fixed_v ? fixed_v : v;
|
||||
future<> f = this->add_column_family_and_make_directory(v_to_add);
|
||||
if (bool(fixed_v)) {
|
||||
v_to_add = fixed_v;
|
||||
auto&& keyspace = find_keyspace(v->ks_name()).metadata();
|
||||
auto mutations = db::schema_tables::make_update_view_mutations(keyspace, view_ptr(v), fixed_v, api::new_timestamp(), true);
|
||||
f = f.then([this, &proxy, mutations = std::move(mutations)] {
|
||||
return db::schema_tables::merge_schema(proxy, _feat, std::move(mutations));
|
||||
});
|
||||
}
|
||||
return f;
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -1743,7 +1751,11 @@ sstring database::get_available_index_name(const sstring &ks_name, const sstring
|
||||
auto base_name = index_metadata::get_default_index_name(cf_name, index_name_root);
|
||||
sstring accepted_name = base_name;
|
||||
int i = 0;
|
||||
while (existing_names.contains(accepted_name)) {
|
||||
auto name_accepted = [&] {
|
||||
auto index_table_name = secondary_index::index_table_name(accepted_name);
|
||||
return !has_schema(ks_name, index_table_name) && !existing_names.contains(accepted_name);
|
||||
};
|
||||
while (!name_accepted()) {
|
||||
accepted_name = base_name + "_" + std::to_string(++i);
|
||||
}
|
||||
return accepted_name;
|
||||
@@ -1808,6 +1820,13 @@ future<>
|
||||
database::stop() {
|
||||
assert(!_large_data_handler->running());
|
||||
|
||||
// Inactive reads might hold on to sstables, blocking the
|
||||
// `sstables_manager::close()` calls below. No one will come back for these
|
||||
// reads at this point so clear them before proceeding with the shutdown.
|
||||
_read_concurrency_sem.clear_inactive_reads();
|
||||
_streaming_concurrency_sem.clear_inactive_reads();
|
||||
_system_read_concurrency_sem.clear_inactive_reads();
|
||||
|
||||
// try to ensure that CL has done disk flushing
|
||||
future<> maybe_shutdown_commitlog = _commitlog != nullptr ? _commitlog->shutdown() : make_ready_future<>();
|
||||
return maybe_shutdown_commitlog.then([this] {
|
||||
@@ -1859,26 +1878,28 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
|
||||
|
||||
return cf.run_with_compaction_disabled([this, &cf, should_flush, auto_snapshot, tsf = std::move(tsf), low_mark]() mutable {
|
||||
future<> f = make_ready_future<>();
|
||||
if (should_flush) {
|
||||
bool did_flush = false;
|
||||
if (should_flush && cf.can_flush()) {
|
||||
// TODO:
|
||||
// this is not really a guarantee at all that we've actually
|
||||
// gotten all things to disk. Again, need queue-ish or something.
|
||||
f = cf.flush();
|
||||
did_flush = true;
|
||||
} else {
|
||||
f = cf.clear();
|
||||
}
|
||||
return f.then([this, &cf, auto_snapshot, tsf = std::move(tsf), low_mark, should_flush] {
|
||||
return f.then([this, &cf, auto_snapshot, tsf = std::move(tsf), low_mark, should_flush, did_flush] {
|
||||
dblog.debug("Discarding sstable data for truncated CF + indexes");
|
||||
// TODO: notify truncation
|
||||
|
||||
return tsf().then([this, &cf, auto_snapshot, low_mark, should_flush](db_clock::time_point truncated_at) {
|
||||
return tsf().then([this, &cf, auto_snapshot, low_mark, should_flush, did_flush](db_clock::time_point truncated_at) {
|
||||
future<> f = make_ready_future<>();
|
||||
if (auto_snapshot) {
|
||||
auto name = format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name());
|
||||
f = cf.snapshot(*this, name);
|
||||
}
|
||||
return f.then([this, &cf, truncated_at, low_mark, should_flush] {
|
||||
return cf.discard_sstables(truncated_at).then([this, &cf, truncated_at, low_mark, should_flush](db::replay_position rp) {
|
||||
return f.then([this, &cf, truncated_at, low_mark, should_flush, did_flush] {
|
||||
return cf.discard_sstables(truncated_at).then([this, &cf, truncated_at, low_mark, should_flush, did_flush](db::replay_position rp) {
|
||||
// TODO: indexes.
|
||||
// Note: since discard_sstables was changed to only count tables owned by this shard,
|
||||
// we can get zero rp back. Changed assert, and ensure we save at least low_mark.
|
||||
@@ -1886,7 +1907,7 @@ future<> database::truncate(const keyspace& ks, column_family& cf, timestamp_fun
|
||||
// We nowadays do not flush tables with sstables but autosnapshot=false. This means
|
||||
// the low_mark assertion does not hold, because we maybe/probably never got around to
|
||||
// creating the sstables that would create them.
|
||||
assert(!should_flush || low_mark <= rp || rp == db::replay_position());
|
||||
assert(!did_flush || low_mark <= rp || rp == db::replay_position());
|
||||
rp = std::max(low_mark, rp);
|
||||
return truncate_views(cf, truncated_at, should_flush).then([&cf, truncated_at, rp] {
|
||||
// save_truncation_record() may actually fail after we cached the truncation time
|
||||
|
||||
@@ -224,6 +224,10 @@ public:
|
||||
return bool(_seal_immediate_fn);
|
||||
}
|
||||
|
||||
bool can_flush() const {
|
||||
return may_flush() && !empty();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
for (auto& m : _memtables) {
|
||||
if (!m->empty()) {
|
||||
@@ -782,6 +786,8 @@ public:
|
||||
// to them, and then pass that + 1 as "start".
|
||||
future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(std::set<int64_t> all_generations, int64_t start);
|
||||
|
||||
bool can_flush() const;
|
||||
|
||||
// FIXME: this is just an example, should be changed to something more
|
||||
// general. compact_all_sstables() starts a compaction of all sstables.
|
||||
// It doesn't flush the current memtable first. It's just a ad-hoc method,
|
||||
|
||||
@@ -1208,7 +1208,42 @@ static void merge_tables_and_views(distributed<service::storage_proxy>& proxy,
|
||||
return create_table_from_mutations(proxy, std::move(sm));
|
||||
});
|
||||
auto views_diff = diff_table_or_view(proxy, std::move(views_before), std::move(views_after), [&] (schema_mutations sm) {
|
||||
return create_view_from_mutations(proxy, std::move(sm));
|
||||
// The view schema mutation should be created with reference to the base table schema because we definitely know it by now.
|
||||
// If we don't do it we are leaving a window where write commands to this schema are illegal.
|
||||
// There are 3 possibilities:
|
||||
// 1. The table was altered - in this case we want the view to correspond to this new table schema.
|
||||
// 2. The table was just created - the table is guarantied to be published with the view in that case.
|
||||
// 3. The view itself was altered - in that case we already know the base table so we can take it from
|
||||
// the database object.
|
||||
view_ptr vp = create_view_from_mutations(proxy, std::move(sm));
|
||||
schema_ptr base_schema;
|
||||
for (auto&& s : tables_diff.altered) {
|
||||
if (s.new_schema.get()->ks_name() == vp->ks_name() && s.new_schema.get()->cf_name() == vp->view_info()->base_name() ) {
|
||||
base_schema = s.new_schema;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!base_schema) {
|
||||
for (auto&& s : tables_diff.created) {
|
||||
if (s.get()->ks_name() == vp->ks_name() && s.get()->cf_name() == vp->view_info()->base_name() ) {
|
||||
base_schema = s;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!base_schema) {
|
||||
base_schema = proxy.local().local_db().find_schema(vp->ks_name(), vp->view_info()->base_name());
|
||||
}
|
||||
|
||||
// Now when we have a referenced base - just in case we are registering an old view (this can happen in a mixed cluster)
|
||||
// lets make it write enabled by updating it's compute columns.
|
||||
view_ptr fixed_vp = maybe_fix_legacy_secondary_index_mv_schema(proxy.local().get_db().local(), vp, base_schema, preserve_version::yes);
|
||||
if(fixed_vp) {
|
||||
vp = fixed_vp;
|
||||
}
|
||||
vp->view_info()->set_base_info(vp->view_info()->make_base_dependent_view_info(*base_schema));
|
||||
return vp;
|
||||
});
|
||||
|
||||
proxy.local().get_db().invoke_on_all([&] (database& db) {
|
||||
@@ -3033,8 +3068,7 @@ std::vector<sstring> all_table_names(schema_features features) {
|
||||
boost::adaptors::transformed([] (auto schema) { return schema->cf_name(); }));
|
||||
}
|
||||
|
||||
future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manager& mm, database& db, view_ptr v) {
|
||||
// TODO(sarna): Remove once computed columns are guaranteed to be featured in the whole cluster.
|
||||
view_ptr maybe_fix_legacy_secondary_index_mv_schema(database& db, const view_ptr& v, schema_ptr base_schema, preserve_version preserve_version) {
|
||||
// Legacy format for a secondary index used a hardcoded "token" column, which ensured a proper
|
||||
// order for indexed queries. This "token" column is now implemented as a computed column,
|
||||
// but for the sake of compatibility we assume that there might be indexes created in the legacy
|
||||
@@ -3042,26 +3076,32 @@ future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manage
|
||||
// columns marked as computed (because they were either created on a node that supports computed
|
||||
// columns or were fixed by this utility function), it's safe to remove this function altogether.
|
||||
if (v->clustering_key_size() == 0) {
|
||||
return make_ready_future<>();
|
||||
return view_ptr(nullptr);
|
||||
}
|
||||
const column_definition& first_view_ck = v->clustering_key_columns().front();
|
||||
if (first_view_ck.is_computed()) {
|
||||
return make_ready_future<>();
|
||||
return view_ptr(nullptr);
|
||||
}
|
||||
|
||||
if (!base_schema) {
|
||||
base_schema = db.find_schema(v->view_info()->base_id());
|
||||
}
|
||||
|
||||
table& base = db.find_column_family(v->view_info()->base_id());
|
||||
schema_ptr base_schema = base.schema();
|
||||
// If the first clustering key part of a view is a column with name not found in base schema,
|
||||
// it implies it might be backing an index created before computed columns were introduced,
|
||||
// and as such it must be recreated properly.
|
||||
if (!base_schema->columns_by_name().contains(first_view_ck.name())) {
|
||||
schema_builder builder{schema_ptr(v)};
|
||||
builder.mark_column_computed(first_view_ck.name(), std::make_unique<token_column_computation>());
|
||||
return mm.announce_view_update(view_ptr(builder.build()), true);
|
||||
if (preserve_version) {
|
||||
builder.with_version(v->version());
|
||||
}
|
||||
return view_ptr(builder.build());
|
||||
}
|
||||
return make_ready_future<>();
|
||||
return view_ptr(nullptr);
|
||||
}
|
||||
|
||||
|
||||
namespace legacy {
|
||||
|
||||
table_schema_version schema_mutations::digest() const {
|
||||
|
||||
@@ -238,7 +238,9 @@ std::vector<mutation> make_update_view_mutations(lw_shared_ptr<keyspace_metadata
|
||||
|
||||
std::vector<mutation> make_drop_view_mutations(lw_shared_ptr<keyspace_metadata> keyspace, view_ptr view, api::timestamp_type timestamp);
|
||||
|
||||
future<> maybe_update_legacy_secondary_index_mv_schema(service::migration_manager& mm, database& db, view_ptr v);
|
||||
class preserve_version_tag {};
|
||||
using preserve_version = bool_class<preserve_version_tag>;
|
||||
view_ptr maybe_fix_legacy_secondary_index_mv_schema(database& db, const view_ptr& v, schema_ptr base_schema, preserve_version preserve_version);
|
||||
|
||||
sstring serialize_kind(column_kind kind);
|
||||
column_kind deserialize_kind(sstring kind);
|
||||
|
||||
@@ -43,9 +43,13 @@
|
||||
|
||||
namespace db {
|
||||
|
||||
future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name) {
|
||||
future<> snapshot_ctl::check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter) {
|
||||
auto& ks = _db.local().find_keyspace(ks_name);
|
||||
return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name)] (auto& pair) {
|
||||
return parallel_for_each(ks.metadata()->cf_meta_data(), [this, ks_name = std::move(ks_name), name = std::move(name), filter = std::move(filter)] (auto& pair) {
|
||||
auto& cf_name = pair.first;
|
||||
if (filter && std::find(filter->begin(), filter->end(), cf_name) == filter->end()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
auto& cf = _db.local().find_column_family(pair.second);
|
||||
return cf.snapshot_exists(name).then([ks_name = std::move(ks_name), name] (bool exists) {
|
||||
if (exists) {
|
||||
@@ -111,7 +115,7 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
|
||||
}
|
||||
|
||||
return run_snapshot_modify_operation([this, ks_name = std::move(ks_name), tables = std::move(tables), tag = std::move(tag)] {
|
||||
return check_snapshot_not_exist(ks_name, tag).then([this, ks_name, tables = std::move(tables), tag] {
|
||||
return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag] {
|
||||
return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag](const std::vector<sstring>& tables) {
|
||||
return do_for_each(tables, [ks_name, tag, this] (const sstring& table_name) {
|
||||
if (table_name.find(".") != sstring::npos) {
|
||||
|
||||
@@ -40,6 +40,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <seastar/core/sharded.hh>
|
||||
#include <seastar/core/future.hh>
|
||||
#include "database.hh"
|
||||
@@ -112,7 +114,7 @@ private:
|
||||
seastar::rwlock _lock;
|
||||
seastar::gate _ops;
|
||||
|
||||
future<> check_snapshot_not_exist(sstring ks_name, sstring name);
|
||||
future<> check_snapshot_not_exist(sstring ks_name, sstring name, std::optional<std::vector<sstring>> filter = {});
|
||||
|
||||
template <typename Func>
|
||||
std::result_of_t<Func()> run_snapshot_modify_operation(Func&&);
|
||||
|
||||
@@ -58,7 +58,8 @@ public:
|
||||
|
||||
template<typename T, typename... Args>
|
||||
void feed_hash(const T& value, Args&&... args) {
|
||||
std::visit([&] (auto& hasher) noexcept -> void {
|
||||
// FIXME uncomment the noexcept marking once clang bug 50994 is fixed or gcc compilation is turned on
|
||||
std::visit([&] (auto& hasher) /* noexcept(noexcept(::feed_hash(hasher, value, args...))) */ -> void {
|
||||
::feed_hash(hasher, value, std::forward<Args>(args)...);
|
||||
}, _impl);
|
||||
};
|
||||
|
||||
5
dist/common/scripts/node_exporter_install
vendored
5
dist/common/scripts/node_exporter_install
vendored
@@ -24,6 +24,8 @@ import os
|
||||
import sys
|
||||
import tempfile
|
||||
import tarfile
|
||||
import shutil
|
||||
import glob
|
||||
from scylla_util import *
|
||||
import argparse
|
||||
|
||||
@@ -61,6 +63,9 @@ if __name__ == '__main__':
|
||||
f.write(data)
|
||||
with tarfile.open('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION)) as tf:
|
||||
tf.extractall(INSTALL_DIR)
|
||||
shutil.chown(f'{INSTALL_DIR}/node_exporter-{VERSION}.linux-amd64', 'root', 'root')
|
||||
for f in glob.glob(f'{INSTALL_DIR}/node_exporter-{VERSION}.linux-amd64/*'):
|
||||
shutil.chown(f, 'root', 'root')
|
||||
os.remove('/var/tmp/node_exporter-{version}.linux-amd64.tar.gz'.format(version=VERSION))
|
||||
if node_exporter_p.exists():
|
||||
node_exporter_p.unlink()
|
||||
|
||||
3
dist/common/scripts/scylla_coredump_setup
vendored
3
dist/common/scripts/scylla_coredump_setup
vendored
@@ -87,7 +87,8 @@ WantedBy=multi-user.target
|
||||
run('sysctl -p /etc/sysctl.d/99-scylla-coredump.conf')
|
||||
|
||||
fp = tempfile.NamedTemporaryFile()
|
||||
fp.write(b'kill -SEGV $$')
|
||||
fp.write(b'ulimit -c unlimited\n')
|
||||
fp.write(b'kill -SEGV $$\n')
|
||||
fp.flush()
|
||||
p = subprocess.Popen(['/bin/bash', fp.name], stdout=subprocess.PIPE)
|
||||
pid = p.pid
|
||||
|
||||
13
dist/common/scripts/scylla_cpuscaling_setup
vendored
13
dist/common/scripts/scylla_cpuscaling_setup
vendored
@@ -22,6 +22,7 @@
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import shlex
|
||||
import distro
|
||||
from scylla_util import *
|
||||
@@ -33,12 +34,22 @@ if __name__ == '__main__':
|
||||
if os.getuid() > 0:
|
||||
print('Requires root permission.')
|
||||
sys.exit(1)
|
||||
if not os.path.exists('/sys/devices/system/cpu/cpufreq/policy0/scaling_governor'):
|
||||
parser = argparse.ArgumentParser(description='CPU scaling setup script for Scylla.')
|
||||
parser.add_argument('--force', dest='force', action='store_true',
|
||||
help='force running setup even CPU scaling unsupported')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.force and not os.path.exists('/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor'):
|
||||
print('This computer doesn\'t supported CPU scaling configuration.')
|
||||
sys.exit(0)
|
||||
if is_debian_variant():
|
||||
if not shutil.which('cpufreq-set'):
|
||||
apt_install('cpufrequtils')
|
||||
try:
|
||||
ondemand = systemd_unit('ondemand')
|
||||
ondemand.disable()
|
||||
except:
|
||||
pass
|
||||
cfg = sysconfig_parser('/etc/default/cpufrequtils')
|
||||
cfg.set('GOVERNOR', 'performance')
|
||||
cfg.commit()
|
||||
|
||||
6
dist/common/scripts/scylla_ntp_setup
vendored
6
dist/common/scripts/scylla_ntp_setup
vendored
@@ -91,12 +91,12 @@ if __name__ == '__main__':
|
||||
with open('/etc/ntp.conf') as f:
|
||||
conf = f.read()
|
||||
if args.subdomain:
|
||||
conf2 = re.sub(r'server\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', 'server \\1.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
|
||||
conf2 = re.sub(r'(server|pool)\s+([0-9]+)\.(\S+)\.pool\.ntp\.org', '\\1 \\2.{}.pool.ntp.org'.format(args.subdomain), conf, flags=re.MULTILINE)
|
||||
with open('/etc/ntp.conf', 'w') as f:
|
||||
f.write(conf2)
|
||||
conf = conf2
|
||||
match = re.search(r'^server\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
|
||||
server = match.group(1)
|
||||
match = re.search(r'^(server|pool)\s+(\S*)(\s+\S+)?', conf, flags=re.MULTILINE)
|
||||
server = match.group(2)
|
||||
ntpd = systemd_unit('ntpd.service')
|
||||
ntpd.stop()
|
||||
# ignore error, ntpd may able to adjust clock later
|
||||
|
||||
1
dist/common/scripts/scylla_prepare
vendored
1
dist/common/scripts/scylla_prepare
vendored
@@ -143,4 +143,3 @@ if __name__ == '__main__':
|
||||
print(f'Exception occurred while creating perftune.yaml: {e}')
|
||||
print('To fix the error, please re-run scylla_setup.')
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
2
dist/common/scripts/scylla_raid_setup
vendored
2
dist/common/scripts/scylla_raid_setup
vendored
@@ -160,7 +160,7 @@ Before=scylla-server.service
|
||||
After={after}
|
||||
|
||||
[Mount]
|
||||
What=UUID={uuid}
|
||||
What=/dev/disk/by-uuid/{uuid}
|
||||
Where={mount_at}
|
||||
Type=xfs
|
||||
Options=noatime
|
||||
|
||||
4
dist/common/scripts/scylla_util.py
vendored
4
dist/common/scripts/scylla_util.py
vendored
@@ -34,6 +34,7 @@ from pathlib import Path
|
||||
|
||||
import distro
|
||||
|
||||
from multiprocessing import cpu_count
|
||||
|
||||
def scriptsdir_p():
|
||||
p = Path(sys.argv[0]).resolve()
|
||||
@@ -308,9 +309,10 @@ class gcp_instance:
|
||||
logging.warning(
|
||||
"This machine doesn't have enough CPUs for allocated number of NVMEs (at least 32 cpus for >=16 disks). Performance will suffer.")
|
||||
return False
|
||||
diskSize = self.firstNvmeSize
|
||||
if diskCount < 1:
|
||||
logging.warning("No ephemeral disks were found.")
|
||||
return False
|
||||
diskSize = self.firstNvmeSize
|
||||
max_disktoramratio = 105
|
||||
# 30:1 Disk/RAM ratio must be kept at least(AWS), we relax this a little bit
|
||||
# on GCP we are OK with {max_disktoramratio}:1 , n1-standard-2 can cope with 1 disk, not more
|
||||
|
||||
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
2
dist/common/sysctl.d/99-scylla-aio.conf
vendored
@@ -1,2 +1,2 @@
|
||||
# Raise max AIO events
|
||||
fs.aio-max-nr = 1048576
|
||||
fs.aio-max-nr = 5578536
|
||||
|
||||
2
dist/common/systemd/scylla-fstrim.timer
vendored
2
dist/common/systemd/scylla-fstrim.timer
vendored
@@ -1,7 +1,5 @@
|
||||
[Unit]
|
||||
Description=Run Scylla fstrim daily
|
||||
After=scylla-server.service
|
||||
BindsTo=scylla-server.service
|
||||
|
||||
[Timer]
|
||||
OnCalendar=Sat *-*-* 00:00:00
|
||||
|
||||
@@ -9,9 +9,9 @@ if [[ $KVER =~ 3\.13\.0\-([0-9]+)-generic ]]; then
|
||||
else
|
||||
# expect failures in virtualized environments
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-sched.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-vm.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-inotify.conf || :
|
||||
sysctl -p/usr/lib/sysctl.d/99-scylla-aio.conf || :
|
||||
fi
|
||||
|
||||
#DEBHELPER#
|
||||
|
||||
2
dist/debian/debian/scylla-server.postrm
vendored
2
dist/debian/debian/scylla-server.postrm
vendored
@@ -12,8 +12,6 @@ case "$1" in
|
||||
if [ "$1" = "purge" ]; then
|
||||
rm -rf /etc/systemd/system/scylla-server.service.d/
|
||||
fi
|
||||
rm -f /etc/systemd/system/var-lib-systemd-coredump.mount
|
||||
rm -f /etc/systemd/system/var-lib-scylla.mount
|
||||
;;
|
||||
esac
|
||||
|
||||
|
||||
21
dist/redhat/scylla.spec
vendored
21
dist/redhat/scylla.spec
vendored
@@ -7,7 +7,7 @@ Group: Applications/Databases
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Source0: %{reloc_pkg}
|
||||
Requires: %{product}-server = %{version} %{product}-conf = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version}
|
||||
Requires: %{product}-server = %{version} %{product}-conf = %{version} %{product}-python3 = %{version} %{product}-kernel-conf = %{version} %{product}-jmx = %{version} %{product}-tools = %{version} %{product}-tools-core = %{version}
|
||||
Obsoletes: scylla-server < 1.1
|
||||
|
||||
%global _debugsource_template %{nil}
|
||||
@@ -52,7 +52,7 @@ Summary: The Scylla database server
|
||||
License: AGPLv3
|
||||
URL: http://www.scylladb.com/
|
||||
Requires: kernel >= 3.10.0-514
|
||||
Requires: %{product}-conf %{product}-python3
|
||||
Requires: %{product}-conf = %{version} %{product}-python3 = %{version}
|
||||
Conflicts: abrt
|
||||
AutoReqProv: no
|
||||
|
||||
@@ -76,13 +76,18 @@ getent passwd scylla || /usr/sbin/useradd -g scylla -s /sbin/nologin -r -d %{_sh
|
||||
%post server
|
||||
/opt/scylladb/scripts/scylla_post_install.sh
|
||||
|
||||
%systemd_post scylla-server.service
|
||||
if [ $1 -eq 1 ] ; then
|
||||
/usr/bin/systemctl preset scylla-server.service ||:
|
||||
fi
|
||||
|
||||
%preun server
|
||||
%systemd_preun scylla-server.service
|
||||
if [ $1 -eq 0 ] ; then
|
||||
/usr/bin/systemctl --no-reload disable scylla-server.service ||:
|
||||
/usr/bin/systemctl stop scylla-server.service ||:
|
||||
fi
|
||||
|
||||
%postun server
|
||||
%systemd_postun scylla-server.service
|
||||
/usr/bin/systemctl daemon-reload ||:
|
||||
|
||||
%posttrans server
|
||||
if [ -d /tmp/%{name}-%{version}-%{release} ]; then
|
||||
@@ -132,9 +137,9 @@ rm -rf $RPM_BUILD_ROOT
|
||||
%ghost /etc/systemd/system/scylla-server.service.d/capabilities.conf
|
||||
%ghost /etc/systemd/system/scylla-server.service.d/mounts.conf
|
||||
/etc/systemd/system/scylla-server.service.d/dependencies.conf
|
||||
%ghost /etc/systemd/system/var-lib-systemd-coredump.mount
|
||||
%ghost %config /etc/systemd/system/var-lib-systemd-coredump.mount
|
||||
%ghost /etc/systemd/system/scylla-cpupower.service
|
||||
%ghost /etc/systemd/system/var-lib-scylla.mount
|
||||
%ghost %config /etc/systemd/system/var-lib-scylla.mount
|
||||
|
||||
%package conf
|
||||
Group: Applications/Databases
|
||||
@@ -200,9 +205,9 @@ if Scylla is the main application on your server and you wish to optimize its la
|
||||
# We cannot use the sysctl_apply rpm macro because it is not present in 7.0
|
||||
# following is a "manual" expansion
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-sched.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-vm.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-inotify.conf >/dev/null 2>&1 || :
|
||||
/usr/lib/systemd/systemd-sysctl 99-scylla-aio.conf >/dev/null 2>&1 || :
|
||||
|
||||
%files kernel-conf
|
||||
%defattr(-,root,root)
|
||||
|
||||
@@ -1774,6 +1774,8 @@ future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes,
|
||||
}).handle_exception_type([node, &fall_back_to_syn_msg] (seastar::rpc::unknown_verb_error&) {
|
||||
logger.warn("Node {} does not support get_endpoint_states verb", node);
|
||||
fall_back_to_syn_msg = true;
|
||||
}).handle_exception_type([node, &nodes_down] (seastar::rpc::timeout_error&) {
|
||||
logger.warn("The get_endpoint_states verb to node {} was timeout", node);
|
||||
}).handle_exception_type([node, &nodes_down] (seastar::rpc::closed_error&) {
|
||||
nodes_down++;
|
||||
logger.warn("Node {} is down for get_endpoint_states verb", node);
|
||||
|
||||
@@ -62,7 +62,7 @@ struct appending_hash;
|
||||
template<typename H, typename T, typename... Args>
|
||||
requires Hasher<H>
|
||||
inline
|
||||
void feed_hash(H& h, const T& value, Args&&... args) noexcept {
|
||||
void feed_hash(H& h, const T& value, Args&&... args) noexcept(noexcept(std::declval<appending_hash<T>>()(h, value, args...))) {
|
||||
appending_hash<T>()(h, value, std::forward<Args>(args)...);
|
||||
};
|
||||
|
||||
|
||||
16
install.sh
16
install.sh
@@ -147,6 +147,10 @@ EOF
|
||||
chmod +x "$install"
|
||||
}
|
||||
|
||||
install() {
|
||||
command install -Z "$@"
|
||||
}
|
||||
|
||||
installconfig() {
|
||||
local perm="$1"
|
||||
local src="$2"
|
||||
@@ -197,13 +201,13 @@ if [ -z "$python3" ]; then
|
||||
fi
|
||||
rpython3=$(realpath -m "$root/$python3")
|
||||
if ! $nonroot; then
|
||||
retc="$root/etc"
|
||||
rsysconfdir="$root/$sysconfdir"
|
||||
rusr="$root/usr"
|
||||
rsystemd="$rusr/lib/systemd/system"
|
||||
retc=$(realpath -m "$root/etc")
|
||||
rsysconfdir=$(realpath -m "$root/$sysconfdir")
|
||||
rusr=$(realpath -m "$root/usr")
|
||||
rsystemd=$(realpath -m "$rusr/lib/systemd/system")
|
||||
rdoc="$rprefix/share/doc"
|
||||
rdata="$root/var/lib/scylla"
|
||||
rhkdata="$root/var/lib/scylla-housekeeping"
|
||||
rdata=$(realpath -m "$root/var/lib/scylla")
|
||||
rhkdata=$(realpath -m "$root/var/lib/scylla-housekeeping")
|
||||
else
|
||||
retc="$rprefix/etc"
|
||||
rsysconfdir="$rprefix/$sysconfdir"
|
||||
|
||||
@@ -1151,6 +1151,9 @@ flat_mutation_reader evictable_reader::recreate_reader() {
|
||||
_range_override.reset();
|
||||
_slice_override.reset();
|
||||
|
||||
_drop_partition_start = false;
|
||||
_drop_static_row = false;
|
||||
|
||||
if (_last_pkey) {
|
||||
bool partition_range_is_inclusive = true;
|
||||
|
||||
@@ -1236,13 +1239,25 @@ void evictable_reader::maybe_validate_partition_start(const flat_mutation_reader
|
||||
// is in range.
|
||||
if (_last_pkey) {
|
||||
const auto cmp_res = tri_cmp(*_last_pkey, ps.key());
|
||||
if (_drop_partition_start) { // should be the same partition
|
||||
if (_drop_partition_start) { // we expect to continue from the same partition
|
||||
// We cannot assume the partition we stopped the read at is still alive
|
||||
// when we recreate the reader. It might have been compacted away in the
|
||||
// meanwhile, so allow for a larger partition too.
|
||||
require(
|
||||
cmp_res == 0,
|
||||
"{}(): validation failed, expected partition with key equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
|
||||
cmp_res <= 0,
|
||||
"{}(): validation failed, expected partition with key larger or equal to _last_pkey {} due to _drop_partition_start being set, but got {}",
|
||||
__FUNCTION__,
|
||||
*_last_pkey,
|
||||
ps.key());
|
||||
// Reset drop flags and next pos if we are not continuing from the same partition
|
||||
if (cmp_res < 0) {
|
||||
// Close previous partition, we are not going to continue it.
|
||||
push_mutation_fragment(*_schema, _permit, partition_end{});
|
||||
_drop_partition_start = false;
|
||||
_drop_static_row = false;
|
||||
_next_position_in_partition = position_in_partition::for_partition_start();
|
||||
_trim_range_tombstones = false;
|
||||
}
|
||||
} else { // should be a larger partition
|
||||
require(
|
||||
cmp_res < 0,
|
||||
@@ -1293,9 +1308,14 @@ bool evictable_reader::should_drop_fragment(const mutation_fragment& mf) {
|
||||
_drop_partition_start = false;
|
||||
return true;
|
||||
}
|
||||
if (_drop_static_row && mf.is_static_row()) {
|
||||
_drop_static_row = false;
|
||||
return true;
|
||||
// Unlike partition-start above, a partition is not guaranteed to have a
|
||||
// static row fragment. So reset the flag regardless of whether we could
|
||||
// drop one or not.
|
||||
// We are guaranteed to get here only right after dropping a partition-start,
|
||||
// so if we are not seeing a static row here, the partition doesn't have one.
|
||||
if (_drop_static_row) {
|
||||
_drop_static_row = false;
|
||||
return mf.is_static_row();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
52
mutation_writer/feed_writers.cc
Normal file
52
mutation_writer/feed_writers.cc
Normal file
@@ -0,0 +1,52 @@
|
||||
/*
|
||||
* Copyright (C) 2021 ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file is part of Scylla.
|
||||
*
|
||||
* Scylla is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Scylla is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "feed_writers.hh"
|
||||
|
||||
namespace mutation_writer {
|
||||
|
||||
bucket_writer::bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
|
||||
: _schema(schema)
|
||||
, _handle(std::move(queue_reader.second))
|
||||
, _consume_fut(consumer(std::move(queue_reader.first)))
|
||||
{ }
|
||||
|
||||
bucket_writer::bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
|
||||
: bucket_writer(schema, make_queue_reader(schema, std::move(permit)), consumer)
|
||||
{ }
|
||||
|
||||
future<> bucket_writer::consume(mutation_fragment mf) {
|
||||
return _handle.push(std::move(mf));
|
||||
}
|
||||
|
||||
void bucket_writer::consume_end_of_stream() {
|
||||
_handle.push_end_of_stream();
|
||||
}
|
||||
|
||||
void bucket_writer::abort(std::exception_ptr ep) noexcept {
|
||||
_handle.abort(std::move(ep));
|
||||
}
|
||||
|
||||
future<> bucket_writer::close() noexcept {
|
||||
return std::move(_consume_fut);
|
||||
}
|
||||
|
||||
} // mutation_writer
|
||||
@@ -22,10 +22,31 @@
|
||||
#pragma once
|
||||
|
||||
#include "flat_mutation_reader.hh"
|
||||
#include "mutation_reader.hh"
|
||||
|
||||
namespace mutation_writer {
|
||||
using reader_consumer = noncopyable_function<future<> (flat_mutation_reader)>;
|
||||
|
||||
class bucket_writer {
|
||||
schema_ptr _schema;
|
||||
queue_reader_handle _handle;
|
||||
future<> _consume_fut;
|
||||
|
||||
private:
|
||||
bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer);
|
||||
|
||||
public:
|
||||
bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer);
|
||||
|
||||
future<> consume(mutation_fragment mf);
|
||||
|
||||
void consume_end_of_stream();
|
||||
|
||||
void abort(std::exception_ptr ep) noexcept;
|
||||
|
||||
future<> close() noexcept;
|
||||
};
|
||||
|
||||
template <typename Writer>
|
||||
requires MutationFragmentConsumer<Writer, future<>>
|
||||
future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
|
||||
@@ -40,9 +61,17 @@ future<> feed_writer(flat_mutation_reader&& rd, Writer&& wr) {
|
||||
if (f.failed()) {
|
||||
auto ex = f.get_exception();
|
||||
wr.abort(ex);
|
||||
return make_exception_future<>(ex);
|
||||
return wr.close().then_wrapped([ex = std::move(ex)] (future<> f) mutable {
|
||||
if (f.failed()) {
|
||||
// The consumer is expected to fail when aborted,
|
||||
// so just ignore any exception.
|
||||
(void)f.get_exception();
|
||||
}
|
||||
return make_exception_future<>(std::move(ex));
|
||||
});
|
||||
} else {
|
||||
return wr.consume_end_of_stream();
|
||||
wr.consume_end_of_stream();
|
||||
return wr.close();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -31,36 +31,7 @@
|
||||
namespace mutation_writer {
|
||||
|
||||
class shard_based_splitting_mutation_writer {
|
||||
class shard_writer {
|
||||
queue_reader_handle _handle;
|
||||
future<> _consume_fut;
|
||||
private:
|
||||
shard_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
|
||||
: _handle(std::move(queue_reader.second))
|
||||
, _consume_fut(consumer(std::move(queue_reader.first))) {
|
||||
}
|
||||
|
||||
public:
|
||||
shard_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
|
||||
: shard_writer(schema, make_queue_reader(schema, std::move(permit)), consumer) {
|
||||
}
|
||||
future<> consume(mutation_fragment mf) {
|
||||
return _handle.push(std::move(mf));
|
||||
}
|
||||
future<> consume_end_of_stream() {
|
||||
// consume_end_of_stream is always called from a finally block,
|
||||
// and that's because we wait for _consume_fut to return. We
|
||||
// don't want to generate another exception here if the read was
|
||||
// aborted.
|
||||
if (!_handle.is_terminated()) {
|
||||
_handle.push_end_of_stream();
|
||||
}
|
||||
return std::move(_consume_fut);
|
||||
}
|
||||
void abort(std::exception_ptr ep) {
|
||||
_handle.abort(ep);
|
||||
}
|
||||
};
|
||||
using shard_writer = bucket_writer;
|
||||
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
@@ -105,13 +76,12 @@ public:
|
||||
return write_to_shard(mutation_fragment(*_schema, _permit, std::move(pe)));
|
||||
}
|
||||
|
||||
future<> consume_end_of_stream() {
|
||||
return parallel_for_each(_shards, [] (std::optional<shard_writer>& shard) {
|
||||
if (!shard) {
|
||||
return make_ready_future<>();
|
||||
void consume_end_of_stream() {
|
||||
for (auto& shard : _shards) {
|
||||
if (shard) {
|
||||
shard->consume_end_of_stream();
|
||||
}
|
||||
return shard->consume_end_of_stream();
|
||||
});
|
||||
}
|
||||
}
|
||||
void abort(std::exception_ptr ep) {
|
||||
for (auto&& shard : _shards) {
|
||||
@@ -120,6 +90,11 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
future<> close() noexcept {
|
||||
return parallel_for_each(_shards, [] (std::optional<shard_writer>& shard) {
|
||||
return shard ? shard->close() : make_ready_future<>();
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
future<> segregate_by_shard(flat_mutation_reader producer, reader_consumer consumer) {
|
||||
|
||||
@@ -109,22 +109,12 @@ small_flat_map<Key, Value, Size>::find(const key_type& k) {
|
||||
class timestamp_based_splitting_mutation_writer {
|
||||
using bucket_id = int64_t;
|
||||
|
||||
class bucket_writer {
|
||||
schema_ptr _schema;
|
||||
queue_reader_handle _handle;
|
||||
future<> _consume_fut;
|
||||
class timestamp_bucket_writer : public bucket_writer {
|
||||
bool _has_current_partition = false;
|
||||
|
||||
private:
|
||||
bucket_writer(schema_ptr schema, std::pair<flat_mutation_reader, queue_reader_handle> queue_reader, reader_consumer& consumer)
|
||||
: _schema(std::move(schema))
|
||||
, _handle(std::move(queue_reader.second))
|
||||
, _consume_fut(consumer(std::move(queue_reader.first))) {
|
||||
}
|
||||
|
||||
public:
|
||||
bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
|
||||
: bucket_writer(schema, make_queue_reader(schema, std::move(permit)), consumer) {
|
||||
timestamp_bucket_writer(schema_ptr schema, reader_permit permit, reader_consumer& consumer)
|
||||
: bucket_writer(schema, std::move(permit), consumer) {
|
||||
}
|
||||
void set_has_current_partition() {
|
||||
_has_current_partition = true;
|
||||
@@ -135,18 +125,6 @@ class timestamp_based_splitting_mutation_writer {
|
||||
bool has_current_partition() const {
|
||||
return _has_current_partition;
|
||||
}
|
||||
future<> consume(mutation_fragment mf) {
|
||||
return _handle.push(std::move(mf));
|
||||
}
|
||||
future<> consume_end_of_stream() {
|
||||
if (!_handle.is_terminated()) {
|
||||
_handle.push_end_of_stream();
|
||||
}
|
||||
return std::move(_consume_fut);
|
||||
}
|
||||
void abort(std::exception_ptr ep) {
|
||||
_handle.abort(ep);
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
@@ -155,7 +133,7 @@ private:
|
||||
classify_by_timestamp _classifier;
|
||||
reader_consumer _consumer;
|
||||
partition_start _current_partition_start;
|
||||
std::unordered_map<bucket_id, bucket_writer> _buckets;
|
||||
std::unordered_map<bucket_id, timestamp_bucket_writer> _buckets;
|
||||
std::vector<bucket_id> _buckets_used_for_current_partition;
|
||||
|
||||
private:
|
||||
@@ -186,16 +164,21 @@ public:
|
||||
future<> consume(range_tombstone&& rt);
|
||||
future<> consume(partition_end&& pe);
|
||||
|
||||
future<> consume_end_of_stream() {
|
||||
return parallel_for_each(_buckets, [] (std::pair<const bucket_id, bucket_writer>& bucket) {
|
||||
return bucket.second.consume_end_of_stream();
|
||||
});
|
||||
void consume_end_of_stream() {
|
||||
for (auto& b : _buckets) {
|
||||
b.second.consume_end_of_stream();
|
||||
}
|
||||
}
|
||||
void abort(std::exception_ptr ep) {
|
||||
for (auto&& b : _buckets) {
|
||||
b.second.abort(ep);
|
||||
}
|
||||
}
|
||||
future<> close() noexcept {
|
||||
return parallel_for_each(_buckets, [] (std::pair<const bucket_id, timestamp_bucket_writer>& b) {
|
||||
return b.second.close();
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
future<> timestamp_based_splitting_mutation_writer::write_to_bucket(bucket_id bucket, mutation_fragment&& mf) {
|
||||
|
||||
@@ -205,6 +205,10 @@ public:
|
||||
auto to_block = std::min(_used_memory - _blocked_bytes, n);
|
||||
_blocked_bytes += to_block;
|
||||
stop = (_limiter->update_and_check(to_block) && _stop_on_global_limit) || stop;
|
||||
if (stop && !_short_read_allowed) {
|
||||
// If we are here we stopped because of the global limit.
|
||||
throw std::runtime_error("Maximum amount of memory for building query results is exhausted, unpaged query cannot be finished");
|
||||
}
|
||||
}
|
||||
return stop;
|
||||
}
|
||||
|
||||
@@ -75,7 +75,7 @@ class reader_permit::impl : public boost::intrusive::list_base_hook<boost::intru
|
||||
sstring _op_name;
|
||||
std::string_view _op_name_view;
|
||||
reader_resources _resources;
|
||||
reader_permit::state _state = reader_permit::state::registered;
|
||||
reader_permit::state _state = reader_permit::state::active;
|
||||
|
||||
public:
|
||||
struct value_tag {};
|
||||
@@ -123,22 +123,17 @@ public:
|
||||
}
|
||||
|
||||
void on_admission() {
|
||||
_state = reader_permit::state::admitted;
|
||||
_semaphore.consume(_resources);
|
||||
_state = reader_permit::state::active;
|
||||
}
|
||||
|
||||
void consume(reader_resources res) {
|
||||
_resources += res;
|
||||
if (_state == reader_permit::state::admitted) {
|
||||
_semaphore.consume(res);
|
||||
}
|
||||
_semaphore.consume(res);
|
||||
}
|
||||
|
||||
void signal(reader_resources res) {
|
||||
_resources -= res;
|
||||
if (_state == reader_permit::state::admitted) {
|
||||
_semaphore.signal(res);
|
||||
}
|
||||
_semaphore.signal(res);
|
||||
}
|
||||
|
||||
reader_resources resources() const {
|
||||
@@ -205,14 +200,11 @@ reader_resources reader_permit::consumed_resources() const {
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, reader_permit::state s) {
|
||||
switch (s) {
|
||||
case reader_permit::state::registered:
|
||||
os << "registered";
|
||||
break;
|
||||
case reader_permit::state::waiting:
|
||||
os << "waiting";
|
||||
break;
|
||||
case reader_permit::state::admitted:
|
||||
os << "admitted";
|
||||
case reader_permit::state::active:
|
||||
os << "active";
|
||||
break;
|
||||
}
|
||||
return os;
|
||||
@@ -249,7 +241,7 @@ struct permit_group_key_hash {
|
||||
|
||||
using permit_groups = std::unordered_map<permit_group_key, permit_stats, permit_group_key_hash>;
|
||||
|
||||
static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state, bool sort_by_memory) {
|
||||
static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const permit_groups& permits, reader_permit::state state) {
|
||||
struct permit_summary {
|
||||
const schema* s;
|
||||
std::string_view op_name;
|
||||
@@ -265,25 +257,17 @@ static permit_stats do_dump_reader_permit_diagnostics(std::ostream& os, const pe
|
||||
}
|
||||
}
|
||||
|
||||
std::ranges::sort(permit_summaries, [sort_by_memory] (const permit_summary& a, const permit_summary& b) {
|
||||
if (sort_by_memory) {
|
||||
return a.memory < b.memory;
|
||||
} else {
|
||||
return a.count < b.count;
|
||||
}
|
||||
std::ranges::sort(permit_summaries, [] (const permit_summary& a, const permit_summary& b) {
|
||||
return a.memory < b.memory;
|
||||
});
|
||||
|
||||
permit_stats total;
|
||||
|
||||
auto print_line = [&os, sort_by_memory] (auto col1, auto col2, auto col3) {
|
||||
if (sort_by_memory) {
|
||||
fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
|
||||
} else {
|
||||
fmt::print(os, "{}\t{}\t{}\n", col1, col2, col3);
|
||||
}
|
||||
auto print_line = [&os] (auto col1, auto col2, auto col3) {
|
||||
fmt::print(os, "{}\t{}\t{}\n", col2, col1, col3);
|
||||
};
|
||||
|
||||
fmt::print(os, "Permits with state {}, sorted by {}\n", state, sort_by_memory ? "memory" : "count");
|
||||
fmt::print(os, "Permits with state {}\n", state);
|
||||
print_line("count", "memory", "name");
|
||||
for (const auto& summary : permit_summaries) {
|
||||
total.count += summary.count;
|
||||
@@ -309,11 +293,9 @@ static void do_dump_reader_permit_diagnostics(std::ostream& os, const reader_con
|
||||
permit_stats total;
|
||||
|
||||
fmt::print(os, "Semaphore {}: {}, dumping permit diagnostics:\n", semaphore.name(), problem);
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::admitted, true);
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::active);
|
||||
fmt::print(os, "\n");
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting, false);
|
||||
fmt::print(os, "\n");
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::registered, false);
|
||||
total += do_dump_reader_permit_diagnostics(os, permits, reader_permit::state::waiting);
|
||||
fmt::print(os, "\n");
|
||||
fmt::print(os, "Total: permits: {}, memory: {}\n", total.count, utils::to_hr_size(total.memory));
|
||||
}
|
||||
@@ -374,7 +356,7 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
|
||||
reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(std::unique_ptr<inactive_read> ir) {
|
||||
// Implies _inactive_reads.empty(), we don't queue new readers before
|
||||
// evicting all inactive reads.
|
||||
if (_wait_list.empty()) {
|
||||
if (_wait_list.empty() && _resources.memory > 0) {
|
||||
const auto [it, _] = _inactive_reads.emplace(_next_id++, std::move(ir));
|
||||
(void)_;
|
||||
++_stats.inactive_reads;
|
||||
@@ -424,13 +406,13 @@ bool reader_concurrency_semaphore::try_evict_one_inactive_read() {
|
||||
}
|
||||
|
||||
bool reader_concurrency_semaphore::has_available_units(const resources& r) const {
|
||||
return bool(_resources) && _resources >= r;
|
||||
// Special case: when there is no active reader (based on count) admit one
|
||||
// regardless of availability of memory.
|
||||
return (bool(_resources) && _resources >= r) || _resources.count == _initial_resources.count;
|
||||
}
|
||||
|
||||
bool reader_concurrency_semaphore::may_proceed(const resources& r) const {
|
||||
// Special case: when there is no active reader (based on count) admit one
|
||||
// regardless of availability of memory.
|
||||
return _wait_list.empty() && (has_available_units(r) || _resources.count == _initial_resources.count);
|
||||
return _wait_list.empty() && has_available_units(r);
|
||||
}
|
||||
|
||||
future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, size_t memory,
|
||||
@@ -480,6 +462,12 @@ void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
|
||||
}
|
||||
}
|
||||
|
||||
std::string reader_concurrency_semaphore::dump_diagnostics() const {
|
||||
std::ostringstream os;
|
||||
do_dump_reader_permit_diagnostics(os, *this, *_permit_list, "user request");
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// A file that tracks the memory usage of buffers resulting from read
|
||||
// operations.
|
||||
class tracking_file_impl : public file_impl {
|
||||
|
||||
@@ -231,4 +231,6 @@ public:
|
||||
}
|
||||
|
||||
void broken(std::exception_ptr ex);
|
||||
|
||||
std::string dump_diagnostics() const;
|
||||
};
|
||||
|
||||
@@ -91,9 +91,8 @@ public:
|
||||
class resource_units;
|
||||
|
||||
enum class state {
|
||||
registered, // read is registered, but didn't attempt admission yet
|
||||
waiting, // waiting for admission
|
||||
admitted,
|
||||
active,
|
||||
};
|
||||
|
||||
class impl;
|
||||
|
||||
@@ -309,7 +309,7 @@ float node_ops_metrics::repair_finished_percentage() {
|
||||
tracker::tracker(size_t nr_shards, size_t max_repair_memory)
|
||||
: _shutdown(false)
|
||||
, _repairs(nr_shards) {
|
||||
auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range()));
|
||||
auto nr = std::max(size_t(1), size_t(max_repair_memory / max_repair_memory_per_range() / 4));
|
||||
rlogger.info("Setting max_repair_memory={}, max_repair_memory_per_range={}, max_repair_ranges_in_parallel={}",
|
||||
max_repair_memory, max_repair_memory_per_range(), nr);
|
||||
_range_parallelism_semaphores.reserve(nr_shards);
|
||||
|
||||
@@ -571,7 +571,7 @@ public:
|
||||
_mq[node_idx] = std::move(queue_handle);
|
||||
auto writer = shared_from_this();
|
||||
_writer_done[node_idx] = mutation_writer::distribute_reader_and_consume_on_shards(_schema, std::move(queue_reader),
|
||||
[&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions, writer] (flat_mutation_reader reader) {
|
||||
[&db, reason = this->_reason, estimated_partitions = this->_estimated_partitions] (flat_mutation_reader reader) {
|
||||
auto& t = db.local().find_column_family(reader.schema());
|
||||
return db::view::check_needs_view_update_path(_sys_dist_ks->local(), t, reason).then([t = t.shared_from_this(), estimated_partitions, reader = std::move(reader)] (bool use_view_update_path) mutable {
|
||||
//FIXME: for better estimations this should be transmitted from remote
|
||||
|
||||
@@ -456,6 +456,9 @@ schema::schema(const schema& o)
|
||||
rebuild();
|
||||
if (o.is_view()) {
|
||||
_view_info = std::make_unique<::view_info>(*this, o.view_info()->raw());
|
||||
if (o.view_info()->base_info()) {
|
||||
_view_info->set_base_info(o.view_info()->base_info());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -859,7 +862,7 @@ std::ostream& schema::describe(database& db, std::ostream& os) const {
|
||||
os << "}";
|
||||
os << "\n AND comment = '" << comment()<< "'";
|
||||
os << "\n AND compaction = {'class': '" << sstables::compaction_strategy::name(compaction_strategy()) << "'";
|
||||
map_as_cql_param(os, compaction_strategy_options()) << "}";
|
||||
map_as_cql_param(os, compaction_strategy_options(), false) << "}";
|
||||
os << "\n AND compression = {";
|
||||
map_as_cql_param(os, get_compressor_params().get_options());
|
||||
os << "}";
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "schema_registry.hh"
|
||||
#include "log.hh"
|
||||
#include "db/schema_tables.hh"
|
||||
#include "view_info.hh"
|
||||
|
||||
static logging::logger slogger("schema_registry");
|
||||
|
||||
@@ -274,22 +275,43 @@ global_schema_ptr::global_schema_ptr(global_schema_ptr&& o) noexcept {
|
||||
assert(o._cpu_of_origin == current);
|
||||
_ptr = std::move(o._ptr);
|
||||
_cpu_of_origin = current;
|
||||
_base_schema = std::move(o._base_schema);
|
||||
}
|
||||
|
||||
schema_ptr global_schema_ptr::get() const {
|
||||
if (this_shard_id() == _cpu_of_origin) {
|
||||
return _ptr;
|
||||
} else {
|
||||
// 'e' points to a foreign entry, but we know it won't be evicted
|
||||
// because _ptr is preventing this.
|
||||
const schema_registry_entry& e = *_ptr->registry_entry();
|
||||
schema_ptr s = local_schema_registry().get_or_null(e.version());
|
||||
if (!s) {
|
||||
s = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
|
||||
return e.frozen();
|
||||
});
|
||||
auto registered_schema = [](const schema_registry_entry& e) {
|
||||
schema_ptr ret = local_schema_registry().get_or_null(e.version());
|
||||
if (!ret) {
|
||||
ret = local_schema_registry().get_or_load(e.version(), [&e](table_schema_version) {
|
||||
return e.frozen();
|
||||
});
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
schema_ptr registered_bs;
|
||||
// the following code contains registry entry dereference of a foreign shard
|
||||
// however, it is guarantied to succeed since we made sure in the constructor
|
||||
// that _bs_schema and _ptr will have a registry on the foreign shard where this
|
||||
// object originated so as long as this object lives the registry entries lives too
|
||||
// and it is safe to reference them on foreign shards.
|
||||
if (_base_schema) {
|
||||
registered_bs = registered_schema(*_base_schema->registry_entry());
|
||||
if (_base_schema->registry_entry()->is_synced()) {
|
||||
registered_bs->registry_entry()->mark_synced();
|
||||
}
|
||||
}
|
||||
if (e.is_synced()) {
|
||||
schema_ptr s = registered_schema(*_ptr->registry_entry());
|
||||
if (s->is_view()) {
|
||||
if (!s->view_info()->base_info()) {
|
||||
// we know that registered_bs is valid here because we make sure of it in the constructors.
|
||||
s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*registered_bs));
|
||||
}
|
||||
}
|
||||
if (_ptr->registry_entry()->is_synced()) {
|
||||
s->registry_entry()->mark_synced();
|
||||
}
|
||||
return s;
|
||||
@@ -297,16 +319,33 @@ schema_ptr global_schema_ptr::get() const {
|
||||
}
|
||||
|
||||
global_schema_ptr::global_schema_ptr(const schema_ptr& ptr)
|
||||
: _ptr([&ptr]() {
|
||||
// _ptr must always have an associated registry entry,
|
||||
// if ptr doesn't, we need to load it into the registry.
|
||||
schema_registry_entry* e = ptr->registry_entry();
|
||||
: _cpu_of_origin(this_shard_id()) {
|
||||
// _ptr must always have an associated registry entry,
|
||||
// if ptr doesn't, we need to load it into the registry.
|
||||
auto ensure_registry_entry = [] (const schema_ptr& s) {
|
||||
schema_registry_entry* e = s->registry_entry();
|
||||
if (e) {
|
||||
return ptr;
|
||||
}
|
||||
return local_schema_registry().get_or_load(ptr->version(), [&ptr] (table_schema_version) {
|
||||
return frozen_schema(ptr);
|
||||
return s;
|
||||
} else {
|
||||
return local_schema_registry().get_or_load(s->version(), [&s] (table_schema_version) {
|
||||
return frozen_schema(s);
|
||||
});
|
||||
}())
|
||||
, _cpu_of_origin(this_shard_id())
|
||||
{ }
|
||||
}
|
||||
};
|
||||
|
||||
schema_ptr s = ensure_registry_entry(ptr);
|
||||
if (s->is_view()) {
|
||||
if (s->view_info()->base_info()) {
|
||||
_base_schema = ensure_registry_entry(s->view_info()->base_info()->base_schema());
|
||||
} else if (ptr->view_info()->base_info()) {
|
||||
_base_schema = ensure_registry_entry(ptr->view_info()->base_info()->base_schema());
|
||||
} else {
|
||||
on_internal_error(slogger, format("Tried to build a global schema for view {}.{} with an uninitialized base info", s->ks_name(), s->cf_name()));
|
||||
}
|
||||
|
||||
if (!s->view_info()->base_info() || !s->view_info()->base_info()->base_schema()->registry_entry()) {
|
||||
s->view_info()->set_base_info(s->view_info()->make_base_dependent_view_info(*_base_schema));
|
||||
}
|
||||
}
|
||||
_ptr = s;
|
||||
}
|
||||
|
||||
@@ -165,6 +165,7 @@ schema_registry& local_schema_registry();
|
||||
// chain will last.
|
||||
class global_schema_ptr {
|
||||
schema_ptr _ptr;
|
||||
schema_ptr _base_schema;
|
||||
unsigned _cpu_of_origin;
|
||||
public:
|
||||
// Note: the schema_ptr must come from the current shard and can't be nullptr.
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: b70b444924...5ef45afa4d
@@ -53,6 +53,7 @@
|
||||
#include "database.hh"
|
||||
#include "db/schema_tables.hh"
|
||||
#include "types/user.hh"
|
||||
#include "db/schema_tables.hh"
|
||||
|
||||
namespace service {
|
||||
|
||||
@@ -1096,8 +1097,19 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
|
||||
// referenced by the incoming request.
|
||||
// That means the column mapping for the schema should always be inserted
|
||||
// with TTL (refresh TTL in case column mapping already existed prior to that).
|
||||
return db::schema_tables::store_column_mapping(proxy, s.unfreeze(db::schema_ctxt(proxy)), true).then([s] {
|
||||
return s;
|
||||
auto us = s.unfreeze(db::schema_ctxt(proxy));
|
||||
// if this is a view - we might need to fix it's schema before registering it.
|
||||
if (us->is_view()) {
|
||||
auto& db = proxy.local().local_db();
|
||||
schema_ptr base_schema = db.find_schema(us->view_info()->base_id());
|
||||
auto fixed_view = db::schema_tables::maybe_fix_legacy_secondary_index_mv_schema(db, view_ptr(us), base_schema,
|
||||
db::schema_tables::preserve_version::yes);
|
||||
if (fixed_view) {
|
||||
us = fixed_view;
|
||||
}
|
||||
}
|
||||
return db::schema_tables::store_column_mapping(proxy, us, true).then([us] {
|
||||
return frozen_schema{us};
|
||||
});
|
||||
});
|
||||
}).then([] (schema_ptr s) {
|
||||
@@ -1105,7 +1117,7 @@ future<schema_ptr> get_schema_definition(table_schema_version v, netw::messaging
|
||||
// table.
|
||||
if (s->is_view()) {
|
||||
if (!s->view_info()->base_info()) {
|
||||
auto& db = service::get_local_storage_proxy().get_db().local();
|
||||
auto& db = service::get_local_storage_proxy().local_db();
|
||||
// This line might throw a no_such_column_family
|
||||
// It should be fine since if we tried to register a view for which
|
||||
// we don't know the base table, our registry is broken.
|
||||
|
||||
@@ -3624,6 +3624,11 @@ protected:
|
||||
|
||||
public:
|
||||
virtual future<foreign_ptr<lw_shared_ptr<query::result>>> execute(storage_proxy::clock_type::time_point timeout) {
|
||||
if (_targets.empty()) {
|
||||
// We may have no targets to read from if a DC with zero replication is queried with LOCACL_QUORUM.
|
||||
// Return an empty result in this case
|
||||
return make_ready_future<foreign_ptr<lw_shared_ptr<query::result>>>(make_foreign(make_lw_shared(query::result())));
|
||||
}
|
||||
digest_resolver_ptr digest_resolver = ::make_shared<digest_read_resolver>(_schema, _cl, _block_for,
|
||||
db::is_datacenter_local(_cl) ? db::count_local_endpoints(_targets): _targets.size(), timeout);
|
||||
auto exec = shared_from_this();
|
||||
|
||||
@@ -446,6 +446,12 @@ public:
|
||||
distributed<database>& get_db() {
|
||||
return _db;
|
||||
}
|
||||
const database& local_db() const noexcept {
|
||||
return _db.local();
|
||||
}
|
||||
database& local_db() noexcept {
|
||||
return _db.local();
|
||||
}
|
||||
|
||||
void set_cdc_service(cdc::cdc_service* cdc) {
|
||||
_cdc = cdc;
|
||||
|
||||
@@ -2336,7 +2336,7 @@ future<> storage_service::rebuild(sstring source_dc) {
|
||||
slogger.info("Streaming for rebuild successful");
|
||||
}).handle_exception([] (auto ep) {
|
||||
// This is used exclusively through JMX, so log the full trace but only throw a simple RTE
|
||||
slogger.warn("Error while rebuilding node: {}", std::current_exception());
|
||||
slogger.warn("Error while rebuilding node: {}", ep);
|
||||
return make_exception_future<>(std::move(ep));
|
||||
});
|
||||
});
|
||||
|
||||
@@ -438,7 +438,6 @@ protected:
|
||||
mutation_source_metadata _ms_metadata = {};
|
||||
garbage_collected_sstable_writer::data _gc_sstable_writer_data;
|
||||
compaction_sstable_replacer_fn _replacer;
|
||||
std::optional<compaction_weight_registration> _weight_registration;
|
||||
utils::UUID _run_identifier;
|
||||
::io_priority_class _io_priority;
|
||||
// optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
|
||||
@@ -457,7 +456,6 @@ protected:
|
||||
, _sstable_level(descriptor.level)
|
||||
, _gc_sstable_writer_data(*this)
|
||||
, _replacer(std::move(descriptor.replacer))
|
||||
, _weight_registration(std::move(descriptor.weight_registration))
|
||||
, _run_identifier(descriptor.run_identifier)
|
||||
, _io_priority(descriptor.io_priority)
|
||||
, _sstable_set(std::move(descriptor.all_sstables_snapshot))
|
||||
@@ -929,9 +927,6 @@ public:
|
||||
}
|
||||
|
||||
virtual void on_end_of_compaction() override {
|
||||
if (_weight_registration) {
|
||||
_cf.get_compaction_manager().on_compaction_complete(*_weight_registration);
|
||||
}
|
||||
replace_remaining_exhausted_sstables();
|
||||
}
|
||||
private:
|
||||
|
||||
@@ -134,8 +134,6 @@ struct compaction_descriptor {
|
||||
uint64_t max_sstable_bytes;
|
||||
// Run identifier of output sstables.
|
||||
utils::UUID run_identifier;
|
||||
// Holds ownership of a weight assigned to this compaction iff it's a regular one.
|
||||
std::optional<compaction_weight_registration> weight_registration;
|
||||
// Calls compaction manager's task for this compaction to release reference to exhausted sstables.
|
||||
std::function<void(const std::vector<shared_sstable>& exhausted_sstables)> release_exhausted;
|
||||
// The options passed down to the compaction code.
|
||||
|
||||
@@ -311,6 +311,7 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstring name, non
|
||||
cmlog.info("{} was abruptly stopped, reason: {}", name, e.what());
|
||||
} catch (...) {
|
||||
cmlog.error("{} failed: {}", name, std::current_exception());
|
||||
throw;
|
||||
}
|
||||
});
|
||||
return task->compaction_done.get_future().then([task] {});
|
||||
@@ -435,7 +436,7 @@ void compaction_manager::reevaluate_postponed_compactions() {
|
||||
}
|
||||
|
||||
void compaction_manager::postpone_compaction_for_column_family(column_family* cf) {
|
||||
_postponed.push_back(cf);
|
||||
_postponed.insert(cf);
|
||||
}
|
||||
|
||||
future<> compaction_manager::stop_ongoing_compactions(sstring reason) {
|
||||
@@ -575,7 +576,7 @@ void compaction_manager::submit(column_family* cf) {
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
}
|
||||
auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
|
||||
descriptor.weight_registration = compaction_weight_registration(this, weight);
|
||||
auto weight_r = compaction_weight_registration(this, weight);
|
||||
descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
|
||||
compacting->release_compacting(exhausted_sstables);
|
||||
};
|
||||
@@ -585,7 +586,7 @@ void compaction_manager::submit(column_family* cf) {
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
task->compaction_running = true;
|
||||
return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
|
||||
return cf.run_compaction(std::move(descriptor)).then_wrapped([this, task, compacting = std::move(compacting), weight_r = std::move(weight_r)] (future<> f) mutable {
|
||||
_stats.active_tasks--;
|
||||
task->compaction_running = false;
|
||||
|
||||
@@ -629,10 +630,11 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
_tasks.push_back(task);
|
||||
|
||||
auto sstables = std::make_unique<std::vector<sstables::shared_sstable>>(get_func(*cf));
|
||||
auto compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
|
||||
auto sstables_ptr = sstables.get();
|
||||
_stats.pending_tasks += sstables->size();
|
||||
|
||||
task->compaction_done = do_until([sstables_ptr] { return sstables_ptr->empty(); }, [this, task, options, sstables_ptr] () mutable {
|
||||
task->compaction_done = do_until([sstables_ptr] { return sstables_ptr->empty(); }, [this, task, options, sstables_ptr, compacting] () mutable {
|
||||
|
||||
// FIXME: lock cf here
|
||||
if (!can_proceed(task)) {
|
||||
@@ -642,7 +644,7 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
auto sst = sstables_ptr->back();
|
||||
sstables_ptr->pop_back();
|
||||
|
||||
return repeat([this, task, options, sst = std::move(sst)] () mutable {
|
||||
return repeat([this, task, options, sst = std::move(sst), compacting] () mutable {
|
||||
column_family& cf = *task->compacting_cf;
|
||||
auto sstable_level = sst->get_sstable_level();
|
||||
auto run_identifier = sst->run_identifier();
|
||||
@@ -650,21 +652,22 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
auto descriptor = sstables::compaction_descriptor({ sst }, cf.get_sstable_set(), service::get_local_compaction_priority(),
|
||||
sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);
|
||||
|
||||
auto compacting = make_lw_shared<compacting_sstable_registration>(this, descriptor.sstables);
|
||||
// Releases reference to cleaned sstable such that respective used disk space can be freed.
|
||||
descriptor.release_exhausted = [compacting] (const std::vector<sstables::shared_sstable>& exhausted_sstables) {
|
||||
compacting->release_compacting(exhausted_sstables);
|
||||
};
|
||||
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
task->compaction_running = true;
|
||||
compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
|
||||
return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
|
||||
return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)] () mutable {
|
||||
return cf.run_compaction(std::move(descriptor));
|
||||
return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor)] () mutable {
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
task->compaction_running = true;
|
||||
compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
|
||||
return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor)] (compaction_backlog_tracker& bt) mutable {
|
||||
return with_scheduling_group(_scheduling_group, [this, &cf, descriptor = std::move(descriptor)]() mutable {
|
||||
return cf.run_compaction(std::move(descriptor));
|
||||
});
|
||||
});
|
||||
}).then_wrapped([this, task, compacting = std::move(compacting)] (future<> f) mutable {
|
||||
}).then_wrapped([this, task, compacting] (future<> f) mutable {
|
||||
task->compaction_running = false;
|
||||
_stats.active_tasks--;
|
||||
if (!can_proceed(task)) {
|
||||
@@ -796,7 +799,7 @@ future<> compaction_manager::remove(column_family* cf) {
|
||||
task->stopping = true;
|
||||
}
|
||||
}
|
||||
_postponed.erase(boost::remove(_postponed, cf), _postponed.end());
|
||||
_postponed.erase(cf);
|
||||
|
||||
// Wait for the termination of an ongoing compaction on cf, if any.
|
||||
return do_for_each(*tasks_to_stop, [this, cf] (auto& task) {
|
||||
@@ -832,11 +835,6 @@ void compaction_manager::stop_compaction(sstring type) {
|
||||
}
|
||||
}
|
||||
|
||||
void compaction_manager::on_compaction_complete(compaction_weight_registration& weight_registration) {
|
||||
weight_registration.deregister();
|
||||
reevaluate_postponed_compactions();
|
||||
}
|
||||
|
||||
void compaction_manager::propagate_replacement(column_family* cf,
|
||||
const std::vector<sstables::shared_sstable>& removed, const std::vector<sstables::shared_sstable>& added) {
|
||||
for (auto& info : _compactions) {
|
||||
|
||||
@@ -99,7 +99,7 @@ private:
|
||||
future<> _waiting_reevalution = make_ready_future<>();
|
||||
condition_variable _postponed_reevaluation;
|
||||
// column families that wait for compaction but had its submission postponed due to ongoing compaction.
|
||||
std::vector<column_family*> _postponed;
|
||||
std::unordered_set<column_family*> _postponed;
|
||||
// tracks taken weights of ongoing compactions, only one compaction per weight is allowed.
|
||||
// weight is value assigned to a compaction job that is log base N of total size of all input sstables.
|
||||
std::unordered_set<int> _weight_tracker;
|
||||
@@ -111,6 +111,7 @@ private:
|
||||
std::unordered_map<column_family*, rwlock> _compaction_locks;
|
||||
|
||||
semaphore _custom_job_sem{1};
|
||||
seastar::named_semaphore _rewrite_sstables_sem = {1, named_semaphore_exception_factory{"rewrite sstables"}};
|
||||
|
||||
std::function<void()> compaction_submission_callback();
|
||||
// all registered column families are submitted for compaction at a constant interval.
|
||||
@@ -255,11 +256,6 @@ public:
|
||||
// Stops ongoing compaction of a given type.
|
||||
void stop_compaction(sstring type);
|
||||
|
||||
// Called by compaction procedure to release the weight lock assigned to it, such that
|
||||
// another compaction waiting on same weight can start as soon as possible. That's usually
|
||||
// called before compaction seals sstable and such and after all compaction work is done.
|
||||
void on_compaction_complete(compaction_weight_registration& weight_registration);
|
||||
|
||||
double backlog() {
|
||||
return _backlog_manager.backlog();
|
||||
}
|
||||
|
||||
@@ -367,6 +367,7 @@ class index_reader {
|
||||
const io_priority_class& _pc;
|
||||
tracing::trace_state_ptr _trace_state;
|
||||
shared_index_lists _index_lists;
|
||||
future<> _background_closes = make_ready_future<>();
|
||||
|
||||
struct reader {
|
||||
index_consumer _consumer;
|
||||
@@ -472,6 +473,16 @@ private:
|
||||
};
|
||||
|
||||
return _index_lists.get_or_load(summary_idx, loader).then([this, &bound, summary_idx] (shared_index_lists::list_ptr ref) {
|
||||
// to make sure list is not closed when another bound is still using it, index list will only be closed when there's only one owner holding it
|
||||
if (bound.current_list && bound.current_list.use_count() == 1) {
|
||||
// a new background close will only be initiated when previous ones terminate, so as to limit the concurrency.
|
||||
_background_closes = _background_closes.then_wrapped([current_list = std::move(bound.current_list)] (future<>&& f) mutable {
|
||||
f.ignore_ready_future();
|
||||
return do_with(std::move(current_list), [] (shared_index_lists::list_ptr& current_list) mutable {
|
||||
return close_index_list(current_list);
|
||||
});
|
||||
});
|
||||
}
|
||||
bound.current_list = std::move(ref);
|
||||
bound.current_summary_idx = summary_idx;
|
||||
bound.current_index_idx = 0;
|
||||
@@ -841,6 +852,8 @@ public:
|
||||
return close_index_list(_upper_bound->current_list);
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).then([this] () mutable {
|
||||
return std::move(_background_closes);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -315,8 +315,8 @@ void sstable_writer_k_l::write_collection(file_writer& out, const composite& clu
|
||||
void sstable_writer_k_l::write_clustered_row(file_writer& out, const schema& schema, const clustering_row& clustered_row) {
|
||||
auto clustering_key = composite::from_clustering_element(schema, clustered_row.key());
|
||||
|
||||
maybe_write_row_marker(out, schema, clustered_row.marker(), clustering_key);
|
||||
maybe_write_row_tombstone(out, clustering_key, clustered_row);
|
||||
maybe_write_row_marker(out, schema, clustered_row.marker(), clustering_key);
|
||||
|
||||
_collector.update_min_max_components(clustered_row.key());
|
||||
|
||||
|
||||
@@ -147,7 +147,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
|
||||
unsigned overlapping_sstables = 0;
|
||||
auto prev_last = dht::ring_position::min();
|
||||
for (auto& sst : sstables) {
|
||||
if (dht::ring_position(sst->get_first_decorated_key()).less_compare(*schema, prev_last)) {
|
||||
if (dht::ring_position(sst->get_first_decorated_key()).tri_compare(*schema, prev_last) <= 0) {
|
||||
overlapping_sstables++;
|
||||
}
|
||||
prev_last = dht::ring_position(sst->get_last_decorated_key());
|
||||
@@ -178,7 +178,7 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
|
||||
|
||||
unsigned max_filled_level = 0;
|
||||
|
||||
size_t offstrategy_threshold = std::max(schema->min_compaction_threshold(), 4);
|
||||
size_t offstrategy_threshold = (mode == reshape_mode::strict) ? std::max(schema->min_compaction_threshold(), 4) : std::max(schema->max_compaction_threshold(), 32);
|
||||
size_t max_sstables = std::max(schema->max_compaction_threshold(), int(offstrategy_threshold));
|
||||
auto tolerance = [mode] (unsigned level) -> unsigned {
|
||||
if (mode == reshape_mode::strict) {
|
||||
@@ -189,10 +189,8 @@ leveled_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> input
|
||||
};
|
||||
|
||||
if (level_info[0].size() > offstrategy_threshold) {
|
||||
level_info[0].resize(std::min(level_info[0].size(), max_sstables));
|
||||
compaction_descriptor desc(std::move(level_info[0]), std::optional<sstables::sstable_set>(), iop);
|
||||
desc.options = compaction_options::make_reshape();
|
||||
return desc;
|
||||
size_tiered_compaction_strategy stcs(_stcs_options);
|
||||
return stcs.get_reshaping_job(std::move(level_info[0]), schema, iop, mode);
|
||||
}
|
||||
|
||||
for (unsigned level = leveled_manifest::MAX_LEVELS - 1; level > 0; --level) {
|
||||
|
||||
@@ -1145,7 +1145,11 @@ public:
|
||||
setup_for_partition(pk);
|
||||
auto dk = dht::decorate_key(*_schema, pk);
|
||||
_reader->on_next_partition(std::move(dk), tombstone(deltime));
|
||||
return proceed::yes;
|
||||
// Only partition start will be consumed if processing a large run of partition tombstones,
|
||||
// so let's stop the consumer if buffer is full.
|
||||
// Otherwise, partition tombstones will keep accumulating in memory till other fragment type
|
||||
// is found which can stop the consumer (perhaps there's none if sstable is full of tombstones).
|
||||
return proceed(!_reader->is_buffer_full());
|
||||
}
|
||||
|
||||
virtual consumer_m::row_processing_result consume_row_start(const std::vector<temporary_buffer<char>>& ecp) override {
|
||||
|
||||
@@ -256,6 +256,7 @@ size_tiered_compaction_strategy::get_reshaping_job(std::vector<shared_sstable> i
|
||||
bucket.resize(std::min(max_sstables, bucket.size()));
|
||||
compaction_descriptor desc(std::move(bucket), std::optional<sstables::sstable_set>(), iop);
|
||||
desc.options = compaction_options::make_reshape();
|
||||
return desc;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -101,7 +101,8 @@ class time_window_compaction_strategy : public compaction_strategy_impl {
|
||||
time_window_compaction_strategy_options _options;
|
||||
int64_t _estimated_remaining_tasks = 0;
|
||||
db_clock::time_point _last_expired_check;
|
||||
timestamp_type _highest_window_seen;
|
||||
// As timestamp_type is an int64_t, a primitive type, it must be initialized here.
|
||||
timestamp_type _highest_window_seen = 0;
|
||||
// Keep track of all recent active windows that still need to be compacted into a single SSTable
|
||||
std::unordered_set<timestamp_type> _recent_active_windows;
|
||||
size_tiered_compaction_strategy_options _stcs_options;
|
||||
|
||||
4
table.cc
4
table.cc
@@ -1551,6 +1551,10 @@ future<> table::flush_streaming_mutations(utils::UUID plan_id, dht::partition_ra
|
||||
});
|
||||
}
|
||||
|
||||
bool table::can_flush() const {
|
||||
return _memtables->can_flush();
|
||||
}
|
||||
|
||||
future<> table::clear() {
|
||||
if (_commitlog) {
|
||||
_commitlog->discard_completed_segments(_schema->id());
|
||||
|
||||
@@ -80,7 +80,7 @@ def dynamodb(request):
|
||||
verify = not request.config.getoption('https')
|
||||
return boto3.resource('dynamodb', endpoint_url=local_url, verify=verify,
|
||||
region_name='us-east-1', aws_access_key_id='alternator', aws_secret_access_key='secret_pass',
|
||||
config=botocore.client.Config(retries={"max_attempts": 3}))
|
||||
config=botocore.client.Config(retries={"max_attempts": 0}, read_timeout=300))
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def dynamodbstreams(request):
|
||||
|
||||
@@ -154,6 +154,27 @@ def test_update_condition_eq_unequal(test_table_s):
|
||||
ConditionExpression='q = :oldval',
|
||||
ExpressionAttributeValues={':val1': 3, ':oldval': 2})
|
||||
|
||||
# In test_update_condition_eq_unequal() above we saw that a non-existent
|
||||
# attribute is not "=" to a value. Here we check what happens when two
|
||||
# non-existent attributes are checked for equality. It turns out, they should
|
||||
# *not* be considered equal. In short, an unset attribute is never equal to
|
||||
# anything - not even to another unset attribute.
|
||||
# Reproduces issue #8511.
|
||||
def test_update_condition_eq_two_unset(test_table_s):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q = z',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'}})
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q = z',
|
||||
ExpressionAttributeValues={':val1': 3})
|
||||
|
||||
# Check that set equality is checked correctly. Unlike string equality (for
|
||||
# example), it cannot be done with just naive string comparison of the JSON
|
||||
# representation, and we need to allow for any order. (see issue #5021)
|
||||
@@ -175,6 +196,39 @@ def test_update_condition_eq_set(test_table_s):
|
||||
ExpressionAttributeValues={':val1': 3, ':oldval': set(['chinchilla', 'cat', 'dog', 'mouse'])})
|
||||
assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
|
||||
# The above test (test_update_condition_eq_set()) checked equality of simple
|
||||
# set attributes. But an attributes can contain a nested document, where the
|
||||
# set sits in a deep level (the set itself is a leaf in this heirarchy because
|
||||
# it can only contain numbers, strings or bytes). We need to correctly support
|
||||
# equality check in that case too.
|
||||
# Reproduces issue #8514.
|
||||
@pytest.mark.skip(reason="test needs nested update not yet in branch 4.3")
|
||||
def test_update_condition_eq_nested_set(test_table_s):
|
||||
p = random_string()
|
||||
# Because boto3 sorts the set values we give it, in order to generate a
|
||||
# set with a different order, we need to build it incrementally.
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
AttributeUpdates={'a': {'Value': {'b': 'c', 'd': ['e', 'f', set(['g', 'h'])], 'i': set(['j', 'k'])}, 'Action': 'PUT'}})
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='ADD a.d[2] :val1, a.i :val2',
|
||||
ExpressionAttributeValues={':val1': set(['l', 'm']), ':val2': set(['n', 'o'])})
|
||||
# Sanity check - the attribute contains the set we think it does
|
||||
expected = {'b': 'c', 'd': ['e', 'f', set(['g', 'h', 'l', 'm'])], 'i': set(['j', 'k', 'n', 'o'])}
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == expected
|
||||
# Now finally check that condition expression check knows the equality too.
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET b = :val1',
|
||||
ConditionExpression='a = :oldval',
|
||||
ExpressionAttributeValues={':val1': 3, ':oldval': expected})
|
||||
assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
# Check that equality can also fail, if the inner set differs
|
||||
wrong = {'b': 'c', 'd': ['e', 'f', set(['g', 'h', 'l', 'bad'])], 'i': set(['j', 'k', 'n', 'o'])}
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET b = :val1',
|
||||
ConditionExpression='a = :oldval',
|
||||
ExpressionAttributeValues={':val1': 4, ':oldval': wrong})
|
||||
|
||||
# Test for ConditionExpression with operator "<>" (non-equality),
|
||||
def test_update_condition_ne(test_table_s):
|
||||
p = random_string()
|
||||
@@ -215,6 +269,54 @@ def test_update_condition_ne(test_table_s):
|
||||
ExpressionAttributeValues={':newval': 3, ':oldval': 1})
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['c'] == 3
|
||||
|
||||
# Check that set inequality is checked correctly. This reproduces the same
|
||||
# bug #5021 that we reproduced above in test_update_condition_eq_set(), just
|
||||
# that here we check the inequality operator instead of equality.
|
||||
# Reproduces issue #8513.
|
||||
def test_update_condition_ne_set(test_table_s):
|
||||
p = random_string()
|
||||
# Because boto3 sorts the set values we give it, in order to generate a
|
||||
# set with a different order, we need to build it incrementally.
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
AttributeUpdates={'a': {'Value': set(['dog', 'chinchilla']), 'Action': 'PUT'}})
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='ADD a :val1',
|
||||
ExpressionAttributeValues={':val1': set(['cat', 'mouse'])})
|
||||
# Sanity check - the attribute contains the set we think it does
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == set(['chinchilla', 'cat', 'dog', 'mouse'])
|
||||
# Now check that condition expression check knows there is no inequality
|
||||
# here.
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET b = :val1',
|
||||
ConditionExpression='a <> :oldval',
|
||||
ExpressionAttributeValues={':val1': 2, ':oldval': set(['chinchilla', 'cat', 'dog', 'mouse'])})
|
||||
# As a sanity check, also check something which should be unequal:
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET b = :val1',
|
||||
ConditionExpression='a <> :oldval',
|
||||
ExpressionAttributeValues={':val1': 3, ':oldval': set(['chinchilla', 'cat', 'dog', 'horse'])})
|
||||
assert 'b' in test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']
|
||||
|
||||
# In test_update_condition_ne() above we saw that a non-existent attribute is
|
||||
# "not equal" to any value. Here we check what happens when two non-existent
|
||||
# attributes are checked for non-equality. It turns out, they are also
|
||||
# considered "not equal". In short, an unset attribute is always "not equal" to
|
||||
# anything - even to another unset attribute.
|
||||
# Reproduces issue #8511.
|
||||
def test_update_condition_ne_two_unset(test_table_s):
|
||||
p = random_string()
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q <> z',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 2
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q <> z',
|
||||
ExpressionAttributeValues={':val1': 3})
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['a'] == 3
|
||||
|
||||
# Test for ConditionExpression with operator "<"
|
||||
def test_update_condition_lt(test_table_s):
|
||||
p = random_string()
|
||||
@@ -316,6 +418,45 @@ def test_update_condition_lt(test_table_s):
|
||||
ExpressionAttributeValues={':newval': 2, ':oldval': 1})
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['z'] == 4
|
||||
|
||||
# In test_update_condition_lt() above we saw that a non-existent attribute is
|
||||
# not "<" any value. Here we check what happens when two non-existent
|
||||
# attributes are compared with "<". It turns out that the result of such
|
||||
# comparison is also false.
|
||||
# The same is true for other order operators - any order comparison involving
|
||||
# one unset attribute should be false - even if the second operand is an
|
||||
# unset attribute as well. Note that the <> operator is different - it is
|
||||
# always results in true if one of the operands is an unset attribute (see
|
||||
# test_update_condition_ne_two_unset() above).
|
||||
# This test is related to issue #8511 (although it passed even before fixing
|
||||
# that issue).
|
||||
def test_update_condition_comparison_two_unset(test_table_s):
|
||||
p = random_string()
|
||||
ops = ['<', '<=', '>', '>=']
|
||||
for op in ops:
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q ' + op + ' z',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q between z and x',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
AttributeUpdates={'a': {'Value': 1, 'Action': 'PUT'}})
|
||||
for op in ops:
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q ' + op + ' z',
|
||||
ExpressionAttributeValues={':val1': 3})
|
||||
with pytest.raises(ClientError, match='ConditionalCheckFailedException'):
|
||||
test_table_s.update_item(Key={'p': p},
|
||||
UpdateExpression='SET a = :val1',
|
||||
ConditionExpression='q between z and x',
|
||||
ExpressionAttributeValues={':val1': 2})
|
||||
|
||||
# Test for ConditionExpression with operator "<="
|
||||
def test_update_condition_le(test_table_s):
|
||||
p = random_string()
|
||||
|
||||
@@ -578,11 +578,14 @@ SEASTAR_TEST_CASE(test_allocation_failure){
|
||||
|
||||
// Use us loads of memory so we can OOM at the appropriate place
|
||||
try {
|
||||
assert(fragmented_temporary_buffer::default_fragment_size < size);
|
||||
for (;;) {
|
||||
junk->emplace_back(new char[size]);
|
||||
junk->emplace_back(new char[fragmented_temporary_buffer::default_fragment_size]);
|
||||
}
|
||||
} catch (std::bad_alloc&) {
|
||||
}
|
||||
auto last = junk->end();
|
||||
junk->erase(--last);
|
||||
return log.add_mutation(utils::UUID_gen::get_time_UUID(), size, db::commitlog::force_sync::no, [size](db::commitlog::output& dst) {
|
||||
dst.fill(char(1), size);
|
||||
}).then_wrapped([junk, size](future<db::rp_handle> f) {
|
||||
|
||||
@@ -550,3 +550,71 @@ SEASTAR_THREAD_TEST_CASE(read_max_size) {
|
||||
}
|
||||
}).get();
|
||||
}
|
||||
|
||||
// Check that mutation queries, those that are stopped when the memory
|
||||
// consumed by their results reach the local/global limit, are aborted
|
||||
// instead of silently terminated when this happens.
|
||||
SEASTAR_THREAD_TEST_CASE(unpaged_mutation_read_global_limit) {
|
||||
auto cfg = cql_test_config{};
|
||||
cfg.dbcfg.emplace();
|
||||
// The memory available to the result memory limiter (global limit) is
|
||||
// configured based on the available memory, so give a small amount to
|
||||
// the "node", so we don't have to work with large amount of data.
|
||||
cfg.dbcfg->available_memory = 2 * 1024 * 1024;
|
||||
do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
e.execute_cql("CREATE TABLE test (pk text, ck int, v text, PRIMARY KEY (pk, ck));").get();
|
||||
auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0();
|
||||
|
||||
auto& db = e.local_db();
|
||||
auto& tab = db.find_column_family("ks", "test");
|
||||
auto s = tab.schema();
|
||||
|
||||
auto pk = make_local_key(s);
|
||||
const auto raw_pk = utf8_type->decompose(data_value(pk));
|
||||
const auto cql3_pk = cql3::raw_value::make_value(raw_pk);
|
||||
|
||||
const auto value = sstring(1024, 'a');
|
||||
const auto raw_value = utf8_type->decompose(data_value(value));
|
||||
const auto cql3_value = cql3::raw_value::make_value(raw_value);
|
||||
|
||||
const int num_rows = 1024;
|
||||
const auto max_size = 1024u * 1024u * 1024u;
|
||||
|
||||
for (int i = 0; i != num_rows; ++i) {
|
||||
const auto cql3_ck = cql3::raw_value::make_value(int32_type->decompose(data_value(i)));
|
||||
e.execute_prepared(id, {cql3_pk, cql3_ck, cql3_value}).get();
|
||||
}
|
||||
|
||||
const auto partition_ranges = std::vector<dht::partition_range>{query::full_partition_range};
|
||||
|
||||
const std::vector<std::pair<sstring, std::function<future<size_t>(schema_ptr, const query::read_command&)>>> query_methods{
|
||||
{"query_mutations()", [&db, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
|
||||
return db.query_mutations(s, cmd, partition_ranges.front(), {}, db::no_timeout).then(
|
||||
[] (const std::tuple<reconcilable_result, cache_temperature>& res) {
|
||||
return std::get<0>(res).memory_usage();
|
||||
});
|
||||
}},
|
||||
{"query_mutations_on_all_shards()", [&e, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
|
||||
return query_mutations_on_all_shards(e.db(), s, cmd, partition_ranges, {}, db::no_timeout).then(
|
||||
[] (const std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>& res) {
|
||||
return std::get<0>(res)->memory_usage();
|
||||
});
|
||||
}}
|
||||
};
|
||||
|
||||
for (auto [query_method_name, query_method] : query_methods) {
|
||||
testlog.info("checking: query_method={}", query_method_name);
|
||||
auto slice = s->full_slice();
|
||||
slice.options.remove<query::partition_slice::option::allow_short_read>();
|
||||
query::read_command cmd(s->id(), s->version(), slice, query::max_result_size(max_size));
|
||||
try {
|
||||
auto size = query_method(s, cmd).get0();
|
||||
// Just to ensure we are not interpreting empty results as success.
|
||||
BOOST_REQUIRE(size != 0);
|
||||
BOOST_FAIL("Expected exception, but none was thrown.");
|
||||
} catch (std::runtime_error& e) {
|
||||
testlog.trace("Exception thrown, as expected: {}", e);
|
||||
}
|
||||
}
|
||||
}, std::move(cfg)).get();
|
||||
}
|
||||
|
||||
@@ -974,14 +974,7 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) {
|
||||
|
||||
const auto& partitions = pop_desc.partitions;
|
||||
smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(pop_desc.schema), &partitions] {
|
||||
auto s = gs.get();
|
||||
auto& sem = db->local().get_reader_concurrency_semaphore();
|
||||
|
||||
auto resources = sem.available_resources();
|
||||
resources -= reader_concurrency_semaphore::resources{1, 0};
|
||||
auto permit = sem.make_permit(s.get(), "fuzzy-test");
|
||||
|
||||
return run_fuzzy_test_workload(cfg, *db, std::move(s), partitions).finally([units = permit.consume_resources(resources)] {});
|
||||
return run_fuzzy_test_workload(cfg, *db, gs.get(), partitions);
|
||||
}).handle_exception([seed] (std::exception_ptr e) {
|
||||
testlog.error("Test workload failed with exception {}."
|
||||
" To repeat this particular run, replace the random seed of the test, with that of this run ({})."
|
||||
|
||||
@@ -894,6 +894,232 @@ sstables::shared_sstable create_sstable(sstables::test_env& env, simple_schema&
|
||||
, mutations);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
class generic_inactive_read : public reader_concurrency_semaphore::inactive_read {
|
||||
flat_mutation_reader_opt _reader;
|
||||
|
||||
private:
|
||||
explicit generic_inactive_read(flat_mutation_reader&& rd) : _reader(std::move(rd)) { }
|
||||
|
||||
virtual void evict() override {
|
||||
_reader = {};
|
||||
}
|
||||
|
||||
public:
|
||||
static std::unique_ptr<inactive_read> make(flat_mutation_reader&& rd) {
|
||||
return std::make_unique<generic_inactive_read>(generic_inactive_read(std::move(rd)));
|
||||
}
|
||||
|
||||
static flat_mutation_reader_opt get_reader(std::unique_ptr<inactive_read>&& ir) {
|
||||
if (!ir) {
|
||||
return {};
|
||||
}
|
||||
auto gir = dynamic_cast<generic_inactive_read*>(ir.get());
|
||||
BOOST_REQUIRE(gir);
|
||||
return std::move(gir->_reader);
|
||||
}
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// This unit test passes a read through admission again-and-again, just
|
||||
// like an evictable reader would be during its lifetime. When readmitted
|
||||
// the read sometimes has to wait and sometimes not. This is to check that
|
||||
// the readmitting a previously admitted reader doesn't leak any units.
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves_units) {
|
||||
simple_schema s;
|
||||
const auto initial_resources = reader_concurrency_semaphore::resources{10, 1024 * 1024};
|
||||
reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name());
|
||||
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
|
||||
std::optional<reader_permit::resource_units> residue_units;
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
const auto have_residue_units = bool(residue_units);
|
||||
|
||||
auto current_resources = initial_resources;
|
||||
if (have_residue_units) {
|
||||
current_resources -= residue_units->resources();
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == current_resources);
|
||||
|
||||
std::optional<reader_permit::resource_units> admitted_units;
|
||||
if (i % 2) {
|
||||
const auto consumed_resources = semaphore.available_resources();
|
||||
semaphore.consume(consumed_resources);
|
||||
|
||||
auto units_fut = permit.wait_admission(1024, db::no_timeout);
|
||||
BOOST_REQUIRE(!units_fut.available());
|
||||
|
||||
semaphore.signal(consumed_resources);
|
||||
admitted_units = units_fut.get();
|
||||
} else {
|
||||
admitted_units = permit.wait_admission(1024, db::no_timeout).get();
|
||||
}
|
||||
|
||||
current_resources -= admitted_units->resources();
|
||||
BOOST_REQUIRE(semaphore.available_resources() == current_resources);
|
||||
|
||||
residue_units.emplace(permit.consume_resources(reader_resources(0, 100)));
|
||||
if (!have_residue_units) {
|
||||
current_resources -= residue_units->resources();
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == current_resources);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(generic_inactive_read::make(make_empty_flat_reader(s.schema(), permit)));
|
||||
(void)handle;
|
||||
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources - residue_units->resources());
|
||||
|
||||
residue_units.reset();
|
||||
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
|
||||
}
|
||||
|
||||
// This unit test checks that the semaphore doesn't get into a deadlock
|
||||
// when contended, in the presence of many memory-only reads (that don't
|
||||
// wait for admission). This is tested by simulating the 3 kind of reads we
|
||||
// currently have in the system:
|
||||
// * memory-only: reads that don't pass admission and only own memory.
|
||||
// * admitted: reads that pass admission.
|
||||
// * evictable: admitted reads that are furthermore evictable.
|
||||
//
|
||||
// The test creates and runs a large number of these reads in parallel,
|
||||
// read kinds being selected randomly, then creates a watchdog which
|
||||
// kills the test if no progress is being made.
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
|
||||
class reader {
|
||||
class skeleton_reader : public flat_mutation_reader::impl {
|
||||
reader_permit::resource_units _base_resources;
|
||||
std::optional<reader_permit::resource_units> _resources;
|
||||
public:
|
||||
skeleton_reader(schema_ptr s, reader_permit permit, reader_permit::resource_units res)
|
||||
: impl(std::move(s), std::move(permit)), _base_resources(std::move(res)) { }
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
||||
_resources.emplace(_permit.consume_resources(reader_resources(0, tests::random::get_int(1024, 2048))));
|
||||
return make_ready_future<>();
|
||||
}
|
||||
virtual void next_partition() override { }
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
|
||||
virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
|
||||
};
|
||||
struct reader_visitor {
|
||||
reader& r;
|
||||
future<> operator()(std::monostate& ms) { return r.tick(ms); }
|
||||
future<> operator()(flat_mutation_reader& reader) { return r.tick(reader); }
|
||||
future<> operator()(reader_concurrency_semaphore::inactive_read_handle& handle) { return r.tick(handle); }
|
||||
};
|
||||
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
reader_permit _permit;
|
||||
bool _memory_only = true;
|
||||
bool _evictable = false;
|
||||
std::optional<reader_permit::resource_units> _units;
|
||||
std::variant<std::monostate, flat_mutation_reader, reader_concurrency_semaphore::inactive_read_handle> _reader;
|
||||
|
||||
private:
|
||||
future<> make_reader() {
|
||||
return async([this] {
|
||||
auto res = _permit.consume_memory();
|
||||
if (!_memory_only) {
|
||||
res = _permit.wait_admission(1024, db::no_timeout).get0();
|
||||
}
|
||||
_reader = make_flat_mutation_reader<skeleton_reader>(_schema, _permit, std::move(res));
|
||||
});
|
||||
}
|
||||
future<> tick(std::monostate&) {
|
||||
return async([this] {
|
||||
make_reader().get();
|
||||
tick(std::get<flat_mutation_reader>(_reader)).get();
|
||||
});
|
||||
}
|
||||
future<> tick(flat_mutation_reader& reader) {
|
||||
return async([this, &reader] {
|
||||
reader.fill_buffer(db::no_timeout).get();
|
||||
if (_evictable) {
|
||||
_reader = _permit.semaphore().register_inactive_read(generic_inactive_read::make(std::move(reader)));
|
||||
}
|
||||
});
|
||||
}
|
||||
future<> tick(reader_concurrency_semaphore::inactive_read_handle& handle) {
|
||||
return async([this, &handle] () mutable {
|
||||
if (auto reader = generic_inactive_read::get_reader(_permit.semaphore().unregister_inactive_read(std::move(handle))); reader) {
|
||||
_reader = std::move(*reader);
|
||||
} else {
|
||||
make_reader().get();
|
||||
}
|
||||
tick(std::get<flat_mutation_reader>(_reader)).get();
|
||||
});
|
||||
}
|
||||
|
||||
public:
|
||||
reader(schema_ptr s, reader_permit permit, bool memory_only, bool evictable)
|
||||
: _schema(std::move(s))
|
||||
, _permit(std::move(permit))
|
||||
, _memory_only(memory_only)
|
||||
, _evictable(evictable)
|
||||
, _units(_permit.consume_memory(tests::random::get_int(128, 1024)))
|
||||
{
|
||||
}
|
||||
future<> tick() {
|
||||
return std::visit(reader_visitor{*this}, _reader);
|
||||
}
|
||||
};
|
||||
|
||||
const auto count = 10;
|
||||
const auto num_readers = 512;
|
||||
const auto ticks = 1000;
|
||||
|
||||
simple_schema s;
|
||||
reader_concurrency_semaphore semaphore(count, count * 1024, get_name());
|
||||
|
||||
std::list<std::optional<reader>> readers;
|
||||
unsigned nr_memory_only = 0;
|
||||
unsigned nr_admitted = 0;
|
||||
unsigned nr_evictable = 0;
|
||||
|
||||
for (auto i = 0; i < num_readers; ++i) {
|
||||
const auto memory_only = tests::random::get_bool();
|
||||
const auto evictable = !memory_only && tests::random::get_bool();
|
||||
if (memory_only) {
|
||||
++nr_memory_only;
|
||||
} else if (evictable) {
|
||||
++nr_evictable;
|
||||
} else {
|
||||
++nr_admitted;
|
||||
}
|
||||
readers.emplace_back(reader(s.schema(), semaphore.make_permit(s.schema().get(), fmt::format("reader{}", i)), memory_only, evictable));
|
||||
}
|
||||
|
||||
testlog.info("Created {} readers, memory_only={}, admitted={}, evictable={}", readers.size(), nr_memory_only, nr_admitted, nr_evictable);
|
||||
|
||||
bool watchdog_touched = false;
|
||||
auto watchdog = timer<db::timeout_clock>([&semaphore, &watchdog_touched] {
|
||||
if (!watchdog_touched) {
|
||||
testlog.error("Watchdog detected a deadlock, dumping diagnostics before killing the test: {}", semaphore.dump_diagnostics());
|
||||
semaphore.broken(std::make_exception_ptr(std::runtime_error("test killed by watchdog")));
|
||||
}
|
||||
watchdog_touched = false;
|
||||
});
|
||||
watchdog.arm_periodic(std::chrono::seconds(30));
|
||||
|
||||
parallel_for_each(readers, [&] (std::optional<reader>& r) -> future<> {
|
||||
return async([this, &watchdog_touched, &r] {
|
||||
for (auto i = 0; i < ticks; ++i) {
|
||||
watchdog_touched = true;
|
||||
r->tick().get();
|
||||
}
|
||||
r.reset();
|
||||
watchdog_touched = true;
|
||||
});
|
||||
}).get();
|
||||
}
|
||||
|
||||
static
|
||||
sstables::shared_sstable create_sstable(sstables::test_env& env, schema_ptr s, std::vector<mutation> mutations) {
|
||||
static thread_local auto tmp = tmpdir();
|
||||
@@ -3041,39 +3267,30 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
reader_permit permit,
|
||||
const dht::partition_range& prange,
|
||||
const query::partition_slice& slice,
|
||||
std::deque<mutation_fragment> first_buffer,
|
||||
position_in_partition_view last_fragment_position,
|
||||
std::deque<mutation_fragment> second_buffer,
|
||||
size_t max_buffer_size) {
|
||||
std::list<std::deque<mutation_fragment>> buffers,
|
||||
position_in_partition_view first_buf_last_fragment_position,
|
||||
size_t max_buffer_size,
|
||||
bool detach_buffer = true) {
|
||||
class factory {
|
||||
schema_ptr _schema;
|
||||
reader_permit _permit;
|
||||
std::optional<std::deque<mutation_fragment>> _first_buffer;
|
||||
std::optional<std::deque<mutation_fragment>> _second_buffer;
|
||||
std::list<std::deque<mutation_fragment>> _buffers;
|
||||
size_t _max_buffer_size;
|
||||
|
||||
private:
|
||||
std::optional<std::deque<mutation_fragment>> copy_buffer(const std::optional<std::deque<mutation_fragment>>& o) {
|
||||
if (!o) {
|
||||
return {};
|
||||
}
|
||||
return copy_fragments(*_schema, _permit, *o);
|
||||
}
|
||||
|
||||
public:
|
||||
factory(schema_ptr schema, reader_permit permit, std::deque<mutation_fragment> first_buffer, std::deque<mutation_fragment> second_buffer, size_t max_buffer_size)
|
||||
factory(schema_ptr schema, reader_permit permit, std::list<std::deque<mutation_fragment>> buffers, size_t max_buffer_size)
|
||||
: _schema(std::move(schema))
|
||||
, _permit(std::move(permit))
|
||||
, _first_buffer(std::move(first_buffer))
|
||||
, _second_buffer(std::move(second_buffer))
|
||||
, _buffers(std::move(buffers))
|
||||
, _max_buffer_size(max_buffer_size) {
|
||||
}
|
||||
|
||||
factory(const factory& o)
|
||||
: _schema(o._schema)
|
||||
, _permit(o._permit)
|
||||
, _first_buffer(copy_buffer(o._first_buffer))
|
||||
, _second_buffer(copy_buffer(o._second_buffer)) {
|
||||
, _permit(o._permit) {
|
||||
for (const auto& buf : o._buffers) {
|
||||
_buffers.emplace_back(copy_fragments(*_schema, _permit, buf));
|
||||
}
|
||||
}
|
||||
factory(factory&& o) = default;
|
||||
|
||||
@@ -3087,14 +3304,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
streamed_mutation::forwarding fwd_sm,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
BOOST_REQUIRE(s == _schema);
|
||||
if (_first_buffer) {
|
||||
auto buf = *std::exchange(_first_buffer, {});
|
||||
auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
|
||||
rd.set_max_buffer_size(_max_buffer_size);
|
||||
return rd;
|
||||
}
|
||||
if (_second_buffer) {
|
||||
auto buf = *std::exchange(_second_buffer, {});
|
||||
if (!_buffers.empty()) {
|
||||
auto buf = std::move(_buffers.front());
|
||||
_buffers.pop_front();
|
||||
auto rd = make_flat_mutation_reader_from_fragments(_schema, std::move(permit), std::move(buf));
|
||||
rd.set_max_buffer_size(_max_buffer_size);
|
||||
return rd;
|
||||
@@ -3102,9 +3314,9 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
return make_empty_flat_reader(_schema, std::move(permit));
|
||||
}
|
||||
};
|
||||
auto ms = mutation_source(factory(schema, permit, std::move(first_buffer), std::move(second_buffer), max_buffer_size));
|
||||
auto ms = mutation_source(factory(schema, permit, std::move(buffers), max_buffer_size));
|
||||
|
||||
auto [rd, handle] = make_manually_paused_evictable_reader(
|
||||
auto rd = make_auto_paused_evictable_reader(
|
||||
std::move(ms),
|
||||
schema,
|
||||
permit,
|
||||
@@ -3120,18 +3332,42 @@ flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
|
||||
const auto eq_cmp = position_in_partition::equal_compare(*schema);
|
||||
BOOST_REQUIRE(rd.is_buffer_full());
|
||||
BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), last_fragment_position));
|
||||
BOOST_REQUIRE(eq_cmp(rd.buffer().back().position(), first_buf_last_fragment_position));
|
||||
BOOST_REQUIRE(!rd.is_end_of_stream());
|
||||
|
||||
rd.detach_buffer();
|
||||
|
||||
handle.pause();
|
||||
if (detach_buffer) {
|
||||
rd.detach_buffer();
|
||||
}
|
||||
|
||||
while(permit.semaphore().try_evict_one_inactive_read());
|
||||
|
||||
return std::move(rd);
|
||||
}
|
||||
|
||||
flat_mutation_reader create_evictable_reader_and_evict_after_first_buffer(
|
||||
schema_ptr schema,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& prange,
|
||||
const query::partition_slice& slice,
|
||||
std::deque<mutation_fragment> first_buffer,
|
||||
position_in_partition_view last_fragment_position,
|
||||
std::deque<mutation_fragment> last_buffer,
|
||||
size_t max_buffer_size,
|
||||
bool detach_buffer = true) {
|
||||
std::list<std::deque<mutation_fragment>> list;
|
||||
list.emplace_back(std::move(first_buffer));
|
||||
list.emplace_back(std::move(last_buffer));
|
||||
return create_evictable_reader_and_evict_after_first_buffer(
|
||||
std::move(schema),
|
||||
std::move(permit),
|
||||
prange,
|
||||
slice,
|
||||
std::move(list),
|
||||
last_fragment_position,
|
||||
max_buffer_size,
|
||||
detach_buffer);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_evictable_reader_trim_range_tombstones) {
|
||||
@@ -3433,7 +3669,7 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {
|
||||
|
||||
check_evictable_reader_validation_is_triggered(
|
||||
"pkey > _last_pkey; pkey ∈ pkrange",
|
||||
partition_error_prefix,
|
||||
"",
|
||||
s.schema(),
|
||||
permit,
|
||||
prange,
|
||||
@@ -3521,3 +3757,208 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {
|
||||
make_second_buffer(pkeys[3]),
|
||||
max_buffer_size);
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_evictable_reader_drop_flags) {
|
||||
reader_concurrency_semaphore semaphore(1, 0, get_name());
|
||||
simple_schema s;
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
|
||||
auto pkeys = s.make_pkeys(2);
|
||||
std::sort(pkeys.begin(), pkeys.end(), [&s] (const auto& pk1, const auto& pk2) {
|
||||
return pk1.less_compare(*s.schema(), pk2);
|
||||
});
|
||||
const auto& pkey1 = pkeys[0];
|
||||
const auto& pkey2 = pkeys[1];
|
||||
const int second_buffer_ck = 10;
|
||||
|
||||
struct buffer {
|
||||
simple_schema& s;
|
||||
reader_permit permit;
|
||||
std::deque<mutation_fragment> frags;
|
||||
std::vector<mutation> muts;
|
||||
size_t size = 0;
|
||||
std::optional<position_in_partition_view> last_pos;
|
||||
|
||||
buffer(simple_schema& s_, reader_permit permit_, dht::decorated_key key)
|
||||
: s(s_), permit(std::move(permit_)) {
|
||||
add_partition(key);
|
||||
}
|
||||
size_t add_partition(dht::decorated_key key) {
|
||||
size += frags.emplace_back(*s.schema(), permit, partition_start{key, {}}).memory_usage();
|
||||
muts.emplace_back(s.schema(), key);
|
||||
return size;
|
||||
}
|
||||
size_t add_mutation_fragment(mutation_fragment&& mf, bool only_to_frags = false) {
|
||||
if (!only_to_frags) {
|
||||
muts.back().apply(mf);
|
||||
}
|
||||
size += frags.emplace_back(*s.schema(), permit, std::move(mf)).memory_usage();
|
||||
return size;
|
||||
}
|
||||
size_t add_static_row(std::optional<mutation_fragment> sr = {}) {
|
||||
auto srow = sr ? std::move(*sr) : s.make_static_row("s");
|
||||
return add_mutation_fragment(std::move(srow));
|
||||
}
|
||||
size_t add_clustering_row(int i, bool only_to_frags = false) {
|
||||
return add_mutation_fragment(mutation_fragment(*s.schema(), permit, s.make_row(s.make_ckey(i), "v")), only_to_frags);
|
||||
}
|
||||
size_t add_clustering_rows(int start, int end) {
|
||||
for (int i = start; i < end; ++i) {
|
||||
add_clustering_row(i);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
size_t add_partition_end() {
|
||||
size += frags.emplace_back(*s.schema(), permit, partition_end{}).memory_usage();
|
||||
return size;
|
||||
}
|
||||
void save_position() { last_pos = frags.back().position(); }
|
||||
void find_position(size_t buf_size) {
|
||||
size_t s = 0;
|
||||
for (const auto& frag : frags) {
|
||||
s += frag.memory_usage();
|
||||
if (s >= buf_size) {
|
||||
last_pos = frag.position();
|
||||
break;
|
||||
}
|
||||
}
|
||||
BOOST_REQUIRE(last_pos);
|
||||
}
|
||||
};
|
||||
|
||||
auto make_reader = [&] (const buffer& first_buffer, const buffer& second_buffer, const buffer* const third_buffer, size_t max_buffer_size) {
|
||||
std::list<std::deque<mutation_fragment>> buffers;
|
||||
buffers.emplace_back(copy_fragments(*s.schema(), permit, first_buffer.frags));
|
||||
buffers.emplace_back(copy_fragments(*s.schema(), permit, second_buffer.frags));
|
||||
if (third_buffer) {
|
||||
buffers.emplace_back(copy_fragments(*s.schema(), permit, third_buffer->frags));
|
||||
}
|
||||
return create_evictable_reader_and_evict_after_first_buffer(
|
||||
s.schema(),
|
||||
permit,
|
||||
query::full_partition_range,
|
||||
s.schema()->full_slice(),
|
||||
std::move(buffers),
|
||||
*first_buffer.last_pos,
|
||||
max_buffer_size,
|
||||
false);
|
||||
};
|
||||
|
||||
testlog.info("Same partition, with static row");
|
||||
{
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
first_buffer.add_static_row();
|
||||
auto srow = mutation_fragment(*s.schema(), permit, first_buffer.frags.back());
|
||||
const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_clustering_row(second_buffer_ck);
|
||||
|
||||
buffer second_buffer(s, permit, pkey1);
|
||||
second_buffer.add_static_row(std::move(srow));
|
||||
second_buffer.add_clustering_row(second_buffer_ck);
|
||||
second_buffer.add_clustering_row(second_buffer_ck + 1);
|
||||
second_buffer.add_partition_end();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.produces(first_buffer.muts[0] + second_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
testlog.info("Same partition, no static row");
|
||||
{
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_clustering_row(second_buffer_ck);
|
||||
|
||||
buffer second_buffer(s, permit, pkey1);
|
||||
second_buffer.add_clustering_row(second_buffer_ck);
|
||||
second_buffer.add_clustering_row(second_buffer_ck + 1);
|
||||
second_buffer.add_partition_end();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.produces(first_buffer.muts[0] + second_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
testlog.info("Same partition as expected, no static row, next partition has static row (#8923)");
|
||||
{
|
||||
buffer second_buffer(s, permit, pkey1);
|
||||
second_buffer.add_clustering_rows(second_buffer_ck, second_buffer_ck + second_buffer_ck / 2);
|
||||
// We want to end the buffer on the partition-start below, but since a
|
||||
// partition start will be dropped from it, we have to use the size
|
||||
// without it.
|
||||
const auto buf_size = second_buffer.add_partition_end();
|
||||
second_buffer.add_partition(pkey2);
|
||||
second_buffer.add_static_row();
|
||||
auto srow = mutation_fragment(*s.schema(), permit, second_buffer.frags.back());
|
||||
second_buffer.add_clustering_rows(0, 2);
|
||||
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
for (int i = 0; first_buffer.add_clustering_row(i) < buf_size; ++i);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_mutation_fragment(mutation_fragment(*s.schema(), permit, second_buffer.frags[1]));
|
||||
|
||||
buffer third_buffer(s, permit, pkey2);
|
||||
third_buffer.add_static_row(std::move(srow));
|
||||
third_buffer.add_clustering_rows(0, 2);
|
||||
third_buffer.add_partition_end();
|
||||
|
||||
first_buffer.find_position(buf_size);
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, &third_buffer, buf_size))
|
||||
.produces(first_buffer.muts[0] + second_buffer.muts[0])
|
||||
.produces(second_buffer.muts[1] + third_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
testlog.info("Next partition, with no static row");
|
||||
{
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_clustering_row(second_buffer_ck + 1, true);
|
||||
|
||||
buffer second_buffer(s, permit, pkey2);
|
||||
second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
|
||||
second_buffer.add_partition_end();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.produces(first_buffer.muts[0])
|
||||
.produces(second_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
|
||||
testlog.info("Next partition, with static row");
|
||||
{
|
||||
buffer first_buffer(s, permit, pkey1);
|
||||
const auto buf_size = first_buffer.add_clustering_rows(0, second_buffer_ck);
|
||||
first_buffer.save_position();
|
||||
first_buffer.add_clustering_row(second_buffer_ck + 1, true);
|
||||
|
||||
buffer second_buffer(s, permit, pkey2);
|
||||
second_buffer.add_static_row();
|
||||
second_buffer.add_clustering_rows(0, second_buffer_ck / 2);
|
||||
second_buffer.add_partition_end();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(make_reader(first_buffer, second_buffer, nullptr, buf_size))
|
||||
.produces(first_buffer.muts[0])
|
||||
.produces(second_buffer.muts[0])
|
||||
.produces_end_of_stream();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1805,12 +1805,16 @@ SEASTAR_TEST_CASE(test_mutation_diff_with_random_generator) {
|
||||
BOOST_FAIL(format("Partitions don't match, got: {}\n...and: {}", mutation_partition::printer(s, mp1), mutation_partition::printer(s, mp2)));
|
||||
}
|
||||
};
|
||||
for_each_mutation_pair([&] (auto&& m1, auto&& m2, are_equal eq) {
|
||||
const auto now = gc_clock::now();
|
||||
can_gc_fn never_gc = [] (tombstone) { return false; };
|
||||
for_each_mutation_pair([&] (auto m1, auto m2, are_equal eq) {
|
||||
mutation_application_stats app_stats;
|
||||
auto s = m1.schema();
|
||||
if (s != m2.schema()) {
|
||||
return;
|
||||
}
|
||||
m1.partition().compact_for_compaction(*s, never_gc, now);
|
||||
m2.partition().compact_for_compaction(*s, never_gc, now);
|
||||
auto m12 = m1;
|
||||
m12.apply(m2);
|
||||
auto m12_with_diff = m1;
|
||||
|
||||
@@ -166,7 +166,7 @@ SEASTAR_TEST_CASE(test_multishard_writer_producer_aborts) {
|
||||
|
||||
namespace {
|
||||
|
||||
class bucket_writer {
|
||||
class test_bucket_writer {
|
||||
schema_ptr _schema;
|
||||
classify_by_timestamp _classify;
|
||||
std::unordered_map<int64_t, std::vector<mutation>>& _buckets;
|
||||
@@ -175,6 +175,17 @@ class bucket_writer {
|
||||
mutation_opt _current_mutation;
|
||||
bool _is_first_mutation = true;
|
||||
|
||||
size_t _throw_after;
|
||||
size_t _mutation_consumed = 0;
|
||||
|
||||
public:
|
||||
class expected_exception : public std::exception {
|
||||
public:
|
||||
virtual const char* what() const noexcept override {
|
||||
return "expected_exception";
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
void check_timestamp(api::timestamp_type ts) {
|
||||
const auto bucket_id = _classify(ts);
|
||||
@@ -223,40 +234,53 @@ private:
|
||||
check_timestamp(rt.tomb.timestamp);
|
||||
}
|
||||
|
||||
void maybe_throw() {
|
||||
if (_mutation_consumed++ >= _throw_after) {
|
||||
throw(expected_exception());
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
bucket_writer(schema_ptr schema, classify_by_timestamp classify, std::unordered_map<int64_t, std::vector<mutation>>& buckets)
|
||||
test_bucket_writer(schema_ptr schema, classify_by_timestamp classify, std::unordered_map<int64_t, std::vector<mutation>>& buckets, size_t throw_after = std::numeric_limits<size_t>::max())
|
||||
: _schema(std::move(schema))
|
||||
, _classify(std::move(classify))
|
||||
, _buckets(buckets) {
|
||||
}
|
||||
, _buckets(buckets)
|
||||
, _throw_after(throw_after)
|
||||
{ }
|
||||
void consume_new_partition(const dht::decorated_key& dk) {
|
||||
maybe_throw();
|
||||
BOOST_REQUIRE(!_current_mutation);
|
||||
_current_mutation = mutation(_schema, dk);
|
||||
}
|
||||
void consume(tombstone partition_tombstone) {
|
||||
maybe_throw();
|
||||
BOOST_REQUIRE(_current_mutation);
|
||||
verify_partition_tombstone(partition_tombstone);
|
||||
_current_mutation->partition().apply(partition_tombstone);
|
||||
}
|
||||
stop_iteration consume(static_row&& sr) {
|
||||
maybe_throw();
|
||||
BOOST_REQUIRE(_current_mutation);
|
||||
verify_static_row(sr);
|
||||
_current_mutation->apply(mutation_fragment(*_schema, tests::make_permit(), std::move(sr)));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
stop_iteration consume(clustering_row&& cr) {
|
||||
maybe_throw();
|
||||
BOOST_REQUIRE(_current_mutation);
|
||||
verify_clustering_row(cr);
|
||||
_current_mutation->apply(mutation_fragment(*_schema, tests::make_permit(), std::move(cr)));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
stop_iteration consume(range_tombstone&& rt) {
|
||||
maybe_throw();
|
||||
BOOST_REQUIRE(_current_mutation);
|
||||
verify_range_tombstone(rt);
|
||||
_current_mutation->apply(mutation_fragment(*_schema, tests::make_permit(), std::move(rt)));
|
||||
return stop_iteration::no;
|
||||
}
|
||||
stop_iteration consume_end_of_partition() {
|
||||
maybe_throw();
|
||||
BOOST_REQUIRE(_current_mutation);
|
||||
BOOST_REQUIRE(_bucket_id);
|
||||
auto& bucket = _buckets[*_bucket_id];
|
||||
@@ -311,7 +335,7 @@ SEASTAR_THREAD_TEST_CASE(test_timestamp_based_splitting_mutation_writer) {
|
||||
|
||||
auto consumer = [&] (flat_mutation_reader bucket_reader) {
|
||||
return do_with(std::move(bucket_reader), [&] (flat_mutation_reader& rd) {
|
||||
return rd.consume(bucket_writer(random_schema.schema(), classify_fn, buckets), db::no_timeout);
|
||||
return rd.consume(test_bucket_writer(random_schema.schema(), classify_fn, buckets), db::no_timeout);
|
||||
});
|
||||
};
|
||||
|
||||
@@ -342,3 +366,53 @@ SEASTAR_THREAD_TEST_CASE(test_timestamp_based_splitting_mutation_writer) {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_timestamp_based_splitting_mutation_writer_abort) {
|
||||
auto random_spec = tests::make_random_schema_specification(
|
||||
get_name(),
|
||||
std::uniform_int_distribution<size_t>(1, 4),
|
||||
std::uniform_int_distribution<size_t>(2, 4),
|
||||
std::uniform_int_distribution<size_t>(2, 8),
|
||||
std::uniform_int_distribution<size_t>(2, 8));
|
||||
auto random_schema = tests::random_schema{tests::random::get_int<uint32_t>(), *random_spec};
|
||||
|
||||
testlog.info("Random schema:\n{}", random_schema.cql());
|
||||
|
||||
auto ts_gen = [&, underlying = tests::default_timestamp_generator()] (std::mt19937& engine,
|
||||
tests::timestamp_destination ts_dest, api::timestamp_type min_timestamp) -> api::timestamp_type {
|
||||
if (ts_dest == tests::timestamp_destination::partition_tombstone ||
|
||||
ts_dest == tests::timestamp_destination::row_marker ||
|
||||
ts_dest == tests::timestamp_destination::row_tombstone ||
|
||||
ts_dest == tests::timestamp_destination::collection_tombstone) {
|
||||
if (tests::random::get_int<int>(0, 10, engine)) {
|
||||
return api::missing_timestamp;
|
||||
}
|
||||
}
|
||||
return underlying(engine, ts_dest, min_timestamp);
|
||||
};
|
||||
|
||||
auto muts = tests::generate_random_mutations(random_schema, ts_gen).get0();
|
||||
|
||||
auto classify_fn = [] (api::timestamp_type ts) {
|
||||
return int64_t(ts % 2);
|
||||
};
|
||||
|
||||
std::unordered_map<int64_t, std::vector<mutation>> buckets;
|
||||
|
||||
int throw_after = tests::random::get_int(muts.size() - 1);
|
||||
testlog.info("Will raise exception after {}/{} mutations", throw_after, muts.size());
|
||||
auto consumer = [&] (flat_mutation_reader bucket_reader) {
|
||||
return do_with(std::move(bucket_reader), [&] (flat_mutation_reader& rd) {
|
||||
return rd.consume(test_bucket_writer(random_schema.schema(), classify_fn, buckets, throw_after), db::no_timeout);
|
||||
});
|
||||
};
|
||||
|
||||
try {
|
||||
segregate_by_timestamp(flat_mutation_reader_from_mutations(tests::make_permit(), muts), classify_fn, std::move(consumer)).get();
|
||||
} catch (const test_bucket_writer::expected_exception&) {
|
||||
BOOST_TEST_PASSPOINT();
|
||||
} catch (const seastar::broken_promise&) {
|
||||
// Tolerated until we properly abort readers
|
||||
BOOST_TEST_PASSPOINT();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -712,7 +712,10 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
|
||||
nullptr,
|
||||
db::no_timeout).get();
|
||||
|
||||
BOOST_CHECK_EQUAL(db.get_querier_cache_stats().resource_based_evictions, 1);
|
||||
// The second read might be evicted too if it consumes more
|
||||
// memory than the first and hence triggers memory control when
|
||||
// saved in the querier cache.
|
||||
BOOST_CHECK_GE(db.get_querier_cache_stats().resource_based_evictions, 1);
|
||||
|
||||
// We want to read the entire partition so that the querier
|
||||
// is not saved at the end and thus ensure it is destroyed.
|
||||
|
||||
@@ -674,6 +674,18 @@ SEASTAR_THREAD_TEST_CASE(scalar_in) {
|
||||
require_rows(e, stmt, {}, {LF({24.f, 25.f})}, {{F(24)}, {F(24)}, {F(25)}});
|
||||
require_rows(e, stmt, {}, {LF({25.f, data_value::make_null(float_type)})}, {{F(25)}});
|
||||
require_rows(e, stmt, {}, {LF({99.f, data_value::make_null(float_type)})}, {});
|
||||
|
||||
const auto in_null = [&] (const char* column) {
|
||||
return e.execute_prepared(
|
||||
e.prepare(format("select * from t where {} in ? allow filtering", column)).get0(),
|
||||
{cql3::raw_value::make_null()})
|
||||
.get();
|
||||
};
|
||||
using ire = exceptions::invalid_request_exception;
|
||||
using exception_predicate::message_contains;
|
||||
BOOST_REQUIRE_EXCEPTION(in_null("p"), ire, message_contains("null value"));
|
||||
BOOST_REQUIRE_EXCEPTION(in_null("c"), ire, message_contains("null value"));
|
||||
BOOST_REQUIRE_EXCEPTION(in_null("r"), ire, message_contains("null value"));
|
||||
}).get();
|
||||
}
|
||||
|
||||
@@ -778,6 +790,10 @@ SEASTAR_THREAD_TEST_CASE(multi_col_in) {
|
||||
require_rows(e, stmt, {}, {bound_tuples({{13, 13}, {12, 22}})}, {{I(12), F(22)}});
|
||||
require_rows(e, stmt, {}, {bound_tuples({{12, 21}})}, {});
|
||||
require_rows(e, stmt, {}, {bound_tuples({{12, 21}, {12, 21}, {13, 21}, {14, 21}})}, {});
|
||||
BOOST_REQUIRE_EXCEPTION(
|
||||
e.execute_prepared(stmt, {cql3::raw_value::make_null()}).get(),
|
||||
exceptions::invalid_request_exception,
|
||||
exception_predicate::message_equals("Invalid null value for IN restriction"));
|
||||
stmt = e.prepare("select ck1 from t where (ck1,ck2) in (?) allow filtering").get0();
|
||||
auto tpl = [] (int32_t e1, float e2) {
|
||||
return make_tuple({int32_type, float_type}, {e1, e2});
|
||||
|
||||
@@ -910,8 +910,20 @@ SEASTAR_TEST_CASE(test_eviction_from_invalidated) {
|
||||
|
||||
std::vector<sstring> tmp;
|
||||
auto alloc_size = logalloc::segment_size * 10;
|
||||
while (tracker.region().occupancy().total_space() > alloc_size) {
|
||||
tmp.push_back(uninitialized_string(alloc_size));
|
||||
/*
|
||||
* Now allocate huge chunks on the region until it gives up
|
||||
* with bad_alloc. At that point the region must not have more
|
||||
* memory than the chunk size, neither it must contain rows
|
||||
* or partitions (except for dummy entries)
|
||||
*/
|
||||
try {
|
||||
while (true) {
|
||||
tmp.push_back(uninitialized_string(alloc_size));
|
||||
}
|
||||
} catch (const std::bad_alloc&) {
|
||||
BOOST_REQUIRE(tracker.region().occupancy().total_space() < alloc_size);
|
||||
BOOST_REQUIRE(tracker.get_stats().partitions == 0);
|
||||
BOOST_REQUIRE(tracker.get_stats().rows == 0);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -6685,3 +6685,135 @@ SEASTAR_TEST_CASE(test_zero_estimated_partitions) {
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(max_ongoing_compaction_test) {
|
||||
return test_env::do_with_async([] (test_env& env) {
|
||||
BOOST_REQUIRE(smp::count == 1);
|
||||
|
||||
auto make_schema = [] (auto idx) {
|
||||
auto builder = schema_builder("tests", std::to_string(idx))
|
||||
.with_column("id", utf8_type, column_kind::partition_key)
|
||||
.with_column("cl", int32_type, column_kind::clustering_key)
|
||||
.with_column("value", int32_type);
|
||||
builder.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
|
||||
std::map <sstring, sstring> opts = {
|
||||
{time_window_compaction_strategy_options::COMPACTION_WINDOW_UNIT_KEY, "HOURS"},
|
||||
{time_window_compaction_strategy_options::COMPACTION_WINDOW_SIZE_KEY, "1"},
|
||||
{time_window_compaction_strategy_options::EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0"},
|
||||
};
|
||||
builder.set_compaction_strategy_options(std::move(opts));
|
||||
builder.set_gc_grace_seconds(0);
|
||||
return builder.build();
|
||||
};
|
||||
|
||||
auto cm = make_lw_shared<compaction_manager>();
|
||||
cm->enable();
|
||||
auto stop_cm = defer([&cm] {
|
||||
cm->stop().get();
|
||||
});
|
||||
|
||||
auto tmp = tmpdir();
|
||||
auto cl_stats = make_lw_shared<cell_locker_stats>();
|
||||
auto tracker = make_lw_shared<cache_tracker>();
|
||||
auto tokens = token_generation_for_shard(1, this_shard_id(), test_db_config.murmur3_partitioner_ignore_msb_bits(), smp::count);
|
||||
|
||||
auto next_timestamp = [] (auto step) {
|
||||
using namespace std::chrono;
|
||||
return (gc_clock::now().time_since_epoch() - duration_cast<microseconds>(step)).count();
|
||||
};
|
||||
auto make_expiring_cell = [&] (schema_ptr s, std::chrono::hours step) {
|
||||
static thread_local int32_t value = 1;
|
||||
|
||||
auto key_str = tokens[0].first;
|
||||
auto key = partition_key::from_exploded(*s, {to_bytes(key_str)});
|
||||
|
||||
mutation m(s, key);
|
||||
auto c_key = clustering_key::from_exploded(*s, {int32_type->decompose(value++)});
|
||||
m.set_clustered_cell(c_key, bytes("value"), data_value(int32_t(value)), next_timestamp(step), gc_clock::duration(step + 5s));
|
||||
return m;
|
||||
};
|
||||
|
||||
auto make_table_with_single_fully_expired_sstable = [&] (auto idx) {
|
||||
auto s = make_schema(idx);
|
||||
column_family::config cfg = column_family_test_config(env.manager());
|
||||
cfg.datadir = tmp.path().string() + "/" + std::to_string(idx);
|
||||
touch_directory(cfg.datadir).get();
|
||||
cfg.enable_commitlog = false;
|
||||
cfg.enable_incremental_backups = false;
|
||||
|
||||
auto sst_gen = [&env, s, dir = cfg.datadir, gen = make_lw_shared<unsigned>(1)] () mutable {
|
||||
return env.make_sstable(s, dir, (*gen)++, sstables::sstable::version_types::md, big);
|
||||
};
|
||||
|
||||
auto cf = make_lw_shared<column_family>(s, cfg, column_family::no_commitlog(), *cm, *cl_stats, *tracker);
|
||||
cf->start();
|
||||
cf->mark_ready_for_writes();
|
||||
|
||||
auto muts = { make_expiring_cell(s, std::chrono::hours(1)) };
|
||||
auto sst = make_sstable_containing(sst_gen, muts);
|
||||
column_family_test(cf).add_sstable(sst);
|
||||
return cf;
|
||||
};
|
||||
|
||||
std::vector<lw_shared_ptr<column_family>> tables;
|
||||
auto stop_tables = defer([&tables] {
|
||||
for (auto& t : tables) {
|
||||
t->stop().get();
|
||||
}
|
||||
});
|
||||
for (auto i = 0; i < 100; i++) {
|
||||
tables.push_back(make_table_with_single_fully_expired_sstable(i));
|
||||
}
|
||||
|
||||
// Make sure everything is expired
|
||||
forward_jump_clocks(std::chrono::hours(100));
|
||||
|
||||
for (auto& t : tables) {
|
||||
BOOST_REQUIRE(t->sstables_count() == 1);
|
||||
t->trigger_compaction();
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(cm->get_stats().pending_tasks >= 1 || cm->get_stats().active_tasks >= 1);
|
||||
|
||||
size_t max_ongoing_compaction = 0;
|
||||
|
||||
// wait for submitted jobs to finish.
|
||||
auto end = [cm, &tables] {
|
||||
return cm->get_stats().pending_tasks == 0 && cm->get_stats().active_tasks == 0
|
||||
&& boost::algorithm::all_of(tables, [] (auto& t) { return t->sstables_count() == 0; });
|
||||
};
|
||||
while (!end()) {
|
||||
if (!cm->get_stats().pending_tasks && !cm->get_stats().active_tasks) {
|
||||
for (auto& t : tables) {
|
||||
if (t->sstables_count()) {
|
||||
t->trigger_compaction();
|
||||
}
|
||||
}
|
||||
}
|
||||
max_ongoing_compaction = std::max(cm->get_stats().active_tasks, max_ongoing_compaction);
|
||||
later().get();
|
||||
}
|
||||
BOOST_REQUIRE(cm->get_stats().errors == 0);
|
||||
BOOST_REQUIRE(max_ongoing_compaction == 1);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(stcs_reshape_test) {
|
||||
return test_env::do_with_async([] (test_env& env) {
|
||||
simple_schema ss;
|
||||
auto s = ss.schema();
|
||||
std::vector<shared_sstable> sstables;
|
||||
sstables.reserve(s->max_compaction_threshold());
|
||||
for (auto gen = 1; gen <= s->max_compaction_threshold(); gen++) {
|
||||
auto sst = env.make_sstable(s, "", gen, la, big);
|
||||
sstables::test(sst).set_data_file_size(1);
|
||||
sstables.push_back(std::move(sst));
|
||||
}
|
||||
|
||||
auto cs = sstables::make_compaction_strategy(sstables::compaction_strategy_type::size_tiered,
|
||||
s->compaction_strategy_options());
|
||||
|
||||
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::strict).sstables.size());
|
||||
BOOST_REQUIRE(cs.get_reshaping_job(sstables, s, default_priority_class(), reshape_mode::relaxed).sstables.size());
|
||||
});
|
||||
}
|
||||
|
||||
186
test/cql-pytest/test_secondary_index.py
Normal file
186
test/cql-pytest/test_secondary_index.py
Normal file
@@ -0,0 +1,186 @@
|
||||
# Copyright 2020 ScyllaDB
|
||||
#
|
||||
# This file is part of Scylla.
|
||||
#
|
||||
# Scylla is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Scylla is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# Tests for secondary indexes
|
||||
|
||||
import time
|
||||
import pytest
|
||||
from cassandra.protocol import SyntaxException, AlreadyExists, InvalidRequest, ConfigurationException, ReadFailure
|
||||
|
||||
from util import new_test_table, unique_name
|
||||
|
||||
# A reproducer for issue #7443: Normally, when the entire table is SELECTed,
|
||||
# the partitions are returned sorted by the partitions' token. When there
|
||||
# is filtering, this order is not expected to change. Furthermore, when this
|
||||
# filtering happens to use a secondary index, again the order is not expected
|
||||
# to change.
|
||||
def test_partition_order_with_si(cql, test_keyspace):
|
||||
schema = 'pk int, x int, PRIMARY KEY ((pk))'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
# Insert 20 partitions, all of them with x=1 so that filtering by x=1
|
||||
# will yield the same 20 partitions:
|
||||
N = 20
|
||||
stmt = cql.prepare('INSERT INTO '+table+' (pk, x) VALUES (?, ?)')
|
||||
for i in range(N):
|
||||
cql.execute(stmt, [i, 1])
|
||||
# SELECT all the rows, and verify they are returned in increasing
|
||||
# partition token order (note that the token is a *signed* number):
|
||||
tokens = [row.system_token_pk for row in cql.execute('SELECT token(pk) FROM '+table)]
|
||||
assert len(tokens) == N
|
||||
assert sorted(tokens) == tokens
|
||||
# Now select all the partitions with filtering of x=1. Since all
|
||||
# rows have x=1, this shouldn't change the list of matching rows, and
|
||||
# also shouldn't check their order:
|
||||
tokens1 = [row.system_token_pk for row in cql.execute('SELECT token(pk) FROM '+table+' WHERE x=1 ALLOW FILTERING')]
|
||||
assert tokens1 == tokens
|
||||
# Now add an index on x, which allows implementing the "x=1"
|
||||
# restriction differently. With the index, "ALLOW FILTERING" is
|
||||
# no longer necessary. But the order of the results should
|
||||
# still not change. Issue #7443 is about the order changing here.
|
||||
cql.execute('CREATE INDEX ON '+table+'(x)')
|
||||
# "CREATE INDEX" does not wait until the index is actually available
|
||||
# for use. Reads immediately after the CREATE INDEX may fail or return
|
||||
# partial results. So let's retry until reads resume working:
|
||||
for i in range(100):
|
||||
try:
|
||||
tokens2 = [row.system_token_pk for row in cql.execute('SELECT token(pk) FROM '+table+' WHERE x=1')]
|
||||
if len(tokens2) == N:
|
||||
break
|
||||
except ReadFailure:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
assert tokens2 == tokens
|
||||
|
||||
# Test which ensures that indexes for a query are picked by the order in which
|
||||
# they appear in restrictions. That way, users can deterministically pick
|
||||
# which indexes are used for which queries.
|
||||
# Note that the order of picking indexing is not set in stone and may be
|
||||
# subject to change - in which case this test case should be amended as well.
|
||||
# The order tested in this case was decided as a good first step in issue
|
||||
# #7969, but it's possible that it will eventually be implemented another
|
||||
# way, e.g. dynamically based on estimated query selectivity statistics.
|
||||
# Ref: #7969
|
||||
@pytest.mark.xfail(reason="The order of picking indexes is currently arbitrary. Issue #7969")
|
||||
def test_order_of_indexes(scylla_only, cql, test_keyspace):
|
||||
schema = 'p int primary key, v1 int, v2 int, v3 int'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
cql.execute(f"CREATE INDEX my_v3_idx ON {table}(v3)")
|
||||
cql.execute(f"CREATE INDEX my_v1_idx ON {table}(v1)")
|
||||
cql.execute(f"CREATE INDEX my_v2_idx ON {table}((p),v2)")
|
||||
# All queries below should use the first index they find in the list
|
||||
# of restrictions. Tracing information will be consulted to ensure
|
||||
# it's true. Currently some of the cases below succeed, because the
|
||||
# order is not well defined (and may, for instance, change upon
|
||||
# server restart), but some of them fail. Once a proper ordering
|
||||
# is implemented, all cases below should succeed.
|
||||
def index_used(query, index_name):
|
||||
assert any([index_name in event.description for event in cql.execute(query, trace=True).get_query_trace().events])
|
||||
index_used(f"SELECT * FROM {table} WHERE v3 = 1", "my_v3_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 allow filtering", "my_v1_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE p = 1 and v3 = 1 and v1 = 2 allow filtering", "my_v3_idx")
|
||||
# Local indexes are still skipped if they cannot be used
|
||||
index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v1 = 2 allow filtering", "my_v1_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE v2 = 1 and v3 = 2 and v1 = 3 allow filtering", "my_v3_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE v1 = 1 and v2 = 2 and v3 = 3 allow filtering", "my_v1_idx")
|
||||
# Local indexes are still preferred over global ones, if they can be used
|
||||
index_used(f"SELECT * FROM {table} WHERE p = 1 and v1 = 1 and v3 = 2 and v2 = 2 allow filtering", "my_v2_idx")
|
||||
index_used(f"SELECT * FROM {table} WHERE p = 1 and v2 = 1 and v1 = 2 allow filtering", "my_v2_idx")
|
||||
|
||||
# Indexes can be created without an explicit name, in which case a default name is chosen.
|
||||
# However, due to #8620 it was possible to break the index creation mechanism by creating
|
||||
# a properly named regular table, which conflicts with the generated index name.
|
||||
def test_create_unnamed_index_when_its_name_is_taken(cql, test_keyspace):
|
||||
schema = 'p int primary key, v int'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
try:
|
||||
cql.execute(f"CREATE TABLE {table}_v_idx_index (i_do_not_exist_in_the_base_table int primary key)")
|
||||
# Creating an index should succeed, even though its default name is taken
|
||||
# by the table above
|
||||
cql.execute(f"CREATE INDEX ON {table}(v)")
|
||||
finally:
|
||||
cql.execute(f"DROP TABLE {table}_v_idx_index")
|
||||
|
||||
# Indexed created with an explicit name cause a materialized view to be created,
|
||||
# and this view has a specific name - <index-name>_index. If there happens to be
|
||||
# a regular table (or another view) named just like that, index creation should fail.
|
||||
def test_create_named_index_when_its_name_is_taken(scylla_only, cql, test_keyspace):
|
||||
schema = 'p int primary key, v int'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
index_name = unique_name()
|
||||
try:
|
||||
cql.execute(f"CREATE TABLE {test_keyspace}.{index_name}_index (i_do_not_exist_in_the_base_table int primary key)")
|
||||
# Creating an index should fail, because it's impossible to create
|
||||
# its underlying materialized view, because its name is taken by a regular table
|
||||
with pytest.raises(InvalidRequest, match="already exists"):
|
||||
cql.execute(f"CREATE INDEX {index_name} ON {table}(v)")
|
||||
finally:
|
||||
cql.execute(f"DROP TABLE {test_keyspace}.{index_name}_index")
|
||||
|
||||
# Tests for CREATE INDEX IF NOT EXISTS
|
||||
# Reproduces issue #8717.
|
||||
def test_create_index_if_not_exists(cql, test_keyspace):
|
||||
with new_test_table(cql, test_keyspace, 'p int primary key, v int') as table:
|
||||
cql.execute(f"CREATE INDEX ON {table}(v)")
|
||||
# Can't create the same index again without "IF NOT EXISTS", but can
|
||||
# do it with "IF NOT EXISTS":
|
||||
with pytest.raises(InvalidRequest, match="duplicate"):
|
||||
cql.execute(f"CREATE INDEX ON {table}(v)")
|
||||
cql.execute(f"CREATE INDEX IF NOT EXISTS ON {table}(v)")
|
||||
cql.execute(f"DROP INDEX {test_keyspace}.{table.split('.')[1]}_v_idx")
|
||||
|
||||
# Now test the same thing for named indexes. This is what broke in #8717:
|
||||
cql.execute(f"CREATE INDEX xyz ON {table}(v)")
|
||||
with pytest.raises(InvalidRequest, match="already exists"):
|
||||
cql.execute(f"CREATE INDEX xyz ON {table}(v)")
|
||||
cql.execute(f"CREATE INDEX IF NOT EXISTS xyz ON {table}(v)")
|
||||
cql.execute(f"DROP INDEX {test_keyspace}.xyz")
|
||||
|
||||
# Exactly the same with non-lower case name.
|
||||
cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
|
||||
with pytest.raises(InvalidRequest, match="already exists"):
|
||||
cql.execute(f'CREATE INDEX "CamelCase" ON {table}(v)')
|
||||
cql.execute(f'CREATE INDEX IF NOT EXISTS "CamelCase" ON {table}(v)')
|
||||
cql.execute(f'DROP INDEX {test_keyspace}."CamelCase"')
|
||||
|
||||
# Trying to create an index for an attribute that's already indexed,
|
||||
# but with a different name. The "IF NOT EXISTS" appears to succeed
|
||||
# in this case, but does not actually create the new index name -
|
||||
# only the old one remains.
|
||||
cql.execute(f"CREATE INDEX xyz ON {table}(v)")
|
||||
with pytest.raises(InvalidRequest, match="duplicate"):
|
||||
cql.execute(f"CREATE INDEX abc ON {table}(v)")
|
||||
cql.execute(f"CREATE INDEX IF NOT EXISTS abc ON {table}(v)")
|
||||
with pytest.raises(InvalidRequest):
|
||||
cql.execute(f"DROP INDEX {test_keyspace}.abc")
|
||||
cql.execute(f"DROP INDEX {test_keyspace}.xyz")
|
||||
|
||||
# Test that the paging state works properly for indexes on tables
|
||||
# with descending clustering order. There was a problem with indexes
|
||||
# created on clustering keys with DESC clustering order - they are represented
|
||||
# as "reverse" types internally and Scylla assertions failed that the base type
|
||||
# is different from the underlying view type, even though, from the perspective
|
||||
# of deserialization, they're equal. Issue #8666
|
||||
def test_paging_with_desc_clustering_order(cql, test_keyspace):
|
||||
schema = 'p int, c int, primary key (p,c)'
|
||||
extra = 'with clustering order by (c desc)'
|
||||
with new_test_table(cql, test_keyspace, schema, extra) as table:
|
||||
cql.execute(f"CREATE INDEX ON {table}(c)")
|
||||
for i in range(3):
|
||||
cql.execute(f"INSERT INTO {table}(p,c) VALUES ({i}, 42)")
|
||||
stmt = SimpleStatement(f"SELECT * FROM {table} WHERE c = 42", fetch_size=1)
|
||||
assert len([row for row in cql.execute(stmt)]) == 3
|
||||
@@ -1396,9 +1396,18 @@ private:
|
||||
return { };
|
||||
}
|
||||
}
|
||||
static void validate_key(const schema& s, const clustering_key& ck, bytes_view v) {
|
||||
auto ck_size = ck.size(s);
|
||||
if (ck_size > s.clustering_key_size()) {
|
||||
throw std::runtime_error(format("Cell name of {}.{} has too many components, expected {} but got {} in 0x{}",
|
||||
s.ks_name(), s.cf_name(), s.clustering_key_size(), ck_size, to_hex(v)));
|
||||
}
|
||||
}
|
||||
static clustering_key_prefix make_clustering_prefix(const schema& s, bytes_view v) {
|
||||
auto composite = composite_view(v, s.thrift().has_compound_comparator());
|
||||
return clustering_key_prefix::from_exploded(composite.values());
|
||||
auto ck = clustering_key_prefix::from_exploded(composite.values());
|
||||
validate_key(s, ck, v);
|
||||
return ck;
|
||||
}
|
||||
static query::clustering_range::bound make_clustering_bound(const schema& s, bytes_view v, composite::eoc exclusiveness_marker) {
|
||||
auto composite = composite_view(v, s.thrift().has_compound_comparator());
|
||||
@@ -1407,6 +1416,7 @@ private:
|
||||
last = c.second;
|
||||
return c.first;
|
||||
}));
|
||||
validate_key(s, ck, v);
|
||||
return query::clustering_range::bound(std::move(ck), last != exclusiveness_marker);
|
||||
}
|
||||
static range<clustering_key_prefix> make_clustering_range(const schema& s, const std::string& start, const std::string& end) {
|
||||
|
||||
Submodule tools/java updated: f2e8666d7e...86fb5c826d
Submodule tools/jmx updated: 47b355ec66...5fcba137c0
@@ -616,16 +616,23 @@ future<> cql_server::connection::process_request() {
|
||||
auto op = f.opcode;
|
||||
auto stream = f.stream;
|
||||
auto mem_estimate = f.length * 2 + 8000; // Allow for extra copies and bookkeeping
|
||||
|
||||
if (mem_estimate > _server._max_request_size) {
|
||||
return make_exception_future<>(exceptions::invalid_request_exception(format("request size too large (frame size {:d}; estimate {:d}; allowed {:d}",
|
||||
f.length, mem_estimate, _server._max_request_size)));
|
||||
return _read_buf.skip(f.length).then([length = f.length, stream = f.stream, mem_estimate, this] () {
|
||||
write_response(make_error(stream, exceptions::exception_code::INVALID,
|
||||
format("request size too large (frame size {:d}; estimate {:d}; allowed {:d}", length, mem_estimate, _server._max_request_size),
|
||||
tracing::trace_state_ptr()));
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
if (_server._requests_serving > _server._max_concurrent_requests) {
|
||||
++_server._requests_shed;
|
||||
return make_exception_future<>(
|
||||
exceptions::overloaded_exception(format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _server._requests_serving)));
|
||||
return _read_buf.skip(f.length).then([this, stream = f.stream] {
|
||||
write_response(make_error(stream, exceptions::exception_code::OVERLOADED,
|
||||
format("too many in-flight requests (configured via max_concurrent_requests_per_shard): {}", _server._requests_serving),
|
||||
tracing::trace_state_ptr()));
|
||||
return make_ready_future<>();
|
||||
});
|
||||
}
|
||||
|
||||
auto fut = get_units(_server._memory_available, mem_estimate);
|
||||
|
||||
3
types.hh
3
types.hh
@@ -588,6 +588,9 @@ public:
|
||||
cql3::cql3_type as_cql3_type() const;
|
||||
const sstring& cql3_type_name() const;
|
||||
virtual shared_ptr<const abstract_type> freeze() const { return shared_from_this(); }
|
||||
const abstract_type& without_reversed() const {
|
||||
return is_reversed() ? *underlying_type() : *this;
|
||||
}
|
||||
friend class list_type_impl;
|
||||
private:
|
||||
mutable sstring _cql3_type_name;
|
||||
|
||||
@@ -173,6 +173,10 @@ public:
|
||||
return res;
|
||||
}
|
||||
|
||||
long use_count() const noexcept {
|
||||
return _e ? _e.use_count() : 0;
|
||||
}
|
||||
|
||||
friend class loading_shared_values;
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user