Compare commits
120 Commits
next
...
scylla-4.6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7c79c513d1 | ||
|
|
9a8e73f0c3 | ||
|
|
fac0443200 | ||
|
|
6bcfef2cfa | ||
|
|
d2c67a2429 | ||
|
|
d6c2f228e7 | ||
|
|
a1b1df2074 | ||
|
|
14e13ecbd4 | ||
|
|
b8740bde6e | ||
|
|
1b23f8d038 | ||
|
|
05a228e4c5 | ||
|
|
2ec293ab0e | ||
|
|
b60f14601e | ||
|
|
284dd21ef7 | ||
|
|
8b52f1d6e7 | ||
|
|
157951f756 | ||
|
|
4f643ed4a5 | ||
|
|
b598629b7f | ||
|
|
43f82047b9 | ||
|
|
ec3c07de6e | ||
|
|
82572e8cfe | ||
|
|
2b9ed79c6f | ||
|
|
ab0b6fd372 | ||
|
|
12f1718ef4 | ||
|
|
322dfe8403 | ||
|
|
11f008e8fd | ||
|
|
fd7314a362 | ||
|
|
d27468f078 | ||
|
|
74ef1ee961 | ||
|
|
07549d159c | ||
|
|
189bbcd82d | ||
|
|
70e6921125 | ||
|
|
e314158708 | ||
|
|
46586532c9 | ||
|
|
0114244363 | ||
|
|
f154c8b719 | ||
|
|
8bf149fdd6 | ||
|
|
0265d56173 | ||
|
|
e50452ba43 | ||
|
|
a205f644cb | ||
|
|
f136b5b950 | ||
|
|
69a1325884 | ||
|
|
ab153c9b94 | ||
|
|
eb372d7f03 | ||
|
|
e232711e7e | ||
|
|
0a440b6d4a | ||
|
|
00bb1e8145 | ||
|
|
e30dbee2db | ||
|
|
2309d6b51e | ||
|
|
b77ca07709 | ||
|
|
bb0a38f889 | ||
|
|
c48fd03463 | ||
|
|
eb78e6d4b8 | ||
|
|
4b1b0a55c0 | ||
|
|
172a8628d5 | ||
|
|
5688b125e6 | ||
|
|
6da4acb41e | ||
|
|
f09cc9a01d | ||
|
|
cd2e33ede4 | ||
|
|
32d0698d78 | ||
|
|
93cf43ae4b | ||
|
|
2f2d22a864 | ||
|
|
5f92f54f06 | ||
|
|
395f2459b4 | ||
|
|
019d50bb5c | ||
|
|
bbe775b926 | ||
|
|
469c94ea17 | ||
|
|
4c780d0265 | ||
|
|
0181de1f2c | ||
|
|
7597a79ef9 | ||
|
|
8f5148e921 | ||
|
|
5694ec189f | ||
|
|
34d470967a | ||
|
|
61db571a44 | ||
|
|
5b5a300a9e | ||
|
|
148a65d0d6 | ||
|
|
e3ad14d55f | ||
|
|
2b506c2d4a | ||
|
|
50aad1c668 | ||
|
|
7bf3f37cd1 | ||
|
|
0f7f8585f2 | ||
|
|
2c65c4a569 | ||
|
|
f85cd289bc | ||
|
|
5e661af9a4 | ||
|
|
5629b67d25 | ||
|
|
ad632cf7fc | ||
|
|
ca24bebcf2 | ||
|
|
7dc5abb6f8 | ||
|
|
e8a1cfb6f8 | ||
|
|
fc312b3021 | ||
|
|
7b82aaf939 | ||
|
|
894a4abfae | ||
|
|
4dcf023470 | ||
|
|
283788828e | ||
|
|
730a147ba6 | ||
|
|
9897e83029 | ||
|
|
1a9b64e6f6 | ||
|
|
49fe9e2c8e | ||
|
|
d0580c41ee | ||
|
|
542394c82f | ||
|
|
018ad3f6f4 | ||
|
|
9b8b7efb54 | ||
|
|
1c3e63975f | ||
|
|
11bb03e46d | ||
|
|
810e410c5d | ||
|
|
97f6da0c3e | ||
|
|
c229fe9694 | ||
|
|
ee1ca8ae4d | ||
|
|
6bfd322e3b | ||
|
|
afc18d5070 | ||
|
|
2ec22c2404 | ||
|
|
19da778271 | ||
|
|
cbd4c13ba6 | ||
|
|
338871802d | ||
|
|
8b5b1b8af6 | ||
|
|
ea89eff95d | ||
|
|
96421e7779 | ||
|
|
142336ca53 | ||
|
|
492f12248c | ||
|
|
7eb7a0e5fe |
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
|
||||
@@ -60,7 +60,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=4.6.dev
|
||||
VERSION=4.6.7
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -1017,18 +1017,16 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
_stats.api_operations.update_table++;
|
||||
elogger.trace("Updating table {}", request);
|
||||
|
||||
std::string table_name = get_table_name(request);
|
||||
if (table_name.find(INTERNAL_TABLE_PREFIX) == 0) {
|
||||
schema_ptr tab = get_table(_proxy, request);
|
||||
// the ugly but harmless conversion to string_view here is because
|
||||
// Seastar's sstring is missing a find(std::string_view) :-()
|
||||
if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
|
||||
return make_ready_future<request_return_type>(api_error::validation(
|
||||
format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
|
||||
}
|
||||
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
|
||||
tracing::add_table_name(trace_state, keyspace_name, table_name);
|
||||
tracing::add_table_name(trace_state, tab->ks_name(), tab->cf_name());
|
||||
|
||||
auto& db = _proxy.get_db().local();
|
||||
auto& cf = db.find_column_family(keyspace_name, table_name);
|
||||
|
||||
schema_builder builder(cf.schema());
|
||||
schema_builder builder(tab);
|
||||
|
||||
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
||||
if (stream_specification && stream_specification->IsObject()) {
|
||||
@@ -2080,6 +2078,9 @@ static attrs_to_get calculate_attrs_to_get(const rjson::value& req, std::unorder
|
||||
for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
|
||||
attribute_path_map_add("AttributesToGet", ret, it->GetString());
|
||||
}
|
||||
if (ret.empty()) {
|
||||
throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
|
||||
}
|
||||
return ret;
|
||||
} else if (has_projection_expression) {
|
||||
const rjson::value& projection_expression = req["ProjectionExpression"];
|
||||
@@ -2481,8 +2482,8 @@ static bool hierarchy_actions(
|
||||
// attr member so we can use add()
|
||||
rjson::add_with_string_name(v, attr, std::move(*newv));
|
||||
} else {
|
||||
throw api_error::validation(format("Can't remove document path {} - not present in item",
|
||||
subh.get_value()._path));
|
||||
// Removing a.b when a is a map but a.b doesn't exist
|
||||
// is silently ignored. It's not considered an error.
|
||||
}
|
||||
} else {
|
||||
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
||||
|
||||
@@ -94,10 +94,7 @@ future<executor::request_return_type> executor::update_time_to_live(client_state
|
||||
}
|
||||
|
||||
future<executor::request_return_type> executor::describe_time_to_live(client_state& client_state, service_permit permit, rjson::value request) {
|
||||
_stats.api_operations.update_time_to_live++;
|
||||
if (!_proxy.get_db().local().features().cluster_supports_alternator_ttl()) {
|
||||
co_return api_error::unknown_operation("DescribeTimeToLive not yet supported. Experimental support is available if the 'alternator_ttl' experimental feature is enabled on all nodes.");
|
||||
}
|
||||
_stats.api_operations.describe_time_to_live++;
|
||||
schema_ptr schema = get_table(_proxy, request);
|
||||
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
||||
rjson::value desc = rjson::empty_object();
|
||||
|
||||
@@ -79,6 +79,49 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
|
||||
set_view(_data);
|
||||
}
|
||||
|
||||
// Based on:
|
||||
// - org.apache.cassandra.db.AbstractCell#reconcile()
|
||||
// - org.apache.cassandra.db.BufferExpiringCell#reconcile()
|
||||
// - org.apache.cassandra.db.BufferDeletedCell#reconcile()
|
||||
std::strong_ordering
|
||||
compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
|
||||
if (left.timestamp() != right.timestamp()) {
|
||||
return left.timestamp() <=> right.timestamp();
|
||||
}
|
||||
if (left.is_live() != right.is_live()) {
|
||||
return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
|
||||
}
|
||||
if (left.is_live()) {
|
||||
auto c = compare_unsigned(left.value(), right.value()) <=> 0;
|
||||
if (c != 0) {
|
||||
return c;
|
||||
}
|
||||
if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
|
||||
// prefer expiring cells.
|
||||
return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
|
||||
}
|
||||
if (left.is_live_and_has_ttl()) {
|
||||
if (left.expiry() != right.expiry()) {
|
||||
return left.expiry() <=> right.expiry();
|
||||
} else {
|
||||
// prefer the cell that was written later,
|
||||
// so it survives longer after it expires, until purged.
|
||||
return right.ttl() <=> left.ttl();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Both are deleted
|
||||
|
||||
// Origin compares big-endian serialized deletion time. That's because it
|
||||
// delegates to AbstractCell.reconcile() which compares values after
|
||||
// comparing timestamps, which in case of deleted cells will hold
|
||||
// serialized expiry.
|
||||
return (uint64_t) left.deletion_time().time_since_epoch().count()
|
||||
<=> (uint64_t) right.deletion_time().time_since_epoch().count();
|
||||
}
|
||||
return std::strong_ordering::equal;
|
||||
}
|
||||
|
||||
atomic_cell_or_collection atomic_cell_or_collection::copy(const abstract_type& type) const {
|
||||
if (_data.empty()) {
|
||||
return atomic_cell_or_collection();
|
||||
|
||||
@@ -593,8 +593,8 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
|
||||
clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
|
||||
auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
|
||||
auto& rows = _snp->version()->partition().mutable_clustered_rows();
|
||||
auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
|
||||
return rows.insert_before(_next_row.get_iterator_in_latest_version(), *new_entry);
|
||||
auto new_entry = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no));
|
||||
return rows.insert_before(_next_row.get_iterator_in_latest_version(), std::move(new_entry));
|
||||
});
|
||||
_snp->tracker()->insert(*it);
|
||||
_last_row = partition_snapshot_row_weakref(*_snp, it, true);
|
||||
|
||||
@@ -765,8 +765,12 @@ future<> generation_service::check_and_repair_cdc_streams() {
|
||||
std::optional<cdc::generation_id> latest = _gen_id;
|
||||
const auto& endpoint_states = _gossiper.get_endpoint_states();
|
||||
for (const auto& [addr, state] : endpoint_states) {
|
||||
if (!_gossiper.is_normal(addr)) {
|
||||
throw std::runtime_error(format("All nodes must be in NORMAL state while performing check_and_repair_cdc_streams"
|
||||
if (_gossiper.is_left(addr)) {
|
||||
cdc_log.info("check_and_repair_cdc_streams ignored node {} because it is in LEFT state", addr);
|
||||
continue;
|
||||
}
|
||||
if (!_gossiper.is_normal(addr)) {
|
||||
throw std::runtime_error(format("All nodes must be in NORMAL or LEFT state while performing check_and_repair_cdc_streams"
|
||||
" ({} is in state {})", addr, _gossiper.get_gossip_status(state)));
|
||||
}
|
||||
|
||||
@@ -830,6 +834,11 @@ future<> generation_service::check_and_repair_cdc_streams() {
|
||||
latest, db_clock::now());
|
||||
should_regenerate = true;
|
||||
} else {
|
||||
if (tmptr->sorted_tokens().size() != gen->entries().size()) {
|
||||
// We probably have garbage streams from old generations
|
||||
cdc_log.info("Generation size does not match the token ring, regenerating");
|
||||
should_regenerate = true;
|
||||
} else {
|
||||
std::unordered_set<dht::token> gen_ends;
|
||||
for (const auto& entry : gen->entries()) {
|
||||
gen_ends.insert(entry.token_range_end);
|
||||
@@ -841,6 +850,7 @@ future<> generation_service::check_and_repair_cdc_streams() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
25
cdc/log.cc
25
cdc/log.cc
@@ -73,7 +73,7 @@ using namespace std::chrono_literals;
|
||||
logging::logger cdc_log("cdc");
|
||||
|
||||
namespace cdc {
|
||||
static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {});
|
||||
static schema_ptr create_log_schema(const schema&, std::optional<utils::UUID> = {}, schema_ptr = nullptr);
|
||||
}
|
||||
|
||||
static constexpr auto cdc_group_name = "cdc";
|
||||
@@ -220,7 +220,7 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt);
|
||||
auto new_log_schema = create_log_schema(new_schema, log_schema ? std::make_optional(log_schema->id()) : std::nullopt, log_schema);
|
||||
|
||||
auto log_mut = log_schema
|
||||
? db::schema_tables::make_update_table_mutations(db, keyspace.metadata(), log_schema, new_log_schema, timestamp, false)
|
||||
@@ -503,7 +503,7 @@ bytes log_data_column_deleted_elements_name_bytes(const bytes& column_name) {
|
||||
return to_bytes(cdc_deleted_elements_column_prefix) + column_name;
|
||||
}
|
||||
|
||||
static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid) {
|
||||
static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID> uuid, schema_ptr old) {
|
||||
schema_builder b(s.ks_name(), log_name(s.cf_name()));
|
||||
b.with_partitioner("com.scylladb.dht.CDCPartitioner");
|
||||
b.set_compaction_strategy(sstables::compaction_strategy_type::time_window);
|
||||
@@ -590,6 +590,20 @@ static schema_ptr create_log_schema(const schema& s, std::optional<utils::UUID>
|
||||
b.set_uuid(*uuid);
|
||||
}
|
||||
|
||||
/**
|
||||
* #10473 - if we are redefining the log table, we need to ensure any dropped
|
||||
* columns are registered in "dropped_columns" table, otherwise clients will not
|
||||
* be able to read data older than now.
|
||||
*/
|
||||
if (old) {
|
||||
// not super efficient, but we don't do this often.
|
||||
for (auto& col : old->all_columns()) {
|
||||
if (!b.has_column({col.name(), col.name_as_text() })) {
|
||||
b.without_column(col.name_as_text(), col.type, api::new_timestamp());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return b.build();
|
||||
}
|
||||
|
||||
@@ -1511,6 +1525,11 @@ public:
|
||||
}
|
||||
|
||||
auto process_cell = [&, this] (const column_definition& cdef) {
|
||||
// If table uses compact storage it may contain a column of type empty
|
||||
// and we need to ignore such a field because it is not present in CDC log.
|
||||
if (cdef.type->get_kind() == abstract_type::kind::empty) {
|
||||
return;
|
||||
}
|
||||
if (auto current = get_col_from_row_state(row_state, cdef)) {
|
||||
_builder->set_value(image_ck, cdef, *current);
|
||||
} else if (op == operation::pre_image) {
|
||||
|
||||
@@ -1634,7 +1634,7 @@ future<bool> scrub_validate_mode_validate_reader(flat_mutation_reader reader, co
|
||||
while (auto mf_opt = co_await reader()) {
|
||||
if (cdata.is_stop_requested()) [[unlikely]] {
|
||||
// Compaction manager will catch this exception and re-schedule the compaction.
|
||||
co_return coroutine::make_exception(compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested));
|
||||
throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
|
||||
}
|
||||
|
||||
const auto& mf = *mf_opt;
|
||||
|
||||
@@ -326,6 +326,11 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstables::compact
|
||||
task->compaction_done = with_semaphore(_custom_job_sem, 1, [this, task, cf, &job = *job_ptr] () mutable {
|
||||
// take read lock for cf, so major compaction and resharding can't proceed in parallel.
|
||||
return with_lock(_compaction_locks[cf].for_read(), [this, task, cf, &job] () mutable {
|
||||
// Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
|
||||
if (task->compaction_data.is_stop_requested()) {
|
||||
throw sstables::compaction_stopped_exception(task->compacting_cf->schema()->ks_name(), task->compacting_cf->schema()->cf_name(),
|
||||
task->compaction_data.stop_requested);
|
||||
}
|
||||
_stats.active_tasks++;
|
||||
if (!can_proceed(task)) {
|
||||
return make_ready_future<>();
|
||||
@@ -676,6 +681,7 @@ void compaction_manager::submit_offstrategy(column_family* cf) {
|
||||
_stats.active_tasks++;
|
||||
task->setup_new_compaction();
|
||||
|
||||
return with_scheduling_group(_maintenance_sg.cpu, [this, task, cf] {
|
||||
return cf->run_offstrategy_compaction(task->compaction_data).then_wrapped([this, task] (future<> f) mutable {
|
||||
_stats.active_tasks--;
|
||||
task->finish_compaction();
|
||||
@@ -698,6 +704,7 @@ void compaction_manager::submit_offstrategy(column_family* cf) {
|
||||
_tasks.remove(task);
|
||||
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -714,9 +721,20 @@ inline bool compaction_manager::check_for_cleanup(column_family* cf) {
|
||||
|
||||
future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compaction_type_options options, get_candidates_func get_func, can_purge_tombstones can_purge) {
|
||||
auto task = make_lw_shared<compaction_manager::task>(cf, options.type());
|
||||
_tasks.push_back(task);
|
||||
|
||||
auto sstables = std::make_unique<std::vector<sstables::shared_sstable>>(get_func(*cf));
|
||||
std::unique_ptr<std::vector<sstables::shared_sstable>> sstables;
|
||||
lw_shared_ptr<compacting_sstable_registration> compacting;
|
||||
|
||||
// since we might potentially have ongoing compactions, and we
|
||||
// must ensure that all sstables created before we run are included
|
||||
// in the re-write, we need to barrier out any previously running
|
||||
// compaction.
|
||||
auto get_and_register_candidates_func = [this, &sstables, &compacting, &get_func] () mutable -> future<> {
|
||||
sstables = std::make_unique<std::vector<sstables::shared_sstable>>(co_await get_func());
|
||||
compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
|
||||
};
|
||||
|
||||
co_await cf->run_with_compaction_disabled(std::ref(get_and_register_candidates_func));
|
||||
// sort sstables by size in descending order, such that the smallest files will be rewritten first
|
||||
// (as sstable to be rewritten is popped off from the back of container), so rewrite will have higher
|
||||
// chance to succeed when the biggest files are reached.
|
||||
@@ -724,10 +742,11 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
return a->data_size() > b->data_size();
|
||||
});
|
||||
|
||||
auto compacting = make_lw_shared<compacting_sstable_registration>(this, *sstables);
|
||||
auto sstables_ptr = sstables.get();
|
||||
_stats.pending_tasks += sstables->size();
|
||||
|
||||
_tasks.push_back(task);
|
||||
|
||||
task->compaction_done = do_until([this, sstables_ptr, task] { return sstables_ptr->empty() || !can_proceed(task); },
|
||||
[this, task, options, sstables_ptr, compacting, can_purge] () mutable {
|
||||
auto sst = sstables_ptr->back();
|
||||
@@ -737,8 +756,10 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
column_family& cf = *task->compacting_cf;
|
||||
auto sstable_level = sst->get_sstable_level();
|
||||
auto run_identifier = sst->run_identifier();
|
||||
|
||||
auto sstable_set_snapshot = can_purge ? std::make_optional(cf.get_sstable_set()) : std::nullopt;
|
||||
auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
|
||||
// FIXME: this compaction should run with maintenance priority.
|
||||
auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
|
||||
sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);
|
||||
|
||||
// Releases reference to cleaned sstable such that respective used disk space can be freed.
|
||||
@@ -747,15 +768,14 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
};
|
||||
|
||||
return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
|
||||
// Take write lock for cf to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
|
||||
return with_lock(_compaction_locks[&cf].for_write(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
|
||||
return with_lock(_compaction_locks[&cf].for_read(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
task->setup_new_compaction();
|
||||
task->output_run_identifier = descriptor.run_identifier;
|
||||
compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
|
||||
return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor), task] (compaction_backlog_tracker& bt) mutable {
|
||||
return with_scheduling_group(_maintenance_sg.cpu, [this, &cf, descriptor = std::move(descriptor), task]() mutable {
|
||||
return with_scheduling_group(_compaction_controller.sg(), [this, &cf, descriptor = std::move(descriptor), task]() mutable {
|
||||
return cf.compact_sstables(std::move(descriptor), task->compaction_data);
|
||||
});
|
||||
});
|
||||
@@ -783,7 +803,7 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
_tasks.remove(task);
|
||||
});
|
||||
|
||||
return task->compaction_done.get_future().then([task] {});
|
||||
co_return co_await task->compaction_done.get_future();
|
||||
}
|
||||
|
||||
future<> compaction_manager::perform_sstable_scrub_validate_mode(column_family* cf) {
|
||||
@@ -865,31 +885,29 @@ future<> compaction_manager::perform_cleanup(database& db, column_family* cf) {
|
||||
return make_exception_future<>(std::runtime_error(format("cleanup request failed: there is an ongoing cleanup on {}.{}",
|
||||
cf->schema()->ks_name(), cf->schema()->cf_name())));
|
||||
}
|
||||
return seastar::async([this, cf, &db] {
|
||||
// FIXME: indentation
|
||||
auto sorted_owned_ranges = db.get_keyspace_local_ranges(cf->schema()->ks_name());
|
||||
auto get_sstables = [this, &db, cf, sorted_owned_ranges] () -> future<std::vector<sstables::shared_sstable>> {
|
||||
return seastar::async([this, &db, cf, sorted_owned_ranges = std::move(sorted_owned_ranges)] {
|
||||
auto schema = cf->schema();
|
||||
auto sorted_owned_ranges = db.get_keyspace_local_ranges(schema->ks_name());
|
||||
auto sstables = std::vector<sstables::shared_sstable>{};
|
||||
const auto candidates = get_candidates(*cf);
|
||||
std::copy_if(candidates.begin(), candidates.end(), std::back_inserter(sstables), [&sorted_owned_ranges, schema] (const sstables::shared_sstable& sst) {
|
||||
seastar::thread::maybe_yield();
|
||||
return sorted_owned_ranges.empty() || needs_cleanup(sst, sorted_owned_ranges, schema);
|
||||
});
|
||||
return std::tuple<dht::token_range_vector, std::vector<sstables::shared_sstable>>(sorted_owned_ranges, sstables);
|
||||
}).then_unpack([this, cf, &db] (dht::token_range_vector owned_ranges, std::vector<sstables::shared_sstable> sstables) {
|
||||
return rewrite_sstables(cf, sstables::compaction_type_options::make_cleanup(std::move(owned_ranges)),
|
||||
[sstables = std::move(sstables)] (const table&) { return sstables; });
|
||||
return sstables;
|
||||
});
|
||||
};
|
||||
return rewrite_sstables(cf, sstables::compaction_type_options::make_cleanup(std::move(sorted_owned_ranges)), std::move(get_sstables));
|
||||
}
|
||||
|
||||
// Submit a column family to be upgraded and wait for its termination.
|
||||
future<> compaction_manager::perform_sstable_upgrade(database& db, column_family* cf, bool exclude_current_version) {
|
||||
using shared_sstables = std::vector<sstables::shared_sstable>;
|
||||
return do_with(shared_sstables{}, [this, &db, cf, exclude_current_version](shared_sstables& tables) {
|
||||
// since we might potentially have ongoing compactions, and we
|
||||
// must ensure that all sstables created before we run are included
|
||||
// in the re-write, we need to barrier out any previously running
|
||||
// compaction.
|
||||
return cf->run_with_compaction_disabled([this, cf, &tables, exclude_current_version] {
|
||||
auto get_sstables = [this, &db, cf, exclude_current_version] {
|
||||
// FIXME: indentation
|
||||
std::vector<sstables::shared_sstable> tables;
|
||||
|
||||
auto last_version = cf->get_sstables_manager().get_highest_supported_format();
|
||||
|
||||
for (auto& sst : get_candidates(*cf)) {
|
||||
@@ -900,21 +918,17 @@ future<> compaction_manager::perform_sstable_upgrade(database& db, column_family
|
||||
tables.emplace_back(sst);
|
||||
}
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}).then([&db, cf] {
|
||||
return db.get_keyspace_local_ranges(cf->schema()->ks_name());
|
||||
}).then([this, &db, cf, &tables] (dht::token_range_vector owned_ranges) {
|
||||
// doing a "cleanup" is about as compacting as we need
|
||||
// to be, provided we get to decide the tables to process,
|
||||
// and ignoring any existing operations.
|
||||
// Note that we potentially could be doing multiple
|
||||
// upgrades here in parallel, but that is really the users
|
||||
// problem.
|
||||
return rewrite_sstables(cf, sstables::compaction_type_options::make_upgrade(std::move(owned_ranges)), [&](auto&) mutable {
|
||||
return std::exchange(tables, {});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
return make_ready_future<std::vector<sstables::shared_sstable>>(tables);
|
||||
};
|
||||
|
||||
// doing a "cleanup" is about as compacting as we need
|
||||
// to be, provided we get to decide the tables to process,
|
||||
// and ignoring any existing operations.
|
||||
// Note that we potentially could be doing multiple
|
||||
// upgrades here in parallel, but that is really the users
|
||||
// problem.
|
||||
return rewrite_sstables(cf, sstables::compaction_type_options::make_upgrade(db.get_keyspace_local_ranges(cf->schema()->ks_name())), std::move(get_sstables));
|
||||
}
|
||||
|
||||
// Submit a column family to be scrubbed and wait for its termination.
|
||||
@@ -922,14 +936,10 @@ future<> compaction_manager::perform_sstable_scrub(column_family* cf, sstables::
|
||||
if (scrub_mode == sstables::compaction_type_options::scrub::mode::validate) {
|
||||
return perform_sstable_scrub_validate_mode(cf);
|
||||
}
|
||||
// since we might potentially have ongoing compactions, and we
|
||||
// must ensure that all sstables created before we run are scrubbed,
|
||||
// we need to barrier out any previously running compaction.
|
||||
return cf->run_with_compaction_disabled([this, cf, scrub_mode] {
|
||||
return rewrite_sstables(cf, sstables::compaction_type_options::make_scrub(scrub_mode), [this] (const table& cf) {
|
||||
return get_candidates(cf);
|
||||
// FIXME: indentation
|
||||
return rewrite_sstables(cf, sstables::compaction_type_options::make_scrub(scrub_mode), [this, cf] {
|
||||
return make_ready_future<std::vector<sstables::shared_sstable>>(get_candidates(*cf));
|
||||
}, can_purge_tombstones::no);
|
||||
});
|
||||
}
|
||||
|
||||
future<> compaction_manager::remove(column_family* cf) {
|
||||
@@ -979,7 +989,7 @@ void compaction_manager::stop_compaction(sstring type) {
|
||||
}
|
||||
// FIXME: switch to task_stop(), and wait for their termination, so API user can know when compactions actually stopped.
|
||||
for (auto& task : _tasks) {
|
||||
if (task->compaction_running && target_type == task->type) {
|
||||
if (target_type == task->type) {
|
||||
task->compaction_data.stop("user request");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,7 +178,7 @@ private:
|
||||
maintenance_scheduling_group _maintenance_sg;
|
||||
size_t _available_memory;
|
||||
|
||||
using get_candidates_func = std::function<std::vector<sstables::shared_sstable>(const column_family&)>;
|
||||
using get_candidates_func = std::function<future<std::vector<sstables::shared_sstable>>()>;
|
||||
class can_purge_tombstones_tag;
|
||||
using can_purge_tombstones = bool_class<can_purge_tombstones_tag>;
|
||||
|
||||
|
||||
@@ -80,7 +80,11 @@ compaction_descriptor leveled_compaction_strategy::get_major_compaction_job(colu
|
||||
}
|
||||
|
||||
void leveled_compaction_strategy::notify_completion(const std::vector<shared_sstable>& removed, const std::vector<shared_sstable>& added) {
|
||||
if (removed.empty() || added.empty()) {
|
||||
// All the update here is only relevant for regular compaction's round-robin picking policy, and if
|
||||
// last_compacted_keys wasn't generated by regular, it means regular is disabled since last restart,
|
||||
// therefore we can skip the updates here until regular runs for the first time. Once it runs,
|
||||
// it will be able to generate last_compacted_keys correctly by looking at metadata of files.
|
||||
if (removed.empty() || added.empty() || !_last_compacted_keys) {
|
||||
return;
|
||||
}
|
||||
auto min_level = std::numeric_limits<uint32_t>::max();
|
||||
|
||||
@@ -225,6 +225,7 @@ time_window_compaction_strategy::get_sstables_for_compaction(column_family& cf,
|
||||
auto gc_before = gc_clock::now() - cf.schema()->gc_grace_seconds();
|
||||
|
||||
if (candidates.empty()) {
|
||||
_estimated_remaining_tasks = 0;
|
||||
return compaction_descriptor();
|
||||
}
|
||||
|
||||
|
||||
@@ -109,9 +109,7 @@ public:
|
||||
virtual seastar::future<seastar::shared_ptr<cql_transport::messages::result_message>>
|
||||
execute(query_processor& qp, service::query_state& state, const query_options& options) const = 0;
|
||||
|
||||
virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const = 0;
|
||||
|
||||
virtual bool depends_on_column_family(const seastar::sstring& cf_name) const = 0;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const = 0;
|
||||
|
||||
virtual seastar::shared_ptr<const metadata> get_result_metadata() const = 0;
|
||||
|
||||
|
||||
@@ -117,10 +117,44 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
|
||||
if (!col_type->is_map()) {
|
||||
throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
|
||||
}
|
||||
const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
|
||||
int32_t index = data.sel.index_of(*cdef);
|
||||
if (index == -1) {
|
||||
throw std::runtime_error(
|
||||
format("Column definition {} does not match any column in the query selection",
|
||||
cdef->name_as_text()));
|
||||
}
|
||||
const managed_bytes_opt& serialized = data.other_columns[index];
|
||||
if (!serialized) {
|
||||
// For null[i] we return null.
|
||||
return std::nullopt;
|
||||
}
|
||||
const auto deserialized = cdef->type->deserialize(managed_bytes_view(*serialized));
|
||||
const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
|
||||
const auto key = evaluate_to_raw_view(col.sub, options);
|
||||
auto&& key_type = col_type->name_comparator();
|
||||
if (key.is_null()) {
|
||||
// For m[null] return null.
|
||||
// This is different from Cassandra - which treats m[null]
|
||||
// as an invalid request error. But m[null] -> null is more
|
||||
// consistent with our usual null treatement (e.g., both
|
||||
// null[2] and null < 2 return null). It will also allow us
|
||||
// to support non-constant subscripts (e.g., m[a]) where "a"
|
||||
// may be null in some rows and non-null in others, and it's
|
||||
// not an error.
|
||||
return std::nullopt;
|
||||
}
|
||||
if (key.is_unset_value()) {
|
||||
// An m[?] with ? bound to UNSET_VALUE is a invalid query.
|
||||
// We could have detected it earlier while binding, but since
|
||||
// we currently don't, we must protect the following code
|
||||
// which can't work with an UNSET_VALUE. Note that the
|
||||
// placement of this check here means that in an empty table,
|
||||
// where we never need to evaluate the filter expression, this
|
||||
// error will not be detected.
|
||||
throw exceptions::invalid_request_exception(
|
||||
format("Unsupported unset map key for column {}",
|
||||
cdef->name_as_text()));
|
||||
}
|
||||
const auto found = key.with_linearized([&] (bytes_view key_bv) {
|
||||
using entry = std::pair<data_value, data_value>;
|
||||
return std::find_if(data_map.cbegin(), data_map.cend(), [&] (const entry& element) {
|
||||
@@ -135,8 +169,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
|
||||
case column_kind::clustering_key:
|
||||
return managed_bytes(data.clustering_key[cdef->id]);
|
||||
case column_kind::static_column:
|
||||
case column_kind::regular_column:
|
||||
return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
|
||||
[[fallthrough]];
|
||||
case column_kind::regular_column: {
|
||||
int32_t index = data.sel.index_of(*cdef);
|
||||
if (index == -1) {
|
||||
throw std::runtime_error(
|
||||
format("Column definition {} does not match any column in the query selection",
|
||||
cdef->name_as_text()));
|
||||
}
|
||||
return managed_bytes_opt(data.other_columns[index]);
|
||||
}
|
||||
default:
|
||||
throw exceptions::unsupported_operation_exception("Unknown column kind");
|
||||
}
|
||||
|
||||
@@ -970,7 +970,7 @@ bool query_processor::migration_subscriber::should_invalidate(
|
||||
sstring ks_name,
|
||||
std::optional<sstring> cf_name,
|
||||
::shared_ptr<cql_statement> statement) {
|
||||
return statement->depends_on_keyspace(ks_name) && (!cf_name || statement->depends_on_column_family(*cf_name));
|
||||
return statement->depends_on(ks_name, cf_name);
|
||||
}
|
||||
|
||||
future<> query_processor::query_internal(
|
||||
|
||||
@@ -528,7 +528,7 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
}
|
||||
|
||||
if (!_nonprimary_key_restrictions->empty()) {
|
||||
if (_has_queriable_regular_index) {
|
||||
if (_has_queriable_regular_index && _partition_range_is_simple) {
|
||||
_uses_secondary_indexing = true;
|
||||
} else if (!allow_filtering) {
|
||||
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
||||
|
||||
@@ -193,7 +193,7 @@ public:
|
||||
|
||||
template<typename RowComparator>
|
||||
void sort(const RowComparator& cmp) {
|
||||
std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
|
||||
std::sort(_rows.begin(), _rows.end(), cmp);
|
||||
}
|
||||
|
||||
metadata& get_metadata();
|
||||
|
||||
@@ -46,13 +46,7 @@ uint32_t cql3::statements::authentication_statement::get_bound_terms() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool cql3::statements::authentication_statement::depends_on_keyspace(
|
||||
const sstring& ks_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool cql3::statements::authentication_statement::depends_on_column_family(
|
||||
const sstring& cf_name) const {
|
||||
bool cql3::statements::authentication_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -55,9 +55,7 @@ public:
|
||||
|
||||
uint32_t get_bound_terms() const override;
|
||||
|
||||
bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
|
||||
|
||||
|
||||
@@ -48,13 +48,7 @@ uint32_t cql3::statements::authorization_statement::get_bound_terms() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool cql3::statements::authorization_statement::depends_on_keyspace(
|
||||
const sstring& ks_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool cql3::statements::authorization_statement::depends_on_column_family(
|
||||
const sstring& cf_name) const {
|
||||
bool cql3::statements::authorization_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -59,9 +59,7 @@ public:
|
||||
|
||||
uint32_t get_bound_terms() const override;
|
||||
|
||||
bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
|
||||
|
||||
|
||||
@@ -98,14 +98,9 @@ batch_statement::batch_statement(type type_,
|
||||
{
|
||||
}
|
||||
|
||||
bool batch_statement::depends_on_keyspace(const sstring& ks_name) const
|
||||
bool batch_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool batch_statement::depends_on_column_family(const sstring& cf_name) const
|
||||
{
|
||||
return false;
|
||||
return boost::algorithm::any_of(_statements, [&ks_name, &cf_name] (auto&& s) { return s.statement->depends_on(ks_name, cf_name); });
|
||||
}
|
||||
|
||||
uint32_t batch_statement::get_bound_terms() const
|
||||
|
||||
@@ -115,9 +115,7 @@ public:
|
||||
std::unique_ptr<attributes> attrs,
|
||||
cql_stats& stats);
|
||||
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
|
||||
|
||||
@@ -571,12 +571,8 @@ modification_statement::validate(service::storage_proxy&, const service::client_
|
||||
}
|
||||
}
|
||||
|
||||
bool modification_statement::depends_on_keyspace(const sstring& ks_name) const {
|
||||
return keyspace() == ks_name;
|
||||
}
|
||||
|
||||
bool modification_statement::depends_on_column_family(const sstring& cf_name) const {
|
||||
return column_family() == cf_name;
|
||||
bool modification_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
|
||||
}
|
||||
|
||||
void modification_statement::add_operation(::shared_ptr<operation> op) {
|
||||
|
||||
@@ -165,9 +165,7 @@ public:
|
||||
// Validate before execute, using client state and current schema
|
||||
void validate(service::storage_proxy&, const service::client_state& state) const override;
|
||||
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
void add_operation(::shared_ptr<operation> op);
|
||||
|
||||
|
||||
@@ -67,12 +67,7 @@ future<> schema_altering_statement::grant_permissions_to_creator(const service::
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
bool schema_altering_statement::depends_on_keyspace(const sstring& ks_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool schema_altering_statement::depends_on_column_family(const sstring& cf_name) const
|
||||
bool schema_altering_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -79,9 +79,7 @@ protected:
|
||||
*/
|
||||
virtual future<> grant_permissions_to_creator(const service::client_state&) const;
|
||||
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
|
||||
|
||||
@@ -194,12 +194,8 @@ void select_statement::validate(service::storage_proxy&, const service::client_s
|
||||
// Nothing to do, all validation has been done by raw_statemet::prepare()
|
||||
}
|
||||
|
||||
bool select_statement::depends_on_keyspace(const sstring& ks_name) const {
|
||||
return keyspace() == ks_name;
|
||||
}
|
||||
|
||||
bool select_statement::depends_on_column_family(const sstring& cf_name) const {
|
||||
return column_family() == cf_name;
|
||||
bool select_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return keyspace() == ks_name && (!cf_name || column_family() == *cf_name);
|
||||
}
|
||||
|
||||
const sstring& select_statement::keyspace() const {
|
||||
@@ -995,6 +991,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
|
||||
}
|
||||
|
||||
auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
|
||||
paging_state_copy->set_remaining(internal_paging_size);
|
||||
paging_state_copy->set_partition_key(std::move(index_pk));
|
||||
paging_state_copy->set_clustering_key(std::move(index_ck));
|
||||
return std::move(paging_state_copy);
|
||||
|
||||
@@ -127,8 +127,7 @@ public:
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
|
||||
virtual void validate(service::storage_proxy&, const service::client_state& state) const override;
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual future<::shared_ptr<cql_transport::messages::result_message>> execute(query_processor& qp,
|
||||
service::query_state& state, const query_options& options) const override;
|
||||
|
||||
@@ -30,13 +30,7 @@ uint32_t service_level_statement::get_bound_terms() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool service_level_statement::depends_on_keyspace(
|
||||
const sstring &ks_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool service_level_statement::depends_on_column_family(
|
||||
const sstring &cf_name) const {
|
||||
bool service_level_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -56,9 +56,7 @@ public:
|
||||
|
||||
uint32_t get_bound_terms() const override;
|
||||
|
||||
bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
future<> check_access(service::storage_proxy& sp, const service::client_state& state) const override;
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ void sl_prop_defs::validate() {
|
||||
data_value v = duration_type->deserialize(duration_type->from_string(*repr));
|
||||
cql_duration duration = static_pointer_cast<const duration_type_impl>(duration_type)->from_value(v);
|
||||
if (duration.months || duration.days) {
|
||||
throw exceptions::invalid_request_exception("Timeout values cannot be longer than 24h");
|
||||
throw exceptions::invalid_request_exception("Timeout values cannot be expressed in days/months");
|
||||
}
|
||||
if (duration.nanoseconds % 1'000'000 != 0) {
|
||||
throw exceptions::invalid_request_exception("Timeout values must be expressed in millisecond granularity");
|
||||
|
||||
@@ -67,12 +67,7 @@ std::unique_ptr<prepared_statement> truncate_statement::prepare(database& db,cql
|
||||
return std::make_unique<prepared_statement>(::make_shared<truncate_statement>(*this));
|
||||
}
|
||||
|
||||
bool truncate_statement::depends_on_keyspace(const sstring& ks_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool truncate_statement::depends_on_column_family(const sstring& cf_name) const
|
||||
bool truncate_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -58,9 +58,7 @@ public:
|
||||
|
||||
virtual std::unique_ptr<prepared_statement> prepare(database& db, cql_stats& stats) override;
|
||||
|
||||
virtual bool depends_on_keyspace(const sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
|
||||
|
||||
|
||||
@@ -53,6 +53,7 @@
|
||||
#include "types/list.hh"
|
||||
#include "types/user.hh"
|
||||
#include "concrete_types.hh"
|
||||
#include "validation.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
@@ -251,6 +252,7 @@ insert_prepared_json_statement::build_partition_keys(const query_options& option
|
||||
exploded.emplace_back(json_value->second);
|
||||
}
|
||||
auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
|
||||
validation::validate_cql_key(*s, pkey);
|
||||
auto k = query::range<query::ring_position>::make_singular(dht::decorate_key(*s, std::move(pkey)));
|
||||
ranges.emplace_back(std::move(k));
|
||||
return ranges;
|
||||
|
||||
@@ -74,12 +74,7 @@ std::unique_ptr<prepared_statement> use_statement::prepare(database& db, cql_sta
|
||||
|
||||
}
|
||||
|
||||
bool use_statement::depends_on_keyspace(const sstring& ks_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool use_statement::depends_on_column_family(const sstring& cf_name) const
|
||||
bool use_statement::depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -59,9 +59,7 @@ public:
|
||||
|
||||
virtual uint32_t get_bound_terms() const override;
|
||||
|
||||
virtual bool depends_on_keyspace(const seastar::sstring& ks_name) const override;
|
||||
|
||||
virtual bool depends_on_column_family(const seastar::sstring& cf_name) const override;
|
||||
virtual bool depends_on(std::string_view ks_name, std::optional<std::string_view> cf_name) const override;
|
||||
|
||||
virtual seastar::future<> check_access(service::storage_proxy& proxy, const service::client_state& state) const override;
|
||||
|
||||
|
||||
60
database.cc
60
database.cc
@@ -926,10 +926,9 @@ bool database::update_column_family(schema_ptr new_schema) {
|
||||
return columns_changed;
|
||||
}
|
||||
|
||||
future<> database::remove(const column_family& cf) noexcept {
|
||||
void database::remove(const table& cf) noexcept {
|
||||
auto s = cf.schema();
|
||||
auto& ks = find_keyspace(s->ks_name());
|
||||
co_await _querier_cache.evict_all_for_table(s->id());
|
||||
_column_families.erase(s->id());
|
||||
ks.metadata()->remove_column_family(s);
|
||||
_ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
|
||||
@@ -946,13 +945,20 @@ future<> database::drop_column_family(const sstring& ks_name, const sstring& cf_
|
||||
auto& ks = find_keyspace(ks_name);
|
||||
auto uuid = find_uuid(ks_name, cf_name);
|
||||
auto cf = _column_families.at(uuid);
|
||||
co_await remove(*cf);
|
||||
remove(*cf);
|
||||
cf->clear_views();
|
||||
co_return co_await cf->await_pending_ops().then([this, &ks, cf, tsf = std::move(tsf), snapshot] {
|
||||
return truncate(ks, *cf, std::move(tsf), snapshot).finally([this, cf] {
|
||||
return cf->stop();
|
||||
});
|
||||
}).finally([cf] {});
|
||||
co_await cf->await_pending_ops();
|
||||
co_await _querier_cache.evict_all_for_table(cf->schema()->id());
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
co_await truncate(ks, *cf, std::move(tsf), snapshot);
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await cf->stop();
|
||||
if (ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}
|
||||
}
|
||||
|
||||
const utils::UUID& database::find_uuid(std::string_view ks, std::string_view cf) const {
|
||||
@@ -1348,44 +1354,6 @@ database::existing_index_names(const sstring& ks_name, const sstring& cf_to_excl
|
||||
return names;
|
||||
}
|
||||
|
||||
// Based on:
|
||||
// - org.apache.cassandra.db.AbstractCell#reconcile()
|
||||
// - org.apache.cassandra.db.BufferExpiringCell#reconcile()
|
||||
// - org.apache.cassandra.db.BufferDeletedCell#reconcile()
|
||||
std::strong_ordering
|
||||
compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
|
||||
if (left.timestamp() != right.timestamp()) {
|
||||
return left.timestamp() <=> right.timestamp();
|
||||
}
|
||||
if (left.is_live() != right.is_live()) {
|
||||
return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
|
||||
}
|
||||
if (left.is_live()) {
|
||||
auto c = compare_unsigned(left.value(), right.value()) <=> 0;
|
||||
if (c != 0) {
|
||||
return c;
|
||||
}
|
||||
if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
|
||||
// prefer expiring cells.
|
||||
return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
|
||||
}
|
||||
if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
|
||||
return left.expiry() <=> right.expiry();
|
||||
}
|
||||
} else {
|
||||
// Both are deleted
|
||||
if (left.deletion_time() != right.deletion_time()) {
|
||||
// Origin compares big-endian serialized deletion time. That's because it
|
||||
// delegates to AbstractCell.reconcile() which compares values after
|
||||
// comparing timestamps, which in case of deleted cells will hold
|
||||
// serialized expiry.
|
||||
return (uint64_t) left.deletion_time().time_since_epoch().count()
|
||||
<=> (uint64_t) right.deletion_time().time_since_epoch().count();
|
||||
}
|
||||
}
|
||||
return std::strong_ordering::equal;
|
||||
}
|
||||
|
||||
future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
|
||||
database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
|
||||
tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
|
||||
|
||||
@@ -1384,6 +1384,7 @@ private:
|
||||
Future update_write_metrics(Future&& f);
|
||||
void update_write_metrics_for_timed_out_write();
|
||||
future<> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, bool is_bootstrap, system_keyspace system);
|
||||
void remove(const table&) noexcept;
|
||||
public:
|
||||
static utils::UUID empty_version;
|
||||
|
||||
@@ -1582,7 +1583,6 @@ public:
|
||||
|
||||
bool update_column_family(schema_ptr s);
|
||||
future<> drop_column_family(const sstring& ks_name, const sstring& cf_name, timestamp_func, bool with_snapshot = true);
|
||||
future<> remove(const column_family&) noexcept;
|
||||
|
||||
const logalloc::region_group& dirty_memory_region_group() const {
|
||||
return _dirty_memory_manager.region_group();
|
||||
|
||||
@@ -428,6 +428,8 @@ private:
|
||||
void abort_recycled_list(std::exception_ptr);
|
||||
void abort_deletion_promise(std::exception_ptr);
|
||||
|
||||
future<> recalculate_footprint();
|
||||
|
||||
future<> rename_file(sstring, sstring) const;
|
||||
size_t max_request_controller_units() const;
|
||||
segment_id_type _ids = 0;
|
||||
@@ -444,6 +446,7 @@ private:
|
||||
seastar::gate _gate;
|
||||
uint64_t _new_counter = 0;
|
||||
std::optional<size_t> _disk_write_alignment;
|
||||
seastar::semaphore _reserve_recalculation_guard;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
@@ -512,6 +515,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
|
||||
uint64_t _file_pos = 0;
|
||||
uint64_t _flush_pos = 0;
|
||||
uint64_t _size_on_disk = 0;
|
||||
uint64_t _waste = 0;
|
||||
|
||||
size_t _alignment;
|
||||
|
||||
@@ -598,7 +602,7 @@ public:
|
||||
clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
|
||||
++_segment_manager->totals.segments_destroyed;
|
||||
_segment_manager->totals.active_size_on_disk -= file_position();
|
||||
_segment_manager->totals.wasted_size_on_disk -= (_size_on_disk - file_position());
|
||||
_segment_manager->totals.wasted_size_on_disk -= _waste;
|
||||
_segment_manager->add_file_to_delete(_file_name, _desc);
|
||||
} else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
|
||||
clogger.warn("Segment {} is dirty and is left on disk.", *this);
|
||||
@@ -725,7 +729,8 @@ public:
|
||||
auto s = co_await sync();
|
||||
co_await flush();
|
||||
co_await terminate();
|
||||
_segment_manager->totals.wasted_size_on_disk += (_size_on_disk - file_position());
|
||||
_waste = _size_on_disk - file_position();
|
||||
_segment_manager->totals.wasted_size_on_disk += _waste;
|
||||
co_return s;
|
||||
}
|
||||
future<sseg_ptr> do_flush(uint64_t pos) {
|
||||
@@ -1223,6 +1228,7 @@ db::commitlog::segment_manager::segment_manager(config c)
|
||||
, _recycled_segments(std::numeric_limits<size_t>::max())
|
||||
, _reserve_replenisher(make_ready_future<>())
|
||||
, _background_sync(make_ready_future<>())
|
||||
, _reserve_recalculation_guard(1)
|
||||
{
|
||||
assert(max_size > 0);
|
||||
assert(max_mutation_size < segment::multi_entry_size_magic);
|
||||
@@ -1248,6 +1254,11 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
|
||||
}
|
||||
try {
|
||||
gate::holder g(_gate);
|
||||
auto guard = co_await get_units(_reserve_recalculation_guard, 1);
|
||||
if (_reserve_segments.full()) {
|
||||
// can happen if we recalculate
|
||||
continue;
|
||||
}
|
||||
// note: if we were strict with disk size, we would refuse to do this
|
||||
// unless disk footprint is lower than threshold. but we cannot (yet?)
|
||||
// trust that flush logic will absolutely free up an existing
|
||||
@@ -1519,7 +1530,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
|
||||
if (cfg.extensions && !cfg.extensions->commitlog_file_extensions().empty()) {
|
||||
for (auto * ext : cfg.extensions->commitlog_file_extensions()) {
|
||||
auto nf = co_await ext->wrap_file(std::move(filename), f, flags);
|
||||
auto nf = co_await ext->wrap_file(filename, f, flags);
|
||||
if (nf) {
|
||||
f = std::move(nf);
|
||||
align = is_overwrite ? f.disk_overwrite_dma_alignment() : f.disk_write_dma_alignment();
|
||||
@@ -1530,12 +1541,21 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
f = make_checked_file(commit_error_handler, std::move(f));
|
||||
} catch (...) {
|
||||
ep = std::current_exception();
|
||||
commit_error_handler(ep);
|
||||
}
|
||||
if (ep) {
|
||||
// do this early, so iff we are to fast-fail server,
|
||||
// we do it before anything else can go wrong.
|
||||
try {
|
||||
commit_error_handler(ep);
|
||||
} catch (...) {
|
||||
ep = std::current_exception();
|
||||
}
|
||||
}
|
||||
if (ep && f) {
|
||||
co_await f.close();
|
||||
}
|
||||
if (ep) {
|
||||
add_file_to_delete(filename, d);
|
||||
co_return coroutine::exception(std::move(ep));
|
||||
}
|
||||
|
||||
@@ -1594,6 +1614,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
}
|
||||
|
||||
future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::new_segment() {
|
||||
gate::holder g(_gate);
|
||||
|
||||
if (_shutdown) {
|
||||
co_return coroutine::make_exception(std::runtime_error("Commitlog has been shut down. Cannot add data"));
|
||||
}
|
||||
@@ -1628,22 +1650,23 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
co_return _segments.back();
|
||||
}
|
||||
|
||||
if (_segment_allocating) {
|
||||
co_await _segment_allocating->get_future(timeout);
|
||||
continue;
|
||||
}
|
||||
|
||||
promise<> p;
|
||||
_segment_allocating.emplace(p.get_future());
|
||||
auto finally = defer([&] () noexcept { _segment_allocating = std::nullopt; });
|
||||
try {
|
||||
gate::holder g(_gate);
|
||||
auto s = co_await with_timeout(timeout, new_segment());
|
||||
p.set_value();
|
||||
} catch (...) {
|
||||
p.set_exception(std::current_exception());
|
||||
throw;
|
||||
// #9896 - we don't want to issue a new_segment call until
|
||||
// the old one has terminated with either result or exception.
|
||||
// Do all waiting through the shared_future
|
||||
if (!_segment_allocating) {
|
||||
auto f = new_segment();
|
||||
// must check that we are not already done.
|
||||
if (f.available()) {
|
||||
f.get(); // maybe force exception
|
||||
continue;
|
||||
}
|
||||
_segment_allocating.emplace(f.discard_result().finally([this] {
|
||||
// clear the shared_future _before_ resolving its contents
|
||||
// (i.e. with result of this finally)
|
||||
_segment_allocating = std::nullopt;
|
||||
}));
|
||||
}
|
||||
co_await _segment_allocating->get_future(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1865,6 +1888,8 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
|
||||
|
||||
std::exception_ptr recycle_error;
|
||||
|
||||
size_t num_deleted = 0;
|
||||
bool except = false;
|
||||
while (!files.empty()) {
|
||||
auto filename = std::move(files.back());
|
||||
files.pop_back();
|
||||
@@ -1914,8 +1939,10 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
|
||||
}
|
||||
}
|
||||
co_await delete_file(filename);
|
||||
++num_deleted;
|
||||
} catch (...) {
|
||||
clogger.error("Could not delete segment {}: {}", filename, std::current_exception());
|
||||
except = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1928,6 +1955,16 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
|
||||
if (recycle_error && _recycled_segments.empty()) {
|
||||
abort_recycled_list(recycle_error);
|
||||
}
|
||||
// If recycle failed and turned into a delete, we should fake-wakeup waiters
|
||||
// since we might still have cleaned up disk space.
|
||||
if (!recycle_error && num_deleted && cfg.reuse_segments && _recycled_segments.empty()) {
|
||||
abort_recycled_list(std::make_exception_ptr(std::runtime_error("deleted files")));
|
||||
}
|
||||
|
||||
// #9348 - if we had an exception, we can't trust our bookeep any more. recalculate.
|
||||
if (except) {
|
||||
co_await recalculate_footprint();
|
||||
}
|
||||
}
|
||||
|
||||
void db::commitlog::segment_manager::abort_recycled_list(std::exception_ptr ep) {
|
||||
@@ -1942,6 +1979,67 @@ void db::commitlog::segment_manager::abort_deletion_promise(std::exception_ptr e
|
||||
std::exchange(_disk_deletions, {}).set_exception(ep);
|
||||
}
|
||||
|
||||
future<> db::commitlog::segment_manager::recalculate_footprint() {
|
||||
try {
|
||||
co_await do_pending_deletes();
|
||||
|
||||
auto guard = co_await get_units(_reserve_recalculation_guard, 1);
|
||||
auto segments_copy = _segments;
|
||||
std::vector<sseg_ptr> reserves;
|
||||
std::vector<sstring> recycles;
|
||||
// this causes haywire things while we steal stuff, but...
|
||||
while (!_reserve_segments.empty()) {
|
||||
reserves.push_back(_reserve_segments.pop());
|
||||
}
|
||||
while (!_recycled_segments.empty()) {
|
||||
recycles.push_back(_recycled_segments.pop());
|
||||
}
|
||||
// #9955 - must re-stock the queues before we do anything
|
||||
// interruptable/continuation. Because both queues are
|
||||
// used with push/pop eventually which _waits_ for signal
|
||||
// but does _not_ verify that the condition is true once
|
||||
// we return. So copy the objects and look at instead.
|
||||
for (auto& filename : recycles) {
|
||||
_recycled_segments.push(sstring(filename));
|
||||
}
|
||||
for (auto& s : reserves) {
|
||||
_reserve_segments.push(sseg_ptr(s)); // you can have it back now.
|
||||
}
|
||||
|
||||
// first, guesstimate sizes
|
||||
uint64_t recycle_size = recycles.size() * max_size;
|
||||
auto old = totals.total_size_on_disk;
|
||||
|
||||
totals.total_size_on_disk = recycle_size;
|
||||
for (auto& s : _segments) {
|
||||
totals.total_size_on_disk += s->_size_on_disk;
|
||||
}
|
||||
for (auto& s : reserves) {
|
||||
totals.total_size_on_disk += s->_size_on_disk;
|
||||
}
|
||||
|
||||
// now we need to adjust the actual sizes of recycled files
|
||||
|
||||
uint64_t actual_recycled_size = 0;
|
||||
|
||||
try {
|
||||
for (auto& filename : recycles) {
|
||||
auto s = co_await seastar::file_size(filename);
|
||||
actual_recycled_size += s;
|
||||
}
|
||||
} catch (...) {
|
||||
clogger.error("Exception reading disk footprint ({}).", std::current_exception());
|
||||
actual_recycled_size = recycle_size; // best we got
|
||||
}
|
||||
|
||||
totals.total_size_on_disk += actual_recycled_size - recycle_size;
|
||||
// pushing things to reserve/recycled queues will have resumed any
|
||||
// waiters, so we should be done.
|
||||
} catch (...) {
|
||||
clogger.error("Exception recalculating disk footprint ({}). Values might be off...", std::current_exception());
|
||||
}
|
||||
}
|
||||
|
||||
future<> db::commitlog::segment_manager::do_pending_deletes() {
|
||||
auto ftc = std::exchange(_files_to_close, {});
|
||||
auto ftd = std::exchange(_files_to_delete, {});
|
||||
|
||||
@@ -119,8 +119,9 @@ future<> snapshot_ctl::take_column_family_snapshot(sstring ks_name, std::vector<
|
||||
return check_snapshot_not_exist(ks_name, tag, tables).then([this, ks_name, tables, tag, sf] {
|
||||
return do_with(std::vector<sstring>(std::move(tables)),[this, ks_name, tag, sf](const std::vector<sstring>& tables) {
|
||||
return do_for_each(tables, [ks_name, tag, sf, this] (const sstring& table_name) {
|
||||
if (table_name.find(".") != sstring::npos) {
|
||||
throw std::invalid_argument("Cannot take a snapshot of a secondary index by itself. Run snapshot on the table that owns the index.");
|
||||
auto& cf = _db.local().find_column_family(ks_name, table_name);
|
||||
if (cf.schema()->is_view()) {
|
||||
throw std::invalid_argument("Do not take a snapshot of a materialized view or a secondary index by itself. Run snapshot on the base table instead.");
|
||||
}
|
||||
return _db.invoke_on_all([ks_name, table_name, tag, sf] (database &db) {
|
||||
auto& cf = db.find_column_family(ks_name, table_name);
|
||||
|
||||
@@ -350,7 +350,11 @@ public:
|
||||
view_filter_checking_visitor(const schema& base, const view_info& view)
|
||||
: _base(base)
|
||||
, _view(view)
|
||||
, _selection(cql3::selection::selection::wildcard(_base.shared_from_this()))
|
||||
, _selection(cql3::selection::selection::for_columns(_base.shared_from_this(),
|
||||
boost::copy_range<std::vector<const column_definition*>>(
|
||||
_base.regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return &cdef; }))
|
||||
)
|
||||
)
|
||||
{}
|
||||
|
||||
void accept_new_partition(const partition_key& key, uint64_t row_count) {
|
||||
@@ -1320,7 +1324,7 @@ future<> mutate_MV(
|
||||
auto mut_ptr = remote_endpoints.empty() ? std::make_unique<frozen_mutation>(std::move(mut.fm)) : std::make_unique<frozen_mutation>(mut.fm);
|
||||
tracing::trace(tr_state, "Locally applying view update for {}.{}; base token = {}; view token = {}",
|
||||
mut.s->ks_name(), mut.s->cf_name(), base_token, view_token);
|
||||
local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, std::move(tr_state), db::commitlog::force_sync::no).then_wrapped(
|
||||
local_view_update = service::get_local_storage_proxy().mutate_locally(mut.s, *mut_ptr, tr_state, db::commitlog::force_sync::no).then_wrapped(
|
||||
[s = mut.s, &stats, &cf_stats, tr_state, base_token, view_token, my_address, mut_ptr = std::move(mut_ptr),
|
||||
units = sem_units.split(sem_units.count())] (future<>&& f) {
|
||||
--stats.writes;
|
||||
|
||||
@@ -215,6 +215,12 @@ public:
|
||||
});
|
||||
}
|
||||
|
||||
future<flush_permit> get_all_flush_permits() {
|
||||
return get_units(_background_work_flush_serializer, _max_background_work).then([this] (auto&& units) {
|
||||
return this->get_flush_permit(std::move(units));
|
||||
});
|
||||
}
|
||||
|
||||
bool has_extraneous_flushes_requested() const {
|
||||
return _extraneous_flushes > 0;
|
||||
}
|
||||
|
||||
1
dist/common/scripts/scylla-housekeeping
vendored
1
dist/common/scripts/scylla-housekeeping
vendored
@@ -100,6 +100,7 @@ def version_compare(a, b):
|
||||
def create_uuid_file(fl):
|
||||
with open(args.uuid_file, 'w') as myfile:
|
||||
myfile.write(str(uuid.uuid1()) + "\n")
|
||||
os.chmod(args.uuid_file, 0o644)
|
||||
|
||||
|
||||
def sanitize_version(version):
|
||||
|
||||
6
dist/common/scripts/scylla_coredump_setup
vendored
6
dist/common/scripts/scylla_coredump_setup
vendored
@@ -127,10 +127,14 @@ WantedBy=multi-user.target
|
||||
# - Storage: /path/to/file (inacessible)
|
||||
# - Storage: /path/to/file
|
||||
#
|
||||
# After systemd-v248, available coredump file output changed like this:
|
||||
# - Storage: /path/to/file (present)
|
||||
# We need to support both versions.
|
||||
#
|
||||
# reference: https://github.com/systemd/systemd/commit/47f50642075a7a215c9f7b600599cbfee81a2913
|
||||
|
||||
corefail = False
|
||||
res = re.findall(r'Storage: (.*)$', coreinfo, flags=re.MULTILINE)
|
||||
res = re.findall(r'Storage: (\S+)(?: \(.+\))?$', coreinfo, flags=re.MULTILINE)
|
||||
# v232 or later
|
||||
if res:
|
||||
corepath = res[0]
|
||||
|
||||
60
dist/common/scripts/scylla_io_setup
vendored
60
dist/common/scripts/scylla_io_setup
vendored
@@ -278,6 +278,66 @@ if __name__ == "__main__":
|
||||
disk_properties["read_bandwidth"] = 2527296683 * nr_disks
|
||||
disk_properties["write_iops"] = 156326 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1063657088 * nr_disks
|
||||
elif idata.instance() == "im4gn.large":
|
||||
disk_properties["read_iops"] = 33943
|
||||
disk_properties["read_bandwidth"] = 288433525
|
||||
disk_properties["write_iops"] = 27877
|
||||
disk_properties["write_bandwidth"] = 126864680
|
||||
elif idata.instance() == "im4gn.xlarge":
|
||||
disk_properties["read_iops"] = 68122
|
||||
disk_properties["read_bandwidth"] = 576603520
|
||||
disk_properties["write_iops"] = 55246
|
||||
disk_properties["write_bandwidth"] = 254534954
|
||||
elif idata.instance() == "im4gn.2xlarge":
|
||||
disk_properties["read_iops"] = 136422
|
||||
disk_properties["read_bandwidth"] = 1152663765
|
||||
disk_properties["write_iops"] = 92184
|
||||
disk_properties["write_bandwidth"] = 508926453
|
||||
elif idata.instance() == "im4gn.4xlarge":
|
||||
disk_properties["read_iops"] = 273050
|
||||
disk_properties["read_bandwidth"] = 1638427264
|
||||
disk_properties["write_iops"] = 92173
|
||||
disk_properties["write_bandwidth"] = 1027966826
|
||||
elif idata.instance() == "im4gn.8xlarge":
|
||||
disk_properties["read_iops"] = 250241 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 1163130709 * nr_disks
|
||||
disk_properties["write_iops"] = 86374 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 977617664 * nr_disks
|
||||
elif idata.instance() == "im4gn.16xlarge":
|
||||
disk_properties["read_iops"] = 273030 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 1638211413 * nr_disks
|
||||
disk_properties["write_iops"] = 92607 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1028340266 * nr_disks
|
||||
elif idata.instance() == "is4gen.medium":
|
||||
disk_properties["read_iops"] = 33965
|
||||
disk_properties["read_bandwidth"] = 288462506
|
||||
disk_properties["write_iops"] = 27876
|
||||
disk_properties["write_bandwidth"] = 126954200
|
||||
elif idata.instance() == "is4gen.large":
|
||||
disk_properties["read_iops"] = 68131
|
||||
disk_properties["read_bandwidth"] = 576654869
|
||||
disk_properties["write_iops"] = 55257
|
||||
disk_properties["write_bandwidth"] = 254551002
|
||||
elif idata.instance() == "is4gen.xlarge":
|
||||
disk_properties["read_iops"] = 136413
|
||||
disk_properties["read_bandwidth"] = 1152747904
|
||||
disk_properties["write_iops"] = 92180
|
||||
disk_properties["write_bandwidth"] = 508889546
|
||||
elif idata.instance() == "is4gen.2xlarge":
|
||||
disk_properties["read_iops"] = 273038
|
||||
disk_properties["read_bandwidth"] = 1628982613
|
||||
disk_properties["write_iops"] = 92182
|
||||
disk_properties["write_bandwidth"] = 1027983530
|
||||
elif idata.instance() == "is4gen.4xlarge":
|
||||
disk_properties["read_iops"] = 260493 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 1217396928 * nr_disks
|
||||
disk_properties["write_iops"] = 83169 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1000390784 * nr_disks
|
||||
elif idata.instance() == "is4gen.8xlarge":
|
||||
disk_properties["read_iops"] = 273021 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 1656354602 * nr_disks
|
||||
disk_properties["write_iops"] = 92233 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1028010325 * nr_disks
|
||||
properties_file = open(etcdir() + "/scylla.d/io_properties.yaml", "w")
|
||||
yaml.dump({ "disks": [ disk_properties ] }, properties_file, default_flow_style=False)
|
||||
ioconf = open(etcdir() + "/scylla.d/io.conf", "w")
|
||||
|
||||
6
dist/common/scripts/scylla_ntp_setup
vendored
6
dist/common/scripts/scylla_ntp_setup
vendored
@@ -66,18 +66,18 @@ if __name__ == '__main__':
|
||||
|
||||
target = None
|
||||
if os.path.exists('/lib/systemd/systemd-timesyncd'):
|
||||
if systemd_unit('systemd-timesyncd').is_active():
|
||||
if systemd_unit('systemd-timesyncd').is_active() == 'active':
|
||||
print('ntp is already configured, skip setup')
|
||||
sys.exit(0)
|
||||
target = 'systemd-timesyncd'
|
||||
if shutil.which('chronyd'):
|
||||
if get_chrony_unit().is_active():
|
||||
if get_chrony_unit().is_active() == 'active':
|
||||
print('ntp is already configured, skip setup')
|
||||
sys.exit(0)
|
||||
if not target:
|
||||
target = 'chrony'
|
||||
if shutil.which('ntpd'):
|
||||
if get_ntp_unit().is_active():
|
||||
if get_ntp_unit().is_active() == 'active':
|
||||
print('ntp is already configured, skip setup')
|
||||
sys.exit(0)
|
||||
if not target:
|
||||
|
||||
19
dist/common/scripts/scylla_raid_setup
vendored
19
dist/common/scripts/scylla_raid_setup
vendored
@@ -117,10 +117,11 @@ if __name__ == '__main__':
|
||||
pkg_install('xfsprogs')
|
||||
if not shutil.which('mdadm'):
|
||||
pkg_install('mdadm')
|
||||
try:
|
||||
md_service = systemd_unit('mdmonitor.service')
|
||||
except SystemdException:
|
||||
md_service = systemd_unit('mdadm.service')
|
||||
if args.raid_level != '0':
|
||||
try:
|
||||
md_service = systemd_unit('mdmonitor.service')
|
||||
except SystemdException:
|
||||
md_service = systemd_unit('mdadm.service')
|
||||
|
||||
print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='fRAID{args.raid_level}' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
|
||||
procs=[]
|
||||
@@ -164,14 +165,15 @@ if __name__ == '__main__':
|
||||
|
||||
uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
||||
after = 'local-fs.target'
|
||||
if raid:
|
||||
wants = ''
|
||||
if raid and args.raid_level != '0':
|
||||
after += f' {md_service}'
|
||||
wants = f'\nWants={md_service}'
|
||||
unit_data = f'''
|
||||
[Unit]
|
||||
Description=Scylla data directory
|
||||
Before=scylla-server.service
|
||||
After={after}
|
||||
Wants={md_service}
|
||||
After={after}{wants}
|
||||
DefaultDependencies=no
|
||||
|
||||
[Mount]
|
||||
@@ -195,7 +197,8 @@ WantedBy=multi-user.target
|
||||
f.write(f'RequiresMountsFor={mount_at}\n')
|
||||
|
||||
systemd_unit.reload()
|
||||
md_service.start()
|
||||
if args.raid_level != '0':
|
||||
md_service.start()
|
||||
mount = systemd_unit(mntunit_bn)
|
||||
mount.start()
|
||||
if args.enable_on_nextboot:
|
||||
|
||||
4
dist/common/scripts/scylla_setup
vendored
4
dist/common/scripts/scylla_setup
vendored
@@ -370,6 +370,10 @@ if __name__ == '__main__':
|
||||
version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', version_check)
|
||||
args.no_version_check = not version_check
|
||||
if version_check:
|
||||
cfg = sysconfig_parser(sysconfdir_p() / 'scylla-housekeeping')
|
||||
repo_files = cfg.get('REPO_FILES')
|
||||
for f in glob.glob(repo_files):
|
||||
os.chmod(f, 0o644)
|
||||
with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
|
||||
f.write('[housekeeping]\ncheck-version: True\n')
|
||||
os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
|
||||
|
||||
6
dist/common/scripts/scylla_util.py
vendored
6
dist/common/scripts/scylla_util.py
vendored
@@ -674,7 +674,7 @@ class aws_instance:
|
||||
return self._type.split(".")[0]
|
||||
|
||||
def is_supported_instance_class(self):
|
||||
if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd']:
|
||||
if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -683,7 +683,7 @@ class aws_instance:
|
||||
instance_size = self.instance_size()
|
||||
if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
|
||||
return 'ixgbevf'
|
||||
if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd']:
|
||||
if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
|
||||
return 'ena'
|
||||
if instance_class == 'm4':
|
||||
if instance_size == '16xlarge':
|
||||
@@ -1041,7 +1041,7 @@ class systemd_unit:
|
||||
return run('systemctl {} disable {}'.format(self.ctlparam, self._unit), shell=True, check=True)
|
||||
|
||||
def is_active(self):
|
||||
return True if run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip() == 'active' else False
|
||||
return run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip()
|
||||
|
||||
def mask(self):
|
||||
return run('systemctl {} mask {}'.format(self.ctlparam, self._unit), shell=True, check=True)
|
||||
|
||||
6
dist/common/supervisor/scylla_util.sh
vendored
6
dist/common/supervisor/scylla_util.sh
vendored
@@ -6,12 +6,16 @@ is_nonroot() {
|
||||
[ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
|
||||
}
|
||||
|
||||
is_container() {
|
||||
[ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
|
||||
}
|
||||
|
||||
is_privileged() {
|
||||
[ ${EUID:-${UID}} = 0 ]
|
||||
}
|
||||
|
||||
execsudo() {
|
||||
if is_nonroot; then
|
||||
if is_nonroot || is_container; then
|
||||
exec "$@"
|
||||
else
|
||||
exec sudo -u scylla -g scylla "$@"
|
||||
|
||||
8
dist/docker/debian/build_docker.sh
vendored
8
dist/docker/debian/build_docker.sh
vendored
@@ -25,6 +25,10 @@ product="$(<build/SCYLLA-PRODUCT-FILE)"
|
||||
version="$(<build/SCYLLA-VERSION-FILE)"
|
||||
release="$(<build/SCYLLA-RELEASE-FILE)"
|
||||
|
||||
if [[ "$version" = *rc* ]]; then
|
||||
version=$(echo $version |sed 's/\(.*\)\.)*/\1~/')
|
||||
fi
|
||||
|
||||
mode="release"
|
||||
|
||||
if uname -m | grep x86_64 ; then
|
||||
@@ -93,12 +97,14 @@ run apt-get -y install hostname supervisor openssh-server openssh-client openjdk
|
||||
run locale-gen en_US.UTF-8
|
||||
run bash -ec "dpkg -i packages/*.deb"
|
||||
run apt-get -y clean all
|
||||
run bash -ec "cat /scylla_bashrc >> /etc/bashrc"
|
||||
run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
|
||||
run mkdir -p /etc/supervisor.conf.d
|
||||
run mkdir -p /var/log/scylla
|
||||
run chown -R scylla:scylla /var/lib/scylla
|
||||
run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server
|
||||
|
||||
run mkdir -p /opt/scylladb/supervisor
|
||||
run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
|
||||
bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
|
||||
bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
|
||||
bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
[program:scylla-server]
|
||||
[program:scylla]
|
||||
command=/opt/scylladb/supervisor/scylla-server.sh
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
|
||||
41
dist/docker/etc/sysconfig/scylla-server
vendored
41
dist/docker/etc/sysconfig/scylla-server
vendored
@@ -1,41 +0,0 @@
|
||||
# choose following mode: virtio, dpdk, posix
|
||||
NETWORK_MODE=posix
|
||||
|
||||
# tap device name(virtio)
|
||||
TAP=tap0
|
||||
|
||||
# bridge device name (virtio)
|
||||
BRIDGE=virbr0
|
||||
|
||||
# ethernet device name
|
||||
IFNAME=eth0
|
||||
|
||||
# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
|
||||
SET_NIC_AND_DISKS=no
|
||||
|
||||
# ethernet device driver (dpdk)
|
||||
ETHDRV=
|
||||
|
||||
# ethernet device PCI ID (dpdk)
|
||||
ETHPCIID=
|
||||
|
||||
# number of hugepages
|
||||
NR_HUGEPAGES=64
|
||||
|
||||
# user for process (must be root for dpdk)
|
||||
USER=scylla
|
||||
|
||||
# group for process
|
||||
GROUP=scylla
|
||||
|
||||
# scylla home dir
|
||||
SCYLLA_HOME=/var/lib/scylla
|
||||
|
||||
# scylla config dir
|
||||
SCYLLA_CONF=/etc/scylla
|
||||
|
||||
# scylla arguments
|
||||
SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
|
||||
|
||||
# setup as AMI instance
|
||||
AMI=no
|
||||
5
dist/docker/scyllasetup.py
vendored
5
dist/docker/scyllasetup.py
vendored
@@ -121,12 +121,13 @@ class ScyllaSetup:
|
||||
if self._apiAddress is not None:
|
||||
args += ["--api-address %s" % self._apiAddress]
|
||||
|
||||
if self._alternatorPort is not None:
|
||||
if self._alternatorAddress is not None:
|
||||
args += ["--alternator-address %s" % self._alternatorAddress]
|
||||
|
||||
if self._alternatorPort is not None:
|
||||
args += ["--alternator-port %s" % self._alternatorPort]
|
||||
|
||||
if self._alternatorHttpsPort is not None:
|
||||
args += ["--alternator-address %s" % self._alternatorAddress]
|
||||
args += ["--alternator-https-port %s" % self._alternatorHttpsPort]
|
||||
|
||||
if self._alternatorWriteIsolation is not None:
|
||||
|
||||
@@ -184,14 +184,18 @@ future<> server::do_accepts(int which, bool keepalive, socket_address server_add
|
||||
_logger.info("exception while advertising new connection: {}", std::current_exception());
|
||||
}
|
||||
// Block while monitoring for lifetime/errors.
|
||||
return conn->process().finally([this, conn] {
|
||||
return unadvertise_connection(conn);
|
||||
}).handle_exception([this] (std::exception_ptr ep) {
|
||||
if (is_broken_pipe_or_connection_reset(ep)) {
|
||||
// expected if another side closes a connection or we're shutting down
|
||||
return;
|
||||
return conn->process().then_wrapped([this, conn] (auto f) {
|
||||
try {
|
||||
f.get();
|
||||
} catch (...) {
|
||||
auto ep = std::current_exception();
|
||||
if (!is_broken_pipe_or_connection_reset(ep)) {
|
||||
// some exceptions are expected if another side closes a connection
|
||||
// or we're shutting down
|
||||
_logger.info("exception while processing connection: {}", ep);
|
||||
}
|
||||
}
|
||||
_logger.info("exception while processing connection: {}", ep);
|
||||
return unadvertise_connection(conn);
|
||||
});
|
||||
});
|
||||
return stop_iteration::no;
|
||||
|
||||
@@ -477,49 +477,42 @@ gossiper::handle_get_endpoint_states_msg(gossip_get_endpoint_states_request requ
|
||||
return make_ready_future<gossip_get_endpoint_states_response>(gossip_get_endpoint_states_response{std::move(map)});
|
||||
}
|
||||
|
||||
rpc::no_wait_type gossiper::background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn) {
|
||||
(void)with_gate(_background_msg, [this, type = std::move(type), fn = std::move(fn)] () mutable {
|
||||
return container().invoke_on(0, std::move(fn)).handle_exception([type = std::move(type)] (auto ep) {
|
||||
logger.warn("Failed to handle {}: {}", type, ep);
|
||||
});
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
}
|
||||
|
||||
void gossiper::init_messaging_service_handler() {
|
||||
_messaging.register_gossip_digest_syn([this] (const rpc::client_info& cinfo, gossip_digest_syn syn_msg) {
|
||||
auto from = netw::messaging_service::get_source(cinfo);
|
||||
// In a new fiber.
|
||||
(void)container().invoke_on(0, [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
|
||||
return background_msg("GOSSIP_DIGEST_SYN", [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
|
||||
return gossiper.handle_syn_msg(from, std::move(syn_msg));
|
||||
}).handle_exception([] (auto ep) {
|
||||
logger.warn("Fail to handle GOSSIP_DIGEST_SYN: {}", ep);
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
});
|
||||
_messaging.register_gossip_digest_ack([this] (const rpc::client_info& cinfo, gossip_digest_ack msg) {
|
||||
auto from = netw::messaging_service::get_source(cinfo);
|
||||
// In a new fiber.
|
||||
(void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
|
||||
return background_msg("GOSSIP_DIGEST_ACK", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
|
||||
return gossiper.handle_ack_msg(from, std::move(msg));
|
||||
}).handle_exception([] (auto ep) {
|
||||
logger.warn("Fail to handle GOSSIP_DIGEST_ACK: {}", ep);
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
});
|
||||
_messaging.register_gossip_digest_ack2([this] (const rpc::client_info& cinfo, gossip_digest_ack2 msg) {
|
||||
auto from = netw::messaging_service::get_source(cinfo);
|
||||
// In a new fiber.
|
||||
(void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
|
||||
return background_msg("GOSSIP_DIGEST_ACK2", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
|
||||
return gossiper.handle_ack2_msg(from, std::move(msg));
|
||||
}).handle_exception([] (auto ep) {
|
||||
logger.warn("Fail to handle GOSSIP_DIGEST_ACK2: {}", ep);
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
});
|
||||
_messaging.register_gossip_echo([this] (const rpc::client_info& cinfo, rpc::optional<int64_t> generation_number_opt) {
|
||||
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
return handle_echo_msg(from, generation_number_opt);
|
||||
});
|
||||
_messaging.register_gossip_shutdown([this] (inet_address from, rpc::optional<int64_t> generation_number_opt) {
|
||||
// In a new fiber.
|
||||
(void)container().invoke_on(0, [from, generation_number_opt] (gms::gossiper& gossiper) {
|
||||
return background_msg("GOSSIP_SHUTDOWN", [from, generation_number_opt] (gms::gossiper& gossiper) {
|
||||
return gossiper.handle_shutdown_msg(from, generation_number_opt);
|
||||
}).handle_exception([] (auto ep) {
|
||||
logger.warn("Fail to handle GOSSIP_SHUTDOWN: {}", ep);
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
});
|
||||
_messaging.register_gossip_get_endpoint_states([this] (const rpc::client_info& cinfo, gossip_get_endpoint_states_request request) {
|
||||
return container().invoke_on(0, [request = std::move(request)] (gms::gossiper& gossiper) mutable {
|
||||
@@ -1679,6 +1672,10 @@ bool gossiper::is_normal(const inet_address& endpoint) const {
|
||||
return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_NORMAL);
|
||||
}
|
||||
|
||||
bool gossiper::is_left(const inet_address& endpoint) const {
|
||||
return get_gossip_status(endpoint) == sstring(versioned_value::STATUS_LEFT);
|
||||
}
|
||||
|
||||
bool gossiper::is_normal_ring_member(const inet_address& endpoint) const {
|
||||
auto status = get_gossip_status(endpoint);
|
||||
return status == sstring(versioned_value::STATUS_NORMAL) || status == sstring(versioned_value::SHUTDOWN);
|
||||
@@ -2178,6 +2175,9 @@ future<> gossiper::start() {
|
||||
}
|
||||
|
||||
future<> gossiper::shutdown() {
|
||||
if (!_background_msg.is_closed()) {
|
||||
co_await _background_msg.close();
|
||||
}
|
||||
if (this_shard_id() == 0) {
|
||||
co_await do_stop_gossiping();
|
||||
}
|
||||
|
||||
@@ -41,7 +41,9 @@
|
||||
#include "unimplemented.hh"
|
||||
#include <seastar/core/distributed.hh>
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/print.hh>
|
||||
#include <seastar/rpc/rpc_types.hh>
|
||||
#include "utils/atomic_vector.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/fb_utilities.hh"
|
||||
@@ -138,12 +140,16 @@ private:
|
||||
bool _enabled = false;
|
||||
semaphore _callback_running{1};
|
||||
semaphore _apply_state_locally_semaphore{100};
|
||||
seastar::gate _background_msg;
|
||||
std::unordered_map<gms::inet_address, syn_msg_pending> _syn_handlers;
|
||||
std::unordered_map<gms::inet_address, ack_msg_pending> _ack_handlers;
|
||||
bool _advertise_myself = true;
|
||||
// Map ip address and generation number
|
||||
std::unordered_map<gms::inet_address, int32_t> _advertise_to_nodes;
|
||||
future<> _failure_detector_loop_done{make_ready_future<>()} ;
|
||||
|
||||
rpc::no_wait_type background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn);
|
||||
|
||||
public:
|
||||
// Get current generation number for the given nodes
|
||||
future<std::unordered_map<gms::inet_address, int32_t>>
|
||||
@@ -565,6 +571,7 @@ public:
|
||||
bool is_seed(const inet_address& endpoint) const;
|
||||
bool is_shutdown(const inet_address& endpoint) const;
|
||||
bool is_normal(const inet_address& endpoint) const;
|
||||
bool is_left(const inet_address& endpoint) const;
|
||||
// Check if a node is in NORMAL or SHUTDOWN status which means the node is
|
||||
// part of the token ring from the gossip point of view and operates in
|
||||
// normal status or was in normal status but is shutdown.
|
||||
|
||||
@@ -520,8 +520,13 @@ relocate_python3 "$rprefix"/scyllatop tools/scyllatop/scyllatop.py
|
||||
if $supervisor; then
|
||||
install -d -m755 `supervisor_dir $retc`
|
||||
for service in scylla-server scylla-jmx scylla-node-exporter; do
|
||||
if [ "$service" = "scylla-server" ]; then
|
||||
program="scylla"
|
||||
else
|
||||
program=$service
|
||||
fi
|
||||
cat << EOS > `supervisor_conf $retc $service`
|
||||
[program:$service]
|
||||
[program:$program]
|
||||
directory=$rprefix
|
||||
command=/bin/bash -c './supervisor/$service.sh'
|
||||
EOS
|
||||
|
||||
@@ -61,6 +61,10 @@ azure_snitch::azure_snitch(const sstring& fname, unsigned io_cpuid) : production
|
||||
}
|
||||
|
||||
future<> azure_snitch::load_config() {
|
||||
if (this_shard_id() != io_cpu_id()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
sstring region = co_await azure_api_call(REGION_NAME_QUERY_PATH);
|
||||
sstring azure_zone = co_await azure_api_call(ZONE_NAME_QUERY_PATH);
|
||||
|
||||
|
||||
33
main.cc
33
main.cc
@@ -377,11 +377,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
|
||||
startlog.info("Shutting down {}", what);
|
||||
try {
|
||||
func();
|
||||
startlog.info("Shutting down {} was successful", what);
|
||||
} catch (...) {
|
||||
startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
|
||||
throw;
|
||||
auto ex = std::current_exception();
|
||||
bool do_abort = true;
|
||||
try {
|
||||
std::rethrow_exception(ex);
|
||||
} catch (const std::system_error& e) {
|
||||
// System error codes we consider "environmental",
|
||||
// i.e. not scylla's fault, therefore there is no point in
|
||||
// aborting and dumping core.
|
||||
for (int i : {EIO, EACCES, ENOSPC}) {
|
||||
if (e.code() == std::error_code(i, std::system_category())) {
|
||||
do_abort = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
}
|
||||
auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
|
||||
if (do_abort) {
|
||||
startlog.error("{}: aborting", msg);
|
||||
abort();
|
||||
} else {
|
||||
startlog.error("{}: exiting, at {}", msg, current_backtrace());
|
||||
|
||||
// Call _exit() rather than exit() to exit immediately
|
||||
// without calling exit handlers, avoiding
|
||||
// boost::intrusive::detail::destructor_impl assert failure
|
||||
// from ~segment_pool exit handler.
|
||||
_exit(255);
|
||||
}
|
||||
}
|
||||
startlog.info("Shutting down {} was successful", what);
|
||||
};
|
||||
|
||||
auto ret = deferred_action(std::move(vfunc));
|
||||
|
||||
@@ -613,7 +613,8 @@ static flat_mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
|
||||
schema_ptr rev_snp_schema = snp->schema()->make_reversed();
|
||||
return make_partition_snapshot_flat_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
} else {
|
||||
return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(snp->schema(), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
schema_ptr snp_schema = snp->schema();
|
||||
return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -628,7 +628,12 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
remove_error_rpc_client(verb, id);
|
||||
}
|
||||
|
||||
auto must_encrypt = [&id, &verb, this] {
|
||||
auto addr = get_preferred_ip(id.addr);
|
||||
auto broadcast_address = utils::fb_utilities::get_broadcast_address();
|
||||
bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != broadcast_address;
|
||||
auto laddr = socket_address(listen_to_bc ? broadcast_address : _cfg.ip, 0);
|
||||
|
||||
auto must_encrypt = [&] {
|
||||
if (_cfg.encrypt == encrypt_what::none) {
|
||||
return false;
|
||||
}
|
||||
@@ -646,13 +651,27 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
auto& snitch_ptr = locator::i_endpoint_snitch::get_local_snitch_ptr();
|
||||
|
||||
// either rack/dc need to be in same dc to use non-tls
|
||||
if (snitch_ptr->get_datacenter(id.addr) != snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address())) {
|
||||
auto my_dc = snitch_ptr->get_datacenter(broadcast_address);
|
||||
if (snitch_ptr->get_datacenter(addr) != my_dc) {
|
||||
return true;
|
||||
}
|
||||
// #9653 - if our idea of dc for bind address differs from our official endpoint address,
|
||||
// we cannot trust downgrading. We need to ensure either (local) bind address is same as
|
||||
// broadcast or that the dc info we get for it is the same.
|
||||
if (broadcast_address != laddr && snitch_ptr->get_datacenter(laddr) != my_dc) {
|
||||
return true;
|
||||
}
|
||||
// if cross-rack tls, check rack.
|
||||
return _cfg.encrypt == encrypt_what::rack &&
|
||||
snitch_ptr->get_rack(id.addr) != snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address())
|
||||
;
|
||||
if (_cfg.encrypt == encrypt_what::dc) {
|
||||
return false;
|
||||
}
|
||||
auto my_rack = snitch_ptr->get_rack(broadcast_address);
|
||||
if (snitch_ptr->get_rack(addr) != my_rack) {
|
||||
return true;
|
||||
}
|
||||
// See above: We need to ensure either (local) bind address is same as
|
||||
// broadcast or that the rack info we get for it is the same.
|
||||
return broadcast_address != laddr && snitch_ptr->get_rack(laddr) != my_rack;
|
||||
}();
|
||||
|
||||
auto must_compress = [&id, this] {
|
||||
@@ -681,7 +700,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
return true;
|
||||
}();
|
||||
|
||||
auto remote_addr = socket_address(get_preferred_ip(id.addr), must_encrypt ? _cfg.ssl_port : _cfg.port);
|
||||
auto remote_addr = socket_address(addr, must_encrypt ? _cfg.ssl_port : _cfg.port);
|
||||
|
||||
rpc::client_options opts;
|
||||
// send keepalive messages each minute if connection is idle, drop connection after 10 failures
|
||||
@@ -691,13 +710,8 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
}
|
||||
opts.tcp_nodelay = must_tcp_nodelay;
|
||||
opts.reuseaddr = true;
|
||||
// We send cookies only for non-default statement tenant clients.
|
||||
if (idx > 3) {
|
||||
opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
|
||||
}
|
||||
opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
|
||||
|
||||
bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != utils::fb_utilities::get_broadcast_address();
|
||||
auto laddr = socket_address(listen_to_bc ? utils::fb_utilities::get_broadcast_address() : _cfg.ip, 0);
|
||||
auto client = must_encrypt ?
|
||||
::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
|
||||
remote_addr, laddr, _credentials) :
|
||||
|
||||
@@ -694,11 +694,11 @@ future<typename ResultBuilder::result_type> do_query(
|
||||
ResultBuilder&& result_builder) {
|
||||
auto ctx = seastar::make_shared<read_context>(db, s, cmd, ranges, trace_state, timeout);
|
||||
|
||||
co_await ctx->lookup_readers(timeout);
|
||||
|
||||
std::exception_ptr ex;
|
||||
|
||||
try {
|
||||
co_await ctx->lookup_readers(timeout);
|
||||
|
||||
auto [last_ckey, result, unconsumed_buffer, compaction_state] = co_await read_page<ResultBuilder>(ctx, s, cmd, ranges, trace_state,
|
||||
std::move(result_builder));
|
||||
|
||||
|
||||
@@ -1545,18 +1545,20 @@ public:
|
||||
};
|
||||
|
||||
future<> shard_reader::close() noexcept {
|
||||
// Nothing to do if there was no reader created, nor is there a background
|
||||
// read ahead in progress which will create one.
|
||||
if (!_reader && !_read_ahead) {
|
||||
co_return;
|
||||
if (_read_ahead) {
|
||||
try {
|
||||
co_await *std::exchange(_read_ahead, std::nullopt);
|
||||
} catch (...) {
|
||||
mrlog.warn("shard_reader::close(): read_ahead on shard {} failed: {}", _shard, std::current_exception());
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
if (_read_ahead) {
|
||||
co_await *std::exchange(_read_ahead, std::nullopt);
|
||||
}
|
||||
|
||||
co_await smp::submit_to(_shard, [this] {
|
||||
if (!_reader) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto irh = std::move(*_reader).inactive_read_handle();
|
||||
return with_closeable(flat_mutation_reader(_reader.release()), [this] (flat_mutation_reader& reader) mutable {
|
||||
auto permit = reader.permit();
|
||||
|
||||
@@ -54,7 +54,7 @@ future<> feed_writer(flat_mutation_reader&& rd_ref, Writer wr) {
|
||||
auto rd = std::move(rd_ref);
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
while (!rd.is_end_of_stream()) {
|
||||
while (!rd.is_end_of_stream() || !rd.is_buffer_empty()) {
|
||||
co_await rd.fill_buffer();
|
||||
while (!rd.is_buffer_empty()) {
|
||||
co_await rd.pop_mutation_fragment().consume(wr);
|
||||
|
||||
@@ -305,14 +305,23 @@ class partition_snapshot_flat_reader : public flat_mutation_reader::impl, public
|
||||
const std::optional<position_in_partition>& last_row,
|
||||
const std::optional<position_in_partition>& last_rts,
|
||||
position_in_partition_view pos) {
|
||||
if (!_rt_stream.empty()) {
|
||||
return _rt_stream.get_next(std::move(pos));
|
||||
}
|
||||
return in_alloc_section([&] () -> mutation_fragment_opt {
|
||||
maybe_refresh_state(ck_range_snapshot, last_row, last_rts);
|
||||
|
||||
position_in_partition::less_compare rt_less(_query_schema);
|
||||
|
||||
// The while below moves range tombstones from partition versions
|
||||
// into _rt_stream, just enough to produce the next range tombstone
|
||||
// The main goal behind moving to _rt_stream is to deoverlap range tombstones
|
||||
// which have the same starting position. This is not in order to satisfy
|
||||
// flat_mutation_reader stream requirements, the reader can emit range tombstones
|
||||
// which have the same position incrementally. This is to guarantee forward
|
||||
// progress in the case iterators get invalidated and maybe_refresh_state()
|
||||
// above needs to restore them. It does so using last_rts, which tracks
|
||||
// the position of the last emitted range tombstone. All range tombstones
|
||||
// with positions <= than last_rts are skipped on refresh. To make progress,
|
||||
// we need to make sure that all range tombstones with duplicated positions
|
||||
// are emitted before maybe_refresh_state().
|
||||
while (has_more_range_tombstones()
|
||||
&& !rt_less(pos, peek_range_tombstone().position())
|
||||
&& (_rt_stream.empty() || !rt_less(_rt_stream.peek_next().position(), peek_range_tombstone().position()))) {
|
||||
|
||||
@@ -325,7 +325,7 @@ public:
|
||||
// When throws, the cursor is invalidated and its position is not changed.
|
||||
bool advance_to(position_in_partition_view lower_bound) {
|
||||
prepare_heap(lower_bound);
|
||||
bool found = no_clustering_row_between(_schema, lower_bound, _heap[0].it->position());
|
||||
bool found = no_clustering_row_between_weak(_schema, lower_bound, _heap[0].it->position());
|
||||
recreate_current_row();
|
||||
return found;
|
||||
}
|
||||
@@ -411,11 +411,11 @@ public:
|
||||
} else {
|
||||
// Copy row from older version because rows in evictable versions must
|
||||
// hold values which are independently complete to be consistent on eviction.
|
||||
auto e = current_allocator().construct<rows_entry>(_schema, *_current_row[0].it);
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, *_current_row[0].it));
|
||||
e->set_continuous(latest_i && latest_i->continuous());
|
||||
_snp.tracker()->insert(*e);
|
||||
rows.insert_before(latest_i, *e);
|
||||
return {*e, true};
|
||||
auto e_i = rows.insert_before(latest_i, std::move(e));
|
||||
return ensure_result{*e_i, true};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -447,11 +447,11 @@ public:
|
||||
}
|
||||
auto&& rows = _snp.version()->partition().mutable_clustered_rows();
|
||||
auto latest_i = get_iterator_in_latest_version();
|
||||
auto e = current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
|
||||
is_continuous(latest_i && latest_i->continuous()));
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
|
||||
is_continuous(latest_i && latest_i->continuous())));
|
||||
_snp.tracker()->insert(*e);
|
||||
rows.insert_before(latest_i, *e);
|
||||
return ensure_result{*e, true};
|
||||
auto e_i = rows.insert_before(latest_i, std::move(e));
|
||||
return ensure_result{*e_i, true};
|
||||
}
|
||||
|
||||
// Brings the entry pointed to by the cursor to the front of the LRU
|
||||
|
||||
@@ -575,6 +575,20 @@ bool no_clustering_row_between(const schema& s, position_in_partition_view a, po
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if and only if there can't be any clustering_row with position >= a and < b.
|
||||
// It is assumed that a <= b.
|
||||
inline
|
||||
bool no_clustering_row_between_weak(const schema& s, position_in_partition_view a, position_in_partition_view b) {
|
||||
clustering_key_prefix::equality eq(s);
|
||||
if (a.has_key() && b.has_key()) {
|
||||
return eq(a.key(), b.key())
|
||||
&& (a.get_bound_weight() == bound_weight::after_all_prefixed
|
||||
|| b.get_bound_weight() != bound_weight::after_all_prefixed);
|
||||
} else {
|
||||
return !a.has_key() && !b.has_key();
|
||||
}
|
||||
}
|
||||
|
||||
// Includes all position_in_partition objects "p" for which: start <= p < end
|
||||
// And only those.
|
||||
class position_range {
|
||||
@@ -659,3 +673,9 @@ inline
|
||||
bool position_range::is_all_clustered_rows(const schema& s) const {
|
||||
return _start.is_before_all_clustered_rows(s) && _end.is_after_all_clustered_rows(s);
|
||||
}
|
||||
|
||||
// Assumes that the bounds of `r` are of 'clustered' type
|
||||
// and that `r` is non-empty (the left bound is smaller than the right bound).
|
||||
//
|
||||
// If `r` does not contain any keys, returns nullopt.
|
||||
std::optional<query::clustering_range> position_range_to_clustering_range(const position_range& r, const schema&);
|
||||
|
||||
49
query.cc
49
query.cc
@@ -379,3 +379,52 @@ foreign_ptr<lw_shared_ptr<query::result>> result_merger::get() {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::optional<query::clustering_range> position_range_to_clustering_range(const position_range& r, const schema& s) {
|
||||
assert(r.start().get_type() == partition_region::clustered);
|
||||
assert(r.end().get_type() == partition_region::clustered);
|
||||
|
||||
if (r.start().has_key() && r.end().has_key()
|
||||
&& clustering_key_prefix::equality(s)(r.start().key(), r.end().key())) {
|
||||
assert(r.start().get_bound_weight() != r.end().get_bound_weight());
|
||||
|
||||
if (r.end().get_bound_weight() == bound_weight::after_all_prefixed
|
||||
&& r.start().get_bound_weight() != bound_weight::after_all_prefixed) {
|
||||
// [before x, after x) and [for x, after x) get converted to [x, x].
|
||||
return query::clustering_range::make_singular(r.start().key());
|
||||
}
|
||||
|
||||
// [before x, for x) does not contain any keys.
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// position_range -> clustering_range
|
||||
// (recall that position_ranges are always left-closed, right opened):
|
||||
// [before x, ...), [for x, ...) -> [x, ...
|
||||
// [after x, ...) -> (x, ...
|
||||
// [..., before x), [..., for x) -> ..., x)
|
||||
// [..., after x) -> ..., x]
|
||||
|
||||
auto to_bound = [&s] (const position_in_partition& p, bool left) -> std::optional<query::clustering_range::bound> {
|
||||
if (p.is_before_all_clustered_rows(s)) {
|
||||
assert(left);
|
||||
return {};
|
||||
}
|
||||
|
||||
if (p.is_after_all_clustered_rows(s)) {
|
||||
assert(!left);
|
||||
return {};
|
||||
}
|
||||
|
||||
assert(p.has_key());
|
||||
|
||||
auto bw = p.get_bound_weight();
|
||||
bool inclusive = left
|
||||
? bw != bound_weight::after_all_prefixed
|
||||
: bw == bound_weight::after_all_prefixed;
|
||||
|
||||
return query::clustering_range::bound{p.key(), inclusive};
|
||||
};
|
||||
|
||||
return query::clustering_range{to_bound(r.start(), true), to_bound(r.end(), false)};
|
||||
}
|
||||
|
||||
@@ -42,28 +42,34 @@ static auto construct_range_tombstone_entry(Args&&... args) {
|
||||
}
|
||||
|
||||
void range_tombstone_list::apply_reversibly(const schema& s,
|
||||
clustering_key_prefix start, bound_kind start_kind,
|
||||
clustering_key_prefix end,
|
||||
clustering_key_prefix start_key, bound_kind start_kind,
|
||||
clustering_key_prefix end_key,
|
||||
bound_kind end_kind,
|
||||
tombstone tomb,
|
||||
reverter& rev)
|
||||
{
|
||||
position_in_partition::less_compare less(s);
|
||||
position_in_partition start(position_in_partition::range_tag_t(), bound_view(std::move(start_key), start_kind));
|
||||
position_in_partition end(position_in_partition::range_tag_t(), bound_view(std::move(end_key), end_kind));
|
||||
|
||||
if (!less(start, end)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!_tombstones.empty()) {
|
||||
bound_view::compare less(s);
|
||||
bound_view start_bound(start, start_kind);
|
||||
auto last = --_tombstones.end();
|
||||
range_tombstones_type::iterator it;
|
||||
if (less(start_bound, last->end_bound())) {
|
||||
it = _tombstones.upper_bound(start_bound, [less](auto&& sb, auto&& rt) {
|
||||
return less(sb, rt.end_bound());
|
||||
if (less(start, last->end_position())) {
|
||||
it = _tombstones.upper_bound(start, [less](auto&& sb, auto&& rt) {
|
||||
return less(sb, rt.end_position());
|
||||
});
|
||||
} else {
|
||||
it = _tombstones.end();
|
||||
}
|
||||
insert_from(s, std::move(it), std::move(start), start_kind, std::move(end), end_kind, std::move(tomb), rev);
|
||||
insert_from(s, std::move(it), std::move(start), std::move(end), std::move(tomb), rev);
|
||||
return;
|
||||
}
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
|
||||
rev.insert(_tombstones.end(), *rt);
|
||||
rt.release();
|
||||
}
|
||||
@@ -81,35 +87,31 @@ void range_tombstone_list::apply_reversibly(const schema& s,
|
||||
*/
|
||||
void range_tombstone_list::insert_from(const schema& s,
|
||||
range_tombstones_type::iterator it,
|
||||
clustering_key_prefix start,
|
||||
bound_kind start_kind,
|
||||
clustering_key_prefix end,
|
||||
bound_kind end_kind,
|
||||
position_in_partition start,
|
||||
position_in_partition end,
|
||||
tombstone tomb,
|
||||
reverter& rev)
|
||||
{
|
||||
bound_view::compare less(s);
|
||||
bound_view end_bound(end, end_kind);
|
||||
position_in_partition::tri_compare cmp(s);
|
||||
|
||||
if (it != _tombstones.begin()) {
|
||||
auto prev = std::prev(it);
|
||||
if (prev->tombstone().tomb == tomb && prev->end_bound().adjacent(s, bound_view(start, start_kind))) {
|
||||
start = prev->tombstone().start;
|
||||
start_kind = prev->tombstone().start_kind;
|
||||
if (prev->tombstone().tomb == tomb && cmp(prev->end_position(), start) == 0) {
|
||||
start = prev->position();
|
||||
rev.erase(prev);
|
||||
}
|
||||
}
|
||||
while (it != _tombstones.end()) {
|
||||
bound_view start_bound(start, start_kind);
|
||||
if (less(end_bound, start_bound)) {
|
||||
if (cmp(end, start) <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (less(end_bound, it->start_bound())) {
|
||||
if (cmp(end, it->position()) < 0) {
|
||||
// not overlapping
|
||||
if (it->tombstone().tomb == tomb && end_bound.adjacent(s, it->start_bound())) {
|
||||
rev.update(it, {std::move(start), start_kind, it->tombstone().end, it->tombstone().end_kind, tomb});
|
||||
if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
|
||||
rev.update(it, {std::move(start), std::move(start), tomb});
|
||||
} else {
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, tomb);
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
}
|
||||
@@ -119,34 +121,29 @@ void range_tombstone_list::insert_from(const schema& s,
|
||||
auto c = tomb <=> it->tombstone().tomb;
|
||||
if (c == 0) {
|
||||
// same timestamp, overlapping or adjacent, so merge.
|
||||
if (less(it->start_bound(), start_bound)) {
|
||||
start = it->tombstone().start;
|
||||
start_kind = it->tombstone().start_kind;
|
||||
if (cmp(it->position(), start) < 0) {
|
||||
start = it->position();
|
||||
}
|
||||
if (less(end_bound, it->end_bound())) {
|
||||
end = it->tombstone().end;
|
||||
end_kind = it->tombstone().end_kind;
|
||||
end_bound = bound_view(end, end_kind);
|
||||
if (cmp(end, it->end_position()) < 0) {
|
||||
end = it->end_position();
|
||||
}
|
||||
it = rev.erase(it);
|
||||
} else if (c > 0) {
|
||||
// We overwrite the current tombstone.
|
||||
|
||||
if (less(it->start_bound(), start_bound)) {
|
||||
auto new_end = bound_view(start, invert_kind(start_kind));
|
||||
if (!less(new_end, it->start_bound())) {
|
||||
// Here it->start < start
|
||||
auto rt = construct_range_tombstone_entry(it->start_bound(), new_end, it->tombstone().tomb);
|
||||
rev.update(it, {start_bound, it->end_bound(), it->tombstone().tomb});
|
||||
if (cmp(it->position(), start) < 0) {
|
||||
{
|
||||
auto rt = construct_range_tombstone_entry(it->position(), start, it->tombstone().tomb);
|
||||
rev.update(it, {start, it->end_position(), it->tombstone().tomb});
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
}
|
||||
}
|
||||
|
||||
if (less(end_bound, it->end_bound())) {
|
||||
if (cmp(end, it->end_position()) < 0) {
|
||||
// Here start <= it->start and end < it->end.
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, end, end_kind, std::move(tomb));
|
||||
rev.update(it, {std::move(end), invert_kind(end_kind), it->tombstone().end, it->tombstone().end_kind, it->tombstone().tomb});
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), end, std::move(tomb));
|
||||
rev.update(it, {std::move(end), it->end_position(), it->tombstone().tomb});
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
return;
|
||||
@@ -157,30 +154,28 @@ void range_tombstone_list::insert_from(const schema& s,
|
||||
} else {
|
||||
// We don't overwrite the current tombstone.
|
||||
|
||||
if (less(start_bound, it->start_bound())) {
|
||||
if (cmp(start, it->position()) < 0) {
|
||||
// The new tombstone starts before the current one.
|
||||
if (less(it->start_bound(), end_bound)) {
|
||||
if (cmp(it->position(), end) < 0) {
|
||||
// Here start < it->start and it->start < end.
|
||||
auto new_end_kind = invert_kind(it->tombstone().start_kind);
|
||||
if (!less(bound_view(it->tombstone().start, new_end_kind), start_bound)) {
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, it->tombstone().start, new_end_kind, tomb);
|
||||
{
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), it->position(), tomb);
|
||||
it = rev.insert(it, *rt);
|
||||
rt.release();
|
||||
++it;
|
||||
}
|
||||
} else {
|
||||
// Here start < it->start and end <= it->start, so just insert the new tombstone.
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (less(it->end_bound(), end_bound)) {
|
||||
if (cmp(it->end_position(), end) < 0) {
|
||||
// Here the current tombstone overwrites a range of the new one.
|
||||
start = it->tombstone().end;
|
||||
start_kind = invert_kind(it->tombstone().end_kind);
|
||||
start = it->end_position();
|
||||
++it;
|
||||
} else {
|
||||
// Here the current tombstone completely overwrites the new one.
|
||||
@@ -190,7 +185,7 @@ void range_tombstone_list::insert_from(const schema& s,
|
||||
}
|
||||
|
||||
// If we got here, then just insert the remainder at the end.
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
}
|
||||
|
||||
@@ -297,7 +297,13 @@ public:
|
||||
private:
|
||||
void apply_reversibly(const schema& s, clustering_key_prefix start, bound_kind start_kind,
|
||||
clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
|
||||
void insert_from(const schema& s, range_tombstones_type::iterator it, clustering_key_prefix start,
|
||||
bound_kind start_kind, clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
|
||||
|
||||
void insert_from(const schema& s,
|
||||
range_tombstones_type::iterator it,
|
||||
position_in_partition start,
|
||||
position_in_partition end,
|
||||
tombstone tomb,
|
||||
reverter& rev);
|
||||
|
||||
range_tombstones_type::iterator find(const schema& s, const range_tombstone_entry& rt);
|
||||
};
|
||||
|
||||
@@ -249,6 +249,14 @@ public:
|
||||
return _base_resources;
|
||||
}
|
||||
|
||||
void release_base_resources() noexcept {
|
||||
if (_base_resources_consumed) {
|
||||
_resources -= _base_resources;
|
||||
_base_resources_consumed = false;
|
||||
}
|
||||
_semaphore.signal(std::exchange(_base_resources, {}));
|
||||
}
|
||||
|
||||
sstring description() const {
|
||||
return format("{}.{}:{}",
|
||||
_schema ? _schema->ks_name() : "*",
|
||||
@@ -394,6 +402,10 @@ reader_resources reader_permit::base_resources() const {
|
||||
return _impl->base_resources();
|
||||
}
|
||||
|
||||
void reader_permit::release_base_resources() noexcept {
|
||||
return _impl->release_base_resources();
|
||||
}
|
||||
|
||||
sstring reader_permit::description() const {
|
||||
return _impl->description();
|
||||
}
|
||||
|
||||
@@ -161,6 +161,8 @@ public:
|
||||
|
||||
reader_resources base_resources() const;
|
||||
|
||||
void release_base_resources() noexcept;
|
||||
|
||||
sstring description() const;
|
||||
|
||||
db::timeout_clock::time_point timeout() const noexcept;
|
||||
|
||||
@@ -407,6 +407,10 @@ public:
|
||||
{},
|
||||
mutation_reader::forwarding::no);
|
||||
} else {
|
||||
// We can't have two permits with count resource for 1 repair.
|
||||
// So we release the one on _permit so the only one is the one the
|
||||
// shard reader will obtain.
|
||||
_permit.release_base_resources();
|
||||
_reader = make_multishard_streaming_reader(db, _schema, _permit, [this] {
|
||||
auto shard_range = _sharder.next();
|
||||
if (shard_range) {
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: a189cdc45d...6217d6ff4e
@@ -635,16 +635,16 @@ void storage_service::bootstrap() {
|
||||
|
||||
// Update pending ranges now, so we correctly count ourselves as a pending replica
|
||||
// when inserting the new CDC generation.
|
||||
if (!bootstrap_rbno) {
|
||||
// When is_repair_based_node_ops_enabled is true, the bootstrap node
|
||||
// will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
|
||||
slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
|
||||
mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
|
||||
auto endpoint = get_broadcast_address();
|
||||
tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
|
||||
return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
|
||||
}).get();
|
||||
}
|
||||
if (!bootstrap_rbno) {
|
||||
// When is_repair_based_node_ops_enabled is true, the bootstrap node
|
||||
// will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
|
||||
slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
|
||||
mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
|
||||
auto endpoint = get_broadcast_address();
|
||||
tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
|
||||
return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
|
||||
}).get();
|
||||
}
|
||||
|
||||
// After we pick a generation timestamp, we start gossiping it, and we stick with it.
|
||||
// We don't do any other generation switches (unless we crash before complecting bootstrap).
|
||||
@@ -652,19 +652,23 @@ void storage_service::bootstrap() {
|
||||
|
||||
_cdc_gen_id = _cdc_gen_service.local().make_new_generation(_bootstrap_tokens, !is_first_node()).get0();
|
||||
|
||||
if (!bootstrap_rbno) {
|
||||
// When is_repair_based_node_ops_enabled is true, the bootstrap node
|
||||
// will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
|
||||
_gossiper.add_local_application_state({
|
||||
// Order is important: both the CDC streams timestamp and tokens must be known when a node handles our status.
|
||||
{ gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
|
||||
{ gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
|
||||
{ gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
|
||||
}).get();
|
||||
if (!bootstrap_rbno) {
|
||||
// When is_repair_based_node_ops_enabled is true, the bootstrap node
|
||||
// will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
|
||||
_gossiper.add_local_application_state({
|
||||
{ gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
|
||||
{ gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
|
||||
{ gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
|
||||
}).get();
|
||||
|
||||
set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
|
||||
_gossiper.wait_for_range_setup().get();
|
||||
}
|
||||
set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
|
||||
_gossiper.wait_for_range_setup().get();
|
||||
} else {
|
||||
// Even with RBNO bootstrap we need to announce the new CDC generation immediately after it's created.
|
||||
_gossiper.add_local_application_state({
|
||||
{ gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
|
||||
}).get();
|
||||
}
|
||||
} else {
|
||||
// Wait until we know tokens of existing node before announcing replacing status.
|
||||
set_mode(mode::JOINING, fmt::format("Wait until local node knows tokens of peer nodes"), true);
|
||||
@@ -3670,7 +3674,7 @@ shared_ptr<abort_source> node_ops_meta_data::get_abort_source() {
|
||||
|
||||
void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_update_heartbeat: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
@@ -3680,7 +3684,7 @@ void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
|
||||
|
||||
void storage_service::node_ops_done(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_done: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
@@ -3691,7 +3695,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {
|
||||
|
||||
void storage_service::node_ops_abort(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
|
||||
@@ -49,12 +49,13 @@ private:
|
||||
public:
|
||||
partition_index_cache* _parent;
|
||||
key_type _key;
|
||||
std::variant<shared_promise<>, partition_index_page> _page;
|
||||
std::variant<lw_shared_ptr<shared_promise<>>, partition_index_page> _page;
|
||||
size_t _size_in_allocator = 0;
|
||||
public:
|
||||
entry(partition_index_cache* parent, key_type key)
|
||||
: _parent(parent)
|
||||
, _key(key)
|
||||
, _page(make_lw_shared<shared_promise<>>())
|
||||
{ }
|
||||
|
||||
void set_page(partition_index_page&& page) noexcept {
|
||||
@@ -67,7 +68,12 @@ private:
|
||||
entry(entry&&) noexcept = default;
|
||||
|
||||
~entry() {
|
||||
assert(!is_referenced());
|
||||
if (is_referenced()) {
|
||||
// Live entry_ptr should keep the entry alive, except when the entry failed on loading.
|
||||
// In that case, entry_ptr holders are not supposed to use the pointer, so it's safe
|
||||
// to nullify those entry_ptrs.
|
||||
assert(!ready());
|
||||
}
|
||||
}
|
||||
|
||||
void on_evicted() noexcept override;
|
||||
@@ -76,7 +82,7 @@ private:
|
||||
// Always returns the same value for a given state of _page.
|
||||
size_t size_in_allocator() const { return _size_in_allocator; }
|
||||
|
||||
shared_promise<>& promise() { return std::get<shared_promise<>>(_page); }
|
||||
lw_shared_ptr<shared_promise<>> promise() { return std::get<lw_shared_ptr<shared_promise<>>>(_page); }
|
||||
bool ready() const { return std::holds_alternative<partition_index_page>(_page); }
|
||||
partition_index_page& page() { return std::get<partition_index_page>(_page); }
|
||||
const partition_index_page& page() const { return std::get<partition_index_page>(_page); }
|
||||
@@ -207,9 +213,7 @@ public:
|
||||
return make_ready_future<entry_ptr>(std::move(ptr));
|
||||
} else {
|
||||
++_shard_stats.blocks;
|
||||
return _as(_region, [ptr] () mutable {
|
||||
return ptr.get_entry().promise().get_shared_future();
|
||||
}).then([ptr] () mutable {
|
||||
return ptr.get_entry().promise()->get_shared_future().then([ptr] () mutable {
|
||||
return std::move(ptr);
|
||||
});
|
||||
}
|
||||
@@ -234,23 +238,23 @@ public:
|
||||
|
||||
// No exceptions before then_wrapped() is installed so that ptr will be eventually populated.
|
||||
|
||||
return futurize_invoke(loader, key).then_wrapped([this, key, ptr] (auto&& f) mutable {
|
||||
return futurize_invoke(loader, key).then_wrapped([this, key, ptr = std::move(ptr)] (auto&& f) mutable {
|
||||
entry& e = ptr.get_entry();
|
||||
try {
|
||||
partition_index_page&& page = f.get0();
|
||||
e.promise().set_value();
|
||||
e.promise()->set_value();
|
||||
e.set_page(std::move(page));
|
||||
_shard_stats.used_bytes += e.size_in_allocator();
|
||||
++_shard_stats.populations;
|
||||
return ptr;
|
||||
} catch (...) {
|
||||
e.promise().set_exception(std::current_exception());
|
||||
e.promise()->set_exception(std::current_exception());
|
||||
ptr = {};
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
_cache.erase(key);
|
||||
});
|
||||
throw;
|
||||
}
|
||||
}).then([ptr] {
|
||||
return ptr;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -400,10 +400,15 @@ void time_series_sstable_set::for_each_sstable(std::function<void(const shared_s
|
||||
|
||||
// O(log n)
|
||||
void time_series_sstable_set::insert(shared_sstable sst) {
|
||||
try {
|
||||
auto min_pos = sst->min_position();
|
||||
auto max_pos_reversed = sst->max_position().reversed();
|
||||
_sstables->emplace(std::move(min_pos), sst);
|
||||
_sstables_reversed->emplace(std::move(max_pos_reversed), std::move(sst));
|
||||
} catch (...) {
|
||||
erase(sst);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
// O(n) worst case, but should be close to O(log n) most of the time
|
||||
|
||||
3
table.cc
3
table.cc
@@ -1493,13 +1493,14 @@ bool table::can_flush() const {
|
||||
}
|
||||
|
||||
future<> table::clear() {
|
||||
auto permits = co_await _config.dirty_memory_manager->get_all_flush_permits();
|
||||
if (_commitlog) {
|
||||
for (auto& t : *_memtables) {
|
||||
_commitlog->discard_completed_segments(_schema->id(), t->get_and_discard_rp_set());
|
||||
}
|
||||
}
|
||||
_memtables->clear_and_add();
|
||||
return _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
|
||||
co_await _cache.invalidate(row_cache::external_updater([] { /* There is no underlying mutation source */ }));
|
||||
}
|
||||
|
||||
// NOTE: does not need to be futurized, but might eventually, depending on
|
||||
|
||||
43
test.py
43
test.py
@@ -291,6 +291,8 @@ class Test:
|
||||
def print_summary(self):
|
||||
pass
|
||||
|
||||
def get_junit_etree(self):
|
||||
return None
|
||||
|
||||
def check_log(self, trim):
|
||||
"""Check and trim logs and xml output for tests which have it"""
|
||||
@@ -338,9 +340,36 @@ class BoostTest(UnitTest):
|
||||
boost_args += ['--color_output=false']
|
||||
boost_args += ['--']
|
||||
self.args = boost_args + self.args
|
||||
self.casename = casename
|
||||
self.__junit_etree = None
|
||||
|
||||
def get_junit_etree(self):
|
||||
def adjust_suite_name(name):
|
||||
# Normalize "path/to/file.cc" to "path.to.file" to conform to
|
||||
# Jenkins expectations that the suite name is a class name. ".cc"
|
||||
# doesn't add any infomation. Add the mode, otherwise failures
|
||||
# in different modes are indistinguishable. The "test/" prefix adds
|
||||
# no information, so remove it.
|
||||
import re
|
||||
name = re.sub(r'^test/', '', name)
|
||||
name = re.sub(r'\.cc$', '', name)
|
||||
name = re.sub(r'/', '.', name)
|
||||
name = f'{name}.{self.mode}'
|
||||
return name
|
||||
if self.__junit_etree is None:
|
||||
self.__junit_etree = ET.parse(self.xmlout)
|
||||
root = self.__junit_etree.getroot()
|
||||
suites = root.findall('.//TestSuite')
|
||||
for suite in suites:
|
||||
suite.attrib['name'] = adjust_suite_name(suite.attrib['name'])
|
||||
skipped = suite.findall('./TestCase[@reason="disabled"]')
|
||||
for e in skipped:
|
||||
suite.remove(e)
|
||||
os.unlink(self.xmlout)
|
||||
return self.__junit_etree
|
||||
|
||||
def check_log(self, trim):
|
||||
ET.parse(self.xmlout)
|
||||
self.get_junit_etree()
|
||||
super().check_log(trim)
|
||||
|
||||
|
||||
@@ -800,6 +829,17 @@ def write_junit_report(tmpdir, mode):
|
||||
with open(junit_filename, "w") as f:
|
||||
ET.ElementTree(xml_results).write(f, encoding="unicode")
|
||||
|
||||
def write_consolidated_boost_junit_xml(tmpdir, mode):
|
||||
xml = ET.Element("TestLog")
|
||||
for suite in TestSuite.suites.values():
|
||||
for test in suite.tests:
|
||||
if test.mode != mode:
|
||||
continue
|
||||
test_xml = test.get_junit_etree()
|
||||
if test_xml is not None:
|
||||
xml.extend(test_xml.getroot().findall('.//TestSuite'))
|
||||
et = ET.ElementTree(xml)
|
||||
et.write(f'{tmpdir}/{mode}/xml/boost.xunit.xml', encoding='unicode')
|
||||
|
||||
def open_log(tmpdir):
|
||||
pathlib.Path(tmpdir).mkdir(parents=True, exist_ok=True)
|
||||
@@ -839,6 +879,7 @@ async def main():
|
||||
|
||||
for mode in options.modes:
|
||||
write_junit_report(options.tmpdir, mode)
|
||||
write_consolidated_boost_junit_xml(options.tmpdir, mode)
|
||||
|
||||
if 'coverage' in options.modes:
|
||||
coverage.generate_coverage_report("build/coverage", "tests")
|
||||
|
||||
@@ -374,6 +374,14 @@ def test_getitem_attributes_to_get_duplicate(dynamodb, test_table):
|
||||
with pytest.raises(ClientError, match='ValidationException.*Duplicate'):
|
||||
test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=['a', 'a'], ConsistentRead=True)
|
||||
|
||||
# Verify that it is forbidden to ask for an empty AttributesToGet
|
||||
# Reproduces issue #10332.
|
||||
def test_getitem_attributes_to_get_empty(dynamodb, test_table):
|
||||
p = random_string()
|
||||
c = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
test_table.get_item(Key={'p': p, 'c': c}, AttributesToGet=[], ConsistentRead=True)
|
||||
|
||||
# Basic test for DeleteItem, with hash key only
|
||||
def test_delete_item_hash(test_table_s):
|
||||
p = random_string()
|
||||
|
||||
@@ -170,6 +170,13 @@ def test_query_attributes_to_get(dynamodb, test_table):
|
||||
expected_items = [{k: x[k] for k in wanted if k in x} for x in items]
|
||||
assert multiset(expected_items) == multiset(got_items)
|
||||
|
||||
# Verify that it is forbidden to ask for an empty AttributesToGet
|
||||
# Reproduces issue #10332.
|
||||
def test_query_attributes_to_get_empty(dynamodb, test_table):
|
||||
p = random_string()
|
||||
with pytest.raises(ClientError, match='ValidationException'):
|
||||
full_query(test_table, KeyConditions={'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}}, AttributesToGet=[])
|
||||
|
||||
# Test that in a table with both hash key and sort key, which keys we can
|
||||
# Query by: We can Query by the hash key, by a combination of both hash and
|
||||
# sort keys, but *cannot* query by just the sort key, and obviously not
|
||||
|
||||
@@ -16,6 +16,9 @@
|
||||
# along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# Tests for basic table operations: CreateTable, DeleteTable, ListTables.
|
||||
# Also some basic tests for UpdateTable - although UpdateTable usually
|
||||
# enables more elaborate features (such as GSI or Streams) and those are
|
||||
# tested elsewhere.
|
||||
|
||||
import pytest
|
||||
from botocore.exceptions import ClientError
|
||||
@@ -311,3 +314,17 @@ def test_table_sse_off(dynamodb):
|
||||
KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
|
||||
AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]);
|
||||
table.delete();
|
||||
|
||||
# Test that trying to delete a table that doesn't exist fails in the
|
||||
# appropriate way (ResourceNotFoundException)
|
||||
def test_delete_table_non_existent(dynamodb, test_table):
|
||||
client = dynamodb.meta.client
|
||||
with pytest.raises(ClientError, match='ResourceNotFoundException'):
|
||||
client.delete_table(TableName=random_string(20))
|
||||
|
||||
# Test that trying to update a table that doesn't exist fails in the
|
||||
# appropriate way (ResourceNotFoundException)
|
||||
def test_update_table_non_existent(dynamodb, test_table):
|
||||
client = dynamodb.meta.client
|
||||
with pytest.raises(ClientError, match='ResourceNotFoundException'):
|
||||
client.update_table(TableName=random_string(20), BillingMode='PAY_PER_REQUEST')
|
||||
|
||||
@@ -1043,6 +1043,20 @@ def test_nested_attribute_remove_from_missing_item(test_table_s):
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x.y')
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x[0]')
|
||||
|
||||
# Though in an above test (test_nested_attribute_update_bad_path_dot) we
|
||||
# showed that DynamoDB does not allow REMOVE x.y if attribute x doesn't
|
||||
# exist - and generates a ValidationException, if x *does* exist but y
|
||||
# doesn't, it's fine and the removal should just be silently ignored.
|
||||
def test_nested_attribute_remove_missing_leaf(test_table_s):
|
||||
p = random_string()
|
||||
item = {'p': p, 'a': {'x': 3}, 'b': ['hi']}
|
||||
test_table_s.put_item(Item=item)
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.y')
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE b[7]')
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE c')
|
||||
# The above UpdateItem calls didn't change anything...
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == item
|
||||
|
||||
# Similarly for other types of bad paths - using [0] on something which
|
||||
# doesn't exist or isn't an array.
|
||||
def test_nested_attribute_update_bad_path_array(test_table_s):
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <boost/range/irange.hpp>
|
||||
#include <seastar/testing/test_case.hh>
|
||||
#include <seastar/testing/thread_test_case.hh>
|
||||
#include <seastar/core/iostream.hh>
|
||||
@@ -49,6 +50,15 @@ static sstring read_to_string(cached_file::stream& s, size_t limit = std::numeri
|
||||
return b.substr(0, limit);
|
||||
}
|
||||
|
||||
static void read_to_void(cached_file::stream& s, size_t limit = std::numeric_limits<size_t>::max()) {
|
||||
while (auto buf = s.next().get0()) {
|
||||
if (buf.size() >= limit) {
|
||||
break;
|
||||
}
|
||||
limit -= buf.size();
|
||||
}
|
||||
}
|
||||
|
||||
static sstring read_to_string(file& f, size_t start, size_t len) {
|
||||
file_input_stream_options opt;
|
||||
auto in = make_file_input_stream(f, start, len, opt);
|
||||
@@ -61,6 +71,12 @@ static sstring read_to_string(cached_file& cf, size_t off, size_t limit = std::n
|
||||
return read_to_string(s, limit);
|
||||
}
|
||||
|
||||
[[gnu::unused]]
|
||||
static void read_to_void(cached_file& cf, size_t off, size_t limit = std::numeric_limits<size_t>::max()) {
|
||||
auto s = cf.read(off, default_priority_class(), std::nullopt);
|
||||
read_to_void(s, limit);
|
||||
}
|
||||
|
||||
struct test_file {
|
||||
tmpdir dir;
|
||||
file f;
|
||||
@@ -204,7 +220,9 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
|
||||
}
|
||||
|
||||
{
|
||||
cf_lru.evict_all();
|
||||
with_allocator(region.allocator(), [] {
|
||||
cf_lru.evict_all();
|
||||
});
|
||||
|
||||
BOOST_REQUIRE_EQUAL(0, metrics.cached_bytes); // change here
|
||||
BOOST_REQUIRE_EQUAL(0, cf.cached_bytes()); // change here
|
||||
@@ -212,6 +230,8 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
|
||||
BOOST_REQUIRE_EQUAL(3, metrics.page_evictions); // change here
|
||||
BOOST_REQUIRE_EQUAL(0, metrics.page_hits);
|
||||
BOOST_REQUIRE_EQUAL(3, metrics.page_populations);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(region.occupancy().used_space(), 0);
|
||||
}
|
||||
|
||||
{
|
||||
@@ -255,6 +275,88 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
|
||||
}
|
||||
}
|
||||
|
||||
// A file which serves garbage but is very fast.
|
||||
class garbage_file_impl : public file_impl {
|
||||
private:
|
||||
[[noreturn]] void unsupported() {
|
||||
throw_with_backtrace<std::logic_error>("unsupported operation");
|
||||
}
|
||||
public:
|
||||
// unsupported
|
||||
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override { unsupported(); }
|
||||
virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override { unsupported(); }
|
||||
virtual future<> flush(void) override { unsupported(); }
|
||||
virtual future<> truncate(uint64_t length) override { unsupported(); }
|
||||
virtual future<> discard(uint64_t offset, uint64_t length) override { unsupported(); }
|
||||
virtual future<> allocate(uint64_t position, uint64_t length) override { unsupported(); }
|
||||
virtual subscription<directory_entry> list_directory(std::function<future<>(directory_entry)>) override { unsupported(); }
|
||||
virtual future<struct stat> stat(void) override { unsupported(); }
|
||||
virtual future<uint64_t> size(void) override { unsupported(); }
|
||||
virtual std::unique_ptr<seastar::file_handle_impl> dup() override { unsupported(); }
|
||||
|
||||
virtual future<> close() override { return make_ready_future<>(); }
|
||||
|
||||
virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t size, const io_priority_class& pc) override {
|
||||
return make_ready_future<temporary_buffer<uint8_t>>(temporary_buffer<uint8_t>(size));
|
||||
}
|
||||
|
||||
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
|
||||
unsupported(); // FIXME
|
||||
}
|
||||
|
||||
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
|
||||
unsupported(); // FIXME
|
||||
}
|
||||
};
|
||||
|
||||
#ifndef SEASTAR_DEFAULT_ALLOCATOR // Eviction works only with the seastar allocator
|
||||
SEASTAR_THREAD_TEST_CASE(test_stress_eviction) {
|
||||
auto page_size = cached_file::page_size;
|
||||
auto n_pages = 8'000'000 / page_size;
|
||||
auto file_size = page_size * n_pages;
|
||||
auto cached_size = 4'000'000;
|
||||
|
||||
cached_file::metrics metrics;
|
||||
logalloc::region region;
|
||||
|
||||
auto f = file(make_shared<garbage_file_impl>());
|
||||
cached_file cf(f, metrics, cf_lru, region, file_size);
|
||||
|
||||
region.make_evictable([&] {
|
||||
testlog.trace("Evicting");
|
||||
cf.invalidate_at_most_front(file_size / 2);
|
||||
return cf_lru.evict();
|
||||
});
|
||||
|
||||
for (int i = 0; i < (cached_size / page_size); ++i) {
|
||||
read_to_string(cf, page_size * i, page_size);
|
||||
}
|
||||
|
||||
testlog.debug("Saturating memory...");
|
||||
|
||||
// Disable background reclaiming which will prevent bugs from reproducing
|
||||
// We want reclamation to happen synchronously with page cache population in read_to_void()
|
||||
seastar::memory::set_min_free_pages(0);
|
||||
|
||||
// Saturate std memory
|
||||
chunked_fifo<bytes> blobs;
|
||||
auto rc = region.reclaim_counter();
|
||||
while (region.reclaim_counter() == rc) {
|
||||
blobs.emplace_back(bytes(bytes::initialized_later(), 1024));
|
||||
}
|
||||
|
||||
testlog.debug("Memory: allocated={}, free={}", seastar::memory::stats().allocated_memory(), seastar::memory::stats().free_memory());
|
||||
testlog.debug("Starting test...");
|
||||
|
||||
for (int j = 0; j < n_pages * 16; ++j) {
|
||||
testlog.trace("Allocating");
|
||||
auto stride = tests::random::get_int(1, 20);
|
||||
auto page_idx = tests::random::get_int(n_pages - stride);
|
||||
read_to_void(cf, page_idx * page_size, page_size * stride);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_invalidation) {
|
||||
auto page_size = cached_file::page_size;
|
||||
test_file tf = make_test_file(page_size * 2);
|
||||
|
||||
@@ -25,6 +25,8 @@
|
||||
#include <deque>
|
||||
#include <random>
|
||||
#include "utils/lsa/chunked_managed_vector.hh"
|
||||
#include "utils/managed_ref.hh"
|
||||
#include "test/lib/log.hh"
|
||||
|
||||
#include <boost/range/algorithm/sort.hpp>
|
||||
#include <boost/range/algorithm/equal.hpp>
|
||||
@@ -216,3 +218,106 @@ SEASTAR_TEST_CASE(tests_reserve_partial) {
|
||||
});
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_clear_and_release) {
|
||||
region region;
|
||||
allocating_section as;
|
||||
|
||||
with_allocator(region.allocator(), [&] {
|
||||
lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
|
||||
|
||||
for (uint64_t i = 1; i < 4000; ++i) {
|
||||
as(region, [&] {
|
||||
v.emplace_back(make_managed<uint64_t>(i));
|
||||
});
|
||||
}
|
||||
|
||||
v.clear_and_release();
|
||||
});
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_chunk_reserve) {
|
||||
region region;
|
||||
allocating_section as;
|
||||
|
||||
for (auto conf :
|
||||
{ // std::make_pair(reserve size, push count)
|
||||
std::make_pair(0, 4000),
|
||||
std::make_pair(100, 4000),
|
||||
std::make_pair(200, 4000),
|
||||
std::make_pair(1000, 4000),
|
||||
std::make_pair(2000, 4000),
|
||||
std::make_pair(3000, 4000),
|
||||
std::make_pair(5000, 4000),
|
||||
std::make_pair(500, 8000),
|
||||
std::make_pair(1000, 8000),
|
||||
std::make_pair(2000, 8000),
|
||||
std::make_pair(8000, 500),
|
||||
})
|
||||
{
|
||||
with_allocator(region.allocator(), [&] {
|
||||
auto [reserve_size, push_count] = conf;
|
||||
testlog.info("Testing reserve({}), {}x emplace_back()", reserve_size, push_count);
|
||||
lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
|
||||
v.reserve(reserve_size);
|
||||
uint64_t seed = rand();
|
||||
for (uint64_t i = 0; i < push_count; ++i) {
|
||||
as(region, [&] {
|
||||
v.emplace_back(make_managed<uint64_t>(seed + i));
|
||||
BOOST_REQUIRE(**v.begin() == seed);
|
||||
});
|
||||
}
|
||||
auto v_it = v.begin();
|
||||
for (uint64_t i = 0; i < push_count; ++i) {
|
||||
BOOST_REQUIRE(**v_it++ == seed + i);
|
||||
}
|
||||
v.clear_and_release();
|
||||
});
|
||||
}
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
|
||||
// the last reserved chunk.
|
||||
SEASTAR_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
|
||||
region region;
|
||||
allocating_section as;
|
||||
|
||||
with_allocator(region.allocator(), [&] {
|
||||
lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
|
||||
|
||||
// Fill two chunks
|
||||
v.reserve(2000);
|
||||
for (uint64_t i = 0; i < 2000; ++i) {
|
||||
as(region, [&] {
|
||||
v.emplace_back(make_managed<uint64_t>(i));
|
||||
});
|
||||
}
|
||||
|
||||
// Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
|
||||
v.shrink_to_fit();
|
||||
|
||||
// Leave the last chunk reserved but empty
|
||||
for (uint64_t i = 0; i < 1000; ++i) {
|
||||
v.pop_back();
|
||||
}
|
||||
|
||||
// Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
|
||||
// with _size not in the last chunk. Should not sigsegv.
|
||||
v.reserve(8000);
|
||||
|
||||
for (uint64_t i = 0; i < 2000; ++i) {
|
||||
as(region, [&] {
|
||||
v.emplace_back(make_managed<uint64_t>(i));
|
||||
});
|
||||
}
|
||||
|
||||
v.clear_and_release();
|
||||
});
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
|
||||
@@ -191,3 +191,32 @@ BOOST_AUTO_TEST_CASE(tests_reserve_partial) {
|
||||
BOOST_REQUIRE_EQUAL(v.capacity(), orig_size);
|
||||
}
|
||||
}
|
||||
|
||||
// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
|
||||
// the last reserved chunk.
|
||||
BOOST_AUTO_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
|
||||
using vector_type = utils::chunked_vector<std::unique_ptr<uint64_t>>;
|
||||
vector_type v;
|
||||
|
||||
// Fill two chunks
|
||||
v.reserve(vector_type::max_chunk_capacity() * 3 / 2);
|
||||
for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 3 / 2; ++i) {
|
||||
v.emplace_back(std::make_unique<uint64_t>(i));
|
||||
}
|
||||
|
||||
// Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
|
||||
v.shrink_to_fit();
|
||||
|
||||
// Leave the last chunk reserved but empty
|
||||
for (uint64_t i = 0; i < vector_type::max_chunk_capacity(); ++i) {
|
||||
v.pop_back();
|
||||
}
|
||||
|
||||
// Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
|
||||
// with _size not in the last chunk. Should not sigsegv.
|
||||
v.reserve(vector_type::max_chunk_capacity() * 4);
|
||||
|
||||
for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 2; ++i) {
|
||||
v.emplace_back(std::make_unique<uint64_t>(i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +44,9 @@
|
||||
#include "test/lib/tmpdir.hh"
|
||||
#include "db/commitlog/commitlog.hh"
|
||||
#include "db/commitlog/commitlog_replayer.hh"
|
||||
#include "db/commitlog/commitlog_extensions.hh"
|
||||
#include "db/commitlog/rp_set.hh"
|
||||
#include "db/extensions.hh"
|
||||
#include "log.hh"
|
||||
#include "service/priority_manager.hh"
|
||||
#include "test/lib/exception_utils.hh"
|
||||
@@ -947,3 +949,113 @@ SEASTAR_TEST_CASE(test_commitlog_deadlock_with_flush_threshold) {
|
||||
co_await log.clear();
|
||||
}
|
||||
}
|
||||
|
||||
static future<> do_test_exception_in_allocate_ex(bool do_file_delete, bool reuse = true) {
|
||||
commitlog::config cfg;
|
||||
|
||||
constexpr auto max_size_mb = 1;
|
||||
|
||||
cfg.commitlog_segment_size_in_mb = max_size_mb;
|
||||
cfg.commitlog_total_space_in_mb = 2 * max_size_mb * smp::count;
|
||||
cfg.commitlog_sync_period_in_ms = 10;
|
||||
cfg.reuse_segments = reuse;
|
||||
cfg.allow_going_over_size_limit = false; // #9348 - now can enforce size limit always
|
||||
cfg.use_o_dsync = true; // make sure we pre-allocate.
|
||||
|
||||
// not using cl_test, because we need to be able to abandon
|
||||
// the log.
|
||||
|
||||
tmpdir tmp;
|
||||
cfg.commit_log_location = tmp.path().string();
|
||||
|
||||
class myfail : public std::exception {
|
||||
public:
|
||||
using std::exception::exception;
|
||||
};
|
||||
|
||||
struct myext: public db::commitlog_file_extension {
|
||||
public:
|
||||
bool fail = false;
|
||||
bool thrown = false;
|
||||
bool do_file_delete;
|
||||
|
||||
myext(bool dd)
|
||||
: do_file_delete(dd)
|
||||
{}
|
||||
|
||||
seastar::future<seastar::file> wrap_file(const seastar::sstring& filename, seastar::file f, seastar::open_flags flags) override {
|
||||
if (fail && !thrown) {
|
||||
thrown = true;
|
||||
if (do_file_delete) {
|
||||
co_await f.close();
|
||||
co_await seastar::remove_file(filename);
|
||||
}
|
||||
throw myfail{};
|
||||
}
|
||||
co_return f;
|
||||
}
|
||||
seastar::future<> before_delete(const seastar::sstring&) override {
|
||||
co_return;
|
||||
}
|
||||
};
|
||||
|
||||
auto ep = std::make_unique<myext>(do_file_delete);
|
||||
auto& mx = *ep;
|
||||
|
||||
db::extensions myexts;
|
||||
myexts.add_commitlog_file_extension("hufflepuff", std::move(ep));
|
||||
|
||||
cfg.extensions = &myexts;
|
||||
|
||||
auto log = co_await commitlog::create_commitlog(cfg);
|
||||
|
||||
rp_set rps;
|
||||
// uncomment for verbosity
|
||||
// logging::logger_registry().set_logger_level("commitlog", logging::log_level::debug);
|
||||
|
||||
auto uuid = utils::UUID_gen::get_time_UUID();
|
||||
auto size = log.max_record_size();
|
||||
|
||||
auto r = log.add_flush_handler([&](cf_id_type id, replay_position pos) {
|
||||
log.discard_completed_segments(id, rps);
|
||||
mx.fail = true;
|
||||
});
|
||||
|
||||
try {
|
||||
while (!mx.thrown) {
|
||||
rp_handle h = co_await log.add_mutation(uuid, size, db::commitlog::force_sync::no, [&](db::commitlog::output& dst) {
|
||||
dst.fill('1', size);
|
||||
});
|
||||
rps.put(std::move(h));
|
||||
}
|
||||
} catch (...) {
|
||||
BOOST_FAIL("log write timed out. maybe it is deadlocked... Will not free log. ASAN errors and leaks will follow...");
|
||||
}
|
||||
|
||||
co_await log.shutdown();
|
||||
co_await log.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test generating an exception in segment file allocation
|
||||
*/
|
||||
SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex) {
|
||||
co_await do_test_exception_in_allocate_ex(false);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_no_recycle) {
|
||||
co_await do_test_exception_in_allocate_ex(false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test generating an exception in segment file allocation, but also
|
||||
* delete the file, which in turn should cause follow-up exceptions
|
||||
* in cleanup delete. Which CL should handle
|
||||
*/
|
||||
SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file) {
|
||||
co_await do_test_exception_in_allocate_ex(true, false);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file_no_recycle) {
|
||||
co_await do_test_exception_in_allocate_ex(true);
|
||||
}
|
||||
|
||||
@@ -784,3 +784,38 @@ SEASTAR_TEST_CASE(upgrade_sstables) {
|
||||
}).get();
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(database_drop_column_family_clears_querier_cache) {
|
||||
return do_with_cql_env_thread([] (cql_test_env& e) {
|
||||
e.execute_cql("create table ks.cf (k text, v int, primary key (k));").get();
|
||||
auto& db = e.local_db();
|
||||
const auto ts = db_clock::now();
|
||||
auto& tbl = db.find_column_family("ks", "cf");
|
||||
|
||||
auto op = std::optional(tbl.read_in_progress());
|
||||
auto s = tbl.schema();
|
||||
auto q = query::data_querier(
|
||||
tbl.as_mutation_source(),
|
||||
tbl.schema(),
|
||||
database_test(db).get_user_read_concurrency_semaphore().make_tracking_only_permit(s.get(), "test", db::no_timeout),
|
||||
query::full_partition_range,
|
||||
s->full_slice(),
|
||||
default_priority_class(),
|
||||
nullptr);
|
||||
|
||||
auto f = e.db().invoke_on_all([ts] (database& db) {
|
||||
return db.drop_column_family("ks", "cf", [ts] { return make_ready_future<db_clock::time_point>(ts); });
|
||||
});
|
||||
|
||||
// we add a querier to the querier cache while the drop is ongoing
|
||||
auto& qc = db.get_querier_cache();
|
||||
qc.insert(utils::make_random_uuid(), std::move(q), nullptr);
|
||||
BOOST_REQUIRE_EQUAL(qc.get_stats().population, 1);
|
||||
|
||||
op.reset(); // this should allow the drop to finish
|
||||
f.get();
|
||||
|
||||
// the drop should have cleaned up all entries belonging to that table
|
||||
BOOST_REQUIRE_EQUAL(qc.get_stats().population, 0);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -22,6 +22,8 @@
|
||||
#include <seastar/testing/test_case.hh>
|
||||
#include "test/lib/cql_test_env.hh"
|
||||
#include "test/lib/cql_assertions.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
|
||||
SEASTAR_TEST_CASE(test_index_with_paging) {
|
||||
@@ -56,3 +58,51 @@ SEASTAR_TEST_CASE(test_index_with_paging) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read) {
|
||||
return do_with_cql_env_thread([] (auto& e) {
|
||||
e.execute_cql("CREATE TABLE tab (pk int, ck text, v int, v2 int, v3 text, PRIMARY KEY (pk, ck))").get();
|
||||
e.execute_cql("CREATE INDEX ON tab (v)").get();
|
||||
|
||||
// Enough to trigger a short read on the base table during scan
|
||||
sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
|
||||
|
||||
const int row_count = 67;
|
||||
for (int i = 0; i < row_count; ++i) {
|
||||
e.execute_cql(format("INSERT INTO tab (pk, ck, v, v2, v3) VALUES ({}, 'hello{}', 1, {}, '{}')", i % 3, i, i, big_string)).get();
|
||||
}
|
||||
|
||||
eventually([&] {
|
||||
uint64_t count = 0;
|
||||
e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
|
||||
++count;
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
}).get();
|
||||
BOOST_REQUIRE_EQUAL(count, row_count);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read_no_ck) {
|
||||
return do_with_cql_env_thread([] (auto& e) {
|
||||
e.execute_cql("CREATE TABLE tab (pk int, v int, v2 int, v3 text, PRIMARY KEY (pk))").get();
|
||||
e.execute_cql("CREATE INDEX ON tab (v)").get();
|
||||
|
||||
// Enough to trigger a short read on the base table during scan
|
||||
sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
|
||||
|
||||
const int row_count = 67;
|
||||
for (int i = 0; i < row_count; ++i) {
|
||||
e.execute_cql(format("INSERT INTO tab (pk, v, v2, v3) VALUES ({}, 1, {}, '{}')", i, i, big_string)).get();
|
||||
}
|
||||
|
||||
eventually([&] {
|
||||
uint64_t count = 0;
|
||||
e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
|
||||
++count;
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
}).get();
|
||||
BOOST_REQUIRE_EQUAL(count, row_count);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -391,3 +391,87 @@ SEASTAR_TEST_CASE(test_loading_cache_reload_during_eviction) {
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.size(), 1);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_loading_cache_remove_leaves_no_old_entries_behind) {
|
||||
using namespace std::chrono;
|
||||
load_count = 0;
|
||||
|
||||
auto load_v1 = [] (auto key) { return make_ready_future<sstring>("v1"); };
|
||||
auto load_v2 = [] (auto key) { return make_ready_future<sstring>("v2"); };
|
||||
auto load_v3 = [] (auto key) { return make_ready_future<sstring>("v3"); };
|
||||
|
||||
{
|
||||
utils::loading_cache<int, sstring> loading_cache(num_loaders, 100s, testlog);
|
||||
auto stop_cache_reload = seastar::defer([&loading_cache] { loading_cache.stop().get(); });
|
||||
|
||||
//
|
||||
// Test remove() concurrent with loading
|
||||
//
|
||||
|
||||
auto f = loading_cache.get_ptr(0, [&](auto key) {
|
||||
return later().then([&] {
|
||||
return load_v1(key);
|
||||
});
|
||||
});
|
||||
|
||||
loading_cache.remove(0);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
|
||||
|
||||
auto ptr1 = f.get0();
|
||||
BOOST_REQUIRE_EQUAL(*ptr1, "v1");
|
||||
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
|
||||
|
||||
ptr1 = loading_cache.get_ptr(0, load_v2).get0();
|
||||
loading_cache.remove(0);
|
||||
BOOST_REQUIRE_EQUAL(*ptr1, "v2");
|
||||
|
||||
//
|
||||
// Test that live ptr1, removed from cache, does not prevent reload of new value
|
||||
//
|
||||
auto ptr2 = loading_cache.get_ptr(0, load_v3).get0();
|
||||
ptr1 = nullptr;
|
||||
BOOST_REQUIRE_EQUAL(*ptr2, "v3");
|
||||
}
|
||||
|
||||
// Test remove_if()
|
||||
{
|
||||
utils::loading_cache<int, sstring> loading_cache(num_loaders, 100s, testlog);
|
||||
auto stop_cache_reload = seastar::defer([&loading_cache] { loading_cache.stop().get(); });
|
||||
|
||||
//
|
||||
// Test remove_if() concurrent with loading
|
||||
//
|
||||
auto f = loading_cache.get_ptr(0, [&](auto key) {
|
||||
return later().then([&] {
|
||||
return load_v1(key);
|
||||
});
|
||||
});
|
||||
|
||||
loading_cache.remove_if([] (auto&& v) { return v == "v1"; });
|
||||
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
|
||||
|
||||
auto ptr1 = f.get0();
|
||||
BOOST_REQUIRE_EQUAL(*ptr1, "v1");
|
||||
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.find(0), nullptr);
|
||||
BOOST_REQUIRE_EQUAL(loading_cache.size(), 0);
|
||||
|
||||
ptr1 = loading_cache.get_ptr(0, load_v2).get0();
|
||||
loading_cache.remove_if([] (auto&& v) { return v == "v2"; });
|
||||
BOOST_REQUIRE_EQUAL(*ptr1, "v2");
|
||||
|
||||
//
|
||||
// Test that live ptr1, removed from cache, does not prevent reload of new value
|
||||
//
|
||||
auto ptr2 = loading_cache.get_ptr(0, load_v3).get0();
|
||||
ptr1 = nullptr;
|
||||
BOOST_REQUIRE_EQUAL(*ptr2, "v3");
|
||||
ptr2 = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,6 +39,9 @@
|
||||
#include "test/lib/random_utils.hh"
|
||||
#include "test/lib/log.hh"
|
||||
#include "test/lib/reader_concurrency_semaphore.hh"
|
||||
#include "test/lib/simple_schema.hh"
|
||||
#include "test/lib/make_random_string.hh"
|
||||
#include "utils/error_injection.hh"
|
||||
|
||||
static api::timestamp_type next_timestamp() {
|
||||
static thread_local api::timestamp_type next_timestamp = 1;
|
||||
@@ -528,6 +531,74 @@ SEASTAR_TEST_CASE(test_exception_safety_of_single_partition_reads) {
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_tombstone_merging_with_multiple_versions) {
|
||||
tests::reader_concurrency_semaphore_wrapper semaphore;
|
||||
simple_schema ss;
|
||||
auto s = ss.schema();
|
||||
auto mt = make_lw_shared<memtable>(ss.schema());
|
||||
|
||||
auto pk = ss.make_pkey(0);
|
||||
auto pr = dht::partition_range::make_singular(pk);
|
||||
|
||||
auto t0 = ss.new_tombstone();
|
||||
auto t1 = ss.new_tombstone();
|
||||
auto t2 = ss.new_tombstone();
|
||||
auto t3 = ss.new_tombstone();
|
||||
|
||||
mutation m1(s, pk);
|
||||
ss.delete_range(m1, *position_range_to_clustering_range(position_range(
|
||||
position_in_partition::before_key(ss.make_ckey(0)),
|
||||
position_in_partition::for_key(ss.make_ckey(3))), *s), t1);
|
||||
ss.add_row(m1, ss.make_ckey(0), "v");
|
||||
ss.add_row(m1, ss.make_ckey(1), "v");
|
||||
|
||||
// Fill so that rd1 stays in the partition snapshot
|
||||
int n_rows = 1000;
|
||||
auto v = make_random_string(512);
|
||||
for (int i = 0; i < n_rows; ++i) {
|
||||
ss.add_row(m1, ss.make_ckey(i), v);
|
||||
}
|
||||
|
||||
mutation m2(s, pk);
|
||||
ss.delete_range(m2, *position_range_to_clustering_range(position_range(
|
||||
position_in_partition::before_key(ss.make_ckey(0)),
|
||||
position_in_partition::before_key(ss.make_ckey(1))), *s), t2);
|
||||
ss.delete_range(m2, *position_range_to_clustering_range(position_range(
|
||||
position_in_partition::before_key(ss.make_ckey(1)),
|
||||
position_in_partition::for_key(ss.make_ckey(3))), *s), t3);
|
||||
|
||||
mutation m3(s, pk);
|
||||
ss.delete_range(m3, *position_range_to_clustering_range(position_range(
|
||||
position_in_partition::before_key(ss.make_ckey(0)),
|
||||
position_in_partition::for_key(ss.make_ckey(4))), *s), t0);
|
||||
|
||||
mt->apply(m1);
|
||||
|
||||
auto rd1 = mt->make_flat_reader(s, semaphore.make_permit(), pr, s->full_slice(), default_priority_class(),
|
||||
nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
|
||||
auto close_rd1 = defer([&] { rd1.close().get(); });
|
||||
|
||||
rd1.fill_buffer().get();
|
||||
BOOST_REQUIRE(!rd1.is_end_of_stream()); // rd1 must keep the m1 version alive
|
||||
|
||||
mt->apply(m2);
|
||||
|
||||
auto rd2 = mt->make_flat_reader(s, semaphore.make_permit(), pr, s->full_slice(), default_priority_class(),
|
||||
nullptr, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
|
||||
auto close_r2 = defer([&] { rd2.close().get(); });
|
||||
|
||||
rd2.fill_buffer().get();
|
||||
BOOST_REQUIRE(!rd2.is_end_of_stream()); // rd2 must keep the m1 version alive
|
||||
|
||||
mt->apply(m3);
|
||||
|
||||
assert_that(mt->make_flat_reader(s, semaphore.make_permit(), pr))
|
||||
.has_monotonic_positions();
|
||||
|
||||
assert_that(mt->make_flat_reader(s, semaphore.make_permit(), pr))
|
||||
.produces(m1 + m2 + m3);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_hash_is_cached) {
|
||||
return seastar::async([] {
|
||||
auto s = schema_builder("ks", "cf")
|
||||
|
||||
@@ -702,6 +702,7 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
|
||||
};
|
||||
|
||||
auto assert_equal = [] (atomic_cell_view c1, atomic_cell_view c2) {
|
||||
testlog.trace("Expected {} == {}", c1, c2);
|
||||
BOOST_REQUIRE(compare_atomic_cell_for_merge(c1, c2) == 0);
|
||||
BOOST_REQUIRE(compare_atomic_cell_for_merge(c2, c1) == 0);
|
||||
};
|
||||
@@ -723,9 +724,11 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_2, ttl_2));
|
||||
|
||||
// Origin doesn't compare ttl (is it wise?)
|
||||
assert_equal(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2));
|
||||
// But we do. See https://github.com/scylladb/scylla/issues/10156
|
||||
// and https://github.com/scylladb/scylla/issues/10173
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1));
|
||||
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 0, bytes("value1")),
|
||||
|
||||
@@ -560,7 +560,7 @@ SEASTAR_TEST_CASE(test_apply_to_incomplete_respects_continuity) {
|
||||
static mutation_partition read_using_cursor(partition_snapshot& snap) {
|
||||
tests::reader_concurrency_semaphore_wrapper semaphore;
|
||||
partition_snapshot_row_cursor cur(*snap.schema(), snap);
|
||||
cur.maybe_refresh();
|
||||
cur.advance_to(position_in_partition::before_all_clustered_rows());
|
||||
auto mp = read_partition_from(*snap.schema(), cur);
|
||||
for (auto&& rt : snap.range_tombstones()) {
|
||||
mp.apply_delete(*snap.schema(), rt);
|
||||
|
||||
@@ -210,6 +210,35 @@ BOOST_AUTO_TEST_CASE(test_overlapping_addition) {
|
||||
BOOST_REQUIRE(it == l.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_adjacent_empty_range_tombstone) {
|
||||
range_tombstone_list l(*s);
|
||||
|
||||
l.apply(*s, rtie(1, 1, 2));
|
||||
l.apply(*s, rt(1, 2, 3));
|
||||
l.apply(*s, rtei(2, 2, 2));
|
||||
l.apply(*s, rtei(2, 4, 3));
|
||||
|
||||
auto it = l.begin();
|
||||
assert_rt(rt(1, 4, 3), *it++);
|
||||
BOOST_REQUIRE(it == l.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_empty_range_tombstones_are_dropped) {
|
||||
range_tombstone_list l(*s);
|
||||
|
||||
l.apply(*s, rtei(0, 0, 1));
|
||||
l.apply(*s, rtie(0, 0, 1));
|
||||
l.apply(*s, rt(1, 2, 1));
|
||||
l.apply(*s, rtei(4, 4, 1));
|
||||
l.apply(*s, rtie(5, 5, 1));
|
||||
l.apply(*s, rt(7, 8, 1));
|
||||
|
||||
auto it = l.begin();
|
||||
assert_rt(rt(1, 2, 1), *it++);
|
||||
assert_rt(rt(7, 8, 1), *it++);
|
||||
BOOST_REQUIRE(it == l.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_simple_overlap) {
|
||||
range_tombstone_list l1(*s);
|
||||
|
||||
@@ -473,6 +502,23 @@ static std::vector<range_tombstone> make_random() {
|
||||
rts.emplace_back(std::move(start_b), std::move(end_b), tombstone(dist(gen), gc_now));
|
||||
}
|
||||
|
||||
int32_t size_empty = dist(gen) / 2;
|
||||
for (int32_t i = 0; i < size_empty; ++i) {
|
||||
clustering_key_prefix key = make_random_ckey();
|
||||
bool start_incl = dist(gen) > 25;
|
||||
if (start_incl) {
|
||||
rts.emplace_back(
|
||||
position_in_partition::before_key(key),
|
||||
position_in_partition::before_key(key),
|
||||
tombstone(dist(gen), gc_now));
|
||||
} else {
|
||||
rts.emplace_back(
|
||||
position_in_partition::after_key(key),
|
||||
position_in_partition::after_key(key),
|
||||
tombstone(dist(gen), gc_now));
|
||||
}
|
||||
}
|
||||
|
||||
return rts;
|
||||
}
|
||||
|
||||
|
||||
@@ -1242,9 +1242,13 @@ SEASTAR_TEST_CASE(test_update_failure) {
|
||||
class throttle {
|
||||
unsigned _block_counter = 0;
|
||||
promise<> _p; // valid when _block_counter != 0, resolves when goes down to 0
|
||||
std::optional<promise<>> _entered;
|
||||
bool _one_shot;
|
||||
public:
|
||||
// one_shot means whether only the first enter() after block() will block.
|
||||
throttle(bool one_shot = false) : _one_shot(one_shot) {}
|
||||
future<> enter() {
|
||||
if (_block_counter) {
|
||||
if (_block_counter && (!_one_shot || _entered)) {
|
||||
promise<> p1;
|
||||
promise<> p2;
|
||||
|
||||
@@ -1256,16 +1260,21 @@ public:
|
||||
p3.set_value();
|
||||
});
|
||||
_p = std::move(p2);
|
||||
|
||||
if (_entered) {
|
||||
_entered->set_value();
|
||||
_entered.reset();
|
||||
}
|
||||
return f1;
|
||||
} else {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
}
|
||||
|
||||
void block() {
|
||||
future<> block() {
|
||||
++_block_counter;
|
||||
_p = promise<>();
|
||||
_entered = promise<>();
|
||||
return _entered->get_future();
|
||||
}
|
||||
|
||||
void unblock() {
|
||||
@@ -1410,7 +1419,7 @@ SEASTAR_TEST_CASE(test_cache_population_and_update_race) {
|
||||
mt2->apply(m);
|
||||
}
|
||||
|
||||
thr.block();
|
||||
auto f = thr.block();
|
||||
|
||||
auto m0_range = dht::partition_range::make_singular(ring[0].ring_position());
|
||||
auto rd1 = cache.make_reader(s, semaphore.make_permit(), m0_range);
|
||||
@@ -1421,6 +1430,7 @@ SEASTAR_TEST_CASE(test_cache_population_and_update_race) {
|
||||
rd2.set_max_buffer_size(1);
|
||||
auto rd2_fill_buffer = rd2.fill_buffer();
|
||||
|
||||
f.get();
|
||||
sleep(10ms).get();
|
||||
|
||||
// This update should miss on all partitions
|
||||
@@ -1548,12 +1558,13 @@ SEASTAR_TEST_CASE(test_cache_population_and_clear_race) {
|
||||
mt2->apply(m);
|
||||
}
|
||||
|
||||
thr.block();
|
||||
auto f = thr.block();
|
||||
|
||||
auto rd1 = cache.make_reader(s, semaphore.make_permit());
|
||||
rd1.set_max_buffer_size(1);
|
||||
auto rd1_fill_buffer = rd1.fill_buffer();
|
||||
|
||||
f.get();
|
||||
sleep(10ms).get();
|
||||
|
||||
// This update should miss on all partitions
|
||||
@@ -3777,3 +3788,81 @@ SEASTAR_TEST_CASE(test_scans_erase_dummies) {
|
||||
BOOST_REQUIRE_EQUAL(tracker.get_stats().rows, 2);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_eviction_of_upper_bound_of_population_range) {
|
||||
return seastar::async([] {
|
||||
simple_schema s;
|
||||
tests::reader_concurrency_semaphore_wrapper semaphore;
|
||||
auto cache_mt = make_lw_shared<memtable>(s.schema());
|
||||
|
||||
auto pkey = s.make_pkey("pk");
|
||||
|
||||
mutation m1(s.schema(), pkey);
|
||||
s.add_row(m1, s.make_ckey(1), "v1");
|
||||
s.add_row(m1, s.make_ckey(2), "v2");
|
||||
cache_mt->apply(m1);
|
||||
|
||||
cache_tracker tracker;
|
||||
throttle thr(true);
|
||||
auto cache_source = make_decorated_snapshot_source(snapshot_source([&] { return cache_mt->as_data_source(); }),
|
||||
[&] (mutation_source src) {
|
||||
return throttled_mutation_source(thr, std::move(src));
|
||||
});
|
||||
row_cache cache(s.schema(), cache_source, tracker);
|
||||
|
||||
auto pr = dht::partition_range::make_singular(pkey);
|
||||
|
||||
auto read = [&] (int start, int end) {
|
||||
auto slice = partition_slice_builder(*s.schema())
|
||||
.with_range(query::clustering_range::make(s.make_ckey(start), s.make_ckey(end)))
|
||||
.build();
|
||||
auto rd = cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice);
|
||||
auto close_rd = deferred_close(rd);
|
||||
auto m_cache = read_mutation_from_flat_mutation_reader(rd).get0();
|
||||
close_rd.close_now();
|
||||
rd = cache_mt->make_flat_reader(s.schema(), semaphore.make_permit(), pr, slice);
|
||||
auto close_rd2 = deferred_close(rd);
|
||||
auto m_mt = read_mutation_from_flat_mutation_reader(rd).get0();
|
||||
BOOST_REQUIRE(m_mt);
|
||||
assert_that(m_cache).has_mutation().is_equal_to(*m_mt);
|
||||
};
|
||||
|
||||
// populate [2]
|
||||
{
|
||||
auto slice = partition_slice_builder(*s.schema())
|
||||
.with_range(query::clustering_range::make_singular(s.make_ckey(2)))
|
||||
.build();
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice))
|
||||
.has_monotonic_positions();
|
||||
}
|
||||
|
||||
auto arrived = thr.block();
|
||||
|
||||
// Read [0, 2]
|
||||
auto f = seastar::async([&] {
|
||||
read(0, 2);
|
||||
});
|
||||
|
||||
arrived.get();
|
||||
|
||||
// populate (2, 3]
|
||||
{
|
||||
auto slice = partition_slice_builder(*s.schema())
|
||||
.with_range(query::clustering_range::make(query::clustering_range::bound(s.make_ckey(2), false),
|
||||
query::clustering_range::bound(s.make_ckey(3), true)))
|
||||
.build();
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr, slice))
|
||||
.has_monotonic_positions();
|
||||
}
|
||||
|
||||
testlog.trace("Evicting");
|
||||
evict_one_row(tracker); // Evicts before(0)
|
||||
evict_one_row(tracker); // Evicts ck(2)
|
||||
testlog.trace("Unblocking");
|
||||
|
||||
thr.unblock();
|
||||
f.get();
|
||||
|
||||
read(0, 3);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -37,20 +37,30 @@ static void add_entry(logalloc::region& r,
|
||||
{
|
||||
logalloc::allocating_section as;
|
||||
as(r, [&] {
|
||||
sstables::key sst_key = sstables::key::from_partition_key(s, key);
|
||||
page._entries.push_back(make_managed<index_entry>(
|
||||
managed_bytes(sst_key.get_bytes()),
|
||||
position,
|
||||
managed_ref<promoted_index>()));
|
||||
with_allocator(r.allocator(), [&] {
|
||||
sstables::key sst_key = sstables::key::from_partition_key(s, key);
|
||||
page._entries.push_back(make_managed<index_entry>(
|
||||
managed_bytes(sst_key.get_bytes()),
|
||||
position,
|
||||
managed_ref<promoted_index>()));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static partition_index_page make_page0(logalloc::region& r, simple_schema& s) {
|
||||
partition_index_page page;
|
||||
auto destroy_page = defer([&] {
|
||||
with_allocator(r.allocator(), [&] {
|
||||
auto p = std::move(page);
|
||||
});
|
||||
});
|
||||
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(0).key(), 0);
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(1).key(), 1);
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(2).key(), 2);
|
||||
add_entry(r, *s.schema(), page, s.make_pkey(3).key(), 3);
|
||||
|
||||
destroy_page.cancel();
|
||||
return page;
|
||||
}
|
||||
|
||||
@@ -141,6 +151,47 @@ SEASTAR_THREAD_TEST_CASE(test_caching) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static future<> ignore_result(future<T>&& f) {
|
||||
return f.then_wrapped([] (auto&& f) {
|
||||
try {
|
||||
f.get();
|
||||
} catch (...) {
|
||||
// expected, silence warnings about ignored failed futures
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_exception_while_loading) {
|
||||
::lru lru;
|
||||
simple_schema s;
|
||||
logalloc::region r;
|
||||
partition_index_cache cache(lru, r);
|
||||
|
||||
auto clear_lru = defer([&] {
|
||||
with_allocator(r.allocator(), [&] {
|
||||
lru.evict_all();
|
||||
});
|
||||
});
|
||||
|
||||
auto page0_loader = [&] (partition_index_cache::key_type k) {
|
||||
return later().then([&] {
|
||||
return make_page0(r, s);
|
||||
});
|
||||
};
|
||||
|
||||
memory::with_allocation_failures([&] {
|
||||
cache.evict_gently().get();
|
||||
auto f0 = ignore_result(cache.get_or_load(0, page0_loader));
|
||||
auto f1 = ignore_result(cache.get_or_load(0, page0_loader));
|
||||
f0.get();
|
||||
f1.get();
|
||||
});
|
||||
|
||||
auto ptr = cache.get_or_load(0, page0_loader).get0();
|
||||
has_page0(ptr);
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_auto_clear) {
|
||||
::lru lru;
|
||||
simple_schema s;
|
||||
|
||||
@@ -19,6 +19,7 @@ from cassandra.cluster import ConsistencyLevel
|
||||
from cassandra.query import SimpleStatement
|
||||
|
||||
from util import new_test_table
|
||||
from nodetool import flush
|
||||
|
||||
def test_cdc_log_entries_use_cdc_streams(scylla_only, cql, test_keyspace):
|
||||
'''Test that the stream IDs chosen for CDC log entries come from the CDC generation
|
||||
@@ -44,3 +45,16 @@ def test_cdc_log_entries_use_cdc_streams(scylla_only, cql, test_keyspace):
|
||||
|
||||
assert(log_stream_ids.issubset(stream_ids))
|
||||
|
||||
|
||||
# Test for #10473 - reading logs (from sstable) after dropping
|
||||
# column in base.
|
||||
def test_cdc_alter_table_drop_column(scylla_only, cql, test_keyspace):
|
||||
schema = "pk int primary key, v int"
|
||||
extra = " with cdc = {'enabled': true}"
|
||||
with new_test_table(cql, test_keyspace, schema, extra) as table:
|
||||
cql.execute(f"insert into {table} (pk, v) values (0, 0)")
|
||||
cql.execute(f"insert into {table} (pk, v) values (1, null)")
|
||||
flush(cql, table)
|
||||
flush(cql, table + "_scylla_cdc_log")
|
||||
cql.execute(f"alter table {table} drop v")
|
||||
cql.execute(f"select * from {table}_scylla_cdc_log")
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user