Compare commits
84 Commits
next
...
scylla-4.6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8bf149fdd6 | ||
|
|
0265d56173 | ||
|
|
e50452ba43 | ||
|
|
a205f644cb | ||
|
|
f136b5b950 | ||
|
|
69a1325884 | ||
|
|
ab153c9b94 | ||
|
|
eb372d7f03 | ||
|
|
e232711e7e | ||
|
|
0a440b6d4a | ||
|
|
00bb1e8145 | ||
|
|
e30dbee2db | ||
|
|
2309d6b51e | ||
|
|
b77ca07709 | ||
|
|
bb0a38f889 | ||
|
|
c48fd03463 | ||
|
|
eb78e6d4b8 | ||
|
|
4b1b0a55c0 | ||
|
|
172a8628d5 | ||
|
|
5688b125e6 | ||
|
|
6da4acb41e | ||
|
|
f09cc9a01d | ||
|
|
cd2e33ede4 | ||
|
|
32d0698d78 | ||
|
|
93cf43ae4b | ||
|
|
2f2d22a864 | ||
|
|
5f92f54f06 | ||
|
|
395f2459b4 | ||
|
|
019d50bb5c | ||
|
|
bbe775b926 | ||
|
|
469c94ea17 | ||
|
|
4c780d0265 | ||
|
|
0181de1f2c | ||
|
|
7597a79ef9 | ||
|
|
8f5148e921 | ||
|
|
5694ec189f | ||
|
|
34d470967a | ||
|
|
61db571a44 | ||
|
|
5b5a300a9e | ||
|
|
148a65d0d6 | ||
|
|
e3ad14d55f | ||
|
|
2b506c2d4a | ||
|
|
50aad1c668 | ||
|
|
7bf3f37cd1 | ||
|
|
0f7f8585f2 | ||
|
|
2c65c4a569 | ||
|
|
f85cd289bc | ||
|
|
5e661af9a4 | ||
|
|
5629b67d25 | ||
|
|
ad632cf7fc | ||
|
|
ca24bebcf2 | ||
|
|
7dc5abb6f8 | ||
|
|
e8a1cfb6f8 | ||
|
|
fc312b3021 | ||
|
|
7b82aaf939 | ||
|
|
894a4abfae | ||
|
|
4dcf023470 | ||
|
|
283788828e | ||
|
|
730a147ba6 | ||
|
|
9897e83029 | ||
|
|
1a9b64e6f6 | ||
|
|
49fe9e2c8e | ||
|
|
d0580c41ee | ||
|
|
542394c82f | ||
|
|
018ad3f6f4 | ||
|
|
9b8b7efb54 | ||
|
|
1c3e63975f | ||
|
|
11bb03e46d | ||
|
|
810e410c5d | ||
|
|
97f6da0c3e | ||
|
|
c229fe9694 | ||
|
|
ee1ca8ae4d | ||
|
|
6bfd322e3b | ||
|
|
afc18d5070 | ||
|
|
2ec22c2404 | ||
|
|
19da778271 | ||
|
|
cbd4c13ba6 | ||
|
|
338871802d | ||
|
|
8b5b1b8af6 | ||
|
|
ea89eff95d | ||
|
|
96421e7779 | ||
|
|
142336ca53 | ||
|
|
492f12248c | ||
|
|
7eb7a0e5fe |
2
.gitmodules
vendored
2
.gitmodules
vendored
@@ -1,6 +1,6 @@
|
||||
[submodule "seastar"]
|
||||
path = seastar
|
||||
url = ../seastar
|
||||
url = ../scylla-seastar
|
||||
ignore = dirty
|
||||
[submodule "swagger-ui"]
|
||||
path = swagger-ui
|
||||
|
||||
@@ -60,7 +60,7 @@ fi
|
||||
|
||||
# Default scylla product/version tags
|
||||
PRODUCT=scylla
|
||||
VERSION=4.6.dev
|
||||
VERSION=4.6.3
|
||||
|
||||
if test -f version
|
||||
then
|
||||
|
||||
@@ -1017,18 +1017,16 @@ future<executor::request_return_type> executor::update_table(client_state& clien
|
||||
_stats.api_operations.update_table++;
|
||||
elogger.trace("Updating table {}", request);
|
||||
|
||||
std::string table_name = get_table_name(request);
|
||||
if (table_name.find(INTERNAL_TABLE_PREFIX) == 0) {
|
||||
schema_ptr tab = get_table(_proxy, request);
|
||||
// the ugly but harmless conversion to string_view here is because
|
||||
// Seastar's sstring is missing a find(std::string_view) :-()
|
||||
if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
|
||||
return make_ready_future<request_return_type>(api_error::validation(
|
||||
format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
|
||||
}
|
||||
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
|
||||
tracing::add_table_name(trace_state, keyspace_name, table_name);
|
||||
tracing::add_table_name(trace_state, tab->ks_name(), tab->cf_name());
|
||||
|
||||
auto& db = _proxy.get_db().local();
|
||||
auto& cf = db.find_column_family(keyspace_name, table_name);
|
||||
|
||||
schema_builder builder(cf.schema());
|
||||
schema_builder builder(tab);
|
||||
|
||||
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
||||
if (stream_specification && stream_specification->IsObject()) {
|
||||
@@ -2481,8 +2479,8 @@ static bool hierarchy_actions(
|
||||
// attr member so we can use add()
|
||||
rjson::add_with_string_name(v, attr, std::move(*newv));
|
||||
} else {
|
||||
throw api_error::validation(format("Can't remove document path {} - not present in item",
|
||||
subh.get_value()._path));
|
||||
// Removing a.b when a is a map but a.b doesn't exist
|
||||
// is silently ignored. It's not considered an error.
|
||||
}
|
||||
} else {
|
||||
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
||||
|
||||
@@ -79,6 +79,49 @@ atomic_cell::atomic_cell(const abstract_type& type, atomic_cell_view other)
|
||||
set_view(_data);
|
||||
}
|
||||
|
||||
// Based on:
|
||||
// - org.apache.cassandra.db.AbstractCell#reconcile()
|
||||
// - org.apache.cassandra.db.BufferExpiringCell#reconcile()
|
||||
// - org.apache.cassandra.db.BufferDeletedCell#reconcile()
|
||||
std::strong_ordering
|
||||
compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
|
||||
if (left.timestamp() != right.timestamp()) {
|
||||
return left.timestamp() <=> right.timestamp();
|
||||
}
|
||||
if (left.is_live() != right.is_live()) {
|
||||
return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
|
||||
}
|
||||
if (left.is_live()) {
|
||||
auto c = compare_unsigned(left.value(), right.value()) <=> 0;
|
||||
if (c != 0) {
|
||||
return c;
|
||||
}
|
||||
if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
|
||||
// prefer expiring cells.
|
||||
return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
|
||||
}
|
||||
if (left.is_live_and_has_ttl()) {
|
||||
if (left.expiry() != right.expiry()) {
|
||||
return left.expiry() <=> right.expiry();
|
||||
} else {
|
||||
// prefer the cell that was written later,
|
||||
// so it survives longer after it expires, until purged.
|
||||
return right.ttl() <=> left.ttl();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Both are deleted
|
||||
|
||||
// Origin compares big-endian serialized deletion time. That's because it
|
||||
// delegates to AbstractCell.reconcile() which compares values after
|
||||
// comparing timestamps, which in case of deleted cells will hold
|
||||
// serialized expiry.
|
||||
return (uint64_t) left.deletion_time().time_since_epoch().count()
|
||||
<=> (uint64_t) right.deletion_time().time_since_epoch().count();
|
||||
}
|
||||
return std::strong_ordering::equal;
|
||||
}
|
||||
|
||||
atomic_cell_or_collection atomic_cell_or_collection::copy(const abstract_type& type) const {
|
||||
if (_data.empty()) {
|
||||
return atomic_cell_or_collection();
|
||||
|
||||
@@ -593,8 +593,8 @@ void cache_flat_mutation_reader::move_to_range(query::clustering_row_ranges::con
|
||||
clogger.trace("csm {}: insert dummy at {}", fmt::ptr(this), _lower_bound);
|
||||
auto it = with_allocator(_lsa_manager.region().allocator(), [&] {
|
||||
auto& rows = _snp->version()->partition().mutable_clustered_rows();
|
||||
auto new_entry = current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no);
|
||||
return rows.insert_before(_next_row.get_iterator_in_latest_version(), *new_entry);
|
||||
auto new_entry = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(*_schema, _lower_bound, is_dummy::yes, is_continuous::no));
|
||||
return rows.insert_before(_next_row.get_iterator_in_latest_version(), std::move(new_entry));
|
||||
});
|
||||
_snp->tracker()->insert(*it);
|
||||
_last_row = partition_snapshot_row_weakref(*_snp, it, true);
|
||||
|
||||
@@ -1511,6 +1511,11 @@ public:
|
||||
}
|
||||
|
||||
auto process_cell = [&, this] (const column_definition& cdef) {
|
||||
// If table uses compact storage it may contain a column of type empty
|
||||
// and we need to ignore such a field because it is not present in CDC log.
|
||||
if (cdef.type->get_kind() == abstract_type::kind::empty) {
|
||||
return;
|
||||
}
|
||||
if (auto current = get_col_from_row_state(row_state, cdef)) {
|
||||
_builder->set_value(image_ck, cdef, *current);
|
||||
} else if (op == operation::pre_image) {
|
||||
|
||||
@@ -1634,7 +1634,7 @@ future<bool> scrub_validate_mode_validate_reader(flat_mutation_reader reader, co
|
||||
while (auto mf_opt = co_await reader()) {
|
||||
if (cdata.is_stop_requested()) [[unlikely]] {
|
||||
// Compaction manager will catch this exception and re-schedule the compaction.
|
||||
co_return coroutine::make_exception(compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested));
|
||||
throw compaction_stopped_exception(schema->ks_name(), schema->cf_name(), cdata.stop_requested);
|
||||
}
|
||||
|
||||
const auto& mf = *mf_opt;
|
||||
|
||||
@@ -326,6 +326,11 @@ future<> compaction_manager::run_custom_job(column_family* cf, sstables::compact
|
||||
task->compaction_done = with_semaphore(_custom_job_sem, 1, [this, task, cf, &job = *job_ptr] () mutable {
|
||||
// take read lock for cf, so major compaction and resharding can't proceed in parallel.
|
||||
return with_lock(_compaction_locks[cf].for_read(), [this, task, cf, &job] () mutable {
|
||||
// Allow caller to know that task (e.g. reshape) was asked to stop while waiting for a chance to run.
|
||||
if (task->compaction_data.is_stop_requested()) {
|
||||
throw sstables::compaction_stopped_exception(task->compacting_cf->schema()->ks_name(), task->compacting_cf->schema()->cf_name(),
|
||||
task->compaction_data.stop_requested);
|
||||
}
|
||||
_stats.active_tasks++;
|
||||
if (!can_proceed(task)) {
|
||||
return make_ready_future<>();
|
||||
@@ -737,8 +742,10 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
column_family& cf = *task->compacting_cf;
|
||||
auto sstable_level = sst->get_sstable_level();
|
||||
auto run_identifier = sst->run_identifier();
|
||||
|
||||
auto sstable_set_snapshot = can_purge ? std::make_optional(cf.get_sstable_set()) : std::nullopt;
|
||||
auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), _maintenance_sg.io,
|
||||
// FIXME: this compaction should run with maintenance priority.
|
||||
auto descriptor = sstables::compaction_descriptor({ sst }, std::move(sstable_set_snapshot), service::get_local_compaction_priority(),
|
||||
sstable_level, sstables::compaction_descriptor::default_max_sstable_bytes, run_identifier, options);
|
||||
|
||||
// Releases reference to cleaned sstable such that respective used disk space can be freed.
|
||||
@@ -747,15 +754,14 @@ future<> compaction_manager::rewrite_sstables(column_family* cf, sstables::compa
|
||||
};
|
||||
|
||||
return with_semaphore(_rewrite_sstables_sem, 1, [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
|
||||
// Take write lock for cf to serialize cleanup/upgrade sstables/scrub with major compaction/reshape/reshard.
|
||||
return with_lock(_compaction_locks[&cf].for_write(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
|
||||
return with_lock(_compaction_locks[&cf].for_read(), [this, task, &cf, descriptor = std::move(descriptor), compacting] () mutable {
|
||||
_stats.pending_tasks--;
|
||||
_stats.active_tasks++;
|
||||
task->setup_new_compaction();
|
||||
task->output_run_identifier = descriptor.run_identifier;
|
||||
compaction_backlog_tracker user_initiated(std::make_unique<user_initiated_backlog_tracker>(_compaction_controller.backlog_of_shares(200), _available_memory));
|
||||
return do_with(std::move(user_initiated), [this, &cf, descriptor = std::move(descriptor), task] (compaction_backlog_tracker& bt) mutable {
|
||||
return with_scheduling_group(_maintenance_sg.cpu, [this, &cf, descriptor = std::move(descriptor), task]() mutable {
|
||||
return with_scheduling_group(_compaction_controller.sg(), [this, &cf, descriptor = std::move(descriptor), task]() mutable {
|
||||
return cf.compact_sstables(std::move(descriptor), task->compaction_data);
|
||||
});
|
||||
});
|
||||
@@ -979,7 +985,7 @@ void compaction_manager::stop_compaction(sstring type) {
|
||||
}
|
||||
// FIXME: switch to task_stop(), and wait for their termination, so API user can know when compactions actually stopped.
|
||||
for (auto& task : _tasks) {
|
||||
if (task->compaction_running && target_type == task->type) {
|
||||
if (target_type == task->type) {
|
||||
task->compaction_data.stop("user request");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,7 +117,13 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
|
||||
if (!col_type->is_map()) {
|
||||
throw exceptions::invalid_request_exception(format("subscripting non-map column {}", cdef->name_as_text()));
|
||||
}
|
||||
const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[data.sel.index_of(*cdef)]));
|
||||
int32_t index = data.sel.index_of(*cdef);
|
||||
if (index == -1) {
|
||||
throw std::runtime_error(
|
||||
format("Column definition {} does not match any column in the query selection",
|
||||
cdef->name_as_text()));
|
||||
}
|
||||
const auto deserialized = cdef->type->deserialize(managed_bytes_view(*data.other_columns[index]));
|
||||
const auto& data_map = value_cast<map_type_impl::native_type>(deserialized);
|
||||
const auto key = evaluate_to_raw_view(col.sub, options);
|
||||
auto&& key_type = col_type->name_comparator();
|
||||
@@ -135,8 +141,16 @@ managed_bytes_opt get_value(const column_value& col, const column_value_eval_bag
|
||||
case column_kind::clustering_key:
|
||||
return managed_bytes(data.clustering_key[cdef->id]);
|
||||
case column_kind::static_column:
|
||||
case column_kind::regular_column:
|
||||
return managed_bytes_opt(data.other_columns[data.sel.index_of(*cdef)]);
|
||||
[[fallthrough]];
|
||||
case column_kind::regular_column: {
|
||||
int32_t index = data.sel.index_of(*cdef);
|
||||
if (index == -1) {
|
||||
throw std::runtime_error(
|
||||
format("Column definition {} does not match any column in the query selection",
|
||||
cdef->name_as_text()));
|
||||
}
|
||||
return managed_bytes_opt(data.other_columns[index]);
|
||||
}
|
||||
default:
|
||||
throw exceptions::unsupported_operation_exception("Unknown column kind");
|
||||
}
|
||||
|
||||
@@ -528,7 +528,7 @@ statement_restrictions::statement_restrictions(database& db,
|
||||
}
|
||||
|
||||
if (!_nonprimary_key_restrictions->empty()) {
|
||||
if (_has_queriable_regular_index) {
|
||||
if (_has_queriable_regular_index && _partition_range_is_simple) {
|
||||
_uses_secondary_indexing = true;
|
||||
} else if (!allow_filtering) {
|
||||
throw exceptions::invalid_request_exception("Cannot execute this query as it might involve data filtering and "
|
||||
|
||||
@@ -193,7 +193,7 @@ public:
|
||||
|
||||
template<typename RowComparator>
|
||||
void sort(const RowComparator& cmp) {
|
||||
std::sort(_rows.begin(), _rows.end(), std::ref(cmp));
|
||||
std::sort(_rows.begin(), _rows.end(), cmp);
|
||||
}
|
||||
|
||||
metadata& get_metadata();
|
||||
|
||||
@@ -995,6 +995,7 @@ lw_shared_ptr<const service::pager::paging_state> indexed_table_select_statement
|
||||
}
|
||||
|
||||
auto paging_state_copy = make_lw_shared<service::pager::paging_state>(service::pager::paging_state(*paging_state));
|
||||
paging_state_copy->set_remaining(internal_paging_size);
|
||||
paging_state_copy->set_partition_key(std::move(index_pk));
|
||||
paging_state_copy->set_clustering_key(std::move(index_ck));
|
||||
return std::move(paging_state_copy);
|
||||
|
||||
@@ -53,6 +53,7 @@
|
||||
#include "types/list.hh"
|
||||
#include "types/user.hh"
|
||||
#include "concrete_types.hh"
|
||||
#include "validation.hh"
|
||||
|
||||
namespace cql3 {
|
||||
|
||||
@@ -251,6 +252,7 @@ insert_prepared_json_statement::build_partition_keys(const query_options& option
|
||||
exploded.emplace_back(json_value->second);
|
||||
}
|
||||
auto pkey = partition_key::from_optional_exploded(*s, std::move(exploded));
|
||||
validation::validate_cql_key(*s, pkey);
|
||||
auto k = query::range<query::ring_position>::make_singular(dht::decorate_key(*s, std::move(pkey)));
|
||||
ranges.emplace_back(std::move(k));
|
||||
return ranges;
|
||||
|
||||
38
database.cc
38
database.cc
@@ -1348,44 +1348,6 @@ database::existing_index_names(const sstring& ks_name, const sstring& cf_to_excl
|
||||
return names;
|
||||
}
|
||||
|
||||
// Based on:
|
||||
// - org.apache.cassandra.db.AbstractCell#reconcile()
|
||||
// - org.apache.cassandra.db.BufferExpiringCell#reconcile()
|
||||
// - org.apache.cassandra.db.BufferDeletedCell#reconcile()
|
||||
std::strong_ordering
|
||||
compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
|
||||
if (left.timestamp() != right.timestamp()) {
|
||||
return left.timestamp() <=> right.timestamp();
|
||||
}
|
||||
if (left.is_live() != right.is_live()) {
|
||||
return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
|
||||
}
|
||||
if (left.is_live()) {
|
||||
auto c = compare_unsigned(left.value(), right.value()) <=> 0;
|
||||
if (c != 0) {
|
||||
return c;
|
||||
}
|
||||
if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
|
||||
// prefer expiring cells.
|
||||
return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
|
||||
}
|
||||
if (left.is_live_and_has_ttl() && left.expiry() != right.expiry()) {
|
||||
return left.expiry() <=> right.expiry();
|
||||
}
|
||||
} else {
|
||||
// Both are deleted
|
||||
if (left.deletion_time() != right.deletion_time()) {
|
||||
// Origin compares big-endian serialized deletion time. That's because it
|
||||
// delegates to AbstractCell.reconcile() which compares values after
|
||||
// comparing timestamps, which in case of deleted cells will hold
|
||||
// serialized expiry.
|
||||
return (uint64_t) left.deletion_time().time_since_epoch().count()
|
||||
<=> (uint64_t) right.deletion_time().time_since_epoch().count();
|
||||
}
|
||||
}
|
||||
return std::strong_ordering::equal;
|
||||
}
|
||||
|
||||
future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
|
||||
database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
|
||||
tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
|
||||
|
||||
@@ -428,6 +428,8 @@ private:
|
||||
void abort_recycled_list(std::exception_ptr);
|
||||
void abort_deletion_promise(std::exception_ptr);
|
||||
|
||||
future<> recalculate_footprint();
|
||||
|
||||
future<> rename_file(sstring, sstring) const;
|
||||
size_t max_request_controller_units() const;
|
||||
segment_id_type _ids = 0;
|
||||
@@ -444,6 +446,7 @@ private:
|
||||
seastar::gate _gate;
|
||||
uint64_t _new_counter = 0;
|
||||
std::optional<size_t> _disk_write_alignment;
|
||||
seastar::semaphore _reserve_recalculation_guard;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
@@ -512,6 +515,7 @@ class db::commitlog::segment : public enable_shared_from_this<segment>, public c
|
||||
uint64_t _file_pos = 0;
|
||||
uint64_t _flush_pos = 0;
|
||||
uint64_t _size_on_disk = 0;
|
||||
uint64_t _waste = 0;
|
||||
|
||||
size_t _alignment;
|
||||
|
||||
@@ -598,7 +602,7 @@ public:
|
||||
clogger.debug("Segment {} is no longer active and will submitted for delete now", *this);
|
||||
++_segment_manager->totals.segments_destroyed;
|
||||
_segment_manager->totals.active_size_on_disk -= file_position();
|
||||
_segment_manager->totals.wasted_size_on_disk -= (_size_on_disk - file_position());
|
||||
_segment_manager->totals.wasted_size_on_disk -= _waste;
|
||||
_segment_manager->add_file_to_delete(_file_name, _desc);
|
||||
} else if (_segment_manager->cfg.warn_about_segments_left_on_disk_after_shutdown) {
|
||||
clogger.warn("Segment {} is dirty and is left on disk.", *this);
|
||||
@@ -725,7 +729,8 @@ public:
|
||||
auto s = co_await sync();
|
||||
co_await flush();
|
||||
co_await terminate();
|
||||
_segment_manager->totals.wasted_size_on_disk += (_size_on_disk - file_position());
|
||||
_waste = _size_on_disk - file_position();
|
||||
_segment_manager->totals.wasted_size_on_disk += _waste;
|
||||
co_return s;
|
||||
}
|
||||
future<sseg_ptr> do_flush(uint64_t pos) {
|
||||
@@ -1223,6 +1228,7 @@ db::commitlog::segment_manager::segment_manager(config c)
|
||||
, _recycled_segments(std::numeric_limits<size_t>::max())
|
||||
, _reserve_replenisher(make_ready_future<>())
|
||||
, _background_sync(make_ready_future<>())
|
||||
, _reserve_recalculation_guard(1)
|
||||
{
|
||||
assert(max_size > 0);
|
||||
assert(max_mutation_size < segment::multi_entry_size_magic);
|
||||
@@ -1248,6 +1254,11 @@ future<> db::commitlog::segment_manager::replenish_reserve() {
|
||||
}
|
||||
try {
|
||||
gate::holder g(_gate);
|
||||
auto guard = co_await get_units(_reserve_recalculation_guard, 1);
|
||||
if (_reserve_segments.full()) {
|
||||
// can happen if we recalculate
|
||||
continue;
|
||||
}
|
||||
// note: if we were strict with disk size, we would refuse to do this
|
||||
// unless disk footprint is lower than threshold. but we cannot (yet?)
|
||||
// trust that flush logic will absolutely free up an existing
|
||||
@@ -1519,7 +1530,7 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
|
||||
if (cfg.extensions && !cfg.extensions->commitlog_file_extensions().empty()) {
|
||||
for (auto * ext : cfg.extensions->commitlog_file_extensions()) {
|
||||
auto nf = co_await ext->wrap_file(std::move(filename), f, flags);
|
||||
auto nf = co_await ext->wrap_file(filename, f, flags);
|
||||
if (nf) {
|
||||
f = std::move(nf);
|
||||
align = is_overwrite ? f.disk_overwrite_dma_alignment() : f.disk_write_dma_alignment();
|
||||
@@ -1530,12 +1541,21 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
f = make_checked_file(commit_error_handler, std::move(f));
|
||||
} catch (...) {
|
||||
ep = std::current_exception();
|
||||
commit_error_handler(ep);
|
||||
}
|
||||
if (ep) {
|
||||
// do this early, so iff we are to fast-fail server,
|
||||
// we do it before anything else can go wrong.
|
||||
try {
|
||||
commit_error_handler(ep);
|
||||
} catch (...) {
|
||||
ep = std::current_exception();
|
||||
}
|
||||
}
|
||||
if (ep && f) {
|
||||
co_await f.close();
|
||||
}
|
||||
if (ep) {
|
||||
add_file_to_delete(filename, d);
|
||||
co_return coroutine::exception(std::move(ep));
|
||||
}
|
||||
|
||||
@@ -1594,6 +1614,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
}
|
||||
|
||||
future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager::new_segment() {
|
||||
gate::holder g(_gate);
|
||||
|
||||
if (_shutdown) {
|
||||
co_return coroutine::make_exception(std::runtime_error("Commitlog has been shut down. Cannot add data"));
|
||||
}
|
||||
@@ -1628,22 +1650,23 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
|
||||
co_return _segments.back();
|
||||
}
|
||||
|
||||
if (_segment_allocating) {
|
||||
co_await _segment_allocating->get_future(timeout);
|
||||
continue;
|
||||
}
|
||||
|
||||
promise<> p;
|
||||
_segment_allocating.emplace(p.get_future());
|
||||
auto finally = defer([&] () noexcept { _segment_allocating = std::nullopt; });
|
||||
try {
|
||||
gate::holder g(_gate);
|
||||
auto s = co_await with_timeout(timeout, new_segment());
|
||||
p.set_value();
|
||||
} catch (...) {
|
||||
p.set_exception(std::current_exception());
|
||||
throw;
|
||||
// #9896 - we don't want to issue a new_segment call until
|
||||
// the old one has terminated with either result or exception.
|
||||
// Do all waiting through the shared_future
|
||||
if (!_segment_allocating) {
|
||||
auto f = new_segment();
|
||||
// must check that we are not already done.
|
||||
if (f.available()) {
|
||||
f.get(); // maybe force exception
|
||||
continue;
|
||||
}
|
||||
_segment_allocating.emplace(f.discard_result().finally([this] {
|
||||
// clear the shared_future _before_ resolving its contents
|
||||
// (i.e. with result of this finally)
|
||||
_segment_allocating = std::nullopt;
|
||||
}));
|
||||
}
|
||||
co_await _segment_allocating->get_future(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1865,6 +1888,8 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
|
||||
|
||||
std::exception_ptr recycle_error;
|
||||
|
||||
size_t num_deleted = 0;
|
||||
bool except = false;
|
||||
while (!files.empty()) {
|
||||
auto filename = std::move(files.back());
|
||||
files.pop_back();
|
||||
@@ -1914,8 +1939,10 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
|
||||
}
|
||||
}
|
||||
co_await delete_file(filename);
|
||||
++num_deleted;
|
||||
} catch (...) {
|
||||
clogger.error("Could not delete segment {}: {}", filename, std::current_exception());
|
||||
except = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1928,6 +1955,16 @@ future<> db::commitlog::segment_manager::delete_segments(std::vector<sstring> fi
|
||||
if (recycle_error && _recycled_segments.empty()) {
|
||||
abort_recycled_list(recycle_error);
|
||||
}
|
||||
// If recycle failed and turned into a delete, we should fake-wakeup waiters
|
||||
// since we might still have cleaned up disk space.
|
||||
if (!recycle_error && num_deleted && cfg.reuse_segments && _recycled_segments.empty()) {
|
||||
abort_recycled_list(std::make_exception_ptr(std::runtime_error("deleted files")));
|
||||
}
|
||||
|
||||
// #9348 - if we had an exception, we can't trust our bookeep any more. recalculate.
|
||||
if (except) {
|
||||
co_await recalculate_footprint();
|
||||
}
|
||||
}
|
||||
|
||||
void db::commitlog::segment_manager::abort_recycled_list(std::exception_ptr ep) {
|
||||
@@ -1942,6 +1979,67 @@ void db::commitlog::segment_manager::abort_deletion_promise(std::exception_ptr e
|
||||
std::exchange(_disk_deletions, {}).set_exception(ep);
|
||||
}
|
||||
|
||||
future<> db::commitlog::segment_manager::recalculate_footprint() {
|
||||
try {
|
||||
co_await do_pending_deletes();
|
||||
|
||||
auto guard = co_await get_units(_reserve_recalculation_guard, 1);
|
||||
auto segments_copy = _segments;
|
||||
std::vector<sseg_ptr> reserves;
|
||||
std::vector<sstring> recycles;
|
||||
// this causes haywire things while we steal stuff, but...
|
||||
while (!_reserve_segments.empty()) {
|
||||
reserves.push_back(_reserve_segments.pop());
|
||||
}
|
||||
while (!_recycled_segments.empty()) {
|
||||
recycles.push_back(_recycled_segments.pop());
|
||||
}
|
||||
// #9955 - must re-stock the queues before we do anything
|
||||
// interruptable/continuation. Because both queues are
|
||||
// used with push/pop eventually which _waits_ for signal
|
||||
// but does _not_ verify that the condition is true once
|
||||
// we return. So copy the objects and look at instead.
|
||||
for (auto& filename : recycles) {
|
||||
_recycled_segments.push(sstring(filename));
|
||||
}
|
||||
for (auto& s : reserves) {
|
||||
_reserve_segments.push(sseg_ptr(s)); // you can have it back now.
|
||||
}
|
||||
|
||||
// first, guesstimate sizes
|
||||
uint64_t recycle_size = recycles.size() * max_size;
|
||||
auto old = totals.total_size_on_disk;
|
||||
|
||||
totals.total_size_on_disk = recycle_size;
|
||||
for (auto& s : _segments) {
|
||||
totals.total_size_on_disk += s->_size_on_disk;
|
||||
}
|
||||
for (auto& s : reserves) {
|
||||
totals.total_size_on_disk += s->_size_on_disk;
|
||||
}
|
||||
|
||||
// now we need to adjust the actual sizes of recycled files
|
||||
|
||||
uint64_t actual_recycled_size = 0;
|
||||
|
||||
try {
|
||||
for (auto& filename : recycles) {
|
||||
auto s = co_await seastar::file_size(filename);
|
||||
actual_recycled_size += s;
|
||||
}
|
||||
} catch (...) {
|
||||
clogger.error("Exception reading disk footprint ({}).", std::current_exception());
|
||||
actual_recycled_size = recycle_size; // best we got
|
||||
}
|
||||
|
||||
totals.total_size_on_disk += actual_recycled_size - recycle_size;
|
||||
// pushing things to reserve/recycled queues will have resumed any
|
||||
// waiters, so we should be done.
|
||||
} catch (...) {
|
||||
clogger.error("Exception recalculating disk footprint ({}). Values might be off...", std::current_exception());
|
||||
}
|
||||
}
|
||||
|
||||
future<> db::commitlog::segment_manager::do_pending_deletes() {
|
||||
auto ftc = std::exchange(_files_to_close, {});
|
||||
auto ftd = std::exchange(_files_to_delete, {});
|
||||
|
||||
1
dist/common/scripts/scylla-housekeeping
vendored
1
dist/common/scripts/scylla-housekeeping
vendored
@@ -100,6 +100,7 @@ def version_compare(a, b):
|
||||
def create_uuid_file(fl):
|
||||
with open(args.uuid_file, 'w') as myfile:
|
||||
myfile.write(str(uuid.uuid1()) + "\n")
|
||||
os.chmod(args.uuid_file, 0o644)
|
||||
|
||||
|
||||
def sanitize_version(version):
|
||||
|
||||
60
dist/common/scripts/scylla_io_setup
vendored
60
dist/common/scripts/scylla_io_setup
vendored
@@ -278,6 +278,66 @@ if __name__ == "__main__":
|
||||
disk_properties["read_bandwidth"] = 2527296683 * nr_disks
|
||||
disk_properties["write_iops"] = 156326 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1063657088 * nr_disks
|
||||
elif idata.instance() == "im4gn.large":
|
||||
disk_properties["read_iops"] = 33943
|
||||
disk_properties["read_bandwidth"] = 288433525
|
||||
disk_properties["write_iops"] = 27877
|
||||
disk_properties["write_bandwidth"] = 126864680
|
||||
elif idata.instance() == "im4gn.xlarge":
|
||||
disk_properties["read_iops"] = 68122
|
||||
disk_properties["read_bandwidth"] = 576603520
|
||||
disk_properties["write_iops"] = 55246
|
||||
disk_properties["write_bandwidth"] = 254534954
|
||||
elif idata.instance() == "im4gn.2xlarge":
|
||||
disk_properties["read_iops"] = 136422
|
||||
disk_properties["read_bandwidth"] = 1152663765
|
||||
disk_properties["write_iops"] = 92184
|
||||
disk_properties["write_bandwidth"] = 508926453
|
||||
elif idata.instance() == "im4gn.4xlarge":
|
||||
disk_properties["read_iops"] = 273050
|
||||
disk_properties["read_bandwidth"] = 1638427264
|
||||
disk_properties["write_iops"] = 92173
|
||||
disk_properties["write_bandwidth"] = 1027966826
|
||||
elif idata.instance() == "im4gn.8xlarge":
|
||||
disk_properties["read_iops"] = 250241 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 1163130709 * nr_disks
|
||||
disk_properties["write_iops"] = 86374 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 977617664 * nr_disks
|
||||
elif idata.instance() == "im4gn.16xlarge":
|
||||
disk_properties["read_iops"] = 273030 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 1638211413 * nr_disks
|
||||
disk_properties["write_iops"] = 92607 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1028340266 * nr_disks
|
||||
elif idata.instance() == "is4gen.medium":
|
||||
disk_properties["read_iops"] = 33965
|
||||
disk_properties["read_bandwidth"] = 288462506
|
||||
disk_properties["write_iops"] = 27876
|
||||
disk_properties["write_bandwidth"] = 126954200
|
||||
elif idata.instance() == "is4gen.large":
|
||||
disk_properties["read_iops"] = 68131
|
||||
disk_properties["read_bandwidth"] = 576654869
|
||||
disk_properties["write_iops"] = 55257
|
||||
disk_properties["write_bandwidth"] = 254551002
|
||||
elif idata.instance() == "is4gen.xlarge":
|
||||
disk_properties["read_iops"] = 136413
|
||||
disk_properties["read_bandwidth"] = 1152747904
|
||||
disk_properties["write_iops"] = 92180
|
||||
disk_properties["write_bandwidth"] = 508889546
|
||||
elif idata.instance() == "is4gen.2xlarge":
|
||||
disk_properties["read_iops"] = 273038
|
||||
disk_properties["read_bandwidth"] = 1628982613
|
||||
disk_properties["write_iops"] = 92182
|
||||
disk_properties["write_bandwidth"] = 1027983530
|
||||
elif idata.instance() == "is4gen.4xlarge":
|
||||
disk_properties["read_iops"] = 260493 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 1217396928 * nr_disks
|
||||
disk_properties["write_iops"] = 83169 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1000390784 * nr_disks
|
||||
elif idata.instance() == "is4gen.8xlarge":
|
||||
disk_properties["read_iops"] = 273021 * nr_disks
|
||||
disk_properties["read_bandwidth"] = 1656354602 * nr_disks
|
||||
disk_properties["write_iops"] = 92233 * nr_disks
|
||||
disk_properties["write_bandwidth"] = 1028010325 * nr_disks
|
||||
properties_file = open(etcdir() + "/scylla.d/io_properties.yaml", "w")
|
||||
yaml.dump({ "disks": [ disk_properties ] }, properties_file, default_flow_style=False)
|
||||
ioconf = open(etcdir() + "/scylla.d/io.conf", "w")
|
||||
|
||||
6
dist/common/scripts/scylla_ntp_setup
vendored
6
dist/common/scripts/scylla_ntp_setup
vendored
@@ -66,18 +66,18 @@ if __name__ == '__main__':
|
||||
|
||||
target = None
|
||||
if os.path.exists('/lib/systemd/systemd-timesyncd'):
|
||||
if systemd_unit('systemd-timesyncd').is_active():
|
||||
if systemd_unit('systemd-timesyncd').is_active() == 'active':
|
||||
print('ntp is already configured, skip setup')
|
||||
sys.exit(0)
|
||||
target = 'systemd-timesyncd'
|
||||
if shutil.which('chronyd'):
|
||||
if get_chrony_unit().is_active():
|
||||
if get_chrony_unit().is_active() == 'active':
|
||||
print('ntp is already configured, skip setup')
|
||||
sys.exit(0)
|
||||
if not target:
|
||||
target = 'chrony'
|
||||
if shutil.which('ntpd'):
|
||||
if get_ntp_unit().is_active():
|
||||
if get_ntp_unit().is_active() == 'active':
|
||||
print('ntp is already configured, skip setup')
|
||||
sys.exit(0)
|
||||
if not target:
|
||||
|
||||
19
dist/common/scripts/scylla_raid_setup
vendored
19
dist/common/scripts/scylla_raid_setup
vendored
@@ -117,10 +117,11 @@ if __name__ == '__main__':
|
||||
pkg_install('xfsprogs')
|
||||
if not shutil.which('mdadm'):
|
||||
pkg_install('mdadm')
|
||||
try:
|
||||
md_service = systemd_unit('mdmonitor.service')
|
||||
except SystemdException:
|
||||
md_service = systemd_unit('mdadm.service')
|
||||
if args.raid_level != '0':
|
||||
try:
|
||||
md_service = systemd_unit('mdmonitor.service')
|
||||
except SystemdException:
|
||||
md_service = systemd_unit('mdadm.service')
|
||||
|
||||
print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='fRAID{args.raid_level}' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
|
||||
procs=[]
|
||||
@@ -164,14 +165,15 @@ if __name__ == '__main__':
|
||||
|
||||
uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
||||
after = 'local-fs.target'
|
||||
if raid:
|
||||
wants = ''
|
||||
if raid and args.raid_level != '0':
|
||||
after += f' {md_service}'
|
||||
wants = f'\nWants={md_service}'
|
||||
unit_data = f'''
|
||||
[Unit]
|
||||
Description=Scylla data directory
|
||||
Before=scylla-server.service
|
||||
After={after}
|
||||
Wants={md_service}
|
||||
After={after}{wants}
|
||||
DefaultDependencies=no
|
||||
|
||||
[Mount]
|
||||
@@ -195,7 +197,8 @@ WantedBy=multi-user.target
|
||||
f.write(f'RequiresMountsFor={mount_at}\n')
|
||||
|
||||
systemd_unit.reload()
|
||||
md_service.start()
|
||||
if args.raid_level != '0':
|
||||
md_service.start()
|
||||
mount = systemd_unit(mntunit_bn)
|
||||
mount.start()
|
||||
if args.enable_on_nextboot:
|
||||
|
||||
4
dist/common/scripts/scylla_setup
vendored
4
dist/common/scripts/scylla_setup
vendored
@@ -370,6 +370,10 @@ if __name__ == '__main__':
|
||||
version_check = interactive_ask_service('Do you want to enable Scylla to check if there is a newer version of Scylla available?', 'Yes - start the Scylla-housekeeping service to check for a newer version. This check runs periodically. No - skips this step.', version_check)
|
||||
args.no_version_check = not version_check
|
||||
if version_check:
|
||||
cfg = sysconfig_parser(sysconfdir_p() / 'scylla-housekeeping')
|
||||
repo_files = cfg.get('REPO_FILES')
|
||||
for f in glob.glob(repo_files):
|
||||
os.chmod(f, 0o644)
|
||||
with open('/etc/scylla.d/housekeeping.cfg', 'w') as f:
|
||||
f.write('[housekeeping]\ncheck-version: True\n')
|
||||
os.chmod('/etc/scylla.d/housekeeping.cfg', 0o644)
|
||||
|
||||
6
dist/common/scripts/scylla_util.py
vendored
6
dist/common/scripts/scylla_util.py
vendored
@@ -674,7 +674,7 @@ class aws_instance:
|
||||
return self._type.split(".")[0]
|
||||
|
||||
def is_supported_instance_class(self):
|
||||
if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd']:
|
||||
if self.instance_class() in ['i2', 'i3', 'i3en', 'c5d', 'm5d', 'm5ad', 'r5d', 'z1d', 'c6gd', 'm6gd', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -683,7 +683,7 @@ class aws_instance:
|
||||
instance_size = self.instance_size()
|
||||
if instance_class in ['c3', 'c4', 'd2', 'i2', 'r3']:
|
||||
return 'ixgbevf'
|
||||
if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd']:
|
||||
if instance_class in ['a1', 'c5', 'c5a', 'c5d', 'c5n', 'c6g', 'c6gd', 'f1', 'g3', 'g4', 'h1', 'i3', 'i3en', 'inf1', 'm5', 'm5a', 'm5ad', 'm5d', 'm5dn', 'm5n', 'm6g', 'm6gd', 'p2', 'p3', 'r4', 'r5', 'r5a', 'r5ad', 'r5b', 'r5d', 'r5dn', 'r5n', 't3', 't3a', 'u-6tb1', 'u-9tb1', 'u-12tb1', 'u-18tn1', 'u-24tb1', 'x1', 'x1e', 'z1d', 'c6g', 'c6gd', 'm6g', 'm6gd', 't4g', 'r6g', 'r6gd', 'x2gd', 'im4gn', 'is4gen']:
|
||||
return 'ena'
|
||||
if instance_class == 'm4':
|
||||
if instance_size == '16xlarge':
|
||||
@@ -1041,7 +1041,7 @@ class systemd_unit:
|
||||
return run('systemctl {} disable {}'.format(self.ctlparam, self._unit), shell=True, check=True)
|
||||
|
||||
def is_active(self):
|
||||
return True if run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip() == 'active' else False
|
||||
return run('systemctl {} is-active {}'.format(self.ctlparam, self._unit), shell=True, capture_output=True, encoding='utf-8').stdout.strip()
|
||||
|
||||
def mask(self):
|
||||
return run('systemctl {} mask {}'.format(self.ctlparam, self._unit), shell=True, check=True)
|
||||
|
||||
6
dist/common/supervisor/scylla_util.sh
vendored
6
dist/common/supervisor/scylla_util.sh
vendored
@@ -6,12 +6,16 @@ is_nonroot() {
|
||||
[ -f "$scylladir"/SCYLLA-NONROOT-FILE ]
|
||||
}
|
||||
|
||||
is_container() {
|
||||
[ -f "$scylladir"/SCYLLA-CONTAINER-FILE ]
|
||||
}
|
||||
|
||||
is_privileged() {
|
||||
[ ${EUID:-${UID}} = 0 ]
|
||||
}
|
||||
|
||||
execsudo() {
|
||||
if is_nonroot; then
|
||||
if is_nonroot || is_container; then
|
||||
exec "$@"
|
||||
else
|
||||
exec sudo -u scylla -g scylla "$@"
|
||||
|
||||
8
dist/docker/debian/build_docker.sh
vendored
8
dist/docker/debian/build_docker.sh
vendored
@@ -25,6 +25,10 @@ product="$(<build/SCYLLA-PRODUCT-FILE)"
|
||||
version="$(<build/SCYLLA-VERSION-FILE)"
|
||||
release="$(<build/SCYLLA-RELEASE-FILE)"
|
||||
|
||||
if [[ "$version" = *rc* ]]; then
|
||||
version=$(echo $version |sed 's/\(.*\)\.)*/\1~/')
|
||||
fi
|
||||
|
||||
mode="release"
|
||||
|
||||
if uname -m | grep x86_64 ; then
|
||||
@@ -93,12 +97,14 @@ run apt-get -y install hostname supervisor openssh-server openssh-client openjdk
|
||||
run locale-gen en_US.UTF-8
|
||||
run bash -ec "dpkg -i packages/*.deb"
|
||||
run apt-get -y clean all
|
||||
run bash -ec "cat /scylla_bashrc >> /etc/bashrc"
|
||||
run bash -ec "cat /scylla_bashrc >> /etc/bash.bashrc"
|
||||
run mkdir -p /etc/supervisor.conf.d
|
||||
run mkdir -p /var/log/scylla
|
||||
run chown -R scylla:scylla /var/lib/scylla
|
||||
run sed -i -e 's/^SCYLLA_ARGS=".*"$/SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"/' /etc/default/scylla-server
|
||||
|
||||
run mkdir -p /opt/scylladb/supervisor
|
||||
run touch /opt/scylladb/SCYLLA-CONTAINER-FILE
|
||||
bcp dist/common/supervisor/scylla-server.sh /opt/scylladb/supervisor/scylla-server.sh
|
||||
bcp dist/common/supervisor/scylla-jmx.sh /opt/scylladb/supervisor/scylla-jmx.sh
|
||||
bcp dist/common/supervisor/scylla-node-exporter.sh /opt/scylladb/supervisor/scylla-node-exporter.sh
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
[program:scylla-server]
|
||||
[program:scylla]
|
||||
command=/opt/scylladb/supervisor/scylla-server.sh
|
||||
stdout_logfile=/dev/stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
|
||||
41
dist/docker/etc/sysconfig/scylla-server
vendored
41
dist/docker/etc/sysconfig/scylla-server
vendored
@@ -1,41 +0,0 @@
|
||||
# choose following mode: virtio, dpdk, posix
|
||||
NETWORK_MODE=posix
|
||||
|
||||
# tap device name(virtio)
|
||||
TAP=tap0
|
||||
|
||||
# bridge device name (virtio)
|
||||
BRIDGE=virbr0
|
||||
|
||||
# ethernet device name
|
||||
IFNAME=eth0
|
||||
|
||||
# setup NIC's and disks' interrupts, RPS, XPS, nomerges and I/O scheduler (posix)
|
||||
SET_NIC_AND_DISKS=no
|
||||
|
||||
# ethernet device driver (dpdk)
|
||||
ETHDRV=
|
||||
|
||||
# ethernet device PCI ID (dpdk)
|
||||
ETHPCIID=
|
||||
|
||||
# number of hugepages
|
||||
NR_HUGEPAGES=64
|
||||
|
||||
# user for process (must be root for dpdk)
|
||||
USER=scylla
|
||||
|
||||
# group for process
|
||||
GROUP=scylla
|
||||
|
||||
# scylla home dir
|
||||
SCYLLA_HOME=/var/lib/scylla
|
||||
|
||||
# scylla config dir
|
||||
SCYLLA_CONF=/etc/scylla
|
||||
|
||||
# scylla arguments
|
||||
SCYLLA_ARGS="--log-to-syslog 0 --log-to-stdout 1 --default-log-level info --network-stack posix"
|
||||
|
||||
# setup as AMI instance
|
||||
AMI=no
|
||||
5
dist/docker/scyllasetup.py
vendored
5
dist/docker/scyllasetup.py
vendored
@@ -121,12 +121,13 @@ class ScyllaSetup:
|
||||
if self._apiAddress is not None:
|
||||
args += ["--api-address %s" % self._apiAddress]
|
||||
|
||||
if self._alternatorPort is not None:
|
||||
if self._alternatorAddress is not None:
|
||||
args += ["--alternator-address %s" % self._alternatorAddress]
|
||||
|
||||
if self._alternatorPort is not None:
|
||||
args += ["--alternator-port %s" % self._alternatorPort]
|
||||
|
||||
if self._alternatorHttpsPort is not None:
|
||||
args += ["--alternator-address %s" % self._alternatorAddress]
|
||||
args += ["--alternator-https-port %s" % self._alternatorHttpsPort]
|
||||
|
||||
if self._alternatorWriteIsolation is not None:
|
||||
|
||||
@@ -184,14 +184,18 @@ future<> server::do_accepts(int which, bool keepalive, socket_address server_add
|
||||
_logger.info("exception while advertising new connection: {}", std::current_exception());
|
||||
}
|
||||
// Block while monitoring for lifetime/errors.
|
||||
return conn->process().finally([this, conn] {
|
||||
return unadvertise_connection(conn);
|
||||
}).handle_exception([this] (std::exception_ptr ep) {
|
||||
if (is_broken_pipe_or_connection_reset(ep)) {
|
||||
// expected if another side closes a connection or we're shutting down
|
||||
return;
|
||||
return conn->process().then_wrapped([this, conn] (auto f) {
|
||||
try {
|
||||
f.get();
|
||||
} catch (...) {
|
||||
auto ep = std::current_exception();
|
||||
if (!is_broken_pipe_or_connection_reset(ep)) {
|
||||
// some exceptions are expected if another side closes a connection
|
||||
// or we're shutting down
|
||||
_logger.info("exception while processing connection: {}", ep);
|
||||
}
|
||||
}
|
||||
_logger.info("exception while processing connection: {}", ep);
|
||||
return unadvertise_connection(conn);
|
||||
});
|
||||
});
|
||||
return stop_iteration::no;
|
||||
|
||||
@@ -477,49 +477,42 @@ gossiper::handle_get_endpoint_states_msg(gossip_get_endpoint_states_request requ
|
||||
return make_ready_future<gossip_get_endpoint_states_response>(gossip_get_endpoint_states_response{std::move(map)});
|
||||
}
|
||||
|
||||
rpc::no_wait_type gossiper::background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn) {
|
||||
(void)with_gate(_background_msg, [this, type = std::move(type), fn = std::move(fn)] () mutable {
|
||||
return container().invoke_on(0, std::move(fn)).handle_exception([type = std::move(type)] (auto ep) {
|
||||
logger.warn("Failed to handle {}: {}", type, ep);
|
||||
});
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
}
|
||||
|
||||
void gossiper::init_messaging_service_handler() {
|
||||
_messaging.register_gossip_digest_syn([this] (const rpc::client_info& cinfo, gossip_digest_syn syn_msg) {
|
||||
auto from = netw::messaging_service::get_source(cinfo);
|
||||
// In a new fiber.
|
||||
(void)container().invoke_on(0, [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
|
||||
return background_msg("GOSSIP_DIGEST_SYN", [from, syn_msg = std::move(syn_msg)] (gms::gossiper& gossiper) mutable {
|
||||
return gossiper.handle_syn_msg(from, std::move(syn_msg));
|
||||
}).handle_exception([] (auto ep) {
|
||||
logger.warn("Fail to handle GOSSIP_DIGEST_SYN: {}", ep);
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
});
|
||||
_messaging.register_gossip_digest_ack([this] (const rpc::client_info& cinfo, gossip_digest_ack msg) {
|
||||
auto from = netw::messaging_service::get_source(cinfo);
|
||||
// In a new fiber.
|
||||
(void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
|
||||
return background_msg("GOSSIP_DIGEST_ACK", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
|
||||
return gossiper.handle_ack_msg(from, std::move(msg));
|
||||
}).handle_exception([] (auto ep) {
|
||||
logger.warn("Fail to handle GOSSIP_DIGEST_ACK: {}", ep);
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
});
|
||||
_messaging.register_gossip_digest_ack2([this] (const rpc::client_info& cinfo, gossip_digest_ack2 msg) {
|
||||
auto from = netw::messaging_service::get_source(cinfo);
|
||||
// In a new fiber.
|
||||
(void)container().invoke_on(0, [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
|
||||
return background_msg("GOSSIP_DIGEST_ACK2", [from, msg = std::move(msg)] (gms::gossiper& gossiper) mutable {
|
||||
return gossiper.handle_ack2_msg(from, std::move(msg));
|
||||
}).handle_exception([] (auto ep) {
|
||||
logger.warn("Fail to handle GOSSIP_DIGEST_ACK2: {}", ep);
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
});
|
||||
_messaging.register_gossip_echo([this] (const rpc::client_info& cinfo, rpc::optional<int64_t> generation_number_opt) {
|
||||
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
return handle_echo_msg(from, generation_number_opt);
|
||||
});
|
||||
_messaging.register_gossip_shutdown([this] (inet_address from, rpc::optional<int64_t> generation_number_opt) {
|
||||
// In a new fiber.
|
||||
(void)container().invoke_on(0, [from, generation_number_opt] (gms::gossiper& gossiper) {
|
||||
return background_msg("GOSSIP_SHUTDOWN", [from, generation_number_opt] (gms::gossiper& gossiper) {
|
||||
return gossiper.handle_shutdown_msg(from, generation_number_opt);
|
||||
}).handle_exception([] (auto ep) {
|
||||
logger.warn("Fail to handle GOSSIP_SHUTDOWN: {}", ep);
|
||||
});
|
||||
return messaging_service::no_wait();
|
||||
});
|
||||
_messaging.register_gossip_get_endpoint_states([this] (const rpc::client_info& cinfo, gossip_get_endpoint_states_request request) {
|
||||
return container().invoke_on(0, [request = std::move(request)] (gms::gossiper& gossiper) mutable {
|
||||
@@ -2178,6 +2171,9 @@ future<> gossiper::start() {
|
||||
}
|
||||
|
||||
future<> gossiper::shutdown() {
|
||||
if (!_background_msg.is_closed()) {
|
||||
co_await _background_msg.close();
|
||||
}
|
||||
if (this_shard_id() == 0) {
|
||||
co_await do_stop_gossiping();
|
||||
}
|
||||
|
||||
@@ -41,7 +41,9 @@
|
||||
#include "unimplemented.hh"
|
||||
#include <seastar/core/distributed.hh>
|
||||
#include <seastar/core/shared_ptr.hh>
|
||||
#include <seastar/core/gate.hh>
|
||||
#include <seastar/core/print.hh>
|
||||
#include <seastar/rpc/rpc_types.hh>
|
||||
#include "utils/atomic_vector.hh"
|
||||
#include "utils/UUID.hh"
|
||||
#include "utils/fb_utilities.hh"
|
||||
@@ -138,12 +140,16 @@ private:
|
||||
bool _enabled = false;
|
||||
semaphore _callback_running{1};
|
||||
semaphore _apply_state_locally_semaphore{100};
|
||||
seastar::gate _background_msg;
|
||||
std::unordered_map<gms::inet_address, syn_msg_pending> _syn_handlers;
|
||||
std::unordered_map<gms::inet_address, ack_msg_pending> _ack_handlers;
|
||||
bool _advertise_myself = true;
|
||||
// Map ip address and generation number
|
||||
std::unordered_map<gms::inet_address, int32_t> _advertise_to_nodes;
|
||||
future<> _failure_detector_loop_done{make_ready_future<>()} ;
|
||||
|
||||
rpc::no_wait_type background_msg(sstring type, noncopyable_function<future<>(gossiper&)> fn);
|
||||
|
||||
public:
|
||||
// Get current generation number for the given nodes
|
||||
future<std::unordered_map<gms::inet_address, int32_t>>
|
||||
|
||||
@@ -520,8 +520,13 @@ relocate_python3 "$rprefix"/scyllatop tools/scyllatop/scyllatop.py
|
||||
if $supervisor; then
|
||||
install -d -m755 `supervisor_dir $retc`
|
||||
for service in scylla-server scylla-jmx scylla-node-exporter; do
|
||||
if [ "$service" = "scylla-server" ]; then
|
||||
program="scylla"
|
||||
else
|
||||
program=$service
|
||||
fi
|
||||
cat << EOS > `supervisor_conf $retc $service`
|
||||
[program:$service]
|
||||
[program:$program]
|
||||
directory=$rprefix
|
||||
command=/bin/bash -c './supervisor/$service.sh'
|
||||
EOS
|
||||
|
||||
33
main.cc
33
main.cc
@@ -377,11 +377,38 @@ static auto defer_verbose_shutdown(const char* what, Func&& func) {
|
||||
startlog.info("Shutting down {}", what);
|
||||
try {
|
||||
func();
|
||||
startlog.info("Shutting down {} was successful", what);
|
||||
} catch (...) {
|
||||
startlog.error("Unexpected error shutting down {}: {}", what, std::current_exception());
|
||||
throw;
|
||||
auto ex = std::current_exception();
|
||||
bool do_abort = true;
|
||||
try {
|
||||
std::rethrow_exception(ex);
|
||||
} catch (const std::system_error& e) {
|
||||
// System error codes we consider "environmental",
|
||||
// i.e. not scylla's fault, therefore there is no point in
|
||||
// aborting and dumping core.
|
||||
for (int i : {EIO, EACCES, ENOSPC}) {
|
||||
if (e.code() == std::error_code(i, std::system_category())) {
|
||||
do_abort = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
}
|
||||
auto msg = fmt::format("Unexpected error shutting down {}: {}", what, ex);
|
||||
if (do_abort) {
|
||||
startlog.error("{}: aborting", msg);
|
||||
abort();
|
||||
} else {
|
||||
startlog.error("{}: exiting, at {}", msg, current_backtrace());
|
||||
|
||||
// Call _exit() rather than exit() to exit immediately
|
||||
// without calling exit handlers, avoiding
|
||||
// boost::intrusive::detail::destructor_impl assert failure
|
||||
// from ~segment_pool exit handler.
|
||||
_exit(255);
|
||||
}
|
||||
}
|
||||
startlog.info("Shutting down {} was successful", what);
|
||||
};
|
||||
|
||||
auto ret = deferred_action(std::move(vfunc));
|
||||
|
||||
@@ -613,7 +613,8 @@ static flat_mutation_reader make_partition_snapshot_flat_reader_from_snp_schema(
|
||||
schema_ptr rev_snp_schema = snp->schema()->make_reversed();
|
||||
return make_partition_snapshot_flat_reader<true, partition_snapshot_read_accounter>(std::move(rev_snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
} else {
|
||||
return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(snp->schema(), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
schema_ptr snp_schema = snp->schema();
|
||||
return make_partition_snapshot_flat_reader<false, partition_snapshot_read_accounter>(std::move(snp_schema), std::move(permit), std::move(dk), std::move(crr), std::move(snp), digest_requested, region, read_section, pointer_to_container, fwd, memtable);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -628,7 +628,12 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
remove_error_rpc_client(verb, id);
|
||||
}
|
||||
|
||||
auto must_encrypt = [&id, &verb, this] {
|
||||
auto addr = get_preferred_ip(id.addr);
|
||||
auto broadcast_address = utils::fb_utilities::get_broadcast_address();
|
||||
bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != broadcast_address;
|
||||
auto laddr = socket_address(listen_to_bc ? broadcast_address : _cfg.ip, 0);
|
||||
|
||||
auto must_encrypt = [&] {
|
||||
if (_cfg.encrypt == encrypt_what::none) {
|
||||
return false;
|
||||
}
|
||||
@@ -646,13 +651,27 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
auto& snitch_ptr = locator::i_endpoint_snitch::get_local_snitch_ptr();
|
||||
|
||||
// either rack/dc need to be in same dc to use non-tls
|
||||
if (snitch_ptr->get_datacenter(id.addr) != snitch_ptr->get_datacenter(utils::fb_utilities::get_broadcast_address())) {
|
||||
auto my_dc = snitch_ptr->get_datacenter(broadcast_address);
|
||||
if (snitch_ptr->get_datacenter(addr) != my_dc) {
|
||||
return true;
|
||||
}
|
||||
// #9653 - if our idea of dc for bind address differs from our official endpoint address,
|
||||
// we cannot trust downgrading. We need to ensure either (local) bind address is same as
|
||||
// broadcast or that the dc info we get for it is the same.
|
||||
if (broadcast_address != laddr && snitch_ptr->get_datacenter(laddr) != my_dc) {
|
||||
return true;
|
||||
}
|
||||
// if cross-rack tls, check rack.
|
||||
return _cfg.encrypt == encrypt_what::rack &&
|
||||
snitch_ptr->get_rack(id.addr) != snitch_ptr->get_rack(utils::fb_utilities::get_broadcast_address())
|
||||
;
|
||||
if (_cfg.encrypt == encrypt_what::dc) {
|
||||
return false;
|
||||
}
|
||||
auto my_rack = snitch_ptr->get_rack(broadcast_address);
|
||||
if (snitch_ptr->get_rack(addr) != my_rack) {
|
||||
return true;
|
||||
}
|
||||
// See above: We need to ensure either (local) bind address is same as
|
||||
// broadcast or that the rack info we get for it is the same.
|
||||
return broadcast_address != laddr && snitch_ptr->get_rack(laddr) != my_rack;
|
||||
}();
|
||||
|
||||
auto must_compress = [&id, this] {
|
||||
@@ -681,7 +700,7 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
return true;
|
||||
}();
|
||||
|
||||
auto remote_addr = socket_address(get_preferred_ip(id.addr), must_encrypt ? _cfg.ssl_port : _cfg.port);
|
||||
auto remote_addr = socket_address(addr, must_encrypt ? _cfg.ssl_port : _cfg.port);
|
||||
|
||||
rpc::client_options opts;
|
||||
// send keepalive messages each minute if connection is idle, drop connection after 10 failures
|
||||
@@ -696,8 +715,6 @@ shared_ptr<messaging_service::rpc_protocol_client_wrapper> messaging_service::ge
|
||||
opts.isolation_cookie = _scheduling_info_for_connection_index[idx].isolation_cookie;
|
||||
}
|
||||
|
||||
bool listen_to_bc = _cfg.listen_on_broadcast_address && _cfg.ip != utils::fb_utilities::get_broadcast_address();
|
||||
auto laddr = socket_address(listen_to_bc ? utils::fb_utilities::get_broadcast_address() : _cfg.ip, 0);
|
||||
auto client = must_encrypt ?
|
||||
::make_shared<rpc_protocol_client_wrapper>(_rpc->protocol(), std::move(opts),
|
||||
remote_addr, laddr, _credentials) :
|
||||
|
||||
@@ -1545,18 +1545,20 @@ public:
|
||||
};
|
||||
|
||||
future<> shard_reader::close() noexcept {
|
||||
// Nothing to do if there was no reader created, nor is there a background
|
||||
// read ahead in progress which will create one.
|
||||
if (!_reader && !_read_ahead) {
|
||||
co_return;
|
||||
if (_read_ahead) {
|
||||
try {
|
||||
co_await *std::exchange(_read_ahead, std::nullopt);
|
||||
} catch (...) {
|
||||
mrlog.warn("shard_reader::close(): read_ahead on shard {} failed: {}", _shard, std::current_exception());
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
if (_read_ahead) {
|
||||
co_await *std::exchange(_read_ahead, std::nullopt);
|
||||
}
|
||||
|
||||
co_await smp::submit_to(_shard, [this] {
|
||||
if (!_reader) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
auto irh = std::move(*_reader).inactive_read_handle();
|
||||
return with_closeable(flat_mutation_reader(_reader.release()), [this] (flat_mutation_reader& reader) mutable {
|
||||
auto permit = reader.permit();
|
||||
|
||||
@@ -54,7 +54,7 @@ future<> feed_writer(flat_mutation_reader&& rd_ref, Writer wr) {
|
||||
auto rd = std::move(rd_ref);
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
while (!rd.is_end_of_stream()) {
|
||||
while (!rd.is_end_of_stream() || !rd.is_buffer_empty()) {
|
||||
co_await rd.fill_buffer();
|
||||
while (!rd.is_buffer_empty()) {
|
||||
co_await rd.pop_mutation_fragment().consume(wr);
|
||||
|
||||
@@ -411,11 +411,11 @@ public:
|
||||
} else {
|
||||
// Copy row from older version because rows in evictable versions must
|
||||
// hold values which are independently complete to be consistent on eviction.
|
||||
auto e = current_allocator().construct<rows_entry>(_schema, *_current_row[0].it);
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, *_current_row[0].it));
|
||||
e->set_continuous(latest_i && latest_i->continuous());
|
||||
_snp.tracker()->insert(*e);
|
||||
rows.insert_before(latest_i, *e);
|
||||
return {*e, true};
|
||||
auto e_i = rows.insert_before(latest_i, std::move(e));
|
||||
return ensure_result{*e_i, true};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -447,11 +447,11 @@ public:
|
||||
}
|
||||
auto&& rows = _snp.version()->partition().mutable_clustered_rows();
|
||||
auto latest_i = get_iterator_in_latest_version();
|
||||
auto e = current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
|
||||
is_continuous(latest_i && latest_i->continuous()));
|
||||
auto e = alloc_strategy_unique_ptr<rows_entry>(current_allocator().construct<rows_entry>(_schema, pos, is_dummy(!pos.is_clustering_row()),
|
||||
is_continuous(latest_i && latest_i->continuous())));
|
||||
_snp.tracker()->insert(*e);
|
||||
rows.insert_before(latest_i, *e);
|
||||
return ensure_result{*e, true};
|
||||
auto e_i = rows.insert_before(latest_i, std::move(e));
|
||||
return ensure_result{*e_i, true};
|
||||
}
|
||||
|
||||
// Brings the entry pointed to by the cursor to the front of the LRU
|
||||
|
||||
@@ -42,28 +42,34 @@ static auto construct_range_tombstone_entry(Args&&... args) {
|
||||
}
|
||||
|
||||
void range_tombstone_list::apply_reversibly(const schema& s,
|
||||
clustering_key_prefix start, bound_kind start_kind,
|
||||
clustering_key_prefix end,
|
||||
clustering_key_prefix start_key, bound_kind start_kind,
|
||||
clustering_key_prefix end_key,
|
||||
bound_kind end_kind,
|
||||
tombstone tomb,
|
||||
reverter& rev)
|
||||
{
|
||||
position_in_partition::less_compare less(s);
|
||||
position_in_partition start(position_in_partition::range_tag_t(), bound_view(std::move(start_key), start_kind));
|
||||
position_in_partition end(position_in_partition::range_tag_t(), bound_view(std::move(end_key), end_kind));
|
||||
|
||||
if (!less(start, end)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!_tombstones.empty()) {
|
||||
bound_view::compare less(s);
|
||||
bound_view start_bound(start, start_kind);
|
||||
auto last = --_tombstones.end();
|
||||
range_tombstones_type::iterator it;
|
||||
if (less(start_bound, last->end_bound())) {
|
||||
it = _tombstones.upper_bound(start_bound, [less](auto&& sb, auto&& rt) {
|
||||
return less(sb, rt.end_bound());
|
||||
if (less(start, last->end_position())) {
|
||||
it = _tombstones.upper_bound(start, [less](auto&& sb, auto&& rt) {
|
||||
return less(sb, rt.end_position());
|
||||
});
|
||||
} else {
|
||||
it = _tombstones.end();
|
||||
}
|
||||
insert_from(s, std::move(it), std::move(start), start_kind, std::move(end), end_kind, std::move(tomb), rev);
|
||||
insert_from(s, std::move(it), std::move(start), std::move(end), std::move(tomb), rev);
|
||||
return;
|
||||
}
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
|
||||
rev.insert(_tombstones.end(), *rt);
|
||||
rt.release();
|
||||
}
|
||||
@@ -81,35 +87,31 @@ void range_tombstone_list::apply_reversibly(const schema& s,
|
||||
*/
|
||||
void range_tombstone_list::insert_from(const schema& s,
|
||||
range_tombstones_type::iterator it,
|
||||
clustering_key_prefix start,
|
||||
bound_kind start_kind,
|
||||
clustering_key_prefix end,
|
||||
bound_kind end_kind,
|
||||
position_in_partition start,
|
||||
position_in_partition end,
|
||||
tombstone tomb,
|
||||
reverter& rev)
|
||||
{
|
||||
bound_view::compare less(s);
|
||||
bound_view end_bound(end, end_kind);
|
||||
position_in_partition::tri_compare cmp(s);
|
||||
|
||||
if (it != _tombstones.begin()) {
|
||||
auto prev = std::prev(it);
|
||||
if (prev->tombstone().tomb == tomb && prev->end_bound().adjacent(s, bound_view(start, start_kind))) {
|
||||
start = prev->tombstone().start;
|
||||
start_kind = prev->tombstone().start_kind;
|
||||
if (prev->tombstone().tomb == tomb && cmp(prev->end_position(), start) == 0) {
|
||||
start = prev->position();
|
||||
rev.erase(prev);
|
||||
}
|
||||
}
|
||||
while (it != _tombstones.end()) {
|
||||
bound_view start_bound(start, start_kind);
|
||||
if (less(end_bound, start_bound)) {
|
||||
if (cmp(end, start) <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (less(end_bound, it->start_bound())) {
|
||||
if (cmp(end, it->position()) < 0) {
|
||||
// not overlapping
|
||||
if (it->tombstone().tomb == tomb && end_bound.adjacent(s, it->start_bound())) {
|
||||
rev.update(it, {std::move(start), start_kind, it->tombstone().end, it->tombstone().end_kind, tomb});
|
||||
if (it->tombstone().tomb == tomb && cmp(end, it->position()) == 0) {
|
||||
rev.update(it, {std::move(start), std::move(start), tomb});
|
||||
} else {
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, tomb);
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), tomb);
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
}
|
||||
@@ -119,34 +121,29 @@ void range_tombstone_list::insert_from(const schema& s,
|
||||
auto c = tomb <=> it->tombstone().tomb;
|
||||
if (c == 0) {
|
||||
// same timestamp, overlapping or adjacent, so merge.
|
||||
if (less(it->start_bound(), start_bound)) {
|
||||
start = it->tombstone().start;
|
||||
start_kind = it->tombstone().start_kind;
|
||||
if (cmp(it->position(), start) < 0) {
|
||||
start = it->position();
|
||||
}
|
||||
if (less(end_bound, it->end_bound())) {
|
||||
end = it->tombstone().end;
|
||||
end_kind = it->tombstone().end_kind;
|
||||
end_bound = bound_view(end, end_kind);
|
||||
if (cmp(end, it->end_position()) < 0) {
|
||||
end = it->end_position();
|
||||
}
|
||||
it = rev.erase(it);
|
||||
} else if (c > 0) {
|
||||
// We overwrite the current tombstone.
|
||||
|
||||
if (less(it->start_bound(), start_bound)) {
|
||||
auto new_end = bound_view(start, invert_kind(start_kind));
|
||||
if (!less(new_end, it->start_bound())) {
|
||||
// Here it->start < start
|
||||
auto rt = construct_range_tombstone_entry(it->start_bound(), new_end, it->tombstone().tomb);
|
||||
rev.update(it, {start_bound, it->end_bound(), it->tombstone().tomb});
|
||||
if (cmp(it->position(), start) < 0) {
|
||||
{
|
||||
auto rt = construct_range_tombstone_entry(it->position(), start, it->tombstone().tomb);
|
||||
rev.update(it, {start, it->end_position(), it->tombstone().tomb});
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
}
|
||||
}
|
||||
|
||||
if (less(end_bound, it->end_bound())) {
|
||||
if (cmp(end, it->end_position()) < 0) {
|
||||
// Here start <= it->start and end < it->end.
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, end, end_kind, std::move(tomb));
|
||||
rev.update(it, {std::move(end), invert_kind(end_kind), it->tombstone().end, it->tombstone().end_kind, it->tombstone().tomb});
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), end, std::move(tomb));
|
||||
rev.update(it, {std::move(end), it->end_position(), it->tombstone().tomb});
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
return;
|
||||
@@ -157,30 +154,28 @@ void range_tombstone_list::insert_from(const schema& s,
|
||||
} else {
|
||||
// We don't overwrite the current tombstone.
|
||||
|
||||
if (less(start_bound, it->start_bound())) {
|
||||
if (cmp(start, it->position()) < 0) {
|
||||
// The new tombstone starts before the current one.
|
||||
if (less(it->start_bound(), end_bound)) {
|
||||
if (cmp(it->position(), end) < 0) {
|
||||
// Here start < it->start and it->start < end.
|
||||
auto new_end_kind = invert_kind(it->tombstone().start_kind);
|
||||
if (!less(bound_view(it->tombstone().start, new_end_kind), start_bound)) {
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, it->tombstone().start, new_end_kind, tomb);
|
||||
{
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), it->position(), tomb);
|
||||
it = rev.insert(it, *rt);
|
||||
rt.release();
|
||||
++it;
|
||||
}
|
||||
} else {
|
||||
// Here start < it->start and end <= it->start, so just insert the new tombstone.
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (less(it->end_bound(), end_bound)) {
|
||||
if (cmp(it->end_position(), end) < 0) {
|
||||
// Here the current tombstone overwrites a range of the new one.
|
||||
start = it->tombstone().end;
|
||||
start_kind = invert_kind(it->tombstone().end_kind);
|
||||
start = it->end_position();
|
||||
++it;
|
||||
} else {
|
||||
// Here the current tombstone completely overwrites the new one.
|
||||
@@ -190,7 +185,7 @@ void range_tombstone_list::insert_from(const schema& s,
|
||||
}
|
||||
|
||||
// If we got here, then just insert the remainder at the end.
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), start_kind, std::move(end), end_kind, std::move(tomb));
|
||||
auto rt = construct_range_tombstone_entry(std::move(start), std::move(end), std::move(tomb));
|
||||
rev.insert(it, *rt);
|
||||
rt.release();
|
||||
}
|
||||
|
||||
@@ -297,7 +297,13 @@ public:
|
||||
private:
|
||||
void apply_reversibly(const schema& s, clustering_key_prefix start, bound_kind start_kind,
|
||||
clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
|
||||
void insert_from(const schema& s, range_tombstones_type::iterator it, clustering_key_prefix start,
|
||||
bound_kind start_kind, clustering_key_prefix end, bound_kind end_kind, tombstone tomb, reverter& rev);
|
||||
|
||||
void insert_from(const schema& s,
|
||||
range_tombstones_type::iterator it,
|
||||
position_in_partition start,
|
||||
position_in_partition end,
|
||||
tombstone tomb,
|
||||
reverter& rev);
|
||||
|
||||
range_tombstones_type::iterator find(const schema& s, const range_tombstone_entry& rt);
|
||||
};
|
||||
|
||||
@@ -249,6 +249,14 @@ public:
|
||||
return _base_resources;
|
||||
}
|
||||
|
||||
void release_base_resources() noexcept {
|
||||
if (_base_resources_consumed) {
|
||||
_resources -= _base_resources;
|
||||
_base_resources_consumed = false;
|
||||
}
|
||||
_semaphore.signal(std::exchange(_base_resources, {}));
|
||||
}
|
||||
|
||||
sstring description() const {
|
||||
return format("{}.{}:{}",
|
||||
_schema ? _schema->ks_name() : "*",
|
||||
@@ -394,6 +402,10 @@ reader_resources reader_permit::base_resources() const {
|
||||
return _impl->base_resources();
|
||||
}
|
||||
|
||||
void reader_permit::release_base_resources() noexcept {
|
||||
return _impl->release_base_resources();
|
||||
}
|
||||
|
||||
sstring reader_permit::description() const {
|
||||
return _impl->description();
|
||||
}
|
||||
|
||||
@@ -161,6 +161,8 @@ public:
|
||||
|
||||
reader_resources base_resources() const;
|
||||
|
||||
void release_base_resources() noexcept;
|
||||
|
||||
sstring description() const;
|
||||
|
||||
db::timeout_clock::time_point timeout() const noexcept;
|
||||
|
||||
@@ -407,6 +407,10 @@ public:
|
||||
{},
|
||||
mutation_reader::forwarding::no);
|
||||
} else {
|
||||
// We can't have two permits with count resource for 1 repair.
|
||||
// So we release the one on _permit so the only one is the one the
|
||||
// shard reader will obtain.
|
||||
_permit.release_base_resources();
|
||||
_reader = make_multishard_streaming_reader(db, _schema, _permit, [this] {
|
||||
auto shard_range = _sharder.next();
|
||||
if (shard_range) {
|
||||
|
||||
2
seastar
2
seastar
Submodule seastar updated: a189cdc45d...94a462d94b
@@ -635,16 +635,16 @@ void storage_service::bootstrap() {
|
||||
|
||||
// Update pending ranges now, so we correctly count ourselves as a pending replica
|
||||
// when inserting the new CDC generation.
|
||||
if (!bootstrap_rbno) {
|
||||
// When is_repair_based_node_ops_enabled is true, the bootstrap node
|
||||
// will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
|
||||
slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
|
||||
mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
|
||||
auto endpoint = get_broadcast_address();
|
||||
tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
|
||||
return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
|
||||
}).get();
|
||||
}
|
||||
if (!bootstrap_rbno) {
|
||||
// When is_repair_based_node_ops_enabled is true, the bootstrap node
|
||||
// will use node_ops_cmd to bootstrap, node_ops_cmd will update the pending ranges.
|
||||
slogger.debug("bootstrap: update pending ranges: endpoint={} bootstrap_tokens={}", get_broadcast_address(), _bootstrap_tokens);
|
||||
mutate_token_metadata([this] (mutable_token_metadata_ptr tmptr) {
|
||||
auto endpoint = get_broadcast_address();
|
||||
tmptr->add_bootstrap_tokens(_bootstrap_tokens, endpoint);
|
||||
return update_pending_ranges(std::move(tmptr), format("bootstrapping node {}", endpoint));
|
||||
}).get();
|
||||
}
|
||||
|
||||
// After we pick a generation timestamp, we start gossiping it, and we stick with it.
|
||||
// We don't do any other generation switches (unless we crash before complecting bootstrap).
|
||||
@@ -652,19 +652,23 @@ void storage_service::bootstrap() {
|
||||
|
||||
_cdc_gen_id = _cdc_gen_service.local().make_new_generation(_bootstrap_tokens, !is_first_node()).get0();
|
||||
|
||||
if (!bootstrap_rbno) {
|
||||
// When is_repair_based_node_ops_enabled is true, the bootstrap node
|
||||
// will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
|
||||
_gossiper.add_local_application_state({
|
||||
// Order is important: both the CDC streams timestamp and tokens must be known when a node handles our status.
|
||||
{ gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
|
||||
{ gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
|
||||
{ gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
|
||||
}).get();
|
||||
if (!bootstrap_rbno) {
|
||||
// When is_repair_based_node_ops_enabled is true, the bootstrap node
|
||||
// will use node_ops_cmd to bootstrap, bootstrapping gossip status is not needed for bootstrap.
|
||||
_gossiper.add_local_application_state({
|
||||
{ gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens) },
|
||||
{ gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
|
||||
{ gms::application_state::STATUS, versioned_value::bootstrapping(_bootstrap_tokens) },
|
||||
}).get();
|
||||
|
||||
set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
|
||||
_gossiper.wait_for_range_setup().get();
|
||||
}
|
||||
set_mode(mode::JOINING, format("sleeping {} ms for pending range setup", get_ring_delay().count()), true);
|
||||
_gossiper.wait_for_range_setup().get();
|
||||
} else {
|
||||
// Even with RBNO bootstrap we need to announce the new CDC generation immediately after it's created.
|
||||
_gossiper.add_local_application_state({
|
||||
{ gms::application_state::CDC_GENERATION_ID, versioned_value::cdc_generation_id(_cdc_gen_id) },
|
||||
}).get();
|
||||
}
|
||||
} else {
|
||||
// Wait until we know tokens of existing node before announcing replacing status.
|
||||
set_mode(mode::JOINING, fmt::format("Wait until local node knows tokens of peer nodes"), true);
|
||||
@@ -3670,7 +3674,7 @@ shared_ptr<abort_source> node_ops_meta_data::get_abort_source() {
|
||||
|
||||
void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_update_heartbeat: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
@@ -3680,7 +3684,7 @@ void storage_service::node_ops_update_heartbeat(utils::UUID ops_uuid) {
|
||||
|
||||
void storage_service::node_ops_done(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_done: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
@@ -3691,7 +3695,7 @@ void storage_service::node_ops_done(utils::UUID ops_uuid) {
|
||||
|
||||
void storage_service::node_ops_abort(utils::UUID ops_uuid) {
|
||||
slogger.debug("node_ops_abort: ops_uuid={}", ops_uuid);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1);
|
||||
auto permit = seastar::get_units(_node_ops_abort_sem, 1).get0();
|
||||
auto it = _node_ops.find(ops_uuid);
|
||||
if (it != _node_ops.end()) {
|
||||
node_ops_meta_data& meta = it->second;
|
||||
|
||||
@@ -49,12 +49,13 @@ private:
|
||||
public:
|
||||
partition_index_cache* _parent;
|
||||
key_type _key;
|
||||
std::variant<shared_promise<>, partition_index_page> _page;
|
||||
std::variant<lw_shared_ptr<shared_promise<>>, partition_index_page> _page;
|
||||
size_t _size_in_allocator = 0;
|
||||
public:
|
||||
entry(partition_index_cache* parent, key_type key)
|
||||
: _parent(parent)
|
||||
, _key(key)
|
||||
, _page(make_lw_shared<shared_promise<>>())
|
||||
{ }
|
||||
|
||||
void set_page(partition_index_page&& page) noexcept {
|
||||
@@ -76,7 +77,7 @@ private:
|
||||
// Always returns the same value for a given state of _page.
|
||||
size_t size_in_allocator() const { return _size_in_allocator; }
|
||||
|
||||
shared_promise<>& promise() { return std::get<shared_promise<>>(_page); }
|
||||
lw_shared_ptr<shared_promise<>> promise() { return std::get<lw_shared_ptr<shared_promise<>>>(_page); }
|
||||
bool ready() const { return std::holds_alternative<partition_index_page>(_page); }
|
||||
partition_index_page& page() { return std::get<partition_index_page>(_page); }
|
||||
const partition_index_page& page() const { return std::get<partition_index_page>(_page); }
|
||||
@@ -207,9 +208,7 @@ public:
|
||||
return make_ready_future<entry_ptr>(std::move(ptr));
|
||||
} else {
|
||||
++_shard_stats.blocks;
|
||||
return _as(_region, [ptr] () mutable {
|
||||
return ptr.get_entry().promise().get_shared_future();
|
||||
}).then([ptr] () mutable {
|
||||
return ptr.get_entry().promise()->get_shared_future().then([ptr] () mutable {
|
||||
return std::move(ptr);
|
||||
});
|
||||
}
|
||||
@@ -234,23 +233,23 @@ public:
|
||||
|
||||
// No exceptions before then_wrapped() is installed so that ptr will be eventually populated.
|
||||
|
||||
return futurize_invoke(loader, key).then_wrapped([this, key, ptr] (auto&& f) mutable {
|
||||
return futurize_invoke(loader, key).then_wrapped([this, key, ptr = std::move(ptr)] (auto&& f) mutable {
|
||||
entry& e = ptr.get_entry();
|
||||
try {
|
||||
partition_index_page&& page = f.get0();
|
||||
e.promise().set_value();
|
||||
e.promise()->set_value();
|
||||
e.set_page(std::move(page));
|
||||
_shard_stats.used_bytes += e.size_in_allocator();
|
||||
++_shard_stats.populations;
|
||||
return ptr;
|
||||
} catch (...) {
|
||||
e.promise().set_exception(std::current_exception());
|
||||
e.promise()->set_exception(std::current_exception());
|
||||
ptr = {};
|
||||
with_allocator(_region.allocator(), [&] {
|
||||
_cache.erase(key);
|
||||
});
|
||||
throw;
|
||||
}
|
||||
}).then([ptr] {
|
||||
return ptr;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
43
test.py
43
test.py
@@ -291,6 +291,8 @@ class Test:
|
||||
def print_summary(self):
|
||||
pass
|
||||
|
||||
def get_junit_etree(self):
|
||||
return None
|
||||
|
||||
def check_log(self, trim):
|
||||
"""Check and trim logs and xml output for tests which have it"""
|
||||
@@ -338,9 +340,36 @@ class BoostTest(UnitTest):
|
||||
boost_args += ['--color_output=false']
|
||||
boost_args += ['--']
|
||||
self.args = boost_args + self.args
|
||||
self.casename = casename
|
||||
self.__junit_etree = None
|
||||
|
||||
def get_junit_etree(self):
|
||||
def adjust_suite_name(name):
|
||||
# Normalize "path/to/file.cc" to "path.to.file" to conform to
|
||||
# Jenkins expectations that the suite name is a class name. ".cc"
|
||||
# doesn't add any infomation. Add the mode, otherwise failures
|
||||
# in different modes are indistinguishable. The "test/" prefix adds
|
||||
# no information, so remove it.
|
||||
import re
|
||||
name = re.sub(r'^test/', '', name)
|
||||
name = re.sub(r'\.cc$', '', name)
|
||||
name = re.sub(r'/', '.', name)
|
||||
name = f'{name}.{self.mode}'
|
||||
return name
|
||||
if self.__junit_etree is None:
|
||||
self.__junit_etree = ET.parse(self.xmlout)
|
||||
root = self.__junit_etree.getroot()
|
||||
suites = root.findall('.//TestSuite')
|
||||
for suite in suites:
|
||||
suite.attrib['name'] = adjust_suite_name(suite.attrib['name'])
|
||||
skipped = suite.findall('./TestCase[@reason="disabled"]')
|
||||
for e in skipped:
|
||||
suite.remove(e)
|
||||
os.unlink(self.xmlout)
|
||||
return self.__junit_etree
|
||||
|
||||
def check_log(self, trim):
|
||||
ET.parse(self.xmlout)
|
||||
self.get_junit_etree()
|
||||
super().check_log(trim)
|
||||
|
||||
|
||||
@@ -800,6 +829,17 @@ def write_junit_report(tmpdir, mode):
|
||||
with open(junit_filename, "w") as f:
|
||||
ET.ElementTree(xml_results).write(f, encoding="unicode")
|
||||
|
||||
def write_consolidated_boost_junit_xml(tmpdir, mode):
|
||||
xml = ET.Element("TestLog")
|
||||
for suite in TestSuite.suites.values():
|
||||
for test in suite.tests:
|
||||
if test.mode != mode:
|
||||
continue
|
||||
test_xml = test.get_junit_etree()
|
||||
if test_xml is not None:
|
||||
xml.extend(test_xml.getroot().findall('.//TestSuite'))
|
||||
et = ET.ElementTree(xml)
|
||||
et.write(f'{tmpdir}/{mode}/xml/boost.xunit.xml', encoding='unicode')
|
||||
|
||||
def open_log(tmpdir):
|
||||
pathlib.Path(tmpdir).mkdir(parents=True, exist_ok=True)
|
||||
@@ -839,6 +879,7 @@ async def main():
|
||||
|
||||
for mode in options.modes:
|
||||
write_junit_report(options.tmpdir, mode)
|
||||
write_consolidated_boost_junit_xml(options.tmpdir, mode)
|
||||
|
||||
if 'coverage' in options.modes:
|
||||
coverage.generate_coverage_report("build/coverage", "tests")
|
||||
|
||||
@@ -16,6 +16,9 @@
|
||||
# along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# Tests for basic table operations: CreateTable, DeleteTable, ListTables.
|
||||
# Also some basic tests for UpdateTable - although UpdateTable usually
|
||||
# enables more elaborate features (such as GSI or Streams) and those are
|
||||
# tested elsewhere.
|
||||
|
||||
import pytest
|
||||
from botocore.exceptions import ClientError
|
||||
@@ -311,3 +314,17 @@ def test_table_sse_off(dynamodb):
|
||||
KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }],
|
||||
AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]);
|
||||
table.delete();
|
||||
|
||||
# Test that trying to delete a table that doesn't exist fails in the
|
||||
# appropriate way (ResourceNotFoundException)
|
||||
def test_delete_table_non_existent(dynamodb, test_table):
|
||||
client = dynamodb.meta.client
|
||||
with pytest.raises(ClientError, match='ResourceNotFoundException'):
|
||||
client.delete_table(TableName=random_string(20))
|
||||
|
||||
# Test that trying to update a table that doesn't exist fails in the
|
||||
# appropriate way (ResourceNotFoundException)
|
||||
def test_update_table_non_existent(dynamodb, test_table):
|
||||
client = dynamodb.meta.client
|
||||
with pytest.raises(ClientError, match='ResourceNotFoundException'):
|
||||
client.update_table(TableName=random_string(20), BillingMode='PAY_PER_REQUEST')
|
||||
|
||||
@@ -1043,6 +1043,20 @@ def test_nested_attribute_remove_from_missing_item(test_table_s):
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x.y')
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE x[0]')
|
||||
|
||||
# Though in an above test (test_nested_attribute_update_bad_path_dot) we
|
||||
# showed that DynamoDB does not allow REMOVE x.y if attribute x doesn't
|
||||
# exist - and generates a ValidationException, if x *does* exist but y
|
||||
# doesn't, it's fine and the removal should just be silently ignored.
|
||||
def test_nested_attribute_remove_missing_leaf(test_table_s):
|
||||
p = random_string()
|
||||
item = {'p': p, 'a': {'x': 3}, 'b': ['hi']}
|
||||
test_table_s.put_item(Item=item)
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE a.y')
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE b[7]')
|
||||
test_table_s.update_item(Key={'p': p}, UpdateExpression='REMOVE c')
|
||||
# The above UpdateItem calls didn't change anything...
|
||||
assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == item
|
||||
|
||||
# Similarly for other types of bad paths - using [0] on something which
|
||||
# doesn't exist or isn't an array.
|
||||
def test_nested_attribute_update_bad_path_array(test_table_s):
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <boost/range/irange.hpp>
|
||||
#include <seastar/testing/test_case.hh>
|
||||
#include <seastar/testing/thread_test_case.hh>
|
||||
#include <seastar/core/iostream.hh>
|
||||
@@ -49,6 +50,15 @@ static sstring read_to_string(cached_file::stream& s, size_t limit = std::numeri
|
||||
return b.substr(0, limit);
|
||||
}
|
||||
|
||||
static void read_to_void(cached_file::stream& s, size_t limit = std::numeric_limits<size_t>::max()) {
|
||||
while (auto buf = s.next().get0()) {
|
||||
if (buf.size() >= limit) {
|
||||
break;
|
||||
}
|
||||
limit -= buf.size();
|
||||
}
|
||||
}
|
||||
|
||||
static sstring read_to_string(file& f, size_t start, size_t len) {
|
||||
file_input_stream_options opt;
|
||||
auto in = make_file_input_stream(f, start, len, opt);
|
||||
@@ -61,6 +71,12 @@ static sstring read_to_string(cached_file& cf, size_t off, size_t limit = std::n
|
||||
return read_to_string(s, limit);
|
||||
}
|
||||
|
||||
[[gnu::unused]]
|
||||
static void read_to_void(cached_file& cf, size_t off, size_t limit = std::numeric_limits<size_t>::max()) {
|
||||
auto s = cf.read(off, default_priority_class(), std::nullopt);
|
||||
read_to_void(s, limit);
|
||||
}
|
||||
|
||||
struct test_file {
|
||||
tmpdir dir;
|
||||
file f;
|
||||
@@ -204,7 +220,9 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
|
||||
}
|
||||
|
||||
{
|
||||
cf_lru.evict_all();
|
||||
with_allocator(region.allocator(), [] {
|
||||
cf_lru.evict_all();
|
||||
});
|
||||
|
||||
BOOST_REQUIRE_EQUAL(0, metrics.cached_bytes); // change here
|
||||
BOOST_REQUIRE_EQUAL(0, cf.cached_bytes()); // change here
|
||||
@@ -212,6 +230,8 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
|
||||
BOOST_REQUIRE_EQUAL(3, metrics.page_evictions); // change here
|
||||
BOOST_REQUIRE_EQUAL(0, metrics.page_hits);
|
||||
BOOST_REQUIRE_EQUAL(3, metrics.page_populations);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(region.occupancy().used_space(), 0);
|
||||
}
|
||||
|
||||
{
|
||||
@@ -255,6 +275,88 @@ SEASTAR_THREAD_TEST_CASE(test_eviction_via_lru) {
|
||||
}
|
||||
}
|
||||
|
||||
// A file which serves garbage but is very fast.
|
||||
class garbage_file_impl : public file_impl {
|
||||
private:
|
||||
[[noreturn]] void unsupported() {
|
||||
throw_with_backtrace<std::logic_error>("unsupported operation");
|
||||
}
|
||||
public:
|
||||
// unsupported
|
||||
virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) override { unsupported(); }
|
||||
virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override { unsupported(); }
|
||||
virtual future<> flush(void) override { unsupported(); }
|
||||
virtual future<> truncate(uint64_t length) override { unsupported(); }
|
||||
virtual future<> discard(uint64_t offset, uint64_t length) override { unsupported(); }
|
||||
virtual future<> allocate(uint64_t position, uint64_t length) override { unsupported(); }
|
||||
virtual subscription<directory_entry> list_directory(std::function<future<>(directory_entry)>) override { unsupported(); }
|
||||
virtual future<struct stat> stat(void) override { unsupported(); }
|
||||
virtual future<uint64_t> size(void) override { unsupported(); }
|
||||
virtual std::unique_ptr<seastar::file_handle_impl> dup() override { unsupported(); }
|
||||
|
||||
virtual future<> close() override { return make_ready_future<>(); }
|
||||
|
||||
virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t size, const io_priority_class& pc) override {
|
||||
return make_ready_future<temporary_buffer<uint8_t>>(temporary_buffer<uint8_t>(size));
|
||||
}
|
||||
|
||||
virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) override {
|
||||
unsupported(); // FIXME
|
||||
}
|
||||
|
||||
virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) override {
|
||||
unsupported(); // FIXME
|
||||
}
|
||||
};
|
||||
|
||||
#ifndef SEASTAR_DEFAULT_ALLOCATOR // Eviction works only with the seastar allocator
|
||||
SEASTAR_THREAD_TEST_CASE(test_stress_eviction) {
|
||||
auto page_size = cached_file::page_size;
|
||||
auto n_pages = 8'000'000 / page_size;
|
||||
auto file_size = page_size * n_pages;
|
||||
auto cached_size = 4'000'000;
|
||||
|
||||
cached_file::metrics metrics;
|
||||
logalloc::region region;
|
||||
|
||||
auto f = file(make_shared<garbage_file_impl>());
|
||||
cached_file cf(f, metrics, cf_lru, region, file_size);
|
||||
|
||||
region.make_evictable([&] {
|
||||
testlog.trace("Evicting");
|
||||
cf.invalidate_at_most_front(file_size / 2);
|
||||
return cf_lru.evict();
|
||||
});
|
||||
|
||||
for (int i = 0; i < (cached_size / page_size); ++i) {
|
||||
read_to_string(cf, page_size * i, page_size);
|
||||
}
|
||||
|
||||
testlog.debug("Saturating memory...");
|
||||
|
||||
// Disable background reclaiming which will prevent bugs from reproducing
|
||||
// We want reclamation to happen synchronously with page cache population in read_to_void()
|
||||
seastar::memory::set_min_free_pages(0);
|
||||
|
||||
// Saturate std memory
|
||||
chunked_fifo<bytes> blobs;
|
||||
auto rc = region.reclaim_counter();
|
||||
while (region.reclaim_counter() == rc) {
|
||||
blobs.emplace_back(bytes(bytes::initialized_later(), 1024));
|
||||
}
|
||||
|
||||
testlog.debug("Memory: allocated={}, free={}", seastar::memory::stats().allocated_memory(), seastar::memory::stats().free_memory());
|
||||
testlog.debug("Starting test...");
|
||||
|
||||
for (int j = 0; j < n_pages * 16; ++j) {
|
||||
testlog.trace("Allocating");
|
||||
auto stride = tests::random::get_int(1, 20);
|
||||
auto page_idx = tests::random::get_int(n_pages - stride);
|
||||
read_to_void(cf, page_idx * page_size, page_size * stride);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_invalidation) {
|
||||
auto page_size = cached_file::page_size;
|
||||
test_file tf = make_test_file(page_size * 2);
|
||||
|
||||
@@ -25,6 +25,8 @@
|
||||
#include <deque>
|
||||
#include <random>
|
||||
#include "utils/lsa/chunked_managed_vector.hh"
|
||||
#include "utils/managed_ref.hh"
|
||||
#include "test/lib/log.hh"
|
||||
|
||||
#include <boost/range/algorithm/sort.hpp>
|
||||
#include <boost/range/algorithm/equal.hpp>
|
||||
@@ -216,3 +218,106 @@ SEASTAR_TEST_CASE(tests_reserve_partial) {
|
||||
});
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_clear_and_release) {
|
||||
region region;
|
||||
allocating_section as;
|
||||
|
||||
with_allocator(region.allocator(), [&] {
|
||||
lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
|
||||
|
||||
for (uint64_t i = 1; i < 4000; ++i) {
|
||||
as(region, [&] {
|
||||
v.emplace_back(make_managed<uint64_t>(i));
|
||||
});
|
||||
}
|
||||
|
||||
v.clear_and_release();
|
||||
});
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_chunk_reserve) {
|
||||
region region;
|
||||
allocating_section as;
|
||||
|
||||
for (auto conf :
|
||||
{ // std::make_pair(reserve size, push count)
|
||||
std::make_pair(0, 4000),
|
||||
std::make_pair(100, 4000),
|
||||
std::make_pair(200, 4000),
|
||||
std::make_pair(1000, 4000),
|
||||
std::make_pair(2000, 4000),
|
||||
std::make_pair(3000, 4000),
|
||||
std::make_pair(5000, 4000),
|
||||
std::make_pair(500, 8000),
|
||||
std::make_pair(1000, 8000),
|
||||
std::make_pair(2000, 8000),
|
||||
std::make_pair(8000, 500),
|
||||
})
|
||||
{
|
||||
with_allocator(region.allocator(), [&] {
|
||||
auto [reserve_size, push_count] = conf;
|
||||
testlog.info("Testing reserve({}), {}x emplace_back()", reserve_size, push_count);
|
||||
lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
|
||||
v.reserve(reserve_size);
|
||||
uint64_t seed = rand();
|
||||
for (uint64_t i = 0; i < push_count; ++i) {
|
||||
as(region, [&] {
|
||||
v.emplace_back(make_managed<uint64_t>(seed + i));
|
||||
BOOST_REQUIRE(**v.begin() == seed);
|
||||
});
|
||||
}
|
||||
auto v_it = v.begin();
|
||||
for (uint64_t i = 0; i < push_count; ++i) {
|
||||
BOOST_REQUIRE(**v_it++ == seed + i);
|
||||
}
|
||||
v.clear_and_release();
|
||||
});
|
||||
}
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
|
||||
// the last reserved chunk.
|
||||
SEASTAR_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
|
||||
region region;
|
||||
allocating_section as;
|
||||
|
||||
with_allocator(region.allocator(), [&] {
|
||||
lsa::chunked_managed_vector<managed_ref<uint64_t>> v;
|
||||
|
||||
// Fill two chunks
|
||||
v.reserve(2000);
|
||||
for (uint64_t i = 0; i < 2000; ++i) {
|
||||
as(region, [&] {
|
||||
v.emplace_back(make_managed<uint64_t>(i));
|
||||
});
|
||||
}
|
||||
|
||||
// Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
|
||||
v.shrink_to_fit();
|
||||
|
||||
// Leave the last chunk reserved but empty
|
||||
for (uint64_t i = 0; i < 1000; ++i) {
|
||||
v.pop_back();
|
||||
}
|
||||
|
||||
// Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
|
||||
// with _size not in the last chunk. Should not sigsegv.
|
||||
v.reserve(8000);
|
||||
|
||||
for (uint64_t i = 0; i < 2000; ++i) {
|
||||
as(region, [&] {
|
||||
v.emplace_back(make_managed<uint64_t>(i));
|
||||
});
|
||||
}
|
||||
|
||||
v.clear_and_release();
|
||||
});
|
||||
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
|
||||
@@ -191,3 +191,32 @@ BOOST_AUTO_TEST_CASE(tests_reserve_partial) {
|
||||
BOOST_REQUIRE_EQUAL(v.capacity(), orig_size);
|
||||
}
|
||||
}
|
||||
|
||||
// Tests the case of make_room() invoked with last_chunk_capacity_deficit but _size not in
|
||||
// the last reserved chunk.
|
||||
BOOST_AUTO_TEST_CASE(test_shrinking_and_expansion_involving_chunk_boundary) {
|
||||
using vector_type = utils::chunked_vector<std::unique_ptr<uint64_t>>;
|
||||
vector_type v;
|
||||
|
||||
// Fill two chunks
|
||||
v.reserve(vector_type::max_chunk_capacity() * 3 / 2);
|
||||
for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 3 / 2; ++i) {
|
||||
v.emplace_back(std::make_unique<uint64_t>(i));
|
||||
}
|
||||
|
||||
// Make the last chunk smaller than max size to trigger the last_chunk_capacity_deficit path in make_room()
|
||||
v.shrink_to_fit();
|
||||
|
||||
// Leave the last chunk reserved but empty
|
||||
for (uint64_t i = 0; i < vector_type::max_chunk_capacity(); ++i) {
|
||||
v.pop_back();
|
||||
}
|
||||
|
||||
// Try to reserve more than the currently reserved capacity and trigger last_chunk_capacity_deficit path
|
||||
// with _size not in the last chunk. Should not sigsegv.
|
||||
v.reserve(vector_type::max_chunk_capacity() * 4);
|
||||
|
||||
for (uint64_t i = 0; i < vector_type::max_chunk_capacity() * 2; ++i) {
|
||||
v.emplace_back(std::make_unique<uint64_t>(i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +44,9 @@
|
||||
#include "test/lib/tmpdir.hh"
|
||||
#include "db/commitlog/commitlog.hh"
|
||||
#include "db/commitlog/commitlog_replayer.hh"
|
||||
#include "db/commitlog/commitlog_extensions.hh"
|
||||
#include "db/commitlog/rp_set.hh"
|
||||
#include "db/extensions.hh"
|
||||
#include "log.hh"
|
||||
#include "service/priority_manager.hh"
|
||||
#include "test/lib/exception_utils.hh"
|
||||
@@ -947,3 +949,113 @@ SEASTAR_TEST_CASE(test_commitlog_deadlock_with_flush_threshold) {
|
||||
co_await log.clear();
|
||||
}
|
||||
}
|
||||
|
||||
static future<> do_test_exception_in_allocate_ex(bool do_file_delete, bool reuse = true) {
|
||||
commitlog::config cfg;
|
||||
|
||||
constexpr auto max_size_mb = 1;
|
||||
|
||||
cfg.commitlog_segment_size_in_mb = max_size_mb;
|
||||
cfg.commitlog_total_space_in_mb = 2 * max_size_mb * smp::count;
|
||||
cfg.commitlog_sync_period_in_ms = 10;
|
||||
cfg.reuse_segments = reuse;
|
||||
cfg.allow_going_over_size_limit = false; // #9348 - now can enforce size limit always
|
||||
cfg.use_o_dsync = true; // make sure we pre-allocate.
|
||||
|
||||
// not using cl_test, because we need to be able to abandon
|
||||
// the log.
|
||||
|
||||
tmpdir tmp;
|
||||
cfg.commit_log_location = tmp.path().string();
|
||||
|
||||
class myfail : public std::exception {
|
||||
public:
|
||||
using std::exception::exception;
|
||||
};
|
||||
|
||||
struct myext: public db::commitlog_file_extension {
|
||||
public:
|
||||
bool fail = false;
|
||||
bool thrown = false;
|
||||
bool do_file_delete;
|
||||
|
||||
myext(bool dd)
|
||||
: do_file_delete(dd)
|
||||
{}
|
||||
|
||||
seastar::future<seastar::file> wrap_file(const seastar::sstring& filename, seastar::file f, seastar::open_flags flags) override {
|
||||
if (fail && !thrown) {
|
||||
thrown = true;
|
||||
if (do_file_delete) {
|
||||
co_await f.close();
|
||||
co_await seastar::remove_file(filename);
|
||||
}
|
||||
throw myfail{};
|
||||
}
|
||||
co_return f;
|
||||
}
|
||||
seastar::future<> before_delete(const seastar::sstring&) override {
|
||||
co_return;
|
||||
}
|
||||
};
|
||||
|
||||
auto ep = std::make_unique<myext>(do_file_delete);
|
||||
auto& mx = *ep;
|
||||
|
||||
db::extensions myexts;
|
||||
myexts.add_commitlog_file_extension("hufflepuff", std::move(ep));
|
||||
|
||||
cfg.extensions = &myexts;
|
||||
|
||||
auto log = co_await commitlog::create_commitlog(cfg);
|
||||
|
||||
rp_set rps;
|
||||
// uncomment for verbosity
|
||||
// logging::logger_registry().set_logger_level("commitlog", logging::log_level::debug);
|
||||
|
||||
auto uuid = utils::UUID_gen::get_time_UUID();
|
||||
auto size = log.max_record_size();
|
||||
|
||||
auto r = log.add_flush_handler([&](cf_id_type id, replay_position pos) {
|
||||
log.discard_completed_segments(id, rps);
|
||||
mx.fail = true;
|
||||
});
|
||||
|
||||
try {
|
||||
while (!mx.thrown) {
|
||||
rp_handle h = co_await log.add_mutation(uuid, size, db::commitlog::force_sync::no, [&](db::commitlog::output& dst) {
|
||||
dst.fill('1', size);
|
||||
});
|
||||
rps.put(std::move(h));
|
||||
}
|
||||
} catch (...) {
|
||||
BOOST_FAIL("log write timed out. maybe it is deadlocked... Will not free log. ASAN errors and leaks will follow...");
|
||||
}
|
||||
|
||||
co_await log.shutdown();
|
||||
co_await log.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test generating an exception in segment file allocation
|
||||
*/
|
||||
SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex) {
|
||||
co_await do_test_exception_in_allocate_ex(false);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_no_recycle) {
|
||||
co_await do_test_exception_in_allocate_ex(false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test generating an exception in segment file allocation, but also
|
||||
* delete the file, which in turn should cause follow-up exceptions
|
||||
* in cleanup delete. Which CL should handle
|
||||
*/
|
||||
SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file) {
|
||||
co_await do_test_exception_in_allocate_ex(true, false);
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_commitlog_exceptions_in_allocate_ex_deleted_file_no_recycle) {
|
||||
co_await do_test_exception_in_allocate_ex(true);
|
||||
}
|
||||
|
||||
@@ -22,6 +22,8 @@
|
||||
#include <seastar/testing/test_case.hh>
|
||||
#include "test/lib/cql_test_env.hh"
|
||||
#include "test/lib/cql_assertions.hh"
|
||||
#include "cql3/untyped_result_set.hh"
|
||||
#include "cql3/query_processor.hh"
|
||||
#include "transport/messages/result_message.hh"
|
||||
|
||||
SEASTAR_TEST_CASE(test_index_with_paging) {
|
||||
@@ -56,3 +58,51 @@ SEASTAR_TEST_CASE(test_index_with_paging) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read) {
|
||||
return do_with_cql_env_thread([] (auto& e) {
|
||||
e.execute_cql("CREATE TABLE tab (pk int, ck text, v int, v2 int, v3 text, PRIMARY KEY (pk, ck))").get();
|
||||
e.execute_cql("CREATE INDEX ON tab (v)").get();
|
||||
|
||||
// Enough to trigger a short read on the base table during scan
|
||||
sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
|
||||
|
||||
const int row_count = 67;
|
||||
for (int i = 0; i < row_count; ++i) {
|
||||
e.execute_cql(format("INSERT INTO tab (pk, ck, v, v2, v3) VALUES ({}, 'hello{}', 1, {}, '{}')", i % 3, i, i, big_string)).get();
|
||||
}
|
||||
|
||||
eventually([&] {
|
||||
uint64_t count = 0;
|
||||
e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
|
||||
++count;
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
}).get();
|
||||
BOOST_REQUIRE_EQUAL(count, row_count);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_index_with_paging_with_base_short_read_no_ck) {
|
||||
return do_with_cql_env_thread([] (auto& e) {
|
||||
e.execute_cql("CREATE TABLE tab (pk int, v int, v2 int, v3 text, PRIMARY KEY (pk))").get();
|
||||
e.execute_cql("CREATE INDEX ON tab (v)").get();
|
||||
|
||||
// Enough to trigger a short read on the base table during scan
|
||||
sstring big_string(2 * query::result_memory_limiter::maximum_result_size, 'j');
|
||||
|
||||
const int row_count = 67;
|
||||
for (int i = 0; i < row_count; ++i) {
|
||||
e.execute_cql(format("INSERT INTO tab (pk, v, v2, v3) VALUES ({}, 1, {}, '{}')", i, i, big_string)).get();
|
||||
}
|
||||
|
||||
eventually([&] {
|
||||
uint64_t count = 0;
|
||||
e.qp().local().query_internal("SELECT * FROM ks.tab WHERE v = 1", [&] (const cql3::untyped_result_set_row&) {
|
||||
++count;
|
||||
return make_ready_future<stop_iteration>(stop_iteration::no);
|
||||
}).get();
|
||||
BOOST_REQUIRE_EQUAL(count, row_count);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -702,6 +702,7 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
|
||||
};
|
||||
|
||||
auto assert_equal = [] (atomic_cell_view c1, atomic_cell_view c2) {
|
||||
testlog.trace("Expected {} == {}", c1, c2);
|
||||
BOOST_REQUIRE(compare_atomic_cell_for_merge(c1, c2) == 0);
|
||||
BOOST_REQUIRE(compare_atomic_cell_for_merge(c2, c1) == 0);
|
||||
};
|
||||
@@ -723,9 +724,11 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_2, ttl_2));
|
||||
|
||||
// Origin doesn't compare ttl (is it wise?)
|
||||
assert_equal(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2));
|
||||
// But we do. See https://github.com/scylladb/scylla/issues/10156
|
||||
// and https://github.com/scylladb/scylla/issues/10173
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_2),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1));
|
||||
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 0, bytes("value1")),
|
||||
|
||||
@@ -210,6 +210,35 @@ BOOST_AUTO_TEST_CASE(test_overlapping_addition) {
|
||||
BOOST_REQUIRE(it == l.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_adjacent_empty_range_tombstone) {
|
||||
range_tombstone_list l(*s);
|
||||
|
||||
l.apply(*s, rtie(1, 1, 2));
|
||||
l.apply(*s, rt(1, 2, 3));
|
||||
l.apply(*s, rtei(2, 2, 2));
|
||||
l.apply(*s, rtei(2, 4, 3));
|
||||
|
||||
auto it = l.begin();
|
||||
assert_rt(rt(1, 4, 3), *it++);
|
||||
BOOST_REQUIRE(it == l.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_empty_range_tombstones_are_dropped) {
|
||||
range_tombstone_list l(*s);
|
||||
|
||||
l.apply(*s, rtei(0, 0, 1));
|
||||
l.apply(*s, rtie(0, 0, 1));
|
||||
l.apply(*s, rt(1, 2, 1));
|
||||
l.apply(*s, rtei(4, 4, 1));
|
||||
l.apply(*s, rtie(5, 5, 1));
|
||||
l.apply(*s, rt(7, 8, 1));
|
||||
|
||||
auto it = l.begin();
|
||||
assert_rt(rt(1, 2, 1), *it++);
|
||||
assert_rt(rt(7, 8, 1), *it++);
|
||||
BOOST_REQUIRE(it == l.end());
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(test_simple_overlap) {
|
||||
range_tombstone_list l1(*s);
|
||||
|
||||
@@ -473,6 +502,23 @@ static std::vector<range_tombstone> make_random() {
|
||||
rts.emplace_back(std::move(start_b), std::move(end_b), tombstone(dist(gen), gc_now));
|
||||
}
|
||||
|
||||
int32_t size_empty = dist(gen) / 2;
|
||||
for (int32_t i = 0; i < size_empty; ++i) {
|
||||
clustering_key_prefix key = make_random_ckey();
|
||||
bool start_incl = dist(gen) > 25;
|
||||
if (start_incl) {
|
||||
rts.emplace_back(
|
||||
position_in_partition::before_key(key),
|
||||
position_in_partition::before_key(key),
|
||||
tombstone(dist(gen), gc_now));
|
||||
} else {
|
||||
rts.emplace_back(
|
||||
position_in_partition::after_key(key),
|
||||
position_in_partition::after_key(key),
|
||||
tombstone(dist(gen), gc_now));
|
||||
}
|
||||
}
|
||||
|
||||
return rts;
|
||||
}
|
||||
|
||||
|
||||
@@ -37,11 +37,13 @@ static void add_entry(logalloc::region& r,
|
||||
{
|
||||
logalloc::allocating_section as;
|
||||
as(r, [&] {
|
||||
sstables::key sst_key = sstables::key::from_partition_key(s, key);
|
||||
page._entries.push_back(make_managed<index_entry>(
|
||||
managed_bytes(sst_key.get_bytes()),
|
||||
position,
|
||||
managed_ref<promoted_index>()));
|
||||
with_allocator(r.allocator(), [&] {
|
||||
sstables::key sst_key = sstables::key::from_partition_key(s, key);
|
||||
page._entries.push_back(make_managed<index_entry>(
|
||||
managed_bytes(sst_key.get_bytes()),
|
||||
position,
|
||||
managed_ref<promoted_index>()));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -128,3 +128,16 @@ def test_operator_ne_not_supported(cql, table1):
|
||||
cql.execute(f'SELECT a FROM {table1} WHERE a != 0')
|
||||
with pytest.raises(InvalidRequest, match='Unsupported.*!='):
|
||||
cql.execute(f'SELECT a FROM {table1} WHERE token(a) != 0')
|
||||
|
||||
# Test that the fact that a column is indexed does not cause us to fetch
|
||||
# incorrect results from a filtering query (issue #10300).
|
||||
def test_index_with_in_relation(scylla_only, cql, test_keyspace):
|
||||
schema = 'p int, c int, v boolean, primary key (p,c)'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
cql.execute(f"create index on {table}(v)")
|
||||
for p, c, v in [(0,0,True),(0,1,False),(0,2,True),(0,3,False),
|
||||
(1,0,True),(1,1,False),(1,2,True),(1,3,False),
|
||||
(2,0,True),(2,1,False),(2,2,True),(2,3,False)]:
|
||||
cql.execute(f"insert into {table} (p,c,v) values ({p}, {c}, {v})")
|
||||
res = cql.execute(f"select * from {table} where p in (0,1) and v = False ALLOW FILTERING")
|
||||
assert set(res) == set([(0,1,False),(0,3,False),(1,1,False), (1,3,False)])
|
||||
|
||||
@@ -63,8 +63,9 @@ def test_insert_null_key(cql, table1):
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(stmt, [None, s])
|
||||
|
||||
# Tests handling of "key_column in ?" where ? is bound to null.
|
||||
# Reproduces issue #8265.
|
||||
def test_primary_key_in_null(cql, table1):
|
||||
'''Tests handling of "key_column in ?" where ? is bound to null.'''
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
cql.execute(cql.prepare(f"SELECT p FROM {table1} WHERE p IN ?"), [None])
|
||||
with pytest.raises(InvalidRequest, match='null value'):
|
||||
@@ -159,6 +160,20 @@ def test_delete_empty_string_key(cql, table1):
|
||||
with pytest.raises(InvalidRequest, match='Key may not be empty'):
|
||||
cql.execute(f"DELETE FROM {table1} WHERE p='' AND c='{s}'")
|
||||
|
||||
# Another test like test_insert_empty_string_key() just using an INSERT JSON
|
||||
# instead of a regular INSERT. Because INSERT JSON takes a different code path
|
||||
# from regular INSERT, we need the emptiness test in yet another place.
|
||||
# Reproduces issue #9853 (the empty-string partition key was allowed, and
|
||||
# actually inserted into the table.)
|
||||
def test_insert_json_empty_string_key(cql, table1):
|
||||
s = random_string()
|
||||
# An empty-string clustering *is* allowed:
|
||||
cql.execute("""INSERT INTO %s JSON '{"p": "%s", "c": "", "v": "cat"}'""" % (table1, s))
|
||||
assert list(cql.execute(f"SELECT v FROM {table1} WHERE p='{s}' AND c=''")) == [('cat',)]
|
||||
# But an empty-string partition key is *not* allowed, with a specific
|
||||
# error that a "Key may not be empty":
|
||||
with pytest.raises(InvalidRequest, match='Key may not be empty'):
|
||||
cql.execute("""INSERT INTO %s JSON '{"p": "", "c": "%s", "v": "cat"}'""" % (table1, s))
|
||||
|
||||
# Although an empty string is not allowed as a partition key (as tested
|
||||
# above by test_empty_string_key()), it turns out that in a *compound*
|
||||
|
||||
@@ -1,4 +1,14 @@
|
||||
create table tb2 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
|
||||
-- Should add 3 rows (preimage + postimage + delta). Delta has only key columns and "pk" + "ck"
|
||||
insert into tb2 (pk, ck) VALUES (2, 22) USING TTL 2222;
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb2_scylla_cdc_log;
|
||||
create table tb2 (pk int, ck int, v int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
|
||||
-- Should add 2 rows (postimage + delta).
|
||||
insert into tb2 (pk, ck, v) VALUES (2, 22, 111) USING TTL 2222;
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
|
||||
-- Should add 3 rows (preimage + postimage + delta).
|
||||
insert into tb2 (pk, ck, v) VALUES (2, 22, 1111) USING TTL 2223;
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
|
||||
create table tb3 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
|
||||
-- Should add 2 rows (postimage + delta).
|
||||
insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2222;
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
|
||||
-- Should add 3 rows (preimage + postimage + delta).
|
||||
insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2223;
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
|
||||
|
||||
@@ -1,13 +1,91 @@
|
||||
create table tb2 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
|
||||
create table tb2 (pk int, ck int, v int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
|
||||
{
|
||||
"status" : "ok"
|
||||
}
|
||||
-- Should add 3 rows (preimage + postimage + delta). Delta has only key columns and "pk" + "ck"
|
||||
insert into tb2 (pk, ck) VALUES (2, 22) USING TTL 2222;
|
||||
-- Should add 2 rows (postimage + delta).
|
||||
insert into tb2 (pk, ck, v) VALUES (2, 22, 111) USING TTL 2222;
|
||||
{
|
||||
"status" : "ok"
|
||||
}
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb2_scylla_cdc_log;
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
|
||||
{
|
||||
"rows" :
|
||||
[
|
||||
{
|
||||
"cdc$batch_seq_no" : "0",
|
||||
"cdc$operation" : "1",
|
||||
"cdc$ttl" : "2222",
|
||||
"ck" : "22",
|
||||
"pk" : "2",
|
||||
"v" : "111"
|
||||
},
|
||||
{
|
||||
"cdc$batch_seq_no" : "1",
|
||||
"cdc$operation" : "9",
|
||||
"ck" : "22",
|
||||
"pk" : "2",
|
||||
"v" : "111"
|
||||
}
|
||||
]
|
||||
}
|
||||
-- Should add 3 rows (preimage + postimage + delta).
|
||||
insert into tb2 (pk, ck, v) VALUES (2, 22, 1111) USING TTL 2223;
|
||||
{
|
||||
"status" : "ok"
|
||||
}
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck, v from tb2_scylla_cdc_log;
|
||||
{
|
||||
"rows" :
|
||||
[
|
||||
{
|
||||
"cdc$batch_seq_no" : "0",
|
||||
"cdc$operation" : "1",
|
||||
"cdc$ttl" : "2222",
|
||||
"ck" : "22",
|
||||
"pk" : "2",
|
||||
"v" : "111"
|
||||
},
|
||||
{
|
||||
"cdc$batch_seq_no" : "1",
|
||||
"cdc$operation" : "9",
|
||||
"ck" : "22",
|
||||
"pk" : "2",
|
||||
"v" : "111"
|
||||
},
|
||||
{
|
||||
"cdc$batch_seq_no" : "0",
|
||||
"cdc$operation" : "0",
|
||||
"ck" : "22",
|
||||
"pk" : "2",
|
||||
"v" : "111"
|
||||
},
|
||||
{
|
||||
"cdc$batch_seq_no" : "1",
|
||||
"cdc$operation" : "1",
|
||||
"cdc$ttl" : "2223",
|
||||
"ck" : "22",
|
||||
"pk" : "2",
|
||||
"v" : "1111"
|
||||
},
|
||||
{
|
||||
"cdc$batch_seq_no" : "2",
|
||||
"cdc$operation" : "9",
|
||||
"ck" : "22",
|
||||
"pk" : "2",
|
||||
"v" : "1111"
|
||||
}
|
||||
]
|
||||
}
|
||||
create table tb3 (pk int, ck int, PRIMARY KEY (pk, ck)) with compact storage and cdc = {'enabled': true, 'preimage': true, 'postimage': true};
|
||||
{
|
||||
"status" : "ok"
|
||||
}
|
||||
-- Should add 2 rows (postimage + delta).
|
||||
insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2222;
|
||||
{
|
||||
"status" : "ok"
|
||||
}
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
|
||||
{
|
||||
"rows" :
|
||||
[
|
||||
@@ -26,3 +104,46 @@ select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb2_scylla_cd
|
||||
}
|
||||
]
|
||||
}
|
||||
-- Should add 3 rows (preimage + postimage + delta).
|
||||
insert into tb3 (pk, ck) VALUES (2, 22) USING TTL 2223;
|
||||
{
|
||||
"status" : "ok"
|
||||
}
|
||||
select "cdc$batch_seq_no", "cdc$operation", "cdc$ttl", pk, ck from tb3_scylla_cdc_log;
|
||||
{
|
||||
"rows" :
|
||||
[
|
||||
{
|
||||
"cdc$batch_seq_no" : "0",
|
||||
"cdc$operation" : "1",
|
||||
"cdc$ttl" : "2222",
|
||||
"ck" : "22",
|
||||
"pk" : "2"
|
||||
},
|
||||
{
|
||||
"cdc$batch_seq_no" : "1",
|
||||
"cdc$operation" : "9",
|
||||
"ck" : "22",
|
||||
"pk" : "2"
|
||||
},
|
||||
{
|
||||
"cdc$batch_seq_no" : "0",
|
||||
"cdc$operation" : "0",
|
||||
"ck" : "22",
|
||||
"pk" : "2"
|
||||
},
|
||||
{
|
||||
"cdc$batch_seq_no" : "1",
|
||||
"cdc$operation" : "1",
|
||||
"cdc$ttl" : "2223",
|
||||
"ck" : "22",
|
||||
"pk" : "2"
|
||||
},
|
||||
{
|
||||
"cdc$batch_seq_no" : "2",
|
||||
"cdc$operation" : "9",
|
||||
"ck" : "22",
|
||||
"pk" : "2"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1230,7 +1230,7 @@ std::unique_ptr<cql_server::response> cql_server::connection::make_read_timeout_
|
||||
std::unique_ptr<cql_server::response> cql_server::connection::make_read_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, db::consistency_level cl, int32_t received, int32_t numfailures, int32_t blockfor, bool data_present, const tracing::trace_state_ptr& tr_state) const
|
||||
{
|
||||
if (_version < 4) {
|
||||
return make_read_timeout_error(stream, err, std::move(msg), cl, received, blockfor, data_present, tr_state);
|
||||
return make_read_timeout_error(stream, exceptions::exception_code::READ_TIMEOUT, std::move(msg), cl, received, blockfor, data_present, tr_state);
|
||||
}
|
||||
auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
|
||||
response->write_int(static_cast<int32_t>(err));
|
||||
@@ -1258,7 +1258,7 @@ std::unique_ptr<cql_server::response> cql_server::connection::make_mutation_writ
|
||||
std::unique_ptr<cql_server::response> cql_server::connection::make_mutation_write_failure_error(int16_t stream, exceptions::exception_code err, sstring msg, db::consistency_level cl, int32_t received, int32_t numfailures, int32_t blockfor, db::write_type type, const tracing::trace_state_ptr& tr_state) const
|
||||
{
|
||||
if (_version < 4) {
|
||||
return make_mutation_write_timeout_error(stream, err, std::move(msg), cl, received, blockfor, type, tr_state);
|
||||
return make_mutation_write_timeout_error(stream, exceptions::exception_code::WRITE_TIMEOUT, std::move(msg), cl, received, blockfor, type, tr_state);
|
||||
}
|
||||
auto response = std::make_unique<cql_server::response>(stream, cql_binary_opcode::ERROR, tr_state);
|
||||
response->write_int(static_cast<int32_t>(err));
|
||||
|
||||
@@ -157,6 +157,7 @@ private:
|
||||
metrics& _metrics;
|
||||
lru& _lru;
|
||||
logalloc::region& _region;
|
||||
logalloc::allocating_section _as;
|
||||
|
||||
using cache_type = bplus::tree<page_idx_type, cached_page, page_idx_less_comparator, 12, bplus::key_search::linear>;
|
||||
cache_type _cache;
|
||||
@@ -187,10 +188,15 @@ private:
|
||||
.then([this, idx] (temporary_buffer<char>&& buf) mutable {
|
||||
cached_page::ptr_type first_page;
|
||||
while (buf.size()) {
|
||||
auto this_buf = buf.share();
|
||||
this_buf.trim(std::min(page_size, buf.size()));
|
||||
buf.trim_front(this_buf.size());
|
||||
auto it_and_flag = _cache.emplace(idx, this, idx, std::move(this_buf));
|
||||
auto this_size = std::min(page_size, buf.size());
|
||||
// _cache.emplace() needs to run under allocating section even though it lives in the std space
|
||||
// because bplus::tree operations are not reentrant, so we need to prevent memory reclamation.
|
||||
auto it_and_flag = _as(_region, [&] {
|
||||
auto this_buf = buf.share();
|
||||
this_buf.trim(this_size);
|
||||
return _cache.emplace(idx, this, idx, std::move(this_buf));
|
||||
});
|
||||
buf.trim_front(this_size);
|
||||
++idx;
|
||||
cached_page &cp = *it_and_flag.first;
|
||||
if (it_and_flag.second) {
|
||||
@@ -333,6 +339,7 @@ public:
|
||||
}
|
||||
|
||||
size_t evict_range(cache_type::iterator start, cache_type::iterator end) noexcept {
|
||||
return with_allocator(standard_allocator(), [&] {
|
||||
size_t count = 0;
|
||||
auto disposer = [] (auto* p) noexcept {};
|
||||
while (start != end) {
|
||||
@@ -345,6 +352,7 @@ public:
|
||||
}
|
||||
}
|
||||
return count;
|
||||
});
|
||||
}
|
||||
public:
|
||||
/// \brief Constructs a cached_file.
|
||||
@@ -471,8 +479,10 @@ public:
|
||||
inline
|
||||
void cached_file::cached_page::on_evicted() noexcept {
|
||||
parent->on_evicted(*this);
|
||||
cached_file::cache_type::iterator it(this);
|
||||
it.erase(page_idx_less_comparator());
|
||||
with_allocator(standard_allocator(), [this] {
|
||||
cached_file::cache_type::iterator it(this);
|
||||
it.erase(page_idx_less_comparator());
|
||||
});
|
||||
}
|
||||
|
||||
class cached_file_impl : public file_impl {
|
||||
|
||||
@@ -52,10 +52,11 @@ class chunked_vector {
|
||||
utils::small_vector<chunk_ptr, 1> _chunks;
|
||||
size_t _size = 0;
|
||||
size_t _capacity = 0;
|
||||
private:
|
||||
public:
|
||||
static size_t max_chunk_capacity() {
|
||||
return std::max(max_contiguous_allocation / sizeof(T), size_t(1));
|
||||
}
|
||||
private:
|
||||
void reserve_for_push_back() {
|
||||
if (_size == _capacity) {
|
||||
do_reserve_for_push_back();
|
||||
@@ -387,7 +388,9 @@ chunked_vector<T, max_contiguous_allocation>::make_room(size_t n, bool stop_afte
|
||||
auto new_last_chunk_capacity = last_chunk_capacity + capacity_increase;
|
||||
// FIXME: realloc? maybe not worth the complication; only works for PODs
|
||||
auto new_last_chunk = new_chunk(new_last_chunk_capacity);
|
||||
migrate(addr(_capacity - last_chunk_capacity), addr(_size), new_last_chunk.get());
|
||||
if (_size > _capacity - last_chunk_capacity) {
|
||||
migrate(addr(_capacity - last_chunk_capacity), addr(_size), new_last_chunk.get());
|
||||
}
|
||||
_chunks.back() = std::move(new_last_chunk);
|
||||
_capacity += capacity_increase;
|
||||
}
|
||||
|
||||
@@ -597,6 +597,10 @@ static constexpr auto max_used_space_ratio_for_compaction = 0.85;
|
||||
static constexpr size_t max_used_space_for_compaction = segment_size * max_used_space_ratio_for_compaction;
|
||||
static constexpr size_t min_free_space_for_compaction = segment_size - max_used_space_for_compaction;
|
||||
|
||||
struct [[gnu::packed]] non_lsa_object_cookie {
|
||||
uint64_t value = 0xbadcaffe;
|
||||
};
|
||||
|
||||
static_assert(min_free_space_for_compaction >= max_managed_object_size,
|
||||
"Segments which cannot fit max_managed_object_size must not be considered compactible for the sake of forward progress of compaction");
|
||||
|
||||
@@ -840,9 +844,13 @@ public:
|
||||
void clear_allocation_failure_flag() { _allocation_failure_flag = false; }
|
||||
bool allocation_failure_flag() { return _allocation_failure_flag; }
|
||||
void refill_emergency_reserve();
|
||||
void update_non_lsa_memory_in_use(ssize_t n) {
|
||||
void add_non_lsa_memory_in_use(size_t n) {
|
||||
_non_lsa_memory_in_use += n;
|
||||
}
|
||||
void subtract_non_lsa_memory_in_use(size_t n) {
|
||||
assert(_non_lsa_memory_in_use >= n);
|
||||
_non_lsa_memory_in_use -= n;
|
||||
}
|
||||
size_t non_lsa_memory_in_use() const {
|
||||
return _non_lsa_memory_in_use;
|
||||
}
|
||||
@@ -1395,6 +1403,8 @@ private:
|
||||
}
|
||||
|
||||
lsa_buffer alloc_buf(size_t buf_size) {
|
||||
// Note: Can be re-entered from allocation sites below due to memory reclamation which
|
||||
// invokes segment compaction.
|
||||
static_assert(segment::size % buf_align == 0);
|
||||
if (buf_size > segment::size) {
|
||||
throw_with_backtrace<std::runtime_error>(format("Buffer size {} too large", buf_size));
|
||||
@@ -1447,6 +1457,7 @@ private:
|
||||
|
||||
if (seg != _buf_active) {
|
||||
if (desc.is_empty()) {
|
||||
assert(desc._buf_pointers.empty());
|
||||
_segment_descs.erase(desc);
|
||||
desc._buf_pointers = std::vector<entangled>();
|
||||
free_segment(seg, desc);
|
||||
@@ -1457,7 +1468,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
void compact_segment_locked(segment* seg, segment_descriptor& desc) {
|
||||
void compact_segment_locked(segment* seg, segment_descriptor& desc) noexcept {
|
||||
auto seg_occupancy = desc.occupancy();
|
||||
llogger.debug("Compacting segment {} from region {}, {}", fmt::ptr(seg), id(), seg_occupancy);
|
||||
|
||||
@@ -1472,6 +1483,7 @@ private:
|
||||
for (entangled& e : _buf_ptrs_for_compact_segment) {
|
||||
if (e) {
|
||||
lsa_buffer* old_ptr = e.get(&lsa_buffer::_link);
|
||||
assert(&desc == old_ptr->_desc);
|
||||
lsa_buffer dst = alloc_buf(old_ptr->_size);
|
||||
memcpy(dst._buf, old_ptr->_buf, dst._size);
|
||||
old_ptr->_link = std::move(dst._link);
|
||||
@@ -1502,6 +1514,10 @@ private:
|
||||
std::vector<entangled> ptrs;
|
||||
ptrs.reserve(segment::size / buf_align);
|
||||
segment* new_active = new_segment();
|
||||
if (_buf_active) [[unlikely]] {
|
||||
// Memory allocation above could allocate active buffer during segment compaction.
|
||||
close_buf_active();
|
||||
}
|
||||
assert((uintptr_t)new_active->at(0) % buf_align == 0);
|
||||
segment_descriptor& desc = shard_segment_pool.descriptor(new_active);
|
||||
desc._buf_pointers = std::move(ptrs);
|
||||
@@ -1635,17 +1651,18 @@ public:
|
||||
memory::on_alloc_point();
|
||||
shard_segment_pool.on_memory_allocation(size);
|
||||
if (size > max_managed_object_size) {
|
||||
auto ptr = standard_allocator().alloc(migrator, size, alignment);
|
||||
auto ptr = standard_allocator().alloc(migrator, size + sizeof(non_lsa_object_cookie), alignment);
|
||||
// This isn't very acurrate, the correct free_space value would be
|
||||
// malloc_usable_size(ptr) - size, but there is no way to get
|
||||
// the exact object size at free.
|
||||
auto allocated_size = malloc_usable_size(ptr);
|
||||
new ((char*)ptr + allocated_size - sizeof(non_lsa_object_cookie)) non_lsa_object_cookie();
|
||||
_non_lsa_occupancy += occupancy_stats(0, allocated_size);
|
||||
if (_group) {
|
||||
_evictable_space += allocated_size;
|
||||
_group->increase_usage(_heap_handle, allocated_size);
|
||||
}
|
||||
shard_segment_pool.update_non_lsa_memory_in_use(allocated_size);
|
||||
shard_segment_pool.add_non_lsa_memory_in_use(allocated_size);
|
||||
return ptr;
|
||||
} else {
|
||||
auto ptr = alloc_small(object_descriptor(migrator), (segment::size_type) size, alignment);
|
||||
@@ -1657,12 +1674,14 @@ public:
|
||||
private:
|
||||
void on_non_lsa_free(void* obj) noexcept {
|
||||
auto allocated_size = malloc_usable_size(obj);
|
||||
auto cookie = (non_lsa_object_cookie*)((char*)obj + allocated_size) - 1;
|
||||
assert(cookie->value == non_lsa_object_cookie().value);
|
||||
_non_lsa_occupancy -= occupancy_stats(0, allocated_size);
|
||||
if (_group) {
|
||||
_evictable_space -= allocated_size;
|
||||
_group->decrease_usage(_heap_handle, allocated_size);
|
||||
}
|
||||
shard_segment_pool.update_non_lsa_memory_in_use(-allocated_size);
|
||||
shard_segment_pool.subtract_non_lsa_memory_in_use(allocated_size);
|
||||
}
|
||||
public:
|
||||
virtual void free(void* obj) noexcept override {
|
||||
@@ -2188,8 +2207,8 @@ private:
|
||||
auto info_level = _stall_detected ? log_level::info : log_level::debug;
|
||||
auto MiB = 1024*1024;
|
||||
|
||||
timing_logger.log(time_level, "Reclamation cycle took {} ms, trying to release {:.3f} MiB {}preemptibly",
|
||||
_duration.count(), (float)_memory_to_release / MiB, _preemptible ? "" : "non-");
|
||||
timing_logger.log(time_level, "Reclamation cycle took {} us, trying to release {:.3f} MiB {}preemptibly",
|
||||
_duration / 1us, (float)_memory_to_release / MiB, _preemptible ? "" : "non-");
|
||||
log_if_any(info_level, "reserved segments", _reserve_segments);
|
||||
if (_memory_released > 0) {
|
||||
auto bytes_per_second =
|
||||
|
||||
@@ -73,6 +73,9 @@ private:
|
||||
throw std::out_of_range("chunked_managed_vector out of range access");
|
||||
}
|
||||
}
|
||||
chunk_ptr& back_chunk() {
|
||||
return _chunks[_size / max_chunk_capacity()];
|
||||
}
|
||||
static void migrate(T* begin, T* end, managed_vector<T>& result);
|
||||
public:
|
||||
using value_type = T;
|
||||
@@ -119,24 +122,24 @@ public:
|
||||
|
||||
void push_back(const T& x) {
|
||||
reserve_for_push_back();
|
||||
_chunks.back().emplace_back(x);
|
||||
back_chunk().emplace_back(x);
|
||||
++_size;
|
||||
}
|
||||
void push_back(T&& x) {
|
||||
reserve_for_push_back();
|
||||
_chunks.back().emplace_back(std::move(x));
|
||||
back_chunk().emplace_back(std::move(x));
|
||||
++_size;
|
||||
}
|
||||
template <typename... Args>
|
||||
T& emplace_back(Args&&... args) {
|
||||
reserve_for_push_back();
|
||||
auto& ret = _chunks.back().emplace_back(std::forward<Args>(args)...);
|
||||
auto& ret = back_chunk().emplace_back(std::forward<Args>(args)...);
|
||||
++_size;
|
||||
return ret;
|
||||
}
|
||||
void pop_back() {
|
||||
--_size;
|
||||
_chunks.back().pop_back();
|
||||
back_chunk().pop_back();
|
||||
}
|
||||
const T& back() const {
|
||||
return *addr(_size - 1);
|
||||
@@ -394,7 +397,9 @@ chunked_managed_vector<T>::make_room(size_t n, bool stop_after_one) {
|
||||
auto new_last_chunk_capacity = last_chunk_capacity + capacity_increase;
|
||||
// FIXME: realloc? maybe not worth the complication; only works for PODs
|
||||
auto new_last_chunk = new_chunk(new_last_chunk_capacity);
|
||||
migrate(addr(_capacity - last_chunk_capacity), addr(_size), new_last_chunk);
|
||||
if (_size > _capacity - last_chunk_capacity) {
|
||||
migrate(addr(_capacity - last_chunk_capacity), addr(_size), new_last_chunk);
|
||||
}
|
||||
_chunks.back() = std::move(new_last_chunk);
|
||||
_capacity += capacity_increase;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user