Merge 'Move the reader concurrency semaphore in front of the cache' from Botond
This patchset combines two important changes to the way reader permits are created and admitted: 1) It switches admission to be up-front. 2) It changes the admission algorithm. (1) Currently permits are created before the read is started, but they only wait for admission when going to the disk. This leaves the resources consumption of cache and memtables reads unbounded, possibly leading to OOM (rare but happens). This series changes this that permits are admitted at the moment they are creating making admission up-front -- at least those reads that pass admission at all (some don't). (2) Admission currently is based on availability of resources. We have a certain amount of memory available, which derived from the memory available to the shard, as well a hardcoded count resource. Reads are admitted when a count and a certain amount (base cost) of memory is available. This patchset adds a new aspect to this admission process beyond the existing resource availability: the number of used/blocked reads. Namely it only admits new reads if in addition to the necessary amount of resources being available, all currently used readers are blocked. In other words we only admit new reads if all currently admitted reads requires something other than CPU to progress. They are either waiting on I/O, a remote shard, or attention from their consumers (not used currently). The reason for making these two changes at the same time is that up-front admission means cache reads now need to obtain a permit too. For cache reads the optimal concurrency is 1. Anything above that just increases latency (without increasing throughput). So we want to make sure that if a cache reader hits it doesn't get any competition for CPU and it can run to completion. We admit new reads only if the read misses and has to go to disk. A side effect of these changes is that the execution stages from the replica-side read path are replaced with the reader concurrency semaphore as an execution stage. This is necessary due to bad interaction between said execution stages and up-front admission. This has an important consequence: read timeouts are more strictly enforced because the execution stage doesn't have a timeout so it can execute already timed-out reads too. This is not the case with the semaphore's queue which will drop timed-out reads. Another consequence is that, now data and mutation reads share the same execution stage, which increases its effectiveness, on the other hand system and user reads don't anymore. Fixes: #4758 Fixes: #5718 Tests: unit(dev, release, debug) * 'reader-concurrency-semaphore-in-front-of-the-cache/v5.3' of https://github.com/denesb/scylla: (54 commits) test/boost/reader_concurrency_semaphore_test: add used/blocked test test/boost/reader_concurrency_semaphore_test: add admission test reader_permit: add operator<< for reader_resources reader_concurrency_semaphore: add reads_{admitted,enqueued} stats table: make_sstable_reader(): fix indentation table: clean up make_sstable_reader() database: remove now unused query execution stages mutation_reader: remove now unused restricting_reader sstables: sstable_set: remove now unused make_restricted_range_sstable_reader() reader_permit: remove now unused wait_admission() reader_concurrency_semaphore: remove now unused obtain_permit_nowait() reader_concurrency_semaphore: admission: flip the switch database: increase semaphore max queue size test: index_with_paging_test: increase semaphore's queue size reader_concurrency_semaphore: add set_max_queue_size() test: mutation_reader_test: remove restricted reader tests reader_concurrency_semaphore: remove now unused make_permit() test: reader_concurrency_semaphore_test: move away from make_permit() test: move away from make_permit() treewide: use make_tracking_only_permit() ...
This commit is contained in:
@@ -539,7 +539,7 @@ protected:
|
||||
: _cf(cf)
|
||||
, _sstable_creator(std::move(descriptor.creator))
|
||||
, _schema(cf.schema())
|
||||
, _permit(_cf.compaction_concurrency_semaphore().make_permit(_cf.schema().get(), "compaction"))
|
||||
, _permit(_cf.compaction_concurrency_semaphore().make_tracking_only_permit(_cf.schema().get(), "compaction"))
|
||||
, _sstables(std::move(descriptor.sstables))
|
||||
, _max_sstable_size(descriptor.max_sstable_bytes)
|
||||
, _sstable_level(descriptor.level)
|
||||
@@ -1704,7 +1704,7 @@ static future<compaction_info> validate_sstables(sstables::compaction_descriptor
|
||||
|
||||
clogger.info("Validating {}", sstables_list_msg);
|
||||
|
||||
auto permit = cf.compaction_concurrency_semaphore().make_permit(schema.get(), "Validation");
|
||||
auto permit = cf.compaction_concurrency_semaphore().make_tracking_only_permit(schema.get(), "Validation");
|
||||
auto reader = sstables->make_local_shard_sstable_reader(schema, permit, query::full_partition_range, schema->full_slice(), descriptor.io_priority,
|
||||
tracing::trace_state_ptr(), ::streamed_mutation::forwarding::no, ::mutation_reader::forwarding::no, default_read_monitor_generator());
|
||||
|
||||
|
||||
154
database.cc
154
database.cc
@@ -272,8 +272,6 @@ void database::setup_scylla_memory_diagnostics_producer() {
|
||||
|
||||
writeln(" Execution Stages:\n");
|
||||
const std::pair<const char*, inheriting_execution_stage::stats> execution_stage_summaries[] = {
|
||||
{"data query stage", _data_query_stage.get_stats()},
|
||||
{"mutation query stage", _mutation_query_stage.get_stats()},
|
||||
{"apply stage", _apply_stage.get_stats()},
|
||||
};
|
||||
for (const auto& [name, exec_stage_summary] : execution_stage_summaries) {
|
||||
@@ -354,8 +352,6 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat
|
||||
max_memory_system_concurrent_reads(),
|
||||
"_system_read_concurrency_sem")
|
||||
, _row_cache_tracker(cache_tracker::register_metrics::yes)
|
||||
, _data_query_stage("data_query", &column_family::query)
|
||||
, _mutation_query_stage("mutation_query", &column_family::mutation_query)
|
||||
, _apply_stage("db_apply", &database::do_apply)
|
||||
, _version(empty_version)
|
||||
, _compaction_manager(make_compaction_manager(_cfg, dbcfg, as))
|
||||
@@ -1411,59 +1407,107 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
|
||||
column_family& cf = find_column_family(cmd.cf_id);
|
||||
auto& semaphore = get_reader_concurrency_semaphore();
|
||||
auto class_config = query::query_class_config{.semaphore = semaphore, .max_memory_for_unlimited_query = *cmd.max_result_size};
|
||||
query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
|
||||
return _data_query_stage(&cf,
|
||||
std::move(s),
|
||||
seastar::cref(cmd),
|
||||
class_config,
|
||||
opts,
|
||||
seastar::cref(ranges),
|
||||
std::move(trace_state),
|
||||
seastar::ref(get_result_memory_limiter()),
|
||||
timeout,
|
||||
std::move(cache_ctx)).then_wrapped([this, s = _stats, &semaphore, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
|
||||
if (f.failed()) {
|
||||
++semaphore.get_stats().total_failed_reads;
|
||||
return make_exception_future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>(f.get_exception());
|
||||
|
||||
std::optional<query::data_querier> querier_opt;
|
||||
lw_shared_ptr<query::result> result;
|
||||
std::exception_ptr ex;
|
||||
|
||||
if (cmd.query_uuid != utils::UUID{} && !cmd.is_first_page) {
|
||||
querier_opt = _querier_cache.lookup_data_querier(cmd.query_uuid, *s, ranges.front(), cmd.slice, trace_state);
|
||||
}
|
||||
|
||||
auto read_func = [&, this] (reader_permit permit) {
|
||||
reader_permit::used_guard ug{permit};
|
||||
return cf.query(std::move(s), std::move(permit), cmd, class_config, opts, ranges, trace_state, get_result_memory_limiter(),
|
||||
timeout, &querier_opt).then([&result, ug = std::move(ug)] (lw_shared_ptr<query::result> res) {
|
||||
result = std::move(res);
|
||||
});
|
||||
};
|
||||
|
||||
try {
|
||||
auto op = cf.read_in_progress();
|
||||
|
||||
if (querier_opt) {
|
||||
co_await semaphore.with_ready_permit(querier_opt->permit(), read_func);
|
||||
} else {
|
||||
++semaphore.get_stats().total_successful_reads;
|
||||
auto result = f.get0();
|
||||
s->short_data_queries += bool(result->is_short_read());
|
||||
return make_ready_future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>(std::tuple(std::move(result), hit_rate));
|
||||
co_await semaphore.with_permit(s.get(), "data-query", cf.estimate_read_memory_cost(), timeout, read_func);
|
||||
}
|
||||
});
|
||||
|
||||
if (cmd.query_uuid != utils::UUID{} && querier_opt) {
|
||||
_querier_cache.insert(cmd.query_uuid, std::move(*querier_opt), std::move(trace_state));
|
||||
}
|
||||
} catch (...) {
|
||||
++semaphore.get_stats().total_failed_reads;
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (querier_opt) {
|
||||
co_await querier_opt->close();
|
||||
}
|
||||
if(ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}
|
||||
|
||||
auto hit_rate = cf.get_global_cache_hit_rate();
|
||||
++semaphore.get_stats().total_successful_reads;
|
||||
_stats->short_data_queries += bool(result->is_short_read());
|
||||
co_return std::tuple(std::move(result), hit_rate);
|
||||
}
|
||||
|
||||
future<std::tuple<reconcilable_result, cache_temperature>>
|
||||
database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range,
|
||||
tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
|
||||
const auto short_read_allwoed = query::short_read(cmd.slice.options.contains<query::partition_slice::option::allow_short_read>());
|
||||
return get_result_memory_limiter().new_mutation_read(*cmd.max_result_size, short_read_allwoed).then(
|
||||
[&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) {
|
||||
auto accounter = co_await get_result_memory_limiter().new_mutation_read(*cmd.max_result_size, short_read_allwoed);
|
||||
column_family& cf = find_column_family(cmd.cf_id);
|
||||
auto& semaphore = get_reader_concurrency_semaphore();
|
||||
auto class_config = query::query_class_config{.semaphore = semaphore, .max_memory_for_unlimited_query = *cmd.max_result_size};
|
||||
query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
|
||||
return _mutation_query_stage(&cf,
|
||||
std::move(s),
|
||||
seastar::cref(cmd),
|
||||
class_config,
|
||||
seastar::cref(range),
|
||||
std::move(trace_state),
|
||||
std::move(accounter),
|
||||
timeout,
|
||||
std::move(cache_ctx)).then_wrapped([this, s = _stats, &semaphore, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
|
||||
if (f.failed()) {
|
||||
++semaphore.get_stats().total_failed_reads;
|
||||
return make_exception_future<std::tuple<reconcilable_result, cache_temperature>>(f.get_exception());
|
||||
|
||||
std::optional<query::mutation_querier> querier_opt;
|
||||
reconcilable_result result;
|
||||
std::exception_ptr ex;
|
||||
|
||||
if (cmd.query_uuid != utils::UUID{} && !cmd.is_first_page) {
|
||||
querier_opt = _querier_cache.lookup_mutation_querier(cmd.query_uuid, *s, range, cmd.slice, trace_state);
|
||||
}
|
||||
|
||||
auto read_func = [&, this] (reader_permit permit) {
|
||||
reader_permit::used_guard ug{permit};
|
||||
return cf.mutation_query(std::move(s), std::move(permit), cmd, class_config, range,
|
||||
std::move(trace_state), std::move(accounter), timeout, &querier_opt).then([&result, ug = std::move(ug)] (reconcilable_result res) {
|
||||
result = std::move(res);
|
||||
});
|
||||
};
|
||||
|
||||
try {
|
||||
auto op = cf.read_in_progress();
|
||||
|
||||
if (querier_opt) {
|
||||
co_await semaphore.with_ready_permit(querier_opt->permit(), read_func);
|
||||
} else {
|
||||
++semaphore.get_stats().total_successful_reads;
|
||||
auto result = f.get0();
|
||||
s->short_mutation_queries += bool(result.is_short_read());
|
||||
return make_ready_future<std::tuple<reconcilable_result, cache_temperature>>(std::tuple(std::move(result), hit_rate));
|
||||
co_await semaphore.with_permit(s.get(), "mutation-query", cf.estimate_read_memory_cost(), timeout, read_func);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
if (cmd.query_uuid != utils::UUID{} && querier_opt) {
|
||||
_querier_cache.insert(cmd.query_uuid, std::move(*querier_opt), std::move(trace_state));
|
||||
}
|
||||
|
||||
} catch (...) {
|
||||
++semaphore.get_stats().total_failed_reads;
|
||||
ex = std::current_exception();
|
||||
}
|
||||
|
||||
if (querier_opt) {
|
||||
co_await querier_opt->close();
|
||||
}
|
||||
if(ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}
|
||||
|
||||
auto hit_rate = cf.get_global_cache_hit_rate();
|
||||
++semaphore.get_stats().total_successful_reads;
|
||||
_stats->short_mutation_queries += bool(result.is_short_read());
|
||||
co_return std::tuple(std::move(result), hit_rate);
|
||||
}
|
||||
|
||||
std::unordered_set<sstring> database::get_initial_tokens() {
|
||||
@@ -1564,6 +1608,14 @@ reader_concurrency_semaphore& database::get_reader_concurrency_semaphore() {
|
||||
}
|
||||
}
|
||||
|
||||
future<reader_permit> database::obtain_reader_permit(table& tbl, const char* const op_name, db::timeout_clock::time_point timeout) {
|
||||
return get_reader_concurrency_semaphore().obtain_permit(tbl.schema().get(), op_name, tbl.estimate_read_memory_cost(), timeout);
|
||||
}
|
||||
|
||||
future<reader_permit> database::obtain_reader_permit(schema_ptr schema, const char* const op_name, db::timeout_clock::time_point timeout) {
|
||||
return obtain_reader_permit(find_column_family(std::move(schema)), op_name, timeout);
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const column_family& cf) {
|
||||
return fmt_print(out, "{{column_family: {}/{}}}", cf._schema->ks_name(), cf._schema->cf_name());
|
||||
}
|
||||
@@ -1620,7 +1672,7 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
|
||||
// counter state for each modified cell...
|
||||
|
||||
tracing::trace(trace_state, "Reading counter values from the CF");
|
||||
auto permit = get_reader_concurrency_semaphore().make_permit(m_schema.get(), "counter-read-before-write");
|
||||
auto permit = get_reader_concurrency_semaphore().make_tracking_only_permit(m_schema.get(), "counter-read-before-write");
|
||||
return counter_write_query(m_schema, cf.as_mutation_source(), std::move(permit), m.decorated_key(), slice, trace_state, timeout)
|
||||
.then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) {
|
||||
// ...now, that we got existing state of all affected counter
|
||||
@@ -2285,7 +2337,7 @@ std::ostream& operator<<(std::ostream& os, const keyspace_metadata& m) {
|
||||
template <typename T>
|
||||
using foreign_unique_ptr = foreign_ptr<std::unique_ptr<T>>;
|
||||
|
||||
flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db, schema_ptr schema,
|
||||
flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db, schema_ptr schema, reader_permit permit,
|
||||
std::function<std::optional<dht::partition_range>()> range_generator) {
|
||||
class streaming_reader_lifecycle_policy
|
||||
: public reader_lifecycle_policy
|
||||
@@ -2303,7 +2355,7 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
|
||||
}
|
||||
virtual flat_mutation_reader create_reader(
|
||||
schema_ptr schema,
|
||||
reader_permit,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
@@ -2316,7 +2368,7 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
|
||||
_contexts[shard].read_operation = make_foreign(std::make_unique<utils::phased_barrier::operation>(cf.read_in_progress()));
|
||||
_contexts[shard].semaphore = &cf.streaming_read_concurrency_semaphore();
|
||||
|
||||
return cf.make_streaming_reader(std::move(schema), *_contexts[shard].range, slice, fwd_mr);
|
||||
return cf.make_streaming_reader(std::move(schema), std::move(permit), *_contexts[shard].range, slice, fwd_mr);
|
||||
}
|
||||
virtual future<> destroy_reader(stopped_reader reader) noexcept override {
|
||||
auto ctx = std::move(_contexts[this_shard_id()]);
|
||||
@@ -2334,6 +2386,10 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
|
||||
}
|
||||
return *_contexts[shard].semaphore;
|
||||
}
|
||||
virtual future<reader_permit> obtain_reader_permit(schema_ptr schema, const char* const description, db::timeout_clock::time_point timeout) override {
|
||||
auto& cf = _db.local().find_column_family(_table_id);
|
||||
return semaphore().obtain_permit(schema.get(), description, cf.estimate_read_memory_cost(), timeout);
|
||||
}
|
||||
};
|
||||
auto ms = mutation_source([&db] (schema_ptr s,
|
||||
reader_permit permit,
|
||||
@@ -2349,7 +2405,7 @@ flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db,
|
||||
});
|
||||
auto&& full_slice = schema->full_slice();
|
||||
auto& cf = db.local().find_column_family(schema);
|
||||
return make_flat_multi_range_reader(schema, cf.streaming_read_concurrency_semaphore().make_permit(schema.get(), "multishard-streaming-reader"), std::move(ms),
|
||||
return make_flat_multi_range_reader(schema, std::move(permit), std::move(ms),
|
||||
std::move(range_generator), std::move(full_slice), service::get_local_streaming_priority(), {}, mutation_reader::forwarding::no);
|
||||
}
|
||||
|
||||
|
||||
65
database.hh
65
database.hh
@@ -319,6 +319,8 @@ class database_sstable_write_monitor;
|
||||
|
||||
using enable_backlog_tracker = bool_class<class enable_backlog_tracker_tag>;
|
||||
|
||||
extern const ssize_t new_reader_base_cost;
|
||||
|
||||
struct table_stats {
|
||||
/** Number of times flush has resulted in the memtable being switched out. */
|
||||
int64_t memtable_switch_count = 0;
|
||||
@@ -670,20 +672,20 @@ public:
|
||||
// reader and a _bounded_ amount of writes which arrive later.
|
||||
// - Does not populate the cache
|
||||
// Requires ranges to be sorted and disjoint.
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema,
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema, reader_permit permit,
|
||||
const dht::partition_range_vector& ranges) const;
|
||||
|
||||
// Single range overload.
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema, const dht::partition_range& range,
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema, reader_permit permit, const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::no) const;
|
||||
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema, const dht::partition_range& range) {
|
||||
return make_streaming_reader(schema, range, schema->full_slice());
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema, reader_permit permit, const dht::partition_range& range) {
|
||||
return make_streaming_reader(std::move(schema), std::move(permit), range, schema->full_slice());
|
||||
}
|
||||
|
||||
// Stream reader from the given sstables
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema, const dht::partition_range& range,
|
||||
flat_mutation_reader make_streaming_reader(schema_ptr schema, reader_permit permit, const dht::partition_range& range,
|
||||
lw_shared_ptr<sstables::sstable_set> sstables) const;
|
||||
|
||||
sstables::shared_sstable make_streaming_sstable_for_write(std::optional<sstring> subdir = {});
|
||||
@@ -737,7 +739,13 @@ public:
|
||||
void apply(const mutation& m, db::rp_handle&& = {});
|
||||
|
||||
// Returns at most "cmd.limit" rows
|
||||
future<lw_shared_ptr<query::result>> query(schema_ptr,
|
||||
// The saved_querier parameter is an input-output parameter which contains
|
||||
// the saved querier from the previous page (if there was one) and after
|
||||
// completion it contains the to-be saved querier for the next page (if
|
||||
// there is one). Pass nullptr when queriers are not saved.
|
||||
future<lw_shared_ptr<query::result>>
|
||||
query(schema_ptr,
|
||||
reader_permit permit,
|
||||
const query::read_command& cmd,
|
||||
query::query_class_config class_config,
|
||||
query::result_options opts,
|
||||
@@ -745,7 +753,7 @@ public:
|
||||
tracing::trace_state_ptr trace_state,
|
||||
query::result_memory_limiter& memory_limiter,
|
||||
db::timeout_clock::time_point timeout,
|
||||
query::querier_cache_context cache_ctx = { });
|
||||
std::optional<query::data_querier>* saved_querier = { });
|
||||
|
||||
// Performs a query on given data source returning data in reconcilable form.
|
||||
//
|
||||
@@ -758,15 +766,21 @@ public:
|
||||
// is absent in the results.
|
||||
//
|
||||
// 'source' doesn't have to survive deferring.
|
||||
//
|
||||
// The saved_querier parameter is an input-output parameter which contains
|
||||
// the saved querier from the previous page (if there was one) and after
|
||||
// completion it contains the to-be saved querier for the next page (if
|
||||
// there is one). Pass nullptr when queriers are not saved.
|
||||
future<reconcilable_result>
|
||||
mutation_query(schema_ptr s,
|
||||
reader_permit permit,
|
||||
const query::read_command& cmd,
|
||||
query::query_class_config class_config,
|
||||
const dht::partition_range& range,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
query::result_memory_accounter accounter,
|
||||
db::timeout_clock::time_point timeout,
|
||||
query::querier_cache_context cache_ctx = { });
|
||||
std::optional<query::mutation_querier>* saved_querier = { });
|
||||
|
||||
void start();
|
||||
future<> stop();
|
||||
@@ -998,6 +1012,8 @@ public:
|
||||
return *_config.compaction_concurrency_semaphore;
|
||||
}
|
||||
|
||||
size_t estimate_read_memory_cost() const;
|
||||
|
||||
private:
|
||||
future<row_locker::lock_holder> do_push_view_replica_updates(schema_ptr s, mutation m, db::timeout_clock::time_point timeout, mutation_source source,
|
||||
tracing::trace_state_ptr tr_state, reader_concurrency_semaphore& sem, const io_priority_class& io_priority, query::partition_slice::option_set custom_opts) const;
|
||||
@@ -1236,8 +1252,8 @@ private:
|
||||
::cf_stats _cf_stats;
|
||||
static constexpr size_t max_count_concurrent_reads{100};
|
||||
size_t max_memory_concurrent_reads() { return _dbcfg.available_memory * 0.02; }
|
||||
// Assume a queued read takes up 10kB of memory, and allow 2% of memory to be filled up with such reads.
|
||||
size_t max_inactive_queue_length() { return _dbcfg.available_memory * 0.02 / 10000; }
|
||||
// Assume a queued read takes up 1kB of memory, and allow 2% of memory to be filled up with such reads.
|
||||
size_t max_inactive_queue_length() { return _dbcfg.available_memory * 0.02 / 1000; }
|
||||
// They're rather heavyweight, so limit more
|
||||
static constexpr size_t max_count_streaming_concurrent_reads{10};
|
||||
size_t max_memory_streaming_concurrent_reads() { return _dbcfg.available_memory * 0.02; }
|
||||
@@ -1282,29 +1298,6 @@ private:
|
||||
|
||||
cache_tracker _row_cache_tracker;
|
||||
|
||||
inheriting_concrete_execution_stage<future<lw_shared_ptr<query::result>>,
|
||||
column_family*,
|
||||
schema_ptr,
|
||||
const query::read_command&,
|
||||
query::query_class_config,
|
||||
query::result_options,
|
||||
const dht::partition_range_vector&,
|
||||
tracing::trace_state_ptr,
|
||||
query::result_memory_limiter&,
|
||||
db::timeout_clock::time_point,
|
||||
query::querier_cache_context> _data_query_stage;
|
||||
|
||||
inheriting_concrete_execution_stage<future<reconcilable_result>,
|
||||
table*,
|
||||
schema_ptr,
|
||||
const query::read_command&,
|
||||
query::query_class_config,
|
||||
const dht::partition_range&,
|
||||
tracing::trace_state_ptr,
|
||||
query::result_memory_accounter,
|
||||
db::timeout_clock::time_point,
|
||||
query::querier_cache_context> _mutation_query_stage;
|
||||
|
||||
inheriting_concrete_execution_stage<
|
||||
future<>,
|
||||
database*,
|
||||
@@ -1609,6 +1602,10 @@ public:
|
||||
// which is deduced from the current scheduling group.
|
||||
reader_concurrency_semaphore& get_reader_concurrency_semaphore();
|
||||
|
||||
// Convenience method to obtain an admitted permit. See reader_concurrency_semaphore::obtain_permit().
|
||||
future<reader_permit> obtain_reader_permit(table& tbl, const char* const op_name, db::timeout_clock::time_point timeout);
|
||||
future<reader_permit> obtain_reader_permit(schema_ptr schema, const char* const op_name, db::timeout_clock::time_point timeout);
|
||||
|
||||
sharded<semaphore>& get_sharded_sst_dir_semaphore() {
|
||||
return _sst_dir_semaphore;
|
||||
}
|
||||
@@ -1621,7 +1618,7 @@ future<> stop_database(sharded<database>& db);
|
||||
//
|
||||
// Shard readers are created via `table::make_streaming_reader()`.
|
||||
// Range generator must generate disjoint, monotonically increasing ranges.
|
||||
flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db, schema_ptr schema,
|
||||
flat_mutation_reader make_multishard_streaming_reader(distributed<database>& db, schema_ptr schema, reader_permit permit,
|
||||
std::function<std::optional<dht::partition_range>()> range_generator);
|
||||
|
||||
bool is_internal_keyspace(std::string_view name);
|
||||
|
||||
@@ -250,7 +250,9 @@ future<> size_estimates_mutation_reader::get_next_partition() {
|
||||
_end_of_stream = true;
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return get_local_ranges(_db).then([this] (auto&& ranges) {
|
||||
return do_with(reader_permit::blocked_guard(_permit), [this] (reader_permit::blocked_guard&) {
|
||||
return get_local_ranges(_db);
|
||||
}).then([this] (auto&& ranges) {
|
||||
auto estimates = this->estimates_for_current_keyspace(std::move(ranges));
|
||||
auto mutations = db::system_keyspace::make_size_estimates_mutation(*_current_partition, std::move(estimates));
|
||||
++_current_partition;
|
||||
|
||||
@@ -1388,7 +1388,7 @@ view_builder::view_builder(database& db, db::system_distributed_keyspace& sys_di
|
||||
: _db(db)
|
||||
, _sys_dist_ks(sys_dist_ks)
|
||||
, _mnotifier(mn)
|
||||
, _permit(_db.get_reader_concurrency_semaphore().make_permit(nullptr, "view_builder")) {
|
||||
, _permit(_db.get_reader_concurrency_semaphore().make_tracking_only_permit(nullptr, "view_builder")) {
|
||||
setup_metrics();
|
||||
}
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ future<> view_update_generator::start() {
|
||||
ssts->insert(sst);
|
||||
}
|
||||
|
||||
auto permit = _db.get_reader_concurrency_semaphore().make_permit(s.get(), "view_update_generator");
|
||||
auto permit = _db.obtain_reader_permit(*t, "view_update_generator", db::no_timeout).get0();
|
||||
auto ms = mutation_source([this, ssts] (
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
@@ -83,7 +83,7 @@ future<> view_update_generator::start() {
|
||||
tracing::trace_state_ptr ts,
|
||||
streamed_mutation::forwarding fwd_ms,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return make_restricted_range_sstable_reader(std::move(ssts), s, std::move(permit), pr, ps, pc, std::move(ts), fwd_ms, fwd_mr);
|
||||
return ssts->make_range_sstable_reader(s, std::move(permit), pr, ps, pc, std::move(ts), fwd_ms, fwd_mr);
|
||||
});
|
||||
auto [staging_sstable_reader, staging_sstable_reader_handle] = make_manually_paused_evictable_reader(
|
||||
std::move(ms),
|
||||
|
||||
@@ -89,7 +89,7 @@ mutation_source memtable_filling_virtual_table::as_mutation_source() {
|
||||
};
|
||||
|
||||
return execute(mutation_sink, timeout).then([this, mt, s, units, &range, &slice, &pc, &trace_state, &fwd, &fwd_mr] () {
|
||||
auto rd = make_restricted_flat_reader(mt->as_data_source(), s, units->units.permit(), range, slice, pc, trace_state, fwd, fwd_mr);
|
||||
auto rd = mt->as_data_source().make_reader(s, units->units.permit(), range, slice, pc, trace_state, fwd, fwd_mr);
|
||||
|
||||
if (!_shard_aware) {
|
||||
rd = make_filtering_reader(std::move(rd), [this] (const dht::decorated_key& dk) -> bool {
|
||||
|
||||
@@ -223,7 +223,7 @@ public:
|
||||
tracing::trace_state_ptr trace_state)
|
||||
: _db(db)
|
||||
, _schema(std::move(s))
|
||||
, _permit(_db.local().get_reader_concurrency_semaphore().make_permit(_schema.get(), "multishard-mutation-query"))
|
||||
, _permit(_db.local().get_reader_concurrency_semaphore().make_tracking_only_permit(_schema.get(), "multishard-mutation-query"))
|
||||
, _cmd(cmd)
|
||||
, _ranges(ranges)
|
||||
, _trace_state(std::move(trace_state))
|
||||
@@ -264,6 +264,15 @@ public:
|
||||
return *_semaphores[shard];
|
||||
}
|
||||
|
||||
virtual future<reader_permit> obtain_reader_permit(schema_ptr schema, const char* const description, db::timeout_clock::time_point timeout) override {
|
||||
const auto shard = this_shard_id();
|
||||
auto& rm = _readers[shard];
|
||||
if (rm.state == reader_state::successful_lookup) {
|
||||
return make_ready_future<reader_permit>(rm.rparts->permit);
|
||||
}
|
||||
return _db.local().obtain_reader_permit(std::move(schema), description, timeout);
|
||||
}
|
||||
|
||||
future<> lookup_readers();
|
||||
|
||||
future<> save_readers(flat_mutation_reader::tracked_buffer unconsumed_buffer, detached_compaction_state compaction_state,
|
||||
@@ -317,13 +326,18 @@ flat_mutation_reader read_context::create_reader(
|
||||
|
||||
auto& table = _db.local().find_column_family(schema);
|
||||
|
||||
auto remote_parts = reader_meta::remote_parts(
|
||||
std::move(permit),
|
||||
std::make_unique<const dht::partition_range>(pr),
|
||||
std::make_unique<const query::partition_slice>(ps),
|
||||
table.read_in_progress());
|
||||
|
||||
if (!rm.rparts) {
|
||||
rm.rparts = make_foreign(std::make_unique<reader_meta::remote_parts>(std::move(permit)));
|
||||
rm.rparts = make_foreign(std::make_unique<reader_meta::remote_parts>(std::move(remote_parts)));
|
||||
} else {
|
||||
*rm.rparts = std::move(remote_parts);
|
||||
}
|
||||
|
||||
rm.rparts->range = std::make_unique<const dht::partition_range>(pr);
|
||||
rm.rparts->slice = std::make_unique<const query::partition_slice>(ps);
|
||||
rm.rparts->read_operation = table.read_in_progress();
|
||||
rm.state = reader_state::used;
|
||||
|
||||
return table.as_mutation_source().make_reader(std::move(schema), rm.rparts->permit, *rm.rparts->range, *rm.rparts->slice, pc,
|
||||
|
||||
@@ -715,123 +715,6 @@ flat_mutation_reader make_combined_reader(schema_ptr schema,
|
||||
return make_combined_reader(std::move(schema), std::move(permit), std::move(v), fwd_sm, fwd_mr);
|
||||
}
|
||||
|
||||
const ssize_t new_reader_base_cost{16 * 1024};
|
||||
|
||||
class restricting_mutation_reader : public flat_mutation_reader::impl {
|
||||
struct mutation_source_and_params {
|
||||
mutation_source _ms;
|
||||
schema_ptr _s;
|
||||
reader_permit _permit;
|
||||
std::reference_wrapper<const dht::partition_range> _range;
|
||||
std::reference_wrapper<const query::partition_slice> _slice;
|
||||
std::reference_wrapper<const io_priority_class> _pc;
|
||||
tracing::trace_state_ptr _trace_state;
|
||||
streamed_mutation::forwarding _fwd;
|
||||
mutation_reader::forwarding _fwd_mr;
|
||||
|
||||
flat_mutation_reader operator()() {
|
||||
return _ms.make_reader(std::move(_s), std::move(_permit), _range.get(), _slice.get(), _pc.get(), std::move(_trace_state), _fwd, _fwd_mr);
|
||||
}
|
||||
};
|
||||
|
||||
struct pending_state {
|
||||
mutation_source_and_params reader_factory;
|
||||
};
|
||||
struct admitted_state {
|
||||
flat_mutation_reader reader;
|
||||
reader_permit::resource_units units;
|
||||
};
|
||||
std::variant<pending_state, admitted_state> _state;
|
||||
|
||||
template<typename Function>
|
||||
requires std::is_move_constructible<Function>::value
|
||||
&& requires(Function fn, flat_mutation_reader& reader) {
|
||||
fn(reader);
|
||||
}
|
||||
decltype(auto) with_reader(Function fn, db::timeout_clock::time_point timeout) {
|
||||
if (auto* state = std::get_if<admitted_state>(&_state)) {
|
||||
return fn(state->reader);
|
||||
}
|
||||
|
||||
return std::get<pending_state>(_state).reader_factory._permit.wait_admission(new_reader_base_cost,
|
||||
timeout).then([this, fn = std::move(fn)] (reader_permit::resource_units units) mutable {
|
||||
auto reader_factory = std::move(std::get<pending_state>(_state).reader_factory);
|
||||
_state.emplace<admitted_state>(admitted_state{reader_factory(), std::move(units)});
|
||||
return fn(std::get<admitted_state>(_state).reader);
|
||||
});
|
||||
}
|
||||
public:
|
||||
restricting_mutation_reader(
|
||||
mutation_source ms,
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr)
|
||||
: impl(s, permit)
|
||||
, _state(pending_state{
|
||||
mutation_source_and_params{std::move(ms), std::move(s), std::move(permit), range, slice, pc, std::move(trace_state), fwd, fwd_mr}}) {
|
||||
}
|
||||
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
||||
return with_reader([this, timeout] (flat_mutation_reader& reader) {
|
||||
return reader.fill_buffer(timeout).then([this, &reader] {
|
||||
_end_of_stream = reader.is_end_of_stream();
|
||||
reader.move_buffer_content_to(*this);
|
||||
});
|
||||
}, timeout);
|
||||
}
|
||||
virtual future<> next_partition() override {
|
||||
clear_buffer_to_next_partition();
|
||||
if (!is_buffer_empty()) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
_end_of_stream = false;
|
||||
if (auto* state = std::get_if<admitted_state>(&_state)) {
|
||||
return state->reader.next_partition();
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
|
||||
clear_buffer();
|
||||
_end_of_stream = false;
|
||||
return with_reader([&pr, timeout] (flat_mutation_reader& reader) {
|
||||
return reader.fast_forward_to(pr, timeout);
|
||||
}, timeout);
|
||||
}
|
||||
virtual future<> fast_forward_to(position_range pr, db::timeout_clock::time_point timeout) override {
|
||||
forward_buffer_to(pr.start());
|
||||
_end_of_stream = false;
|
||||
return with_reader([pr = std::move(pr), timeout] (flat_mutation_reader& reader) mutable {
|
||||
return reader.fast_forward_to(std::move(pr), timeout);
|
||||
}, timeout);
|
||||
}
|
||||
virtual future<> close() noexcept override {
|
||||
if (auto* state = std::get_if<admitted_state>(&_state)) {
|
||||
return state->reader.close();
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
};
|
||||
|
||||
flat_mutation_reader
|
||||
make_restricted_flat_reader(
|
||||
mutation_source ms,
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return make_flat_mutation_reader<restricting_mutation_reader>(std::move(ms), std::move(s), std::move(permit), range, slice, pc, std::move(trace_state), fwd, fwd_mr);
|
||||
}
|
||||
|
||||
|
||||
snapshot_source make_empty_snapshot_source() {
|
||||
return snapshot_source([] {
|
||||
return make_empty_mutation_source();
|
||||
@@ -907,6 +790,7 @@ class foreign_reader : public flat_mutation_reader::impl {
|
||||
// we don't have to wait on the remote reader filling its buffer.
|
||||
template <typename Operation, typename Result = futurize_t<std::result_of_t<Operation()>>>
|
||||
Result forward_operation(db::timeout_clock::time_point timeout, Operation op) {
|
||||
reader_permit::blocked_guard bg{_permit};
|
||||
return smp::submit_to(_reader.get_owner_shard(), [reader = _reader.get(),
|
||||
read_ahead_future = std::exchange(_read_ahead_future, nullptr),
|
||||
timeout,
|
||||
@@ -933,7 +817,7 @@ class foreign_reader : public flat_mutation_reader::impl {
|
||||
auto result = std::get<1>(std::move(fut_and_result));
|
||||
return make_ready_future<decltype(result)>(std::move(result));
|
||||
}
|
||||
});
|
||||
}).finally([bg = std::move(bg)] { });
|
||||
}
|
||||
public:
|
||||
foreign_reader(schema_ptr schema,
|
||||
@@ -1087,7 +971,7 @@ private:
|
||||
void update_next_position(flat_mutation_reader& reader);
|
||||
void adjust_partition_slice();
|
||||
flat_mutation_reader recreate_reader();
|
||||
flat_mutation_reader resume_or_create_reader();
|
||||
future<flat_mutation_reader> resume_or_create_reader(db::timeout_clock::time_point timeout);
|
||||
void maybe_validate_partition_start(const flat_mutation_reader::tracked_buffer& buffer);
|
||||
void validate_position_in_partition(position_in_partition_view pos) const;
|
||||
bool should_drop_fragment(const mutation_fragment& mf);
|
||||
@@ -1129,6 +1013,9 @@ public:
|
||||
do_pause(std::move(*_reader));
|
||||
}
|
||||
}
|
||||
reader_permit permit() {
|
||||
return _permit;
|
||||
}
|
||||
};
|
||||
|
||||
void evictable_reader::do_pause(flat_mutation_reader reader) {
|
||||
@@ -1253,14 +1140,15 @@ flat_mutation_reader evictable_reader::recreate_reader() {
|
||||
_fwd_mr);
|
||||
}
|
||||
|
||||
flat_mutation_reader evictable_reader::resume_or_create_reader() {
|
||||
future<flat_mutation_reader> evictable_reader::resume_or_create_reader(db::timeout_clock::time_point timeout) {
|
||||
if (_reader) {
|
||||
return std::move(*_reader);
|
||||
co_return std::move(*_reader);
|
||||
}
|
||||
if (auto reader_opt = try_resume()) {
|
||||
return std::move(*reader_opt);
|
||||
co_return std::move(*reader_opt);
|
||||
}
|
||||
return recreate_reader();
|
||||
co_await _permit.maybe_wait_readmission(timeout);
|
||||
co_return recreate_reader();
|
||||
}
|
||||
|
||||
template <typename... Arg>
|
||||
@@ -1505,14 +1393,12 @@ evictable_reader::evictable_reader(
|
||||
|
||||
future<> evictable_reader::fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
if (is_end_of_stream()) {
|
||||
return make_ready_future<>();
|
||||
co_return;
|
||||
}
|
||||
return with_closeable(resume_or_create_reader(), [this, timeout] (flat_mutation_reader& reader) mutable {
|
||||
return fill_buffer(reader, timeout).then([this, &reader] {
|
||||
_end_of_stream = reader.is_end_of_stream() && reader.is_buffer_empty();
|
||||
maybe_pause(std::move(reader));
|
||||
});
|
||||
});
|
||||
_reader = co_await resume_or_create_reader(timeout);
|
||||
co_await fill_buffer(*_reader, timeout);
|
||||
_end_of_stream = _reader->is_end_of_stream() && _reader->is_buffer_empty();
|
||||
maybe_pause(std::move(*_reader));
|
||||
}
|
||||
|
||||
future<> evictable_reader::next_partition() {
|
||||
@@ -1521,7 +1407,7 @@ future<> evictable_reader::next_partition() {
|
||||
if (!is_buffer_empty()) {
|
||||
co_return;
|
||||
}
|
||||
auto reader = resume_or_create_reader();
|
||||
auto reader = co_await resume_or_create_reader(db::no_timeout);
|
||||
co_await reader.next_partition();
|
||||
maybe_pause(std::move(reader));
|
||||
}
|
||||
@@ -1694,7 +1580,7 @@ future<> shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
};
|
||||
|
||||
if (!_reader) {
|
||||
fill_buf_fut = smp::submit_to(_shard, [this, gs = global_schema_ptr(_schema), timeout] {
|
||||
fill_buf_fut = smp::submit_to(_shard, [this, gs = global_schema_ptr(_schema), timeout] () -> future<reader_and_buffer_fill_result> {
|
||||
auto ms = mutation_source([lifecycle_policy = _lifecycle_policy.get()] (
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
@@ -1707,21 +1593,30 @@ future<> shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
return lifecycle_policy->create_reader(std::move(s), std::move(permit), pr, ps, pc, std::move(ts), fwd_mr);
|
||||
});
|
||||
auto s = gs.get();
|
||||
auto permit = co_await _lifecycle_policy->obtain_reader_permit(s, "shard-reader", timeout);
|
||||
auto rreader = make_foreign(std::make_unique<evictable_reader>(evictable_reader::auto_pause::yes, std::move(ms),
|
||||
s, _lifecycle_policy->semaphore().make_permit(s.get(), "shard-reader"), *_pr, _ps, _pc, _trace_state, _fwd_mr));
|
||||
tracing::trace(_trace_state, "Creating shard reader on shard: {}", this_shard_id());
|
||||
auto f = rreader->fill_buffer(timeout);
|
||||
return f.then([rreader = std::move(rreader)] () mutable {
|
||||
s, std::move(permit), *_pr, _ps, _pc, _trace_state, _fwd_mr));
|
||||
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
tracing::trace(_trace_state, "Creating shard reader on shard: {}", this_shard_id());
|
||||
reader_permit::used_guard ug{rreader->permit()};
|
||||
co_await rreader->fill_buffer(timeout);
|
||||
auto res = remote_fill_buffer_result(rreader->detach_buffer(), rreader->is_end_of_stream());
|
||||
return make_ready_future<reader_and_buffer_fill_result>(reader_and_buffer_fill_result{std::move(rreader), std::move(res)});
|
||||
});
|
||||
co_return reader_and_buffer_fill_result{std::move(rreader), std::move(res)};
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await rreader->close();
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}).then([this, timeout] (reader_and_buffer_fill_result res) {
|
||||
_reader = std::move(res.reader);
|
||||
return std::move(res.result);
|
||||
});
|
||||
} else {
|
||||
fill_buf_fut = smp::submit_to(_shard, [this, timeout] () mutable {
|
||||
return _reader->fill_buffer(timeout).then([this] {
|
||||
reader_permit::used_guard ug{_reader->permit()};
|
||||
return _reader->fill_buffer(timeout).then([this, ug = std::move(ug)] {
|
||||
return remote_fill_buffer_result(_reader->detach_buffer(), _reader->is_end_of_stream());
|
||||
});
|
||||
});
|
||||
@@ -1736,19 +1631,26 @@ future<> shard_reader::do_fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
}
|
||||
|
||||
future<> shard_reader::fill_buffer(db::timeout_clock::time_point timeout) {
|
||||
// FIXME: want to move this to the inner scopes but it makes clang miscompile the code.
|
||||
reader_permit::blocked_guard guard(_permit);
|
||||
if (_read_ahead) {
|
||||
return *std::exchange(_read_ahead, std::nullopt);
|
||||
co_await *std::exchange(_read_ahead, std::nullopt);
|
||||
co_return;
|
||||
}
|
||||
if (!is_buffer_empty()) {
|
||||
return make_ready_future<>();
|
||||
co_return;
|
||||
}
|
||||
return do_fill_buffer(timeout);
|
||||
co_await do_fill_buffer(timeout);
|
||||
}
|
||||
|
||||
future<> shard_reader::next_partition() {
|
||||
if (!_reader) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
// FIXME: want to move this to the inner scopes but it makes clang miscompile the code.
|
||||
reader_permit::blocked_guard guard(_permit);
|
||||
|
||||
if (_read_ahead) {
|
||||
co_await *std::exchange(_read_ahead, std::nullopt);
|
||||
}
|
||||
@@ -1767,17 +1669,19 @@ future<> shard_reader::fast_forward_to(const dht::partition_range& pr, db::timeo
|
||||
if (!_reader && !_read_ahead) {
|
||||
// No need to fast-forward uncreated readers, they will be passed the new
|
||||
// range when created.
|
||||
return make_ready_future<>();
|
||||
co_return;
|
||||
}
|
||||
|
||||
auto f = _read_ahead ? *std::exchange(_read_ahead, std::nullopt) : make_ready_future<>();
|
||||
return f.then([this, &pr, timeout] {
|
||||
_end_of_stream = false;
|
||||
clear_buffer();
|
||||
reader_permit::blocked_guard guard(_permit);
|
||||
|
||||
return smp::submit_to(_shard, [this, &pr, timeout] {
|
||||
return _reader->fast_forward_to(pr, timeout);
|
||||
});
|
||||
if (_read_ahead) {
|
||||
co_await *std::exchange(_read_ahead, std::nullopt);
|
||||
}
|
||||
_end_of_stream = false;
|
||||
clear_buffer();
|
||||
|
||||
co_await smp::submit_to(_shard, [this, &pr, timeout] {
|
||||
return _reader->fast_forward_to(pr, timeout);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -345,37 +345,6 @@ public:
|
||||
mutation_source make_empty_mutation_source();
|
||||
snapshot_source make_empty_snapshot_source();
|
||||
|
||||
extern const ssize_t new_reader_base_cost;
|
||||
|
||||
// Creates a restricted reader whose resource usages will be tracked
|
||||
// during it's lifetime. If there are not enough resources (dues to
|
||||
// existing readers) to create the new reader, it's construction will
|
||||
// be deferred until there are sufficient resources.
|
||||
// The internal reader once created will not be hindered in it's work
|
||||
// anymore. Reusorce limits are determined by the config which contains
|
||||
// a semaphore to track and limit the memory usage of readers. It also
|
||||
// contains a timeout and a maximum queue size for inactive readers
|
||||
// whose construction is blocked.
|
||||
flat_mutation_reader make_restricted_flat_reader(
|
||||
mutation_source ms,
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc = default_priority_class(),
|
||||
tracing::trace_state_ptr trace_state = nullptr,
|
||||
streamed_mutation::forwarding fwd = streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding fwd_mr = mutation_reader::forwarding::yes);
|
||||
|
||||
inline flat_mutation_reader make_restricted_flat_reader(
|
||||
mutation_source ms,
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& range = query::full_partition_range) {
|
||||
auto& full_slice = s->full_slice();
|
||||
return make_restricted_flat_reader(std::move(ms), std::move(s), std::move(permit), range, full_slice);
|
||||
}
|
||||
|
||||
using mutation_source_opt = optimized_optional<mutation_source>;
|
||||
|
||||
// Adapts a non-movable FlattenedConsumer to a movable one.
|
||||
@@ -533,6 +502,14 @@ public:
|
||||
///
|
||||
/// This method will be called on the shard where the relevant reader lives.
|
||||
virtual reader_concurrency_semaphore& semaphore() = 0;
|
||||
|
||||
/// Obtain an admitted permit.
|
||||
///
|
||||
/// The permit will be associated with the semaphore returned by
|
||||
/// `semaphore()`.
|
||||
///
|
||||
/// This method will be called on the shard where the relevant reader lives.
|
||||
virtual future<reader_permit> obtain_reader_permit(schema_ptr schema, const char* const description, db::timeout_clock::time_point timeout) = 0;
|
||||
};
|
||||
|
||||
/// Make a multishard_combining_reader.
|
||||
|
||||
@@ -124,7 +124,7 @@ future<> multishard_writer::make_shard_writer(unsigned shard) {
|
||||
reader = make_foreign(std::make_unique<flat_mutation_reader>(std::move(reader)))] () mutable {
|
||||
auto s = gs.get();
|
||||
auto semaphore = std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, "shard_writer");
|
||||
auto permit = semaphore->make_permit(s.get(), "multishard-writer");
|
||||
auto permit = semaphore->make_tracking_only_permit(s.get(), "multishard-writer");
|
||||
auto this_shard_reader = make_foreign_reader(s, std::move(permit), std::move(reader));
|
||||
return make_foreign(std::make_unique<shard_writer>(gs.get(), std::move(semaphore), std::move(this_shard_reader), consumer));
|
||||
}).then([this, shard] (foreign_ptr<std::unique_ptr<shard_writer>> writer) {
|
||||
|
||||
59
querier.cc
59
querier.cc
@@ -231,7 +231,7 @@ struct querier_utils {
|
||||
};
|
||||
|
||||
template <typename Querier>
|
||||
static void insert_querier(
|
||||
void querier_cache::insert_querier(
|
||||
utils::UUID key,
|
||||
querier_cache::index& index,
|
||||
querier_cache::stats& stats,
|
||||
@@ -243,6 +243,9 @@ static void insert_querier(
|
||||
// current partition when the page ends so it cannot be reused across
|
||||
// pages.
|
||||
if (q.is_reversed()) {
|
||||
(void)with_gate(_closing_gate, [this, q = std::move(q)] () mutable {
|
||||
return q.close().finally([q = std::move(q)] {});
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -450,58 +453,4 @@ future<> querier_cache::stop() noexcept {
|
||||
}
|
||||
}
|
||||
|
||||
querier_cache_context::querier_cache_context(querier_cache& cache, utils::UUID key, query::is_first_page is_first_page)
|
||||
: _cache(&cache)
|
||||
, _key(key)
|
||||
, _is_first_page(is_first_page) {
|
||||
}
|
||||
|
||||
void querier_cache_context::insert(data_querier&& q, tracing::trace_state_ptr trace_state) {
|
||||
if (_cache && _key != utils::UUID{}) {
|
||||
_cache->insert(_key, std::move(q), std::move(trace_state));
|
||||
}
|
||||
}
|
||||
|
||||
void querier_cache_context::insert(mutation_querier&& q, tracing::trace_state_ptr trace_state) {
|
||||
if (_cache && _key != utils::UUID{}) {
|
||||
_cache->insert(_key, std::move(q), std::move(trace_state));
|
||||
}
|
||||
}
|
||||
|
||||
void querier_cache_context::insert(shard_mutation_querier&& q, tracing::trace_state_ptr trace_state) {
|
||||
if (_cache && _key != utils::UUID{}) {
|
||||
_cache->insert(_key, std::move(q), std::move(trace_state));
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<data_querier> querier_cache_context::lookup_data_querier(const schema& s,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
if (_cache && _key != utils::UUID{} && !_is_first_page) {
|
||||
return _cache->lookup_data_querier(_key, s, range, slice, std::move(trace_state));
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::optional<mutation_querier> querier_cache_context::lookup_mutation_querier(const schema& s,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
if (_cache && _key != utils::UUID{} && !_is_first_page) {
|
||||
return _cache->lookup_mutation_querier(_key, s, range, slice, std::move(trace_state));
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::optional<shard_mutation_querier> querier_cache_context::lookup_shard_mutation_querier(const schema& s,
|
||||
const dht::partition_range_vector& ranges,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state) {
|
||||
if (_cache && _key != utils::UUID{} && !_is_first_page) {
|
||||
return _cache->lookup_shard_mutation_querier(_key, s, ranges, slice, std::move(trace_state));
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
} // namespace query
|
||||
|
||||
35
querier.hh
35
querier.hh
@@ -372,6 +372,16 @@ private:
|
||||
stats _stats;
|
||||
gate _closing_gate;
|
||||
|
||||
private:
|
||||
template <typename Querier>
|
||||
void insert_querier(
|
||||
utils::UUID key,
|
||||
querier_cache::index& index,
|
||||
querier_cache::stats& stats,
|
||||
Querier&& q,
|
||||
std::chrono::seconds ttl,
|
||||
tracing::trace_state_ptr trace_state);
|
||||
|
||||
template <typename Querier>
|
||||
std::optional<Querier> lookup_querier(
|
||||
querier_cache::index& index,
|
||||
@@ -460,29 +470,4 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
class querier_cache_context {
|
||||
querier_cache* _cache{};
|
||||
utils::UUID _key;
|
||||
query::is_first_page _is_first_page;
|
||||
|
||||
public:
|
||||
querier_cache_context() = default;
|
||||
querier_cache_context(querier_cache& cache, utils::UUID key, query::is_first_page is_first_page);
|
||||
void insert(data_querier&& q, tracing::trace_state_ptr trace_state);
|
||||
void insert(mutation_querier&& q, tracing::trace_state_ptr trace_state);
|
||||
void insert(shard_mutation_querier&& q, tracing::trace_state_ptr trace_state);
|
||||
std::optional<data_querier> lookup_data_querier(const schema& s,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state);
|
||||
std::optional<mutation_querier> lookup_mutation_querier(const schema& s,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state);
|
||||
std::optional<shard_mutation_querier> lookup_shard_mutation_querier(const schema& s,
|
||||
const dht::partition_range_vector& ranges,
|
||||
const query::partition_slice& slice,
|
||||
tracing::trace_state_ptr trace_state);
|
||||
};
|
||||
|
||||
} // namespace query
|
||||
|
||||
@@ -33,6 +33,11 @@
|
||||
|
||||
logger rcslog("reader_concurrency_semaphore");
|
||||
|
||||
std::ostream& operator<<(std::ostream& os , const reader_resources& r) {
|
||||
os << "{" << r.count << ", " << r.memory << "}";
|
||||
return os;
|
||||
}
|
||||
|
||||
reader_permit::resource_units::resource_units(reader_permit permit, reader_resources res) noexcept
|
||||
: _permit(std::move(permit)), _resources(res) {
|
||||
_permit.consume(res);
|
||||
@@ -72,33 +77,80 @@ void reader_permit::resource_units::reset(reader_resources res) {
|
||||
_resources = res;
|
||||
}
|
||||
|
||||
class reader_permit::impl : public boost::intrusive::list_base_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>> {
|
||||
class reader_permit::impl
|
||||
: public boost::intrusive::list_base_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>>
|
||||
, public enable_shared_from_this<reader_permit::impl> {
|
||||
reader_concurrency_semaphore& _semaphore;
|
||||
const schema* _schema;
|
||||
sstring _op_name;
|
||||
std::string_view _op_name_view;
|
||||
reader_resources _base_resources;
|
||||
bool _base_resources_consumed = false;
|
||||
reader_resources _resources;
|
||||
reader_permit::state _state = reader_permit::state::active;
|
||||
reader_permit::state _state = reader_permit::state::active_unused;
|
||||
uint64_t _used_branches = 0;
|
||||
bool _marked_as_used = false;
|
||||
uint64_t _blocked_branches = 0;
|
||||
bool _marked_as_blocked = false;
|
||||
|
||||
private:
|
||||
void on_permit_used() {
|
||||
_semaphore.on_permit_used();
|
||||
_marked_as_used = true;
|
||||
}
|
||||
void on_permit_unused() {
|
||||
_semaphore.on_permit_unused();
|
||||
_marked_as_used = false;
|
||||
}
|
||||
void on_permit_blocked() {
|
||||
_semaphore.on_permit_blocked();
|
||||
_marked_as_blocked = true;
|
||||
}
|
||||
void on_permit_unblocked() {
|
||||
_semaphore.on_permit_unblocked();
|
||||
_marked_as_blocked = false;
|
||||
}
|
||||
void on_permit_active() {
|
||||
if (_used_branches) {
|
||||
_state = reader_permit::state::active_used;
|
||||
on_permit_used();
|
||||
} else {
|
||||
_state = reader_permit::state::active_unused;
|
||||
}
|
||||
}
|
||||
|
||||
void on_permit_inactive(reader_permit::state st) {
|
||||
_state = st;
|
||||
if (_marked_as_used) {
|
||||
on_permit_unused();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
struct value_tag {};
|
||||
|
||||
impl(reader_concurrency_semaphore& semaphore, const schema* const schema, const std::string_view& op_name)
|
||||
impl(reader_concurrency_semaphore& semaphore, const schema* const schema, const std::string_view& op_name, reader_resources base_resources)
|
||||
: _semaphore(semaphore)
|
||||
, _schema(schema)
|
||||
, _op_name_view(op_name)
|
||||
, _base_resources(base_resources)
|
||||
{
|
||||
_semaphore.on_permit_created(*this);
|
||||
}
|
||||
impl(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name)
|
||||
impl(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name, reader_resources base_resources)
|
||||
: _semaphore(semaphore)
|
||||
, _schema(schema)
|
||||
, _op_name(std::move(op_name))
|
||||
, _op_name_view(_op_name)
|
||||
, _base_resources(base_resources)
|
||||
{
|
||||
_semaphore.on_permit_created(*this);
|
||||
}
|
||||
~impl() {
|
||||
if (_base_resources_consumed) {
|
||||
signal(_base_resources);
|
||||
}
|
||||
|
||||
if (_resources) {
|
||||
on_internal_error_noexcept(rcslog, format("reader_permit::impl::~impl(): permit {} detected a leak of {{count={}, memory={}}} resources",
|
||||
description(),
|
||||
@@ -107,6 +159,24 @@ public:
|
||||
signal(_resources);
|
||||
}
|
||||
|
||||
if (_used_branches) {
|
||||
on_internal_error_noexcept(rcslog, format("reader_permit::impl::~impl(): permit {}.{}:{} destroyed with {} used branches",
|
||||
_schema ? _schema->ks_name() : "*",
|
||||
_schema ? _schema->cf_name() : "*",
|
||||
_op_name_view,
|
||||
_used_branches));
|
||||
_semaphore.on_permit_unused();
|
||||
}
|
||||
|
||||
if (_blocked_branches) {
|
||||
on_internal_error_noexcept(rcslog, format("reader_permit::impl::~impl(): permit {}.{}:{} destroyed with {} blocked branches",
|
||||
_schema ? _schema->ks_name() : "*",
|
||||
_schema ? _schema->cf_name() : "*",
|
||||
_op_name_view,
|
||||
_blocked_branches));
|
||||
_semaphore.on_permit_unblocked();
|
||||
}
|
||||
|
||||
_semaphore.on_permit_destroyed(*this);
|
||||
}
|
||||
|
||||
@@ -127,19 +197,33 @@ public:
|
||||
}
|
||||
|
||||
void on_waiting() {
|
||||
_state = reader_permit::state::waiting;
|
||||
on_permit_inactive(reader_permit::state::waiting);
|
||||
}
|
||||
|
||||
void on_admission() {
|
||||
_state = reader_permit::state::active;
|
||||
assert(_state != reader_permit::state::active_blocked);
|
||||
on_permit_active();
|
||||
consume(_base_resources);
|
||||
_base_resources_consumed = true;
|
||||
}
|
||||
|
||||
void on_register_as_inactive() {
|
||||
_state = reader_permit::state::inactive;
|
||||
assert(_state == reader_permit::state::active_unused || _state == reader_permit::state::active_used);
|
||||
on_permit_inactive(reader_permit::state::inactive);
|
||||
}
|
||||
|
||||
void on_unregister_as_inactive() {
|
||||
_state = reader_permit::state::active;
|
||||
assert(_state == reader_permit::state::inactive);
|
||||
on_permit_active();
|
||||
}
|
||||
|
||||
void on_evicted() {
|
||||
assert(_state == reader_permit::state::inactive);
|
||||
_state = reader_permit::state::evicted;
|
||||
if (_base_resources_consumed) {
|
||||
signal(_base_resources);
|
||||
_base_resources_consumed = false;
|
||||
}
|
||||
}
|
||||
|
||||
void consume(reader_resources res) {
|
||||
@@ -156,12 +240,67 @@ public:
|
||||
return _resources;
|
||||
}
|
||||
|
||||
reader_resources base_resources() const {
|
||||
return _base_resources;
|
||||
}
|
||||
|
||||
sstring description() const {
|
||||
return format("{}.{}:{}",
|
||||
_schema ? _schema->ks_name() : "*",
|
||||
_schema ? _schema->cf_name() : "*",
|
||||
_op_name_view);
|
||||
}
|
||||
|
||||
void mark_used() noexcept {
|
||||
++_used_branches;
|
||||
if (!_marked_as_used && _state == reader_permit::state::active_unused) {
|
||||
_state = reader_permit::state::active_used;
|
||||
on_permit_used();
|
||||
if (_blocked_branches && !_marked_as_blocked) {
|
||||
_state = reader_permit::state::active_blocked;
|
||||
on_permit_blocked();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mark_unused() noexcept {
|
||||
assert(_used_branches);
|
||||
--_used_branches;
|
||||
if (_marked_as_used && !_used_branches) {
|
||||
// When an exception is thrown, blocked and used guards might be
|
||||
// destroyed out-of-order. Force an unblock here so that we maintain
|
||||
// used >= blocked.
|
||||
if (_marked_as_blocked) {
|
||||
on_permit_unblocked();
|
||||
}
|
||||
_state = reader_permit::state::active_unused;
|
||||
on_permit_unused();
|
||||
}
|
||||
}
|
||||
|
||||
void mark_blocked() noexcept {
|
||||
++_blocked_branches;
|
||||
if (_blocked_branches == 1 && _state == reader_permit::state::active_used) {
|
||||
_state = reader_permit::state::active_blocked;
|
||||
on_permit_blocked();
|
||||
}
|
||||
}
|
||||
|
||||
void mark_unblocked() noexcept {
|
||||
assert(_blocked_branches);
|
||||
--_blocked_branches;
|
||||
if (_marked_as_blocked && !_blocked_branches) {
|
||||
_state = reader_permit::state::active_used;
|
||||
on_permit_unblocked();
|
||||
}
|
||||
}
|
||||
|
||||
future<> maybe_wait_readmission(db::timeout_clock::time_point timeout) {
|
||||
if (_state != reader_permit::state::evicted) {
|
||||
return make_ready_future<>();
|
||||
}
|
||||
return _semaphore.do_wait_admission(shared_from_this(), timeout);
|
||||
}
|
||||
};
|
||||
|
||||
struct reader_concurrency_semaphore::permit_list {
|
||||
@@ -171,13 +310,19 @@ struct reader_concurrency_semaphore::permit_list {
|
||||
permit_stats stats;
|
||||
};
|
||||
|
||||
reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, std::string_view op_name)
|
||||
: _impl(::seastar::make_shared<reader_permit::impl>(semaphore, schema, op_name))
|
||||
reader_permit::reader_permit(shared_ptr<impl> impl) : _impl(std::move(impl))
|
||||
{
|
||||
}
|
||||
|
||||
reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name)
|
||||
: _impl(::seastar::make_shared<reader_permit::impl>(semaphore, schema, std::move(op_name)))
|
||||
reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, std::string_view op_name,
|
||||
reader_resources base_resources)
|
||||
: _impl(::seastar::make_shared<reader_permit::impl>(semaphore, schema, op_name, base_resources))
|
||||
{
|
||||
}
|
||||
|
||||
reader_permit::reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name,
|
||||
reader_resources base_resources)
|
||||
: _impl(::seastar::make_shared<reader_permit::impl>(semaphore, schema, std::move(op_name), base_resources))
|
||||
{
|
||||
}
|
||||
|
||||
@@ -196,8 +341,8 @@ reader_concurrency_semaphore& reader_permit::semaphore() {
|
||||
return _impl->semaphore();
|
||||
}
|
||||
|
||||
future<reader_permit::resource_units> reader_permit::wait_admission(size_t memory, db::timeout_clock::time_point timeout) {
|
||||
return _impl->semaphore().do_wait_admission(*this, memory, timeout);
|
||||
future<> reader_permit::maybe_wait_readmission(db::timeout_clock::time_point timeout) {
|
||||
return _impl->maybe_wait_readmission(timeout);
|
||||
}
|
||||
|
||||
void reader_permit::consume(reader_resources res) {
|
||||
@@ -220,21 +365,50 @@ reader_resources reader_permit::consumed_resources() const {
|
||||
return _impl->resources();
|
||||
}
|
||||
|
||||
reader_resources reader_permit::base_resources() const {
|
||||
return _impl->base_resources();
|
||||
}
|
||||
|
||||
sstring reader_permit::description() const {
|
||||
return _impl->description();
|
||||
}
|
||||
|
||||
void reader_permit::mark_used() noexcept {
|
||||
_impl->mark_used();
|
||||
}
|
||||
|
||||
void reader_permit::mark_unused() noexcept {
|
||||
_impl->mark_unused();
|
||||
}
|
||||
|
||||
void reader_permit::mark_blocked() noexcept {
|
||||
_impl->mark_blocked();
|
||||
}
|
||||
|
||||
void reader_permit::mark_unblocked() noexcept {
|
||||
_impl->mark_unblocked();
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, reader_permit::state s) {
|
||||
switch (s) {
|
||||
case reader_permit::state::waiting:
|
||||
os << "waiting";
|
||||
break;
|
||||
case reader_permit::state::active:
|
||||
os << "active";
|
||||
case reader_permit::state::active_unused:
|
||||
os << "active/unused";
|
||||
break;
|
||||
case reader_permit::state::active_used:
|
||||
os << "active/used";
|
||||
break;
|
||||
case reader_permit::state::active_blocked:
|
||||
os << "active/blocked";
|
||||
break;
|
||||
case reader_permit::state::inactive:
|
||||
os << "inactive";
|
||||
break;
|
||||
case reader_permit::state::evicted:
|
||||
os << "evicted";
|
||||
break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -379,18 +553,40 @@ void reader_concurrency_semaphore::inactive_read_handle::abandon() noexcept {
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct stop_execution_loop {
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
future<> reader_concurrency_semaphore::execution_loop() noexcept {
|
||||
while (!_stopped) {
|
||||
try {
|
||||
co_await _ready_list.not_empty();
|
||||
} catch (stop_execution_loop) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
while (!_ready_list.empty()) {
|
||||
auto e = _ready_list.pop();
|
||||
|
||||
try {
|
||||
e.func(std::move(e.permit)).forward_to(std::move(e.pr));
|
||||
} catch (...) {
|
||||
e.pr.set_exception(std::current_exception());
|
||||
}
|
||||
|
||||
if (need_preempt()) {
|
||||
co_await make_ready_future<>();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::signal(const resources& r) noexcept {
|
||||
_resources += r;
|
||||
while (!_wait_list.empty() && has_available_units(_wait_list.front().res)) {
|
||||
auto& x = _wait_list.front();
|
||||
try {
|
||||
x.permit.on_admission();
|
||||
x.pr.set_value(reader_permit::resource_units(std::move(x.permit), x.res));
|
||||
} catch (...) {
|
||||
x.pr.set_exception(std::current_exception());
|
||||
}
|
||||
_wait_list.pop_front();
|
||||
}
|
||||
maybe_admit_waiters();
|
||||
}
|
||||
|
||||
reader_concurrency_semaphore::reader_concurrency_semaphore(int count, ssize_t memory, sstring name, size_t max_queue_length,
|
||||
@@ -398,6 +594,7 @@ reader_concurrency_semaphore::reader_concurrency_semaphore(int count, ssize_t me
|
||||
: _initial_resources(count, memory)
|
||||
, _resources(count, memory)
|
||||
, _wait_list(expiry_handler(*this))
|
||||
, _ready_list(max_queue_length)
|
||||
, _name(std::move(name))
|
||||
, _max_queue_length(max_queue_length)
|
||||
, _prethrow_action(std::move(prethrow_action))
|
||||
@@ -418,13 +615,14 @@ reader_concurrency_semaphore::~reader_concurrency_semaphore() {
|
||||
on_internal_error_noexcept(rcslog, format("~reader_concurrency_semaphore(): semaphore {} not stopped before destruction", _name));
|
||||
// With the below conditions, we can get away with the semaphore being
|
||||
// unstopped. In this case don't force an abort.
|
||||
assert(_inactive_reads.empty() && !_close_readers_gate.get_count() && !_permit_gate.get_count());
|
||||
assert(_inactive_reads.empty() && !_close_readers_gate.get_count() && !_permit_gate.get_count() && !_execution_loop_future);
|
||||
broken();
|
||||
}
|
||||
}
|
||||
|
||||
reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore::register_inactive_read(flat_mutation_reader reader) noexcept {
|
||||
auto& permit_impl = *reader.permit()._impl;
|
||||
permit_impl.on_register_as_inactive();
|
||||
// Implies _inactive_reads.empty(), we don't queue new readers before
|
||||
// evicting all inactive reads.
|
||||
// Checking the _wait_list covers the count resources only, so check memory
|
||||
@@ -435,7 +633,6 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
|
||||
auto& ir = *irp;
|
||||
_inactive_reads.push_back(ir);
|
||||
++_stats.inactive_reads;
|
||||
permit_impl.on_register_as_inactive();
|
||||
return inactive_read_handle(*this, *irp.release());
|
||||
} catch (...) {
|
||||
// It is okay to swallow the exception since
|
||||
@@ -446,6 +643,7 @@ reader_concurrency_semaphore::inactive_read_handle reader_concurrency_semaphore:
|
||||
rcslog.warn("Registering inactive read failed: {}. Ignored as if it was evicted.", std::current_exception());
|
||||
}
|
||||
} else {
|
||||
permit_impl.on_evicted();
|
||||
++_stats.permit_based_evictions;
|
||||
}
|
||||
close_reader(std::move(reader));
|
||||
@@ -517,6 +715,12 @@ future<> reader_concurrency_semaphore::stop() noexcept {
|
||||
clear_inactive_reads();
|
||||
co_await _close_readers_gate.close();
|
||||
co_await _permit_gate.close();
|
||||
if (_execution_loop_future) {
|
||||
if (_ready_list.has_blocked_consumer()) {
|
||||
_ready_list.abort(std::make_exception_ptr(stop_execution_loop{}));
|
||||
}
|
||||
co_await std::move(*_execution_loop_future);
|
||||
}
|
||||
broken(std::make_exception_ptr(stopped_exception()));
|
||||
co_return;
|
||||
}
|
||||
@@ -524,6 +728,7 @@ future<> reader_concurrency_semaphore::stop() noexcept {
|
||||
flat_mutation_reader reader_concurrency_semaphore::detach_inactive_reader(inactive_read& ir, evict_reason reason) noexcept {
|
||||
auto reader = std::move(ir.reader);
|
||||
ir.detach();
|
||||
reader.permit()._impl->on_evicted();
|
||||
std::unique_ptr<inactive_read> irp(&ir);
|
||||
try {
|
||||
if (ir.notify_handler) {
|
||||
@@ -564,23 +769,31 @@ bool reader_concurrency_semaphore::has_available_units(const resources& r) const
|
||||
return (bool(_resources) && _resources >= r) || _resources.count == _initial_resources.count;
|
||||
}
|
||||
|
||||
future<reader_permit::resource_units> reader_concurrency_semaphore::enqueue_waiter(reader_permit permit, resources r,
|
||||
db::timeout_clock::time_point timeout) {
|
||||
if (_wait_list.size() >= _max_queue_length) {
|
||||
bool reader_concurrency_semaphore::all_used_permits_are_stalled() const {
|
||||
return _permit_list->stats.used_permits == _permit_list->stats.blocked_permits;
|
||||
}
|
||||
|
||||
std::exception_ptr reader_concurrency_semaphore::check_queue_size(std::string_view queue_name) {
|
||||
if ((_wait_list.size() + _ready_list.size()) >= _max_queue_length) {
|
||||
_stats.total_reads_shed_due_to_overload++;
|
||||
if (_prethrow_action) {
|
||||
_prethrow_action();
|
||||
}
|
||||
maybe_dump_reader_permit_diagnostics(*this, *_permit_list, "wait queue overloaded");
|
||||
return make_exception_future<reader_permit::resource_units>(
|
||||
std::make_exception_ptr(std::runtime_error(
|
||||
format("{}: restricted mutation reader queue overload", _name))));
|
||||
maybe_dump_reader_permit_diagnostics(*this, *_permit_list, fmt::format("{} queue overload", queue_name));
|
||||
return std::make_exception_ptr(std::runtime_error(format("{}: {} queue overload", _name, queue_name)));
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
promise<reader_permit::resource_units> pr;
|
||||
future<> reader_concurrency_semaphore::enqueue_waiter(reader_permit permit, db::timeout_clock::time_point timeout, read_func func) {
|
||||
if (auto ex = check_queue_size("wait")) {
|
||||
return make_exception_future<>(std::move(ex));
|
||||
}
|
||||
promise<> pr;
|
||||
auto fut = pr.get_future();
|
||||
permit.on_waiting();
|
||||
_wait_list.push_back(entry(std::move(pr), std::move(permit), r), timeout);
|
||||
_wait_list.push_back(entry(std::move(pr), std::move(permit), std::move(func)), timeout);
|
||||
++_stats.reads_enqueued;
|
||||
return fut;
|
||||
}
|
||||
|
||||
@@ -594,46 +807,128 @@ void reader_concurrency_semaphore::evict_readers_in_background() {
|
||||
});
|
||||
}
|
||||
|
||||
future<reader_permit::resource_units> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, size_t memory,
|
||||
db::timeout_clock::time_point timeout) {
|
||||
auto r = resources(1, static_cast<ssize_t>(memory));
|
||||
auto first = _wait_list.empty();
|
||||
|
||||
if (first && has_available_units(r)) {
|
||||
permit.on_admission();
|
||||
return make_ready_future<reader_permit::resource_units>(reader_permit::resource_units(std::move(permit), r));
|
||||
future<> reader_concurrency_semaphore::do_wait_admission(reader_permit permit, db::timeout_clock::time_point timeout, read_func func) {
|
||||
if (!_execution_loop_future) {
|
||||
_execution_loop_future.emplace(execution_loop());
|
||||
}
|
||||
if (!_wait_list.empty() || !_ready_list.empty()) {
|
||||
return enqueue_waiter(std::move(permit), timeout, std::move(func));
|
||||
}
|
||||
|
||||
auto fut = enqueue_waiter(std::move(permit), r, timeout);
|
||||
|
||||
if (first && !_inactive_reads.empty()) {
|
||||
evict_readers_in_background();
|
||||
if (!has_available_units(permit.base_resources())) {
|
||||
auto fut = enqueue_waiter(std::move(permit), timeout, std::move(func));
|
||||
if (!_inactive_reads.empty()) {
|
||||
evict_readers_in_background();
|
||||
}
|
||||
return fut;
|
||||
}
|
||||
|
||||
return fut;
|
||||
if (!all_used_permits_are_stalled()) {
|
||||
return enqueue_waiter(std::move(permit), timeout, std::move(func));
|
||||
}
|
||||
|
||||
permit.on_admission();
|
||||
++_stats.reads_admitted;
|
||||
if (func) {
|
||||
return with_ready_permit(std::move(permit), std::move(func));
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::maybe_admit_waiters() noexcept {
|
||||
while (!_wait_list.empty() && _ready_list.empty() && has_available_units(_wait_list.front().permit.base_resources()) && all_used_permits_are_stalled()) {
|
||||
auto& x = _wait_list.front();
|
||||
try {
|
||||
x.permit.on_admission();
|
||||
++_stats.reads_admitted;
|
||||
if (x.func) {
|
||||
_ready_list.push(std::move(x));
|
||||
} else {
|
||||
x.pr.set_value();
|
||||
}
|
||||
} catch (...) {
|
||||
x.pr.set_exception(std::current_exception());
|
||||
}
|
||||
_wait_list.pop_front();
|
||||
}
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::on_permit_created(reader_permit::impl& permit) {
|
||||
_permit_gate.enter();
|
||||
_permit_list->permits.push_back(permit);
|
||||
++_permit_list->stats.total_permits;
|
||||
++_permit_list->stats.current_permits;
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::on_permit_destroyed(reader_permit::impl& permit) noexcept {
|
||||
permit.unlink();
|
||||
_permit_gate.leave();
|
||||
--_permit_list->stats.current_permits;
|
||||
}
|
||||
|
||||
reader_concurrency_semaphore::permit_stats reader_concurrency_semaphore::get_permit_stats() const {
|
||||
return _permit_list->stats;
|
||||
}
|
||||
|
||||
reader_permit reader_concurrency_semaphore::make_permit(const schema* const schema, const char* const op_name) {
|
||||
return reader_permit(*this, schema, std::string_view(op_name));
|
||||
void reader_concurrency_semaphore::on_permit_used() noexcept {
|
||||
++_permit_list->stats.used_permits;
|
||||
}
|
||||
|
||||
reader_permit reader_concurrency_semaphore::make_permit(const schema* const schema, sstring&& op_name) {
|
||||
return reader_permit(*this, schema, std::move(op_name));
|
||||
void reader_concurrency_semaphore::on_permit_unused() noexcept {
|
||||
assert(_permit_list->stats.used_permits);
|
||||
--_permit_list->stats.used_permits;
|
||||
assert(_permit_list->stats.used_permits >= _permit_list->stats.blocked_permits);
|
||||
maybe_admit_waiters();
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::on_permit_blocked() noexcept {
|
||||
++_permit_list->stats.blocked_permits;
|
||||
assert(_permit_list->stats.used_permits >= _permit_list->stats.blocked_permits);
|
||||
maybe_admit_waiters();
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::on_permit_unblocked() noexcept {
|
||||
assert(_permit_list->stats.blocked_permits);
|
||||
--_permit_list->stats.blocked_permits;
|
||||
}
|
||||
|
||||
future<reader_permit> reader_concurrency_semaphore::obtain_permit(const schema* const schema, const char* const op_name, size_t memory,
|
||||
db::timeout_clock::time_point timeout) {
|
||||
auto permit = reader_permit(*this, schema, std::string_view(op_name), {1, static_cast<ssize_t>(memory)});
|
||||
return do_wait_admission(permit, timeout).then([permit] () mutable {
|
||||
return std::move(permit);
|
||||
});
|
||||
}
|
||||
|
||||
future<reader_permit> reader_concurrency_semaphore::obtain_permit(const schema* const schema, sstring&& op_name, size_t memory,
|
||||
db::timeout_clock::time_point timeout) {
|
||||
auto permit = reader_permit(*this, schema, std::move(op_name), {1, static_cast<ssize_t>(memory)});
|
||||
return do_wait_admission(permit, timeout).then([permit] () mutable {
|
||||
return std::move(permit);
|
||||
});
|
||||
}
|
||||
|
||||
reader_permit reader_concurrency_semaphore::make_tracking_only_permit(const schema* const schema, const char* const op_name) {
|
||||
return reader_permit(*this, schema, std::string_view(op_name), {});
|
||||
}
|
||||
|
||||
reader_permit reader_concurrency_semaphore::make_tracking_only_permit(const schema* const schema, sstring&& op_name) {
|
||||
return reader_permit(*this, schema, std::move(op_name), {});
|
||||
}
|
||||
|
||||
future<> reader_concurrency_semaphore::with_permit(const schema* const schema, const char* const op_name, size_t memory,
|
||||
db::timeout_clock::time_point timeout, read_func func) {
|
||||
return do_wait_admission(reader_permit(*this, schema, std::string_view(op_name), {1, static_cast<ssize_t>(memory)}), timeout, std::move(func));
|
||||
}
|
||||
|
||||
future<> reader_concurrency_semaphore::with_ready_permit(reader_permit permit, read_func func) {
|
||||
if (auto ex = check_queue_size("ready")) {
|
||||
return make_exception_future<>(std::move(ex));
|
||||
}
|
||||
promise<> pr;
|
||||
auto fut = pr.get_future();
|
||||
_ready_list.push(entry(std::move(pr), std::move(permit), std::move(func)));
|
||||
return fut;
|
||||
}
|
||||
|
||||
void reader_concurrency_semaphore::broken(std::exception_ptr ex) {
|
||||
|
||||
@@ -47,10 +47,14 @@ using namespace seastar;
|
||||
/// It's also possible to specify the maximum allowed number of waiting
|
||||
/// readers by the `max_queue_length` constructor parameter. When the
|
||||
/// number of waiting readers becomes equal or greater than
|
||||
/// `max_queue_length` (upon calling `wait_admission()`) an exception of
|
||||
/// `max_queue_length` (upon calling `obtain_permit()`) an exception of
|
||||
/// type `std::runtime_error` is thrown. Optionally, some additional
|
||||
/// code can be executed just before throwing (`prethrow_action`
|
||||
/// constructor parameter).
|
||||
///
|
||||
/// The semaphore also acts as an execution stage for reads. This
|
||||
/// functionality is exposed via \ref with_permit() and \ref
|
||||
/// with_ready_permit().
|
||||
class reader_concurrency_semaphore {
|
||||
public:
|
||||
using resources = reader_resources;
|
||||
@@ -78,23 +82,35 @@ public:
|
||||
uint64_t total_failed_reads = 0;
|
||||
// Total number of reads rejected because the admission queue reached its max capacity
|
||||
uint64_t total_reads_shed_due_to_overload = 0;
|
||||
// Total number of reads admitted, via all admission paths.
|
||||
uint64_t reads_admitted = 0;
|
||||
// Total number of reads enqueued to wait for admission.
|
||||
uint64_t reads_enqueued = 0;
|
||||
};
|
||||
struct permit_stats {
|
||||
// Total number of permits created so far.
|
||||
uint64_t total_permits = 0;
|
||||
// Current number of permits.
|
||||
uint64_t current_permits = 0;
|
||||
// Current number of used permits.
|
||||
uint64_t used_permits = 0;
|
||||
// Current number of blocked permits.
|
||||
uint64_t blocked_permits = 0;
|
||||
};
|
||||
|
||||
struct permit_list;
|
||||
|
||||
class inactive_read_handle;
|
||||
|
||||
using read_func = noncopyable_function<future<>(reader_permit)>;
|
||||
|
||||
private:
|
||||
struct entry {
|
||||
promise<reader_permit::resource_units> pr;
|
||||
promise<> pr;
|
||||
reader_permit permit;
|
||||
resources res;
|
||||
entry(promise<reader_permit::resource_units>&& pr, reader_permit permit, resources r)
|
||||
: pr(std::move(pr)), permit(std::move(permit)), res(r) {}
|
||||
read_func func;
|
||||
entry(promise<>&& pr, reader_permit permit, read_func func)
|
||||
: pr(std::move(pr)), permit(std::move(permit)), func(std::move(func)) {}
|
||||
};
|
||||
|
||||
class expiry_handler {
|
||||
@@ -168,6 +184,7 @@ private:
|
||||
resources _resources;
|
||||
|
||||
expiring_fifo<entry, expiry_handler, db::timeout_clock> _wait_list;
|
||||
queue<entry> _ready_list;
|
||||
|
||||
sstring _name;
|
||||
size_t _max_queue_length = std::numeric_limits<size_t>::max();
|
||||
@@ -178,6 +195,7 @@ private:
|
||||
bool _stopped = false;
|
||||
gate _close_readers_gate;
|
||||
gate _permit_gate;
|
||||
std::optional<future<>> _execution_loop_future;
|
||||
|
||||
private:
|
||||
[[nodiscard]] flat_mutation_reader detach_inactive_reader(inactive_read&, evict_reason reason) noexcept;
|
||||
@@ -185,20 +203,33 @@ private:
|
||||
|
||||
bool has_available_units(const resources& r) const;
|
||||
|
||||
bool all_used_permits_are_stalled() const;
|
||||
|
||||
[[nodiscard]] std::exception_ptr check_queue_size(std::string_view queue_name);
|
||||
|
||||
// Add the permit to the wait queue and return the future which resolves when
|
||||
// the permit is admitted (popped from the queue).
|
||||
future<reader_permit::resource_units> enqueue_waiter(reader_permit permit, resources r, db::timeout_clock::time_point timeout);
|
||||
future<> enqueue_waiter(reader_permit permit, db::timeout_clock::time_point timeout, read_func func);
|
||||
void evict_readers_in_background();
|
||||
future<reader_permit::resource_units> do_wait_admission(reader_permit permit, size_t memory, db::timeout_clock::time_point timeout);
|
||||
future<> do_wait_admission(reader_permit permit, db::timeout_clock::time_point timeout, read_func func = {});
|
||||
void maybe_admit_waiters() noexcept;
|
||||
|
||||
void on_permit_created(reader_permit::impl&);
|
||||
void on_permit_destroyed(reader_permit::impl&) noexcept;
|
||||
|
||||
void on_permit_used() noexcept;
|
||||
void on_permit_unused() noexcept;
|
||||
|
||||
void on_permit_blocked() noexcept;
|
||||
void on_permit_unblocked() noexcept;
|
||||
|
||||
std::runtime_error stopped_exception();
|
||||
|
||||
// closes reader in the background.
|
||||
void close_reader(flat_mutation_reader reader);
|
||||
|
||||
future<> execution_loop() noexcept;
|
||||
|
||||
public:
|
||||
struct no_limits { };
|
||||
|
||||
@@ -289,7 +320,47 @@ public:
|
||||
/// Return stats about the currently existing permits.
|
||||
permit_stats get_permit_stats() const;
|
||||
|
||||
/// Make a permit
|
||||
/// Make an admitted permit
|
||||
///
|
||||
/// The permit is already in an admitted state after being created, this
|
||||
/// method includes waiting for admission.
|
||||
/// The permit is associated with a schema, which is the schema of the table
|
||||
/// the read is executed against, and the operation name, which should be a
|
||||
/// name such that we can identify the operation which created this permit.
|
||||
/// Ideally this should be a unique enough name that we not only can identify
|
||||
/// the kind of read, but the exact code-path that was taken.
|
||||
///
|
||||
/// Some permits cannot be associated with any table, so passing nullptr as
|
||||
/// the schema parameter is allowed.
|
||||
future<reader_permit> obtain_permit(const schema* const schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout);
|
||||
future<reader_permit> obtain_permit(const schema* const schema, sstring&& op_name, size_t memory, db::timeout_clock::time_point timeout);
|
||||
|
||||
/// Make a tracking only permit
|
||||
///
|
||||
/// The permit is not admitted. It is intended for reads that bypass the
|
||||
/// normal concurrency control, but whose resource usage we still want to
|
||||
/// keep track of, as part of that concurrency control.
|
||||
/// The permit is associated with a schema, which is the schema of the table
|
||||
/// the read is executed against, and the operation name, which should be a
|
||||
/// name such that we can identify the operation which created this permit.
|
||||
/// Ideally this should be a unique enough name that we not only can identify
|
||||
/// the kind of read, but the exact code-path that was taken.
|
||||
///
|
||||
/// Some permits cannot be associated with any table, so passing nullptr as
|
||||
/// the schema parameter is allowed.
|
||||
reader_permit make_tracking_only_permit(const schema* const schema, const char* const op_name);
|
||||
reader_permit make_tracking_only_permit(const schema* const schema, sstring&& op_name);
|
||||
|
||||
/// Run the function through the semaphore's execution stage with an admitted permit
|
||||
///
|
||||
/// First a permit is obtained via the normal admission route, as if
|
||||
/// it was created with \ref obtain_permit(), then func is enqueued to be
|
||||
/// run by the semaphore's execution loop. This emulates an execution stage,
|
||||
/// as it allows batching multiple funcs to be run together. Unlike an
|
||||
/// execution stage, with_permit() accepts a type-erased function, which
|
||||
/// allows for more flexibility in what functions are batched together.
|
||||
/// Use only functions that share most of their code to benefit from the
|
||||
/// instruction-cache warm-up!
|
||||
///
|
||||
/// The permit is associated with a schema, which is the schema of the table
|
||||
/// the read is executed against, and the operation name, which should be a
|
||||
@@ -299,8 +370,16 @@ public:
|
||||
///
|
||||
/// Some permits cannot be associated with any table, so passing nullptr as
|
||||
/// the schema parameter is allowed.
|
||||
reader_permit make_permit(const schema* const schema, const char* const op_name);
|
||||
reader_permit make_permit(const schema* const schema, sstring&& op_name);
|
||||
future<> with_permit(const schema* const schema, const char* const op_name, size_t memory, db::timeout_clock::time_point timeout, read_func func);
|
||||
|
||||
/// Run the function through the semaphore's execution stage with a pre-admitted permit
|
||||
///
|
||||
/// Same as \ref with_permit(), but it uses an already admitted
|
||||
/// permit. Should only be used when a permit is already readily
|
||||
/// available, e.g. when resuming a saved read. Using
|
||||
/// \ref obtain_permit(), then \ref with_ready_permit() is less
|
||||
/// optimal then just using \ref with_permit().
|
||||
future<> with_ready_permit(reader_permit permit, read_func func);
|
||||
|
||||
const resources initial_resources() const {
|
||||
return _initial_resources;
|
||||
@@ -331,4 +410,8 @@ public:
|
||||
/// Use max-lines to cap the number of (permit) lines in the report.
|
||||
/// Use 0 for unlimited.
|
||||
std::string dump_diagnostics(unsigned max_lines = 0) const;
|
||||
|
||||
void set_max_queue_length(size_t size) {
|
||||
_max_queue_length = size;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <seastar/util/optimized_optional.hh>
|
||||
#include "seastarx.hh"
|
||||
|
||||
#include "db/timeout_clock.hh"
|
||||
@@ -76,26 +77,30 @@ inline bool operator==(const reader_resources& a, const reader_resources& b) {
|
||||
return a.count == b.count && a.memory == b.memory;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const reader_resources& r);
|
||||
|
||||
class reader_concurrency_semaphore;
|
||||
|
||||
/// A permit for a specific read.
|
||||
///
|
||||
/// Used to track the read's resource consumption and wait for admission to read
|
||||
/// from the disk.
|
||||
/// Use `consume_memory()` to register memory usage. Use `wait_admission()` to
|
||||
/// wait for admission, before reading from the disk. Both methods return a
|
||||
/// `resource_units` RAII object that should be held onto while the respective
|
||||
/// resources are in use.
|
||||
/// Used to track the read's resource consumption. Use `consume_memory()` to
|
||||
/// register memory usage, which returns a `resource_units` RAII object that
|
||||
/// should be held onto while the respective resources are in use.
|
||||
class reader_permit {
|
||||
friend class reader_concurrency_semaphore;
|
||||
|
||||
public:
|
||||
class resource_units;
|
||||
class used_guard;
|
||||
class blocked_guard;
|
||||
|
||||
enum class state {
|
||||
waiting, // waiting for admission
|
||||
active,
|
||||
active_unused,
|
||||
active_used,
|
||||
active_blocked,
|
||||
inactive,
|
||||
evicted,
|
||||
};
|
||||
|
||||
class impl;
|
||||
@@ -104,12 +109,28 @@ private:
|
||||
shared_ptr<impl> _impl;
|
||||
|
||||
private:
|
||||
explicit reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, std::string_view op_name);
|
||||
explicit reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name);
|
||||
reader_permit() = default;
|
||||
reader_permit(shared_ptr<impl>);
|
||||
explicit reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, std::string_view op_name,
|
||||
reader_resources base_resources);
|
||||
explicit reader_permit(reader_concurrency_semaphore& semaphore, const schema* const schema, sstring&& op_name,
|
||||
reader_resources base_resources);
|
||||
|
||||
void on_waiting();
|
||||
void on_admission();
|
||||
|
||||
void mark_used() noexcept;
|
||||
|
||||
void mark_unused() noexcept;
|
||||
|
||||
void mark_blocked() noexcept;
|
||||
|
||||
void mark_unblocked() noexcept;
|
||||
|
||||
operator bool() const { return bool(_impl); }
|
||||
|
||||
friend class optimized_optional<reader_permit>;
|
||||
|
||||
public:
|
||||
~reader_permit();
|
||||
|
||||
@@ -125,7 +146,7 @@ public:
|
||||
|
||||
reader_concurrency_semaphore& semaphore();
|
||||
|
||||
future<resource_units> wait_admission(size_t memory, db::timeout_clock::time_point timeout);
|
||||
future<> maybe_wait_readmission(db::timeout_clock::time_point timeout);
|
||||
|
||||
void consume(reader_resources res);
|
||||
|
||||
@@ -137,9 +158,13 @@ public:
|
||||
|
||||
reader_resources consumed_resources() const;
|
||||
|
||||
reader_resources base_resources() const;
|
||||
|
||||
sstring description() const;
|
||||
};
|
||||
|
||||
using reader_permit_opt = optimized_optional<reader_permit>;
|
||||
|
||||
class reader_permit::resource_units {
|
||||
reader_permit _permit;
|
||||
reader_resources _resources;
|
||||
@@ -160,6 +185,55 @@ public:
|
||||
reader_resources resources() const { return _resources; }
|
||||
};
|
||||
|
||||
/// Mark a permit as used.
|
||||
///
|
||||
/// Conceptually, a permit is considered used, when at least one reader
|
||||
/// associated with it has an ongoing foreground operation initiated by
|
||||
/// its consumer. E.g. a pending `fill_buffer()` call.
|
||||
/// This class is an RAII used marker meant to be used by keeping it alive
|
||||
/// until the reader is used.
|
||||
class reader_permit::used_guard {
|
||||
reader_permit_opt _permit;
|
||||
public:
|
||||
explicit used_guard(reader_permit permit) noexcept : _permit(std::move(permit)) {
|
||||
_permit->mark_used();
|
||||
}
|
||||
used_guard(used_guard&&) noexcept = default;
|
||||
used_guard(const used_guard&) = delete;
|
||||
~used_guard() {
|
||||
if (_permit) {
|
||||
_permit->mark_unused();
|
||||
}
|
||||
}
|
||||
used_guard& operator=(used_guard&&) = delete;
|
||||
used_guard& operator=(const used_guard&) = delete;
|
||||
};
|
||||
|
||||
/// Mark a permit as blocked.
|
||||
///
|
||||
/// Conceptually, a permit is considered blocked, when at least one reader
|
||||
/// associated with it is waiting on I/O or a remote shard as part of a
|
||||
/// foreground operation initiated by its consumer. E.g. an sstable reader
|
||||
/// waiting on a disk read as part of its `fill_buffer()` call.
|
||||
/// This class is an RAII block marker meant to be used by keeping it alive
|
||||
/// until said block resolves.
|
||||
class reader_permit::blocked_guard {
|
||||
reader_permit_opt _permit;
|
||||
public:
|
||||
explicit blocked_guard(reader_permit permit) noexcept : _permit(std::move(permit)) {
|
||||
_permit->mark_blocked();
|
||||
}
|
||||
blocked_guard(blocked_guard&&) noexcept = default;
|
||||
blocked_guard(const blocked_guard&) = delete;
|
||||
~blocked_guard() {
|
||||
if (_permit) {
|
||||
_permit->mark_unblocked();
|
||||
}
|
||||
}
|
||||
blocked_guard& operator=(blocked_guard&&) = delete;
|
||||
blocked_guard& operator=(const blocked_guard&) = delete;
|
||||
};
|
||||
|
||||
template <typename Char>
|
||||
temporary_buffer<Char> make_tracked_temporary_buffer(temporary_buffer<Char> buf, reader_permit& permit) {
|
||||
return temporary_buffer<Char>(buf.get_write(), buf.size(),
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "database.hh"
|
||||
#include <seastar/util/bool_class.hh>
|
||||
#include <seastar/core/metrics_registration.hh>
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
@@ -458,14 +459,14 @@ public:
|
||||
if (local_reader) {
|
||||
auto ms = mutation_source([&cf] (
|
||||
schema_ptr s,
|
||||
reader_permit,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr,
|
||||
streamed_mutation::forwarding,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return cf.make_streaming_reader(std::move(s), pr, ps, fwd_mr);
|
||||
return cf.make_streaming_reader(std::move(s), std::move(permit), pr, ps, fwd_mr);
|
||||
});
|
||||
std::tie(_reader, _reader_handle) = make_manually_paused_evictable_reader(
|
||||
std::move(ms),
|
||||
@@ -477,7 +478,7 @@ public:
|
||||
{},
|
||||
mutation_reader::forwarding::no);
|
||||
} else {
|
||||
_reader = make_multishard_streaming_reader(db, _schema, [this] {
|
||||
_reader = make_multishard_streaming_reader(db, _schema, _permit, [this] {
|
||||
auto shard_range = _sharder.next();
|
||||
if (shard_range) {
|
||||
return std::optional<dht::partition_range>(dht::to_partition_range(*shard_range));
|
||||
@@ -788,6 +789,7 @@ public:
|
||||
seastar::sharded<db::view::view_update_generator>& view_update_generator,
|
||||
column_family& cf,
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
dht::token_range range,
|
||||
row_level_diff_detect_algorithm algo,
|
||||
size_t max_row_buf_size,
|
||||
@@ -805,7 +807,7 @@ public:
|
||||
, _view_update_generator(view_update_generator)
|
||||
, _cf(cf)
|
||||
, _schema(s)
|
||||
, _permit(_cf.streaming_read_concurrency_semaphore().make_permit(_schema.get(), "repair-meta"))
|
||||
, _permit(std::move(permit))
|
||||
, _range(range)
|
||||
, _cmp(repair_sync_boundary::tri_compare(*_schema))
|
||||
, _algo(algo)
|
||||
@@ -931,6 +933,19 @@ public:
|
||||
reason] (schema_ptr s) {
|
||||
auto& db = repair.get_db();
|
||||
auto& cf = db.local().find_column_family(s->id());
|
||||
return db.local().obtain_reader_permit(cf, "repair-meta", db::no_timeout).then([s = std::move(s),
|
||||
&db,
|
||||
&cf,
|
||||
&repair,
|
||||
from,
|
||||
repair_meta_id,
|
||||
range,
|
||||
algo,
|
||||
max_row_buf_size,
|
||||
seed,
|
||||
master_node_shard_config,
|
||||
schema_version,
|
||||
reason] (reader_permit permit) mutable {
|
||||
node_repair_meta_id id{from, repair_meta_id};
|
||||
auto rm = make_lw_shared<repair_meta>(db,
|
||||
repair.get_messaging().container(),
|
||||
@@ -938,6 +953,7 @@ public:
|
||||
repair.get_view_update_generator(),
|
||||
cf,
|
||||
s,
|
||||
std::move(permit),
|
||||
range,
|
||||
algo,
|
||||
max_row_buf_size,
|
||||
@@ -956,6 +972,7 @@ public:
|
||||
} else {
|
||||
rlogger.debug("insert_repair_meta: Inserted repair_meta_id {} for node {}", id.repair_meta_id, id.ip);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2827,12 +2844,15 @@ public:
|
||||
auto schema_version = s->version();
|
||||
bool table_dropped = false;
|
||||
|
||||
auto permit = _ri.db.local().obtain_reader_permit(_cf, "repair-meta", db::no_timeout).get0();
|
||||
|
||||
repair_meta master(_ri.db,
|
||||
_ri.messaging,
|
||||
_ri.sys_dist_ks,
|
||||
_ri.view_update_generator,
|
||||
_cf,
|
||||
s,
|
||||
std::move(permit),
|
||||
_range,
|
||||
algorithm,
|
||||
max_row_buf_size,
|
||||
|
||||
@@ -1744,7 +1744,7 @@ class scylla_memory(gdb.Command):
|
||||
**mem_stats))
|
||||
|
||||
gdb.write(' Execution Stages:\n')
|
||||
for es_path in [('_data_query_stage',), ('_mutation_query_stage', '_execution_stage'), ('_apply_stage',)]:
|
||||
for es_path in [('_apply_stage',)]:
|
||||
machine_name = es_path[0]
|
||||
human_name = machine_name.replace('_', ' ').strip()
|
||||
total = 0
|
||||
|
||||
@@ -3260,7 +3260,8 @@ future<> storage_service::load_and_stream(sstring ks_name, sstring cf_name,
|
||||
size_t num_partitions_processed = 0;
|
||||
size_t num_bytes_read = 0;
|
||||
nr_sst_current += sst_processed.size();
|
||||
auto reader = table.make_streaming_reader(s, full_partition_range, sst_set);
|
||||
auto permit = co_await _db.local().obtain_reader_permit(table, "storage_service::load_and_stream()", db::no_timeout);
|
||||
auto reader = table.make_streaming_reader(s, std::move(permit), full_partition_range, sst_set);
|
||||
std::exception_ptr eptr;
|
||||
bool failed = false;
|
||||
try {
|
||||
|
||||
@@ -491,6 +491,8 @@ protected:
|
||||
sstables::reader_position_tracker _stream_position;
|
||||
// remaining length of input to read (if <0, continue until end of file).
|
||||
uint64_t _remain;
|
||||
std::optional<reader_permit::blocked_guard> _blocked_guard;
|
||||
bool _first_invoke = true;
|
||||
public:
|
||||
using read_status = data_consumer::read_status;
|
||||
|
||||
@@ -501,6 +503,16 @@ public:
|
||||
, _remain(maxlen) {}
|
||||
|
||||
future<> consume_input() {
|
||||
// On first invoke we are guaranteed to go to the disk, so mark as
|
||||
// blocked unconditionally. On succeeding invokes, we determine whether
|
||||
// we need to block inside operator().
|
||||
// One corner case this misses is when the last operator() consumed all
|
||||
// data but didn't want more so the next invocation will block, we bet
|
||||
// on this being rare.
|
||||
if (_first_invoke) {
|
||||
_first_invoke = false;
|
||||
mark_blocked();
|
||||
}
|
||||
return _input.consume(state_processor());
|
||||
}
|
||||
|
||||
@@ -508,6 +520,14 @@ public:
|
||||
state_processor().verify_end_state();
|
||||
}
|
||||
|
||||
void mark_blocked() {
|
||||
_blocked_guard.emplace(_permit);
|
||||
}
|
||||
|
||||
void mark_unblocked() {
|
||||
_blocked_guard.reset();
|
||||
}
|
||||
|
||||
data_consumer::processing_result skip(temporary_buffer<char>& data, uint32_t len) {
|
||||
if (data.size() >= len) {
|
||||
data.trim_front(len);
|
||||
@@ -548,6 +568,7 @@ public:
|
||||
// called by input_stream::consume():
|
||||
future<consumption_result_type>
|
||||
operator()(temporary_buffer<char> data) {
|
||||
mark_unblocked();
|
||||
if (data.size() >= _remain) {
|
||||
// We received more data than we actually care about, so process
|
||||
// the beginning of the buffer, and return the rest to the stream
|
||||
@@ -575,6 +596,7 @@ public:
|
||||
_remain -= orig_data_size - data.size();
|
||||
_stream_position.position -= data.size();
|
||||
if (value == proceed::yes) {
|
||||
mark_blocked();
|
||||
return make_ready_future<consumption_result_type>(continue_consuming{});
|
||||
} else {
|
||||
return make_ready_future<consumption_result_type>(stop_consuming<char>{std::move(data)});
|
||||
@@ -592,6 +614,7 @@ public:
|
||||
}
|
||||
_stream_position.position += skip.get_value();
|
||||
_remain -= skip.get_value();
|
||||
mark_blocked();
|
||||
return make_ready_future<consumption_result_type>(std::move(skip));
|
||||
});
|
||||
}
|
||||
@@ -606,7 +629,8 @@ public:
|
||||
_remain = end - _stream_position.position;
|
||||
|
||||
primitive_consumer::reset();
|
||||
return _input.skip(n);
|
||||
reader_permit::blocked_guard _{_permit};
|
||||
co_await _input.skip(n);
|
||||
}
|
||||
|
||||
future<> skip_to(size_t begin) {
|
||||
|
||||
@@ -757,7 +757,7 @@ public:
|
||||
, _enc_stats(enc_stats)
|
||||
, _shard(shard)
|
||||
, _semaphore(reader_concurrency_semaphore::no_limits{}, "mx writer")
|
||||
, _range_tombstones(range_tombstone_stream(_schema, _semaphore.make_permit(&s, "mx-writer")))
|
||||
, _range_tombstones(range_tombstone_stream(_schema, _semaphore.make_tracking_only_permit(&s, "mx-writer")))
|
||||
, _tmp_bufs(_sst.sstable_buffer_size)
|
||||
, _sst_schema(make_sstable_schema(s, _enc_stats, _cfg))
|
||||
, _run_identifier(cfg.run_identifier)
|
||||
|
||||
@@ -1098,33 +1098,6 @@ sstable_set::make_local_shard_sstable_reader(
|
||||
fwd_mr);
|
||||
}
|
||||
|
||||
flat_mutation_reader make_restricted_range_sstable_reader(
|
||||
lw_shared_ptr<sstable_set> sstables,
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr,
|
||||
read_monitor_generator& monitor_generator)
|
||||
{
|
||||
auto ms = mutation_source([sstables=std::move(sstables), &monitor_generator] (
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return sstables->make_range_sstable_reader(std::move(s), std::move(permit), pr, slice, pc,
|
||||
std::move(trace_state), fwd, fwd_mr, monitor_generator);
|
||||
});
|
||||
return make_restricted_flat_reader(std::move(ms), std::move(s), std::move(permit), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
|
||||
}
|
||||
|
||||
unsigned sstable_set_overlapping_count(const schema_ptr& schema, const std::vector<shared_sstable>& sstables) {
|
||||
unsigned overlapping_sstables = 0;
|
||||
auto prev_last = dht::ring_position::min();
|
||||
|
||||
@@ -158,22 +158,6 @@ public:
|
||||
friend class compound_sstable_set;
|
||||
};
|
||||
|
||||
/// Read a range from the passed-in sstables.
|
||||
///
|
||||
/// The reader is restricted, that is it will wait for admission on the semaphore
|
||||
/// belonging to the passed-in permit, before starting to read.
|
||||
flat_mutation_reader make_restricted_range_sstable_reader(
|
||||
lw_shared_ptr<sstable_set> sstables,
|
||||
schema_ptr,
|
||||
reader_permit,
|
||||
const dht::partition_range&,
|
||||
const query::partition_slice&,
|
||||
const io_priority_class&,
|
||||
tracing::trace_state_ptr,
|
||||
streamed_mutation::forwarding,
|
||||
mutation_reader::forwarding,
|
||||
read_monitor_generator& rmg = default_read_monitor_generator());
|
||||
|
||||
sstable_set make_partitioned_sstable_set(schema_ptr schema, lw_shared_ptr<sstable_list> all, bool use_level_metadata = true);
|
||||
|
||||
sstable_set make_compound_sstable_set(schema_ptr schema, std::vector<lw_shared_ptr<sstable_set>> sets);
|
||||
|
||||
@@ -1790,7 +1790,7 @@ future<> sstable::generate_summary(const io_priority_class& pc) {
|
||||
|
||||
auto s = summary_generator(_schema->get_partitioner(), _components->summary, _manager.config().sstable_summary_ratio());
|
||||
auto ctx = make_lw_shared<index_consume_entry_context<summary_generator>>(
|
||||
sem.make_permit(_schema.get(), "generate-summary"), s, trust_promoted_index::yes, *_schema, index_file, std::move(options), 0, index_size,
|
||||
sem.make_tracking_only_permit(_schema.get(), "generate-summary"), s, trust_promoted_index::yes, *_schema, index_file, std::move(options), 0, index_size,
|
||||
(_version >= sstable_version_types::mc
|
||||
? std::make_optional(get_clustering_values_fixed_lengths(get_serialization_header()))
|
||||
: std::optional<column_values_fixed_lengths>{}));
|
||||
@@ -2701,7 +2701,7 @@ future<bool> sstable::has_partition_key(const utils::hashed_key& hk, const dht::
|
||||
std::exception_ptr ex;
|
||||
auto sem = reader_concurrency_semaphore(reader_concurrency_semaphore::no_limits{}, "sstables::has_partition_key()");
|
||||
try {
|
||||
auto lh_index_ptr = std::make_unique<sstables::index_reader>(s, sem.make_permit(_schema.get(), s->get_filename()), default_priority_class(), tracing::trace_state_ptr());
|
||||
auto lh_index_ptr = std::make_unique<sstables::index_reader>(s, sem.make_tracking_only_permit(_schema.get(), s->get_filename()), default_priority_class(), tracing::trace_state_ptr());
|
||||
present = co_await lh_index_ptr->advance_lower_and_check_if_present(dk);
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
|
||||
@@ -146,13 +146,13 @@ void stream_session::init_messaging_service_handler(netw::messaging_service& ms,
|
||||
}
|
||||
|
||||
return mm->get_schema_for_write(schema_id, from, ms).then([from, estimated_partitions, plan_id, schema_id, &cf, source, reason] (schema_ptr s) mutable {
|
||||
return get_local_db().obtain_reader_permit(cf, "stream-session", db::no_timeout).then([from, estimated_partitions, plan_id, schema_id, &cf, source, reason, s] (reader_permit permit) mutable {
|
||||
auto sink = stream_session::ms().make_sink_for_stream_mutation_fragments(source);
|
||||
struct stream_mutation_fragments_cmd_status {
|
||||
bool got_cmd = false;
|
||||
bool got_end_of_stream = false;
|
||||
};
|
||||
auto cmd_status = make_lw_shared<stream_mutation_fragments_cmd_status>();
|
||||
auto permit = cf.streaming_read_concurrency_semaphore().make_permit(s.get(), "stream-session");
|
||||
auto get_next_mutation_fragment = [source, plan_id, from, s, cmd_status, permit] () mutable {
|
||||
return source().then([plan_id, from, s, cmd_status, permit] (std::optional<std::tuple<frozen_mutation_fragment, rpc::optional<stream_mutation_fragments_cmd>>> opt) mutable {
|
||||
if (opt) {
|
||||
@@ -215,6 +215,7 @@ void stream_session::init_messaging_service_handler(netw::messaging_service& ms,
|
||||
});
|
||||
return make_ready_future<rpc::sink<int>>(sink);
|
||||
});
|
||||
});
|
||||
});
|
||||
ms.register_stream_mutation_done([] (const rpc::client_info& cinfo, UUID plan_id, dht::token_range_vector ranges, UUID cf_id, unsigned dst_cpu_id) {
|
||||
const auto& from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
|
||||
|
||||
@@ -87,20 +87,20 @@ struct send_info {
|
||||
dht::token_range_vector ranges;
|
||||
dht::partition_range_vector prs;
|
||||
flat_mutation_reader reader;
|
||||
send_info(database& db_, netw::messaging_service& ms_, utils::UUID plan_id_, utils::UUID cf_id_,
|
||||
send_info(database& db_, netw::messaging_service& ms_, utils::UUID plan_id_, table& tbl_, reader_permit permit_,
|
||||
dht::token_range_vector ranges_, netw::messaging_service::msg_addr id_,
|
||||
uint32_t dst_cpu_id_, stream_reason reason_)
|
||||
: db(db_)
|
||||
, ms(ms_)
|
||||
, plan_id(plan_id_)
|
||||
, cf_id(cf_id_)
|
||||
, cf_id(tbl_.schema()->id())
|
||||
, id(id_)
|
||||
, dst_cpu_id(dst_cpu_id_)
|
||||
, reason(reason_)
|
||||
, cf(db.find_column_family(cf_id))
|
||||
, cf(tbl_)
|
||||
, ranges(std::move(ranges_))
|
||||
, prs(dht::to_partition_ranges(ranges))
|
||||
, reader(cf.make_streaming_reader(cf.schema(), prs)) {
|
||||
, reader(cf.make_streaming_reader(cf.schema(), std::move(permit_), prs)) {
|
||||
}
|
||||
future<bool> has_relevant_range_on_this_shard() {
|
||||
return do_with(false, ranges.begin(), [this] (bool& found_relevant_range, dht::token_range_vector::iterator& ranges_it) {
|
||||
@@ -221,8 +221,10 @@ future<> stream_transfer_task::execute() {
|
||||
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}", plan_id, cf_id);
|
||||
sort_and_merge_ranges();
|
||||
auto reason = session->get_reason();
|
||||
return session->get_db().invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges=this->_ranges, reason] (database& db) {
|
||||
auto si = make_lw_shared<send_info>(db, stream_session::ms(), plan_id, cf_id, std::move(ranges), id, dst_cpu_id, reason);
|
||||
return session->get_db().invoke_on_all([plan_id, cf_id, id, dst_cpu_id, ranges=this->_ranges, reason] (database& db) mutable {
|
||||
auto& tbl = db.find_column_family(cf_id);
|
||||
return db.obtain_reader_permit(tbl, "stream-transfer-task", db::no_timeout).then([&db, &tbl, plan_id, cf_id, id, dst_cpu_id, ranges=std::move(ranges), reason] (reader_permit permit) mutable {
|
||||
auto si = make_lw_shared<send_info>(db, stream_session::ms(), plan_id, tbl, std::move(permit), std::move(ranges), id, dst_cpu_id, reason);
|
||||
return si->has_relevant_range_on_this_shard().then([si, plan_id, cf_id] (bool has_relevant_range_on_this_shard) {
|
||||
if (!has_relevant_range_on_this_shard) {
|
||||
sslog.debug("[Stream #{}] stream_transfer_task: cf_id={}: ignore ranges on shard={}",
|
||||
@@ -233,6 +235,7 @@ future<> stream_transfer_task::execute() {
|
||||
}).finally([si] {
|
||||
return si->reader.close();
|
||||
});
|
||||
});
|
||||
}).then([this, plan_id, cf_id, id] {
|
||||
sslog.debug("[Stream #{}] SEND STREAM_MUTATION_DONE to {}, cf_id={}", plan_id, id, cf_id);
|
||||
return session->ms().send_stream_mutation_done(id, plan_id, _ranges,
|
||||
|
||||
139
table.cc
139
table.cc
@@ -72,52 +72,18 @@ table::make_sstable_reader(schema_ptr s,
|
||||
// we want to optimize and read exactly this partition. As a
|
||||
// consequence, fast_forward_to() will *NOT* work on the result,
|
||||
// regardless of what the fwd_mr parameter says.
|
||||
auto ms = [&] () -> mutation_source {
|
||||
if (pr.is_singular() && pr.start()->value().has_key()) {
|
||||
const dht::ring_position& pos = pr.start()->value();
|
||||
if (dht::shard_of(*s, pos.token()) != this_shard_id()) {
|
||||
return mutation_source([] (
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return make_empty_flat_reader(s, std::move(permit)); // range doesn't belong to this shard
|
||||
});
|
||||
}
|
||||
|
||||
return mutation_source([this, sstables=std::move(sstables)] (
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return sstables->create_single_key_sstable_reader(const_cast<column_family*>(this), std::move(s), std::move(permit),
|
||||
_stats.estimated_sstable_per_read, pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
|
||||
});
|
||||
} else {
|
||||
return mutation_source([sstables=std::move(sstables)] (
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return sstables->make_local_shard_sstable_reader(std::move(s), std::move(permit), pr, slice, pc,
|
||||
std::move(trace_state), fwd, fwd_mr);
|
||||
});
|
||||
if (pr.is_singular() && pr.start()->value().has_key()) {
|
||||
const dht::ring_position& pos = pr.start()->value();
|
||||
if (dht::shard_of(*s, pos.token()) != this_shard_id()) {
|
||||
return make_empty_flat_reader(s, std::move(permit)); // range doesn't belong to this shard
|
||||
}
|
||||
}();
|
||||
|
||||
return make_restricted_flat_reader(std::move(ms), std::move(s), std::move(permit), pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
|
||||
return sstables->create_single_key_sstable_reader(const_cast<column_family*>(this), std::move(s), std::move(permit),
|
||||
_stats.estimated_sstable_per_read, pr, slice, pc, std::move(trace_state), fwd, fwd_mr);
|
||||
} else {
|
||||
return sstables->make_local_shard_sstable_reader(std::move(s), std::move(permit), pr, slice, pc,
|
||||
std::move(trace_state), fwd, fwd_mr);
|
||||
}
|
||||
}
|
||||
|
||||
lw_shared_ptr<sstables::sstable_set> table::make_compound_sstable_set() {
|
||||
@@ -236,9 +202,8 @@ sstables::shared_sstable table::make_streaming_sstable_for_write(std::optional<s
|
||||
}
|
||||
|
||||
flat_mutation_reader
|
||||
table::make_streaming_reader(schema_ptr s,
|
||||
table::make_streaming_reader(schema_ptr s, reader_permit permit,
|
||||
const dht::partition_range_vector& ranges) const {
|
||||
auto permit = _config.streaming_read_concurrency_semaphore->make_permit(s.get(), "stream-ranges");
|
||||
auto& slice = s->full_slice();
|
||||
auto& pc = service::get_local_streaming_priority();
|
||||
|
||||
@@ -256,9 +221,8 @@ table::make_streaming_reader(schema_ptr s,
|
||||
return make_flat_multi_range_reader(s, std::move(permit), std::move(source), ranges, slice, pc, nullptr, mutation_reader::forwarding::no);
|
||||
}
|
||||
|
||||
flat_mutation_reader table::make_streaming_reader(schema_ptr schema, const dht::partition_range& range,
|
||||
flat_mutation_reader table::make_streaming_reader(schema_ptr schema, reader_permit permit, const dht::partition_range& range,
|
||||
const query::partition_slice& slice, mutation_reader::forwarding fwd_mr) const {
|
||||
auto permit = _config.streaming_read_concurrency_semaphore->make_permit(schema.get(), "stream-range");
|
||||
const auto& pc = service::get_local_streaming_priority();
|
||||
auto trace_state = tracing::trace_state_ptr();
|
||||
const auto fwd = streamed_mutation::forwarding::no;
|
||||
@@ -272,9 +236,8 @@ flat_mutation_reader table::make_streaming_reader(schema_ptr schema, const dht::
|
||||
return make_combined_reader(std::move(schema), std::move(permit), std::move(readers), fwd, fwd_mr);
|
||||
}
|
||||
|
||||
flat_mutation_reader table::make_streaming_reader(schema_ptr schema, const dht::partition_range& range,
|
||||
flat_mutation_reader table::make_streaming_reader(schema_ptr schema, reader_permit permit, const dht::partition_range& range,
|
||||
lw_shared_ptr<sstables::sstable_set> sstables) const {
|
||||
auto permit = _config.streaming_read_concurrency_semaphore->make_permit(schema.get(), "load-and-stream");
|
||||
auto& slice = schema->full_slice();
|
||||
const auto& pc = service::get_local_streaming_priority();
|
||||
auto trace_state = tracing::trace_state_ptr();
|
||||
@@ -647,7 +610,7 @@ table::try_flush_memtable_to_sstable(lw_shared_ptr<memtable> old, sstable_write_
|
||||
|
||||
auto f = consumer(old->make_flush_reader(
|
||||
old->schema(),
|
||||
compaction_concurrency_semaphore().make_permit(old->schema().get(), "try_flush_memtable_to_sstable()"),
|
||||
compaction_concurrency_semaphore().make_tracking_only_permit(old->schema().get(), "try_flush_memtable_to_sstable()"),
|
||||
service::get_local_memtable_flush_priority()));
|
||||
|
||||
// Switch back to default scheduling group for post-flush actions, to avoid them being staved by the memtable flush
|
||||
@@ -1836,6 +1799,12 @@ future<> table::populate_views(
|
||||
}
|
||||
}
|
||||
|
||||
const ssize_t new_reader_base_cost{16 * 1024};
|
||||
|
||||
size_t table::estimate_read_memory_cost() const {
|
||||
return new_reader_base_cost;
|
||||
}
|
||||
|
||||
void table::set_hit_rate(gms::inet_address addr, cache_temperature rate) {
|
||||
auto& e = _cluster_cache_hit_rates[addr];
|
||||
e.rate = rate;
|
||||
@@ -1949,7 +1918,7 @@ write_memtable_to_sstable(memtable& mt, sstables::shared_sstable sst, sstables::
|
||||
std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, "write_memtable_to_sstable"),
|
||||
cfg,
|
||||
[&mt, sst] (auto& monitor, auto& semaphore, auto& cfg) {
|
||||
return write_memtable_to_sstable(semaphore->make_permit(mt.schema().get(), "mt_to_sst"), mt, std::move(sst), monitor, cfg)
|
||||
return write_memtable_to_sstable(semaphore->make_tracking_only_permit(mt.schema().get(), "mt_to_sst"), mt, std::move(sst), monitor, cfg)
|
||||
.finally([&semaphore] {
|
||||
return semaphore->stop();
|
||||
});
|
||||
@@ -1992,6 +1961,7 @@ struct query_state {
|
||||
|
||||
future<lw_shared_ptr<query::result>>
|
||||
table::query(schema_ptr s,
|
||||
reader_permit permit,
|
||||
const query::read_command& cmd,
|
||||
query::query_class_config class_config,
|
||||
query::result_options opts,
|
||||
@@ -1999,7 +1969,7 @@ table::query(schema_ptr s,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
query::result_memory_limiter& memory_limiter,
|
||||
db::timeout_clock::time_point timeout,
|
||||
query::querier_cache_context cache_ctx) {
|
||||
std::optional<query::data_querier>* saved_querier) {
|
||||
if (cmd.get_row_limit() == 0 || cmd.slice.partition_row_limit() == 0 || cmd.partition_limit == 0) {
|
||||
co_return make_lw_shared<query::result>();
|
||||
}
|
||||
@@ -2022,63 +1992,84 @@ table::query(schema_ptr s,
|
||||
|
||||
query_state qs(s, cmd, opts, partition_ranges, std::move(accounter));
|
||||
|
||||
std::optional<query::data_querier> querier_opt;
|
||||
if (saved_querier) {
|
||||
querier_opt = std::move(*saved_querier);
|
||||
}
|
||||
|
||||
while (!qs.done()) {
|
||||
auto&& range = *qs.current_partition_range++;
|
||||
|
||||
auto querier_opt = cache_ctx.lookup_data_querier(*s, range, qs.cmd.slice, trace_state);
|
||||
auto q = querier_opt
|
||||
? std::move(*querier_opt)
|
||||
: query::data_querier(as_mutation_source(), s, class_config.semaphore.make_permit(s.get(), "data-query"), range, qs.cmd.slice,
|
||||
service::get_local_sstable_query_read_priority(), trace_state);
|
||||
if (!querier_opt) {
|
||||
querier_opt = query::data_querier(as_mutation_source(), s, permit, range, qs.cmd.slice,
|
||||
service::get_local_sstable_query_read_priority(), trace_state);
|
||||
}
|
||||
auto& q = *querier_opt;
|
||||
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
co_await q.consume_page(query_result_builder(*s, qs.builder), qs.remaining_rows(), qs.remaining_partitions(), qs.cmd.timestamp, timeout,
|
||||
class_config.max_memory_for_unlimited_query);
|
||||
|
||||
if (q.are_limits_reached() || qs.builder.is_short_read()) {
|
||||
cache_ctx.insert(std::move(q), std::move(trace_state));
|
||||
}
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
}
|
||||
co_await q.close();
|
||||
if (ex || !qs.done()) {
|
||||
co_await q.close();
|
||||
querier_opt = {};
|
||||
}
|
||||
if (ex) {
|
||||
std::rethrow_exception(std::move(ex));
|
||||
}
|
||||
}
|
||||
|
||||
if (!saved_querier || (querier_opt && !querier_opt->are_limits_reached() && !qs.builder.is_short_read())) {
|
||||
co_await querier_opt->close();
|
||||
querier_opt = {};
|
||||
}
|
||||
if (saved_querier) {
|
||||
*saved_querier = std::move(querier_opt);
|
||||
}
|
||||
|
||||
co_return make_lw_shared<query::result>(qs.builder.build());
|
||||
}
|
||||
|
||||
future<reconcilable_result>
|
||||
table::mutation_query(schema_ptr s,
|
||||
reader_permit permit,
|
||||
const query::read_command& cmd,
|
||||
query::query_class_config class_config,
|
||||
const dht::partition_range& range,
|
||||
tracing::trace_state_ptr trace_state,
|
||||
query::result_memory_accounter accounter,
|
||||
db::timeout_clock::time_point timeout,
|
||||
query::querier_cache_context cache_ctx) {
|
||||
std::optional<query::mutation_querier>* saved_querier) {
|
||||
if (cmd.get_row_limit() == 0 || cmd.slice.partition_row_limit() == 0 || cmd.partition_limit == 0) {
|
||||
co_return reconcilable_result();
|
||||
}
|
||||
|
||||
auto querier_opt = cache_ctx.lookup_mutation_querier(*s, range, cmd.slice, trace_state);
|
||||
auto q = querier_opt
|
||||
? std::move(*querier_opt)
|
||||
: query::mutation_querier(as_mutation_source(), s, class_config.semaphore.make_permit(s.get(), "mutation-query"), range, cmd.slice,
|
||||
service::get_local_sstable_query_read_priority(), trace_state);
|
||||
std::optional<query::mutation_querier> querier_opt;
|
||||
if (saved_querier) {
|
||||
querier_opt = std::move(*saved_querier);
|
||||
}
|
||||
if (!querier_opt) {
|
||||
querier_opt = query::mutation_querier(as_mutation_source(), s, permit, range, cmd.slice,
|
||||
service::get_local_sstable_query_read_priority(), trace_state);
|
||||
}
|
||||
auto& q = *querier_opt;
|
||||
|
||||
std::exception_ptr ex;
|
||||
try {
|
||||
auto rrb = reconcilable_result_builder(*s, cmd.slice, std::move(accounter));
|
||||
auto r = co_await q.consume_page(std::move(rrb), cmd.get_row_limit(), cmd.partition_limit, cmd.timestamp, timeout, class_config.max_memory_for_unlimited_query);
|
||||
|
||||
if (q.are_limits_reached() || r.is_short_read()) {
|
||||
cache_ctx.insert(std::move(q), std::move(trace_state));
|
||||
if (!saved_querier || (!q.are_limits_reached() && !r.is_short_read())) {
|
||||
co_await q.close();
|
||||
querier_opt = {};
|
||||
}
|
||||
co_await q.close();
|
||||
if (saved_querier) {
|
||||
*saved_querier = std::move(querier_opt);
|
||||
}
|
||||
|
||||
co_return r;
|
||||
} catch (...) {
|
||||
ex = std::current_exception();
|
||||
@@ -2259,7 +2250,7 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(schema_ptr s
|
||||
auto cr_ranges = co_await db::view::calculate_affected_clustering_ranges(*base, m.decorated_key(), m.partition(), views);
|
||||
if (cr_ranges.empty()) {
|
||||
tracing::trace(tr_state, "View updates do not require read-before-write");
|
||||
co_await generate_and_propagate_view_updates(base, sem.make_permit(s.get(), "push-view-updates-1"), std::move(views), std::move(m), { }, std::move(tr_state), now);
|
||||
co_await generate_and_propagate_view_updates(base, sem.make_tracking_only_permit(s.get(), "push-view-updates-1"), std::move(views), std::move(m), { }, std::move(tr_state), now);
|
||||
// In this case we are not doing a read-before-write, just a
|
||||
// write, so no lock is needed.
|
||||
co_return row_locker::lock_holder();
|
||||
@@ -2284,7 +2275,7 @@ future<row_locker::lock_holder> table::do_push_view_replica_updates(schema_ptr s
|
||||
co_await utils::get_local_injector().inject("table_push_view_replica_updates_timeout", timeout);
|
||||
auto lock = co_await std::move(lockf);
|
||||
auto pk = dht::partition_range::make_singular(m.decorated_key());
|
||||
auto permit = sem.make_permit(base.get(), "push-view-updates-2");
|
||||
auto permit = sem.make_tracking_only_permit(base.get(), "push-view-updates-2");
|
||||
auto reader = source.make_reader(base, permit, pk, slice, io_priority, tr_state, streamed_mutation::forwarding::no, mutation_reader::forwarding::no);
|
||||
co_await this->generate_and_propagate_view_updates(base, std::move(permit), std::move(views), std::move(m), std::move(reader), tr_state, now);
|
||||
tracing::trace(tr_state, "View updates for {}.{} were generated and propagated", base->ks_name(), base->cf_name());
|
||||
|
||||
@@ -647,7 +647,7 @@ SEASTAR_TEST_CASE(test_commitlog_replay_invalid_key){
|
||||
}
|
||||
|
||||
{
|
||||
auto rd = mt.make_flat_reader(s, db.get_reader_concurrency_semaphore().make_permit(s.get(), "test"));
|
||||
auto rd = mt.make_flat_reader(s, db.get_reader_concurrency_semaphore().make_tracking_only_permit(s.get(), "test"));
|
||||
auto close_rd = deferred_close(rd);
|
||||
auto mopt = read_mutation_from_flat_mutation_reader(rd, db::no_timeout).get0();
|
||||
BOOST_REQUIRE(mopt);
|
||||
|
||||
@@ -42,6 +42,7 @@ class test_consumer final : public data_consumer::continuous_data_consumer<test_
|
||||
uint64_t _tested_value;
|
||||
int _state = 0;
|
||||
int _count = 0;
|
||||
reader_permit::used_guard _used_guard;
|
||||
|
||||
void check(uint64_t got) {
|
||||
BOOST_REQUIRE_EQUAL(_tested_value, got);
|
||||
@@ -65,6 +66,7 @@ public:
|
||||
test_consumer(reader_permit permit, uint64_t tested_value)
|
||||
: continuous_data_consumer(std::move(permit), prepare_stream(tested_value), 0, calculate_length(tested_value))
|
||||
, _tested_value(tested_value)
|
||||
, _used_guard(_permit)
|
||||
{ }
|
||||
|
||||
bool non_consuming() { return false; }
|
||||
|
||||
@@ -35,6 +35,11 @@ SEASTAR_TEST_CASE(test_index_with_paging) {
|
||||
e.execute_cql(format("INSERT INTO tab (pk, ck, v, v2, v3) VALUES ({}, 'hello{}', 1, {}, '{}')", i % 3, i, i, big_string)).get();
|
||||
}
|
||||
|
||||
e.db().invoke_on_all([] (database& db) {
|
||||
// The semaphore's queue has to able to absorb one read / row in this test.
|
||||
db.get_reader_concurrency_semaphore().set_max_queue_length(64 * 1024);
|
||||
}).get();
|
||||
|
||||
eventually([&] {
|
||||
auto qo = std::make_unique<cql3::query_options>(db::consistency_level::LOCAL_ONE, std::vector<cql3::raw_value>{},
|
||||
cql3::query_options::specific_options{4321, nullptr, {}, api::new_timestamp()});
|
||||
|
||||
@@ -427,7 +427,7 @@ SEASTAR_THREAD_TEST_CASE(test_mutation_fragment_mutate_exception_safety) {
|
||||
|
||||
reader_concurrency_semaphore sem(1, 100, get_name());
|
||||
auto stop_sem = deferred_stop(sem);
|
||||
auto permit = sem.make_permit(s.schema().get(), get_name());
|
||||
auto permit = sem.make_tracking_only_permit(s.schema().get(), get_name());
|
||||
|
||||
const auto available_res = sem.available_resources();
|
||||
const sstring val(1024, 'a');
|
||||
|
||||
@@ -936,278 +936,6 @@ sstables::shared_sstable create_sstable(sstables::test_env& env, schema_ptr s, s
|
||||
}, mutations);
|
||||
}
|
||||
|
||||
class tracking_reader : public flat_mutation_reader::impl {
|
||||
flat_mutation_reader _reader;
|
||||
std::size_t _call_count{0};
|
||||
std::size_t _ff_count{0};
|
||||
public:
|
||||
tracking_reader(schema_ptr schema, reader_permit permit, lw_shared_ptr<sstables::sstable> sst)
|
||||
: impl(schema, permit)
|
||||
, _reader(sst->make_reader(
|
||||
schema,
|
||||
permit,
|
||||
query::full_partition_range,
|
||||
schema->full_slice(),
|
||||
default_priority_class(),
|
||||
tracing::trace_state_ptr(),
|
||||
streamed_mutation::forwarding::no,
|
||||
mutation_reader::forwarding::yes)) {
|
||||
}
|
||||
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
||||
++_call_count;
|
||||
return _reader.fill_buffer(timeout).then([this] {
|
||||
_end_of_stream = _reader.is_end_of_stream();
|
||||
while (!_reader.is_buffer_empty()) {
|
||||
push_mutation_fragment(*_schema, _permit, _reader.pop_mutation_fragment());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
virtual future<> next_partition() override {
|
||||
_end_of_stream = false;
|
||||
clear_buffer_to_next_partition();
|
||||
if (is_buffer_empty()) {
|
||||
return _reader.next_partition();
|
||||
}
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override {
|
||||
++_ff_count;
|
||||
// Don't forward this to the underlying reader, it will force us
|
||||
// to come up with meaningful partition-ranges which is hard and
|
||||
// unecessary for these tests.
|
||||
return make_ready_future<>();
|
||||
}
|
||||
|
||||
virtual future<> fast_forward_to(position_range, db::timeout_clock::time_point timeout) override {
|
||||
return make_exception_future<>(make_backtraced_exception_ptr<std::bad_function_call>());
|
||||
}
|
||||
|
||||
future<> close() noexcept {
|
||||
return _reader.close();
|
||||
}
|
||||
|
||||
std::size_t call_count() const {
|
||||
return _call_count;
|
||||
}
|
||||
|
||||
std::size_t ff_count() const {
|
||||
return _ff_count;
|
||||
}
|
||||
};
|
||||
|
||||
class reader_wrapper {
|
||||
flat_mutation_reader _reader;
|
||||
tracking_reader* _tracker{nullptr};
|
||||
db::timeout_clock::time_point _timeout;
|
||||
public:
|
||||
reader_wrapper(
|
||||
reader_concurrency_semaphore& semaphore,
|
||||
schema_ptr schema,
|
||||
lw_shared_ptr<sstables::sstable> sst,
|
||||
db::timeout_clock::time_point timeout = db::no_timeout)
|
||||
: _reader(make_empty_flat_reader(schema, semaphore.make_permit(schema.get(), "reader_wrapper")))
|
||||
, _timeout(timeout)
|
||||
{
|
||||
auto ms = mutation_source([this, sst=std::move(sst)] (schema_ptr schema,
|
||||
reader_permit permit,
|
||||
const dht::partition_range&,
|
||||
const query::partition_slice&,
|
||||
const io_priority_class&,
|
||||
tracing::trace_state_ptr,
|
||||
streamed_mutation::forwarding,
|
||||
mutation_reader::forwarding) {
|
||||
auto tracker_ptr = std::make_unique<tracking_reader>(std::move(schema), std::move(permit), std::move(sst));
|
||||
_tracker = tracker_ptr.get();
|
||||
return flat_mutation_reader(std::move(tracker_ptr));
|
||||
});
|
||||
|
||||
_reader.close().get();
|
||||
_reader = make_restricted_flat_reader(std::move(ms), schema, semaphore.make_permit(schema.get(), "reader-wrapper"));
|
||||
}
|
||||
|
||||
reader_wrapper(
|
||||
reader_concurrency_semaphore& semaphore,
|
||||
schema_ptr schema,
|
||||
lw_shared_ptr<sstables::sstable> sst,
|
||||
db::timeout_clock::duration timeout_duration)
|
||||
: reader_wrapper(semaphore, std::move(schema), std::move(sst), db::timeout_clock::now() + timeout_duration) {
|
||||
}
|
||||
|
||||
reader_wrapper(const reader_wrapper&) = delete;
|
||||
reader_wrapper(reader_wrapper&&) = default;
|
||||
|
||||
// must be called in a seastar thread.
|
||||
~reader_wrapper() {
|
||||
_reader.close().get();
|
||||
}
|
||||
|
||||
future<> operator()() {
|
||||
while (!_reader.is_buffer_empty()) {
|
||||
_reader.pop_mutation_fragment();
|
||||
}
|
||||
return _reader.fill_buffer(_timeout);
|
||||
}
|
||||
|
||||
future<> fast_forward_to(const dht::partition_range& pr) {
|
||||
return _reader.fast_forward_to(pr, _timeout);
|
||||
}
|
||||
|
||||
std::size_t call_count() const {
|
||||
return _tracker ? _tracker->call_count() : 0;
|
||||
}
|
||||
|
||||
std::size_t ff_count() const {
|
||||
return _tracker ? _tracker->ff_count() : 0;
|
||||
}
|
||||
|
||||
bool created() const {
|
||||
return bool(_tracker);
|
||||
}
|
||||
|
||||
future<> close() noexcept {
|
||||
return _reader.close();
|
||||
}
|
||||
};
|
||||
|
||||
SEASTAR_TEST_CASE(restricted_reader_reading) {
|
||||
return sstables::test_env::do_with_async([&] (sstables::test_env& env) {
|
||||
reader_concurrency_semaphore semaphore(2, new_reader_base_cost, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
{
|
||||
simple_schema s;
|
||||
auto tmp = tmpdir();
|
||||
auto sst = create_sstable(env, s, tmp.path().string());
|
||||
|
||||
auto reader1 = reader_wrapper(semaphore, s.schema(), sst);
|
||||
|
||||
reader1().get();
|
||||
|
||||
BOOST_REQUIRE_LE(semaphore.available_resources().count, 1);
|
||||
BOOST_REQUIRE_LE(semaphore.available_resources().memory, 0);
|
||||
BOOST_REQUIRE_EQUAL(reader1.call_count(), 1);
|
||||
|
||||
auto reader2 = reader_wrapper(semaphore, s.schema(), sst);
|
||||
auto read2_fut = reader2();
|
||||
|
||||
// reader2 shouldn't be allowed yet
|
||||
BOOST_REQUIRE_EQUAL(reader2.call_count(), 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
|
||||
auto reader3 = reader_wrapper(semaphore, s.schema(), sst);
|
||||
auto read3_fut = reader3();
|
||||
|
||||
// reader3 shouldn't be allowed yet
|
||||
BOOST_REQUIRE_EQUAL(reader3.call_count(), 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 2);
|
||||
|
||||
// Move reader1 to the heap, so that we can safely destroy it.
|
||||
reader1.close().get();
|
||||
auto reader1_ptr = std::make_unique<reader_wrapper>(std::move(reader1));
|
||||
reader1_ptr.reset();
|
||||
|
||||
// reader1's destruction should've freed up enough memory for
|
||||
// reader2 by now.
|
||||
REQUIRE_EVENTUALLY_EQUAL(reader2.call_count(), 1);
|
||||
read2_fut.get();
|
||||
|
||||
// But reader3 should still not be allowed
|
||||
BOOST_REQUIRE_EQUAL(reader3.call_count(), 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
|
||||
// Move reader2 to the heap, so that we can safely destroy it.
|
||||
reader2.close().get();
|
||||
auto reader2_ptr = std::make_unique<reader_wrapper>(std::move(reader2));
|
||||
reader2_ptr.reset();
|
||||
|
||||
// Again, reader2's destruction should've freed up enough memory
|
||||
// for reader3 by now.
|
||||
REQUIRE_EVENTUALLY_EQUAL(reader3.call_count(), 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 0);
|
||||
read3_fut.get();
|
||||
|
||||
{
|
||||
BOOST_REQUIRE_LE(semaphore.available_resources().memory, 0);
|
||||
|
||||
// Already allowed readers should not be blocked anymore even if
|
||||
// there are no more units available.
|
||||
read3_fut = reader3();
|
||||
BOOST_REQUIRE_EQUAL(reader3.call_count(), 2);
|
||||
read3_fut.get();
|
||||
}
|
||||
reader3.close().get();
|
||||
}
|
||||
|
||||
// All units should have been deposited back.
|
||||
REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, semaphore.available_resources().memory);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(restricted_reader_create_reader) {
|
||||
return sstables::test_env::do_with_async([&] (sstables::test_env& env) {
|
||||
reader_concurrency_semaphore semaphore(100, new_reader_base_cost, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
{
|
||||
simple_schema s;
|
||||
auto tmp = tmpdir();
|
||||
auto sst = create_sstable(env, s, tmp.path().string());
|
||||
|
||||
{
|
||||
auto reader = reader_wrapper(semaphore, s.schema(), sst);
|
||||
auto close_reader = deferred_close(reader);
|
||||
// This fast-forward is stupid, I know but the
|
||||
// underlying dummy reader won't care, so it's fine.
|
||||
reader.fast_forward_to(query::full_partition_range).get();
|
||||
|
||||
BOOST_REQUIRE(reader.created());
|
||||
BOOST_REQUIRE_EQUAL(reader.call_count(), 0);
|
||||
BOOST_REQUIRE_EQUAL(reader.ff_count(), 1);
|
||||
}
|
||||
|
||||
{
|
||||
auto reader = reader_wrapper(semaphore, s.schema(), sst);
|
||||
auto close_reader = deferred_close(reader);
|
||||
reader().get();
|
||||
|
||||
BOOST_REQUIRE(reader.created());
|
||||
BOOST_REQUIRE_EQUAL(reader.call_count(), 1);
|
||||
BOOST_REQUIRE_EQUAL(reader.ff_count(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
REQUIRE_EVENTUALLY_EQUAL(new_reader_base_cost, semaphore.available_resources().memory);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_restricted_reader_as_mutation_source) {
|
||||
return seastar::async([test_name = get_name()] {
|
||||
auto make_restricted_populator = [] (schema_ptr s, const std::vector<mutation> &muts) {
|
||||
auto mt = make_lw_shared<memtable>(s);
|
||||
for (auto &&mut : muts) {
|
||||
mt->apply(mut);
|
||||
}
|
||||
|
||||
auto ms = mt->as_data_source();
|
||||
return mutation_source([ms = std::move(ms)](schema_ptr schema,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& range,
|
||||
const query::partition_slice& slice,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr tr,
|
||||
streamed_mutation::forwarding fwd,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return make_restricted_flat_reader(std::move(ms), std::move(schema), std::move(permit), range, slice, pc, tr,
|
||||
fwd, fwd_mr);
|
||||
});
|
||||
};
|
||||
run_mutation_source_tests(make_restricted_populator);
|
||||
});
|
||||
}
|
||||
|
||||
static mutation compacted(const mutation& m) {
|
||||
auto result = m;
|
||||
result.partition().compact_for_compaction(*result.schema(), always_gc, gc_clock::now());
|
||||
@@ -2625,7 +2353,7 @@ SEASTAR_THREAD_TEST_CASE(test_multishard_streaming_reader) {
|
||||
auto& local_partitioner = schema->get_sharder();
|
||||
auto remote_partitioner = dht::sharder(local_partitioner.shard_count() - 1, local_partitioner.sharding_ignore_msb());
|
||||
|
||||
auto tested_reader = make_multishard_streaming_reader(env.db(), schema,
|
||||
auto tested_reader = make_multishard_streaming_reader(env.db(), schema, make_reader_permit(env),
|
||||
[sharder = dht::selective_token_range_sharder(remote_partitioner, token_range, 0)] () mutable -> std::optional<dht::partition_range> {
|
||||
if (auto next = sharder.next()) {
|
||||
return dht::to_partition_range(*next);
|
||||
@@ -3191,7 +2919,7 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_trim_range_tombstones) {
|
||||
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
simple_schema s;
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
auto permit = semaphore.make_tracking_only_permit(s.schema().get(), get_name());
|
||||
|
||||
const auto pkey = s.make_pkey();
|
||||
size_t max_buffer_size = 512;
|
||||
@@ -3285,7 +3013,7 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_self_validation) {
|
||||
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
simple_schema s;
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
auto permit = semaphore.make_tracking_only_permit(s.schema().get(), get_name());
|
||||
|
||||
auto pkeys = s.make_pkeys(4);
|
||||
std::ranges::sort(pkeys, dht::decorated_key::less_comparator(s.schema()));
|
||||
@@ -3643,7 +3371,7 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_recreate_before_fast_forward_to)
|
||||
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
simple_schema s;
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
auto permit = semaphore.make_tracking_only_permit(s.schema().get(), get_name());
|
||||
auto pkeys = s.make_pkeys(6);
|
||||
boost::sort(pkeys, dht::decorated_key::less_comparator(s.schema()));
|
||||
|
||||
@@ -3694,7 +3422,7 @@ SEASTAR_THREAD_TEST_CASE(test_evictable_reader_drop_flags) {
|
||||
reader_concurrency_semaphore semaphore(1, 0, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
simple_schema s;
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
auto permit = semaphore.make_tracking_only_permit(s.schema().get(), get_name());
|
||||
|
||||
auto pkeys = s.make_pkeys(2);
|
||||
std::sort(pkeys.begin(), pkeys.end(), [&s] (const auto& pk1, const auto& pk2) {
|
||||
|
||||
@@ -116,7 +116,7 @@ private:
|
||||
Querier make_querier(const dht::partition_range& range) {
|
||||
return Querier(_mutation_source,
|
||||
_s.schema(),
|
||||
_sem.make_permit(_s.schema().get(), "make-querier"),
|
||||
_sem.make_tracking_only_permit(_s.schema().get(), "make-querier"),
|
||||
range,
|
||||
_s.schema()->full_slice(),
|
||||
service::get_local_sstable_query_read_priority(),
|
||||
@@ -757,16 +757,13 @@ SEASTAR_THREAD_TEST_CASE(test_immediate_evict_on_insert) {
|
||||
test_querier_cache t;
|
||||
|
||||
auto& sem = t.get_semaphore();
|
||||
auto permit1 = sem.make_permit(t.get_schema().get(), get_name());
|
||||
auto permit2 = sem.make_permit(t.get_schema().get(), get_name());
|
||||
|
||||
permit1.wait_admission(0, db::no_timeout).get();
|
||||
auto permit1 = sem.obtain_permit(t.get_schema().get(), get_name(), 0, db::no_timeout).get0();
|
||||
|
||||
auto resources = permit1.consume_resources(reader_resources(sem.available_resources().count, 0));
|
||||
|
||||
BOOST_CHECK_EQUAL(sem.available_resources().count, 0);
|
||||
|
||||
auto fut = permit2.wait_admission(1, db::no_timeout);
|
||||
auto fut = sem.obtain_permit(t.get_schema().get(), get_name(), 1, db::no_timeout);
|
||||
|
||||
BOOST_CHECK_EQUAL(sem.waiters(), 1);
|
||||
|
||||
@@ -792,8 +789,8 @@ SEASTAR_THREAD_TEST_CASE(test_unique_inactive_read_handle) {
|
||||
.with_column("v", int32_type)
|
||||
.build();
|
||||
|
||||
auto sem1_h1 = sem1.register_inactive_read(make_empty_flat_reader(schema, sem1.make_permit(schema.get(), get_name())));
|
||||
auto sem2_h1 = sem2.register_inactive_read(make_empty_flat_reader(schema, sem2.make_permit(schema.get(), get_name())));
|
||||
auto sem1_h1 = sem1.register_inactive_read(make_empty_flat_reader(schema, sem1.make_tracking_only_permit(schema.get(), get_name())));
|
||||
auto sem2_h1 = sem2.register_inactive_read(make_empty_flat_reader(schema, sem2.make_tracking_only_permit(schema.get(), get_name())));
|
||||
|
||||
// Sanity check that lookup still works with empty handle.
|
||||
BOOST_REQUIRE(!sem1.unregister_inactive_read(reader_concurrency_semaphore::inactive_read_handle{}));
|
||||
|
||||
@@ -38,7 +38,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_clear_inactive_reads)
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), semaphore.make_permit(s.schema().get(), get_name()))));
|
||||
handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), semaphore.make_tracking_only_permit(s.schema().get(), get_name()))));
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return bool(handle); }));
|
||||
@@ -50,7 +50,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_clear_inactive_reads)
|
||||
handles.clear();
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), semaphore.make_permit(s.schema().get(), get_name()))));
|
||||
handles.emplace_back(semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), semaphore.make_tracking_only_permit(s.schema().get(), get_name()))));
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(std::all_of(handles.begin(), handles.end(), [] (const reader_concurrency_semaphore::inactive_read_handle& handle) { return bool(handle); }));
|
||||
@@ -68,39 +68,37 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_destroyed_permit_rele
|
||||
|
||||
// Not admitted, active
|
||||
{
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
auto permit = semaphore.make_tracking_only_permit(s.schema().get(), get_name());
|
||||
auto units2 = permit.consume_memory(1024);
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
|
||||
// Not admitted, inactive
|
||||
{
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
auto permit = semaphore.make_tracking_only_permit(s.schema().get(), get_name());
|
||||
auto units2 = permit.consume_memory(1024);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit));
|
||||
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
|
||||
// Admitted, active
|
||||
{
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
auto units1 = permit.wait_admission(1024, db::no_timeout).get0();
|
||||
auto units2 = permit.consume_memory(1024);
|
||||
auto permit = semaphore.obtain_permit(s.schema().get(), get_name(), 1024, db::no_timeout).get0();
|
||||
auto units1 = permit.consume_memory(1024);
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
|
||||
// Admitted, inactive
|
||||
{
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
auto units1 = permit.wait_admission(1024, db::no_timeout).get0();
|
||||
auto units2 = permit.consume_memory(1024);
|
||||
auto permit = semaphore.obtain_permit(s.schema().get(), get_name(), 1024, db::no_timeout).get0();
|
||||
auto units1 = permit.consume_memory(1024);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit));
|
||||
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_abandoned_handle_closes_reader) {
|
||||
@@ -108,7 +106,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_abandoned_handle_clos
|
||||
reader_concurrency_semaphore semaphore(reader_concurrency_semaphore::no_limits{}, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
auto permit = semaphore.make_tracking_only_permit(s.schema().get(), get_name());
|
||||
{
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit));
|
||||
// The handle is destroyed here, triggering the destrution of the inactive read.
|
||||
@@ -124,54 +122,52 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_abandoned_handle_clos
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves_units) {
|
||||
simple_schema s;
|
||||
const auto initial_resources = reader_concurrency_semaphore::resources{10, 1024 * 1024};
|
||||
const auto base_resources = reader_concurrency_semaphore::resources{1, 1024};
|
||||
reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name());
|
||||
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
auto permit = semaphore.make_permit(s.schema().get(), get_name());
|
||||
|
||||
reader_permit_opt permit = semaphore.obtain_permit(s.schema().get(), get_name(), 1024, db::no_timeout).get();
|
||||
BOOST_REQUIRE_EQUAL(permit->consumed_resources(), base_resources);
|
||||
|
||||
std::optional<reader_permit::resource_units> residue_units;
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
const auto have_residue_units = bool(residue_units);
|
||||
residue_units.emplace(permit->consume_resources(reader_resources(0, 100)));
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
|
||||
|
||||
auto current_resources = initial_resources;
|
||||
if (have_residue_units) {
|
||||
current_resources -= residue_units->resources();
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == current_resources);
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), *permit));
|
||||
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
|
||||
BOOST_REQUIRE_EQUAL(permit->consumed_resources(), residue_units->resources());
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
|
||||
|
||||
std::optional<reader_permit::resource_units> admitted_units;
|
||||
if (i % 2) {
|
||||
const auto consumed_resources = semaphore.available_resources();
|
||||
semaphore.consume(consumed_resources);
|
||||
|
||||
auto units_fut = permit.wait_admission(1024, db::no_timeout);
|
||||
BOOST_REQUIRE(!units_fut.available());
|
||||
auto fut = permit->maybe_wait_readmission(db::no_timeout);
|
||||
BOOST_REQUIRE(!fut.available());
|
||||
|
||||
semaphore.signal(consumed_resources);
|
||||
admitted_units = units_fut.get();
|
||||
fut.get();
|
||||
} else {
|
||||
admitted_units = permit.wait_admission(1024, db::no_timeout).get();
|
||||
permit->maybe_wait_readmission(db::no_timeout).get();
|
||||
}
|
||||
|
||||
current_resources -= admitted_units->resources();
|
||||
BOOST_REQUIRE(semaphore.available_resources() == current_resources);
|
||||
|
||||
residue_units.emplace(permit.consume_resources(reader_resources(0, 100)));
|
||||
if (!have_residue_units) {
|
||||
current_resources -= residue_units->resources();
|
||||
}
|
||||
BOOST_REQUIRE(semaphore.available_resources() == current_resources);
|
||||
|
||||
auto handle = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit));
|
||||
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
|
||||
BOOST_REQUIRE_EQUAL(permit->consumed_resources(), residue_units->resources() + base_resources);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources - residue_units->resources());
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
|
||||
|
||||
residue_units.reset();
|
||||
|
||||
BOOST_REQUIRE(semaphore.available_resources() == initial_resources);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources - permit->consumed_resources());
|
||||
|
||||
permit = {};
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
}
|
||||
|
||||
// This unit test checks that the semaphore doesn't get into a deadlock
|
||||
@@ -188,14 +184,14 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_readmission_preserves
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
|
||||
class reader {
|
||||
class skeleton_reader : public flat_mutation_reader::impl {
|
||||
reader_permit::resource_units _base_resources;
|
||||
std::optional<reader_permit::resource_units> _resources;
|
||||
public:
|
||||
skeleton_reader(schema_ptr s, reader_permit permit, reader_permit::resource_units res)
|
||||
: impl(std::move(s), std::move(permit)), _base_resources(std::move(res)) { }
|
||||
skeleton_reader(schema_ptr s, reader_permit permit)
|
||||
: impl(std::move(s), std::move(permit)) { }
|
||||
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
|
||||
reader_permit::blocked_guard _{_permit};
|
||||
_resources.emplace(_permit.consume_resources(reader_resources(0, tests::random::get_int(1024, 2048))));
|
||||
return make_ready_future<>();
|
||||
co_await sleep(std::chrono::milliseconds(1));
|
||||
}
|
||||
virtual future<> next_partition() override { return make_ready_future<>(); }
|
||||
virtual future<> fast_forward_to(const dht::partition_range& pr, db::timeout_clock::time_point timeout) override { return make_ready_future<>(); }
|
||||
@@ -214,48 +210,53 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
|
||||
|
||||
private:
|
||||
schema_ptr _schema;
|
||||
reader_permit _permit;
|
||||
reader_concurrency_semaphore& _semaphore;
|
||||
bool _memory_only = true;
|
||||
bool _evictable = false;
|
||||
reader_permit_opt _permit;
|
||||
std::optional<reader_permit::resource_units> _units;
|
||||
std::variant<std::monostate, flat_mutation_reader, reader_concurrency_semaphore::inactive_read_handle> _reader;
|
||||
|
||||
private:
|
||||
future<> make_reader() {
|
||||
auto res = _permit.consume_memory();
|
||||
if (!_memory_only) {
|
||||
res = co_await _permit.wait_admission(1024, db::no_timeout);
|
||||
}
|
||||
_reader = make_flat_mutation_reader<skeleton_reader>(_schema, _permit, std::move(res));
|
||||
void make_reader() {
|
||||
_reader = make_flat_mutation_reader<skeleton_reader>(_schema, *_permit);
|
||||
}
|
||||
future<> tick(std::monostate&) {
|
||||
co_await make_reader();
|
||||
make_reader();
|
||||
co_await tick(std::get<flat_mutation_reader>(_reader));
|
||||
}
|
||||
future<> tick(flat_mutation_reader& reader) {
|
||||
co_await reader.fill_buffer(db::no_timeout);
|
||||
if (_evictable) {
|
||||
_reader = _permit.semaphore().register_inactive_read(std::move(reader));
|
||||
_reader = _permit->semaphore().register_inactive_read(std::move(reader));
|
||||
}
|
||||
}
|
||||
future<> tick(reader_concurrency_semaphore::inactive_read_handle& handle) {
|
||||
if (auto reader = _permit.semaphore().unregister_inactive_read(std::move(handle)); reader) {
|
||||
if (auto reader = _permit->semaphore().unregister_inactive_read(std::move(handle)); reader) {
|
||||
_reader = std::move(*reader);
|
||||
} else {
|
||||
co_await make_reader();
|
||||
co_await _permit->maybe_wait_readmission(db::no_timeout);
|
||||
make_reader();
|
||||
}
|
||||
co_await tick(std::get<flat_mutation_reader>(_reader));
|
||||
}
|
||||
|
||||
public:
|
||||
reader(schema_ptr s, reader_permit permit, bool memory_only, bool evictable)
|
||||
reader(schema_ptr s, reader_concurrency_semaphore& semaphore, bool memory_only, bool evictable)
|
||||
: _schema(std::move(s))
|
||||
, _permit(std::move(permit))
|
||||
, _semaphore(semaphore)
|
||||
, _memory_only(memory_only)
|
||||
, _evictable(evictable)
|
||||
, _units(_permit.consume_memory(tests::random::get_int(128, 1024)))
|
||||
{
|
||||
}
|
||||
future<> obtain_permit() {
|
||||
if (_memory_only) {
|
||||
_permit = _semaphore.make_tracking_only_permit(_schema.get(), "reader_m");
|
||||
} else {
|
||||
_permit = co_await _semaphore.obtain_permit(_schema.get(), fmt::format("reader_{}", _evictable ? 'e' : 'a'), 1024, db::no_timeout);
|
||||
}
|
||||
_units = _permit->consume_memory(tests::random::get_int(128, 1024));
|
||||
}
|
||||
future<> tick() {
|
||||
return std::visit(reader_visitor{*this}, _reader);
|
||||
}
|
||||
@@ -267,15 +268,22 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef DEBUG
|
||||
const auto count = 10;
|
||||
const auto num_readers = 512;
|
||||
const auto ticks = 1000;
|
||||
const auto ticks = 200;
|
||||
#else
|
||||
const auto count = 10;
|
||||
const auto num_readers = 128;
|
||||
const auto ticks = 10;
|
||||
#endif
|
||||
|
||||
simple_schema s;
|
||||
reader_concurrency_semaphore semaphore(count, count * 1024, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
std::list<std::optional<reader>> readers;
|
||||
std::vector<std::unique_ptr<reader>> readers;
|
||||
|
||||
unsigned nr_memory_only = 0;
|
||||
unsigned nr_admitted = 0;
|
||||
unsigned nr_evictable = 0;
|
||||
@@ -290,7 +298,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
|
||||
} else {
|
||||
++nr_admitted;
|
||||
}
|
||||
readers.emplace_back(reader(s.schema(), semaphore.make_permit(s.schema().get(), fmt::format("reader{}", i)), memory_only, evictable));
|
||||
readers.emplace_back(std::make_unique<reader>(s.schema(), semaphore, memory_only, evictable));
|
||||
}
|
||||
|
||||
testlog.info("Created {} readers, memory_only={}, admitted={}, evictable={}", readers.size(), nr_memory_only, nr_admitted, nr_evictable);
|
||||
@@ -305,13 +313,25 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_forward_progress) {
|
||||
});
|
||||
watchdog.arm_periodic(std::chrono::seconds(30));
|
||||
|
||||
parallel_for_each(readers, [&] (std::optional<reader>& r) -> future<> {
|
||||
parallel_for_each(readers, [&] (std::unique_ptr<reader>& r_) -> future<> {
|
||||
auto r = std::move(r_);
|
||||
try {
|
||||
co_await r->obtain_permit();
|
||||
} catch (semaphore_timed_out&) {
|
||||
semaphore.broken(std::make_exception_ptr(std::runtime_error("test failed due to read timeout")));
|
||||
co_return;
|
||||
}
|
||||
|
||||
for (auto i = 0; i < ticks; ++i) {
|
||||
watchdog_touched = true;
|
||||
co_await r->tick();
|
||||
try {
|
||||
watchdog_touched = true;
|
||||
co_await r->tick();
|
||||
} catch (semaphore_timed_out&) {
|
||||
semaphore.broken(std::make_exception_ptr(std::runtime_error("test failed due to read timeout")));
|
||||
break;
|
||||
}
|
||||
}
|
||||
co_await r->close();
|
||||
r.reset();
|
||||
watchdog_touched = true;
|
||||
}).get();
|
||||
}
|
||||
@@ -378,8 +398,7 @@ SEASTAR_TEST_CASE(reader_restriction_file_tracking) {
|
||||
return async([&] {
|
||||
reader_concurrency_semaphore semaphore(100, 4 * 1024, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
auto permit = semaphore.make_permit(nullptr, get_name());
|
||||
permit.wait_admission(0, db::no_timeout).get();
|
||||
auto permit = semaphore.obtain_permit(nullptr, get_name(), 0, db::no_timeout).get();
|
||||
|
||||
{
|
||||
auto tracked_file = make_tracked_file(file(shared_ptr<file_impl>(make_shared<dummy_file_impl>())), permit);
|
||||
@@ -439,14 +458,11 @@ SEASTAR_TEST_CASE(reader_concurrency_semaphore_timeout) {
|
||||
{
|
||||
auto timeout = db::timeout_clock::now() + std::chrono::duration_cast<db::timeout_clock::time_point::duration>(std::chrono::milliseconds{1});
|
||||
|
||||
auto permit1 = semaphore.make_permit(nullptr, "permit1");
|
||||
std::optional<reader_permit::resource_units> permit1_res = permit1.wait_admission(new_reader_base_cost, timeout).get();
|
||||
reader_permit_opt permit1 = semaphore.obtain_permit(nullptr, "permit1", new_reader_base_cost, timeout).get();
|
||||
|
||||
auto permit2 = semaphore.make_permit(nullptr, "permit2");
|
||||
auto permit2_fut = permit2.wait_admission(new_reader_base_cost, timeout);
|
||||
auto permit2_fut = semaphore.obtain_permit(nullptr, "permit2", new_reader_base_cost, timeout);
|
||||
|
||||
auto permit3 = semaphore.make_permit(nullptr, "permit3");
|
||||
auto permit3_fut = permit3.wait_admission(new_reader_base_cost, timeout);
|
||||
auto permit3_fut = semaphore.obtain_permit(nullptr, "permit3", new_reader_base_cost, timeout);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 2);
|
||||
|
||||
@@ -459,7 +475,7 @@ SEASTAR_TEST_CASE(reader_concurrency_semaphore_timeout) {
|
||||
} else {
|
||||
// We need special cleanup when the test failed to avoid invalid
|
||||
// memory access.
|
||||
permit1_res.reset();
|
||||
permit1 = {};
|
||||
|
||||
BOOST_CHECK(eventually_true([&] { return permit2_fut.available(); }));
|
||||
{
|
||||
@@ -484,24 +500,20 @@ SEASTAR_TEST_CASE(reader_concurrency_semaphore_max_queue_length) {
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
{
|
||||
auto permit1 = semaphore.make_permit(nullptr, "permit1");
|
||||
auto permit1_res = permit1.wait_admission(new_reader_base_cost, db::no_timeout).get();
|
||||
reader_permit_opt permit1 = semaphore.obtain_permit(nullptr, "permit1", new_reader_base_cost, db::no_timeout).get();
|
||||
|
||||
auto permit2 = semaphore.make_permit(nullptr, "permit2");
|
||||
auto permit2_fut = permit2.wait_admission(new_reader_base_cost, db::no_timeout);
|
||||
auto permit2_fut = semaphore.obtain_permit(nullptr, "permit2", new_reader_base_cost, db::no_timeout);
|
||||
|
||||
auto permit3 = semaphore.make_permit(nullptr, "permit3");
|
||||
auto permit3_fut = permit3.wait_admission(new_reader_base_cost, db::no_timeout);
|
||||
auto permit3_fut = semaphore.obtain_permit(nullptr, "permit3", new_reader_base_cost, db::no_timeout);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 2);
|
||||
|
||||
auto permit4 = semaphore.make_permit(nullptr, "permit4");
|
||||
auto permit4_fut = permit4.wait_admission(new_reader_base_cost, db::no_timeout);
|
||||
auto permit4_fut = semaphore.obtain_permit(nullptr, "permit4", new_reader_base_cost, db::no_timeout);
|
||||
|
||||
// The queue should now be full.
|
||||
BOOST_REQUIRE_THROW(permit4_fut.get(), std::runtime_error);
|
||||
|
||||
permit1_res.reset();
|
||||
permit1 = {};
|
||||
{
|
||||
auto res = permit2_fut.get0();
|
||||
}
|
||||
@@ -536,7 +548,7 @@ SEASTAR_THREAD_TEST_CASE(reader_concurrency_semaphore_dump_reader_diganostics) {
|
||||
for (auto& schema : schemas) {
|
||||
const auto nr_permits = tests::random::get_int<unsigned>(2, 32);
|
||||
for (unsigned i = 0; i < nr_permits; ++i) {
|
||||
auto permit = semaphore.make_permit(schema.get(), op_names.at(tests::random::get_int<unsigned>(0, nr_ops - 1)));
|
||||
auto permit = semaphore.make_tracking_only_permit(schema.get(), op_names.at(tests::random::get_int<unsigned>(0, nr_ops - 1)));
|
||||
if (tests::random::get_int<unsigned>(0, 4)) {
|
||||
auto units = permit.consume_resources(reader_resources(tests::random::get_int<unsigned>(0, 1), tests::random::get_int<unsigned>(1024, 16 * 1024 * 1024)));
|
||||
permits.push_back(std::pair(std::move(permit), std::move(units)));
|
||||
@@ -552,7 +564,6 @@ SEASTAR_THREAD_TEST_CASE(reader_concurrency_semaphore_dump_reader_diganostics) {
|
||||
testlog.info("With max-lines=4: {}", semaphore.dump_diagnostics(4));
|
||||
testlog.info("With no max-lines: {}", semaphore.dump_diagnostics(0));
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_waits_on_permits) {
|
||||
BOOST_TEST_MESSAGE("unused");
|
||||
{
|
||||
@@ -570,7 +581,7 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_waits_on_permits
|
||||
BOOST_TEST_MESSAGE("1 permit");
|
||||
{
|
||||
auto semaphore = std::make_unique<reader_concurrency_semaphore>(reader_concurrency_semaphore::no_limits{}, get_name());
|
||||
auto permit = std::make_unique<reader_permit>(semaphore->make_permit(nullptr, "permit1"));
|
||||
auto permit = std::make_unique<reader_permit>(semaphore->make_tracking_only_permit(nullptr, "permit1"));
|
||||
|
||||
// Test will fail via use-after-free
|
||||
auto f = semaphore->stop().then([semaphore = std::move(semaphore)] { });
|
||||
@@ -583,3 +594,326 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_stop_waits_on_permits
|
||||
f.get();
|
||||
}
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_admission) {
|
||||
simple_schema s;
|
||||
const auto schema_ptr = s.schema().get();
|
||||
const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
|
||||
reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
auto require_can_admit = [&] (bool expected_can_admit, const char* description,
|
||||
std::experimental::source_location sl = std::experimental::source_location::current()) {
|
||||
testlog.trace("Running admission scenario {}, with exepcted_can_admit={}", description, expected_can_admit);
|
||||
const auto stats_before = semaphore.get_stats();
|
||||
|
||||
auto admit_fut = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::timeout_clock::now());
|
||||
admit_fut.wait();
|
||||
const bool can_admit = !admit_fut.failed();
|
||||
if (can_admit) {
|
||||
admit_fut.ignore_ready_future();
|
||||
} else {
|
||||
// Make sure we have a timeout exception, not something else
|
||||
BOOST_REQUIRE_THROW(std::rethrow_exception(admit_fut.get_exception()), semaphore_timed_out);
|
||||
}
|
||||
|
||||
const auto stats_after = semaphore.get_stats();
|
||||
BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted + uint64_t(can_admit));
|
||||
// Deliberately not checking `reads_enqueued`, a read can be enqueued temporarily during the admission process.
|
||||
|
||||
if (can_admit == expected_can_admit) {
|
||||
testlog.trace("admission scenario '{}' with expected_can_admit={} passed at {}:{}", description, expected_can_admit, sl.file_name(),
|
||||
sl.line());
|
||||
} else {
|
||||
BOOST_FAIL(fmt::format("admission scenario '{}' with expected_can_admit={} failed at {}:{}\ndiagnostics: {}", description,
|
||||
expected_can_admit, sl.file_name(), sl.line(), semaphore.dump_diagnostics()));
|
||||
}
|
||||
};
|
||||
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
// resources and waitlist
|
||||
{
|
||||
reader_permit_opt permit = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::timeout_clock::now()).get();
|
||||
|
||||
require_can_admit(true, "enough resources");
|
||||
|
||||
const auto stats_before = semaphore.get_stats();
|
||||
|
||||
auto enqueued_permit_fut = semaphore.obtain_permit(schema_ptr, get_name(), 2 * 1024, db::no_timeout);
|
||||
{
|
||||
const auto stats_after = semaphore.get_stats();
|
||||
BOOST_REQUIRE(!enqueued_permit_fut.available());
|
||||
BOOST_REQUIRE_EQUAL(stats_after.reads_enqueued, stats_before.reads_enqueued + 1);
|
||||
BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
}
|
||||
|
||||
BOOST_REQUIRE(semaphore.available_resources().count >= 1);
|
||||
BOOST_REQUIRE(semaphore.available_resources().memory >= 1024);
|
||||
require_can_admit(false, "enough resources but waitlist not empty");
|
||||
|
||||
permit = {};
|
||||
|
||||
reader_permit _(enqueued_permit_fut.get());
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
// used and blocked
|
||||
{
|
||||
auto permit = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::timeout_clock::now()).get();
|
||||
|
||||
require_can_admit(true, "!used");
|
||||
{
|
||||
reader_permit::used_guard ug{permit};
|
||||
|
||||
require_can_admit(false, "used > blocked");
|
||||
{
|
||||
reader_permit::blocked_guard bg{permit};
|
||||
require_can_admit(true, "used == blocked");
|
||||
}
|
||||
require_can_admit(false, "used > blocked");
|
||||
}
|
||||
require_can_admit(true, "!used");
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
// forward progress -- resources
|
||||
{
|
||||
const auto resources = reader_resources::with_memory(semaphore.available_resources().memory);
|
||||
semaphore.consume(resources);
|
||||
require_can_admit(true, "semaphore with no memory but all count available");
|
||||
semaphore.signal(resources);
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
// forward progress -- readmission
|
||||
{
|
||||
auto permit = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::timeout_clock::now()).get();
|
||||
|
||||
auto irh = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit));
|
||||
BOOST_REQUIRE(semaphore.try_evict_one_inactive_read());
|
||||
BOOST_REQUIRE(!irh);
|
||||
|
||||
reader_permit::used_guard _{permit};
|
||||
|
||||
const auto stats_before = semaphore.get_stats();
|
||||
|
||||
auto wait_fut = permit.maybe_wait_readmission(db::timeout_clock::now());
|
||||
wait_fut.wait();
|
||||
BOOST_REQUIRE(!wait_fut.failed());
|
||||
|
||||
const auto stats_after = semaphore.get_stats();
|
||||
BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted + 1);
|
||||
BOOST_REQUIRE_EQUAL(stats_after.reads_enqueued, stats_before.reads_enqueued);
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
// inactive readers
|
||||
{
|
||||
auto permit = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::timeout_clock::now()).get();
|
||||
|
||||
require_can_admit(true, "!used");
|
||||
{
|
||||
auto irh = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit));
|
||||
require_can_admit(true, "inactive");
|
||||
|
||||
reader_permit::used_guard ug{permit};
|
||||
|
||||
require_can_admit(true, "inactive (used)");
|
||||
|
||||
{
|
||||
auto rd = semaphore.unregister_inactive_read(std::move(irh));
|
||||
rd->close().get();
|
||||
}
|
||||
|
||||
require_can_admit(false, "used > blocked");
|
||||
|
||||
irh = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit));
|
||||
require_can_admit(true, "inactive (used)");
|
||||
}
|
||||
require_can_admit(true, "!used");
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
// evicting inactive readers for admission
|
||||
{
|
||||
auto permit1 = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::timeout_clock::now()).get();
|
||||
auto irh1 = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit1));
|
||||
|
||||
auto permit2 = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::timeout_clock::now()).get();
|
||||
auto irh2 = semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), permit2));
|
||||
|
||||
require_can_admit(true, "evictable reads");
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
auto check_admitting_enqueued_read = [&] (auto pre_admission_hook, auto post_enqueue_hook) {
|
||||
auto cookie1 = pre_admission_hook();
|
||||
|
||||
require_can_admit(false, "admission blocked");
|
||||
|
||||
const auto stats_before = semaphore.get_stats();
|
||||
|
||||
auto permit2_fut = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::no_timeout);
|
||||
|
||||
const auto stats_after = semaphore.get_stats();
|
||||
BOOST_REQUIRE_EQUAL(stats_after.reads_admitted, stats_before.reads_admitted);
|
||||
BOOST_REQUIRE_EQUAL(stats_after.reads_enqueued, stats_before.reads_enqueued + 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.waiters(), 1);
|
||||
|
||||
auto cookie2 = post_enqueue_hook(cookie1);
|
||||
|
||||
if (!eventually_true([&] { return permit2_fut.available(); })) {
|
||||
semaphore.broken();
|
||||
permit2_fut.wait();
|
||||
permit2_fut.ignore_ready_future();
|
||||
BOOST_FAIL("Enqueued permit didn't get admitted as expected");
|
||||
}
|
||||
};
|
||||
|
||||
// admitting enqueued reads -- permit owning resources destroyed
|
||||
{
|
||||
check_admitting_enqueued_read(
|
||||
[&] {
|
||||
return reader_permit_opt(semaphore.obtain_permit(schema_ptr, get_name(), 2 * 1024, db::timeout_clock::now()).get());
|
||||
},
|
||||
[] (reader_permit_opt& permit1) {
|
||||
permit1 = {};
|
||||
return 0;
|
||||
}
|
||||
);
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
// admitting enqueued reads -- permit owning resources becomes inactive
|
||||
{
|
||||
check_admitting_enqueued_read(
|
||||
[&] {
|
||||
return reader_permit_opt(semaphore.obtain_permit(schema_ptr, get_name(), 2 * 1024, db::timeout_clock::now()).get());
|
||||
},
|
||||
[&] (reader_permit_opt& permit1) {
|
||||
return semaphore.register_inactive_read(make_empty_flat_reader(s.schema(), *permit1));
|
||||
}
|
||||
);
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
// admitting enqueued reads -- permit becomes unused
|
||||
{
|
||||
check_admitting_enqueued_read(
|
||||
[&] {
|
||||
auto permit = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::timeout_clock::now()).get();
|
||||
require_can_admit(true, "enough resources");
|
||||
return std::pair(permit, std::optional<reader_permit::used_guard>{permit});
|
||||
}, [&] (std::pair<reader_permit, std::optional<reader_permit::used_guard>>& permit_and_used_guard) {
|
||||
permit_and_used_guard.second.reset();
|
||||
return 0;
|
||||
}
|
||||
);
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
|
||||
// admitting enqueued reads -- permit becomes blocked
|
||||
{
|
||||
check_admitting_enqueued_read(
|
||||
[&] {
|
||||
auto permit = semaphore.obtain_permit(schema_ptr, get_name(), 1024, db::timeout_clock::now()).get();
|
||||
require_can_admit(true, "enough resources");
|
||||
return std::pair(permit, reader_permit::used_guard{permit});
|
||||
}, [&] (std::pair<reader_permit, reader_permit::used_guard>& permit_and_used_guard) {
|
||||
return reader_permit::blocked_guard{permit_and_used_guard.first};
|
||||
}
|
||||
);
|
||||
}
|
||||
BOOST_REQUIRE_EQUAL(semaphore.available_resources(), initial_resources);
|
||||
require_can_admit(true, "semaphore in initial state");
|
||||
}
|
||||
|
||||
SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_used_blocked) {
|
||||
const auto initial_resources = reader_concurrency_semaphore::resources{2, 2 * 1024};
|
||||
reader_concurrency_semaphore semaphore(initial_resources.count, initial_resources.memory, get_name());
|
||||
auto stop_sem = deferred_stop(semaphore);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().current_permits, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().used_permits, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().blocked_permits, 0);
|
||||
|
||||
auto permit = semaphore.obtain_permit(nullptr, get_name(), 1024, db::no_timeout).get0();
|
||||
|
||||
for (auto scenario = 0; scenario < 5; ++scenario) {
|
||||
testlog.info("Running scenario {}", scenario);
|
||||
|
||||
std::vector<reader_permit::used_guard> used;
|
||||
std::vector<reader_permit::blocked_guard> blocked;
|
||||
unsigned count;
|
||||
|
||||
switch (scenario) {
|
||||
case 0:
|
||||
used.emplace_back(permit);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().current_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().used_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().blocked_permits, 0);
|
||||
break;
|
||||
case 1:
|
||||
used.emplace_back(permit);
|
||||
blocked.emplace_back(permit);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().current_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().used_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().blocked_permits, 1);
|
||||
break;
|
||||
case 2:
|
||||
blocked.emplace_back(permit);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().current_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().used_permits, 0);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().blocked_permits, 0);
|
||||
break;
|
||||
case 3:
|
||||
blocked.emplace_back(permit);
|
||||
used.emplace_back(permit);
|
||||
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().current_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().used_permits, 1);
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().blocked_permits, 1);
|
||||
break;
|
||||
default:
|
||||
count = tests::random::get_int<unsigned>(3, 100);
|
||||
for (unsigned i = 0; i < count; ++i) {
|
||||
if (tests::random::get_bool()) {
|
||||
used.emplace_back(permit);
|
||||
} else {
|
||||
blocked.emplace_back(permit);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
while (!used.empty() && !blocked.empty()) {
|
||||
const bool pop_used = !used.empty() && tests::random::get_bool();
|
||||
|
||||
if (pop_used) {
|
||||
used.pop_back();
|
||||
if (used.empty()) {
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().used_permits, 0);
|
||||
}
|
||||
} else {
|
||||
blocked.pop_back();
|
||||
if (blocked.empty()) {
|
||||
BOOST_REQUIRE_EQUAL(semaphore.get_permit_stats().blocked_permits, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -439,7 +439,7 @@ SEASTAR_TEST_CASE(test_view_update_generator) {
|
||||
sstables::sstable_writer_config sst_cfg = e.db().local().get_user_sstables_manager().configure_writer("test");
|
||||
auto& pc = service::get_local_streaming_priority();
|
||||
|
||||
auto permit = e.local_db().get_reader_concurrency_semaphore().make_permit(s.get(), "test");
|
||||
auto permit = e.local_db().get_reader_concurrency_semaphore().make_tracking_only_permit(s.get(), "test");
|
||||
sst->write_components(flat_mutation_reader_from_mutations(std::move(permit), {m}), 1ul, s, sst_cfg, {}, pc).get();
|
||||
sst->open_data().get();
|
||||
t->add_sstable_and_update_cache(sst).get();
|
||||
@@ -549,7 +549,7 @@ SEASTAR_THREAD_TEST_CASE(test_view_update_generator_deadlock) {
|
||||
sstables::sstable_writer_config sst_cfg = e.local_db().get_user_sstables_manager().configure_writer("test");
|
||||
auto& pc = service::get_local_streaming_priority();
|
||||
|
||||
auto permit = e.local_db().get_reader_concurrency_semaphore().make_permit(s.get(), "test");
|
||||
auto permit = e.local_db().get_reader_concurrency_semaphore().make_tracking_only_permit(s.get(), "test");
|
||||
sst->write_components(flat_mutation_reader_from_mutations(std::move(permit), {m}), 1ul, s, sst_cfg, {}, pc).get();
|
||||
sst->open_data().get();
|
||||
t->add_sstable_and_update_cache(sst).get();
|
||||
@@ -626,7 +626,7 @@ SEASTAR_THREAD_TEST_CASE(test_view_update_generator_register_semaphore_unit_leak
|
||||
sstables::sstable_writer_config sst_cfg = e.local_db().get_user_sstables_manager().configure_writer("test");
|
||||
auto& pc = service::get_local_streaming_priority();
|
||||
|
||||
auto permit = e.local_db().get_reader_concurrency_semaphore().make_permit(s.get(), "test");
|
||||
auto permit = e.local_db().get_reader_concurrency_semaphore().make_tracking_only_permit(s.get(), "test");
|
||||
sst->write_components(flat_mutation_reader_from_mutations(std::move(permit), {m}), 1ul, s, sst_cfg, {}, pc).get();
|
||||
sst->open_data().get();
|
||||
t->add_sstable_and_update_cache(sst).get();
|
||||
@@ -702,7 +702,7 @@ SEASTAR_THREAD_TEST_CASE(test_view_update_generator_buffering) {
|
||||
|
||||
class consumer_verifier {
|
||||
schema_ptr _schema;
|
||||
reader_permit _permit;
|
||||
reader_concurrency_semaphore& _semaphore;
|
||||
const partition_size_map& _partition_rows;
|
||||
std::vector<mutation>& _collected_muts;
|
||||
std::unique_ptr<row_locker> _rl;
|
||||
@@ -727,7 +727,7 @@ SEASTAR_THREAD_TEST_CASE(test_view_update_generator_buffering) {
|
||||
void check(mutation mut) {
|
||||
// First we check that we would be able to create a reader, even
|
||||
// though the staging reader consumed all resources.
|
||||
auto res_units = _permit.wait_admission(new_reader_base_cost, db::timeout_clock::now()).get0();
|
||||
auto permit = _semaphore.obtain_permit(_schema.get(), "consumer_verifier", new_reader_base_cost, db::timeout_clock::now()).get0();
|
||||
|
||||
const size_t current_rows = rows_in_mut(mut);
|
||||
const auto total_rows = _partition_rows.at(mut.decorated_key());
|
||||
@@ -773,7 +773,7 @@ SEASTAR_THREAD_TEST_CASE(test_view_update_generator_buffering) {
|
||||
public:
|
||||
consumer_verifier(schema_ptr schema, reader_concurrency_semaphore& sem, const partition_size_map& partition_rows, std::vector<mutation>& collected_muts, bool& ok)
|
||||
: _schema(std::move(schema))
|
||||
, _permit(sem.make_permit(_schema.get(), "consumer_verifier"))
|
||||
, _semaphore(sem)
|
||||
, _partition_rows(partition_rows)
|
||||
, _collected_muts(collected_muts)
|
||||
, _rl(std::make_unique<row_locker>(_schema))
|
||||
@@ -832,26 +832,15 @@ SEASTAR_THREAD_TEST_CASE(test_view_update_generator_buffering) {
|
||||
return less(a.decorated_key(), b.decorated_key());
|
||||
});
|
||||
|
||||
auto permit = sem.make_permit(schema.get(), get_name());
|
||||
auto permit = sem.obtain_permit(schema.get(), get_name(), new_reader_base_cost, db::no_timeout).get0();
|
||||
|
||||
auto mt = make_lw_shared<memtable>(schema);
|
||||
for (const auto& mut : muts) {
|
||||
mt->apply(mut);
|
||||
}
|
||||
|
||||
auto ms = mutation_source([mt] (
|
||||
schema_ptr s,
|
||||
reader_permit permit,
|
||||
const dht::partition_range& pr,
|
||||
const query::partition_slice& ps,
|
||||
const io_priority_class& pc,
|
||||
tracing::trace_state_ptr ts,
|
||||
streamed_mutation::forwarding fwd_ms,
|
||||
mutation_reader::forwarding fwd_mr) {
|
||||
return make_restricted_flat_reader(mt->as_data_source(), s, std::move(permit), pr, ps, pc, std::move(ts), fwd_ms, fwd_mr);
|
||||
});
|
||||
auto p = make_manually_paused_evictable_reader(
|
||||
std::move(ms),
|
||||
mt->as_data_source(),
|
||||
schema,
|
||||
permit,
|
||||
query::full_partition_range,
|
||||
|
||||
@@ -320,7 +320,7 @@ public:
|
||||
table_name = std::move(table_name)] (database& db) mutable {
|
||||
auto& cf = db.find_column_family(ks_name, table_name);
|
||||
auto schema = cf.schema();
|
||||
auto permit = db.get_reader_concurrency_semaphore().make_permit(schema.get(), "require_column_has_value()");
|
||||
auto permit = db.get_reader_concurrency_semaphore().make_tracking_only_permit(schema.get(), "require_column_has_value()");
|
||||
return cf.find_partition_slow(schema, permit, pkey)
|
||||
.then([schema, ckey, column_name, exp] (column_family::const_mutation_partition_ptr p) {
|
||||
assert(p != nullptr);
|
||||
@@ -784,7 +784,7 @@ future<> do_with_cql_env_thread(std::function<void(cql_test_env&)> func, cql_tes
|
||||
}
|
||||
|
||||
reader_permit make_reader_permit(cql_test_env& env) {
|
||||
return env.local_db().get_reader_concurrency_semaphore().make_permit(nullptr, "test");
|
||||
return env.local_db().get_reader_concurrency_semaphore().make_tracking_only_permit(nullptr, "test");
|
||||
}
|
||||
|
||||
namespace debug {
|
||||
|
||||
@@ -39,7 +39,7 @@ public:
|
||||
}
|
||||
|
||||
reader_concurrency_semaphore& semaphore() { return *_semaphore; };
|
||||
reader_permit make_permit() { return _semaphore->make_permit(nullptr, "test"); }
|
||||
reader_permit make_permit() { return _semaphore->make_tracking_only_permit(nullptr, "test"); }
|
||||
};
|
||||
|
||||
} // namespace tests
|
||||
|
||||
@@ -102,5 +102,8 @@ public:
|
||||
}
|
||||
return *_contexts[shard]->semaphore;
|
||||
}
|
||||
virtual future<reader_permit> obtain_reader_permit(schema_ptr schema, const char* const description, db::timeout_clock::time_point timeout) override {
|
||||
return semaphore().obtain_permit(schema.get(), description, 128 * 1024, timeout);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ public:
|
||||
|
||||
test_env_sstables_manager& manager() { return *_mgr; }
|
||||
reader_concurrency_semaphore& semaphore() { return *_semaphore; }
|
||||
reader_permit make_reader_permit(const schema* const s = nullptr, const char* n = "test") { return _semaphore->make_permit(s, n); }
|
||||
reader_permit make_reader_permit(const schema* const s = nullptr, const char* n = "test") { return _semaphore->make_tracking_only_permit(s, n); }
|
||||
|
||||
future<> working_sst(schema_ptr schema, sstring dir, unsigned long generation) {
|
||||
return reusable_sst(std::move(schema), dir, generation).then([] (auto ptr) { return make_ready_future<>(); });
|
||||
|
||||
@@ -187,8 +187,7 @@ void execute_reads(const schema& s, reader_concurrency_semaphore& sem, unsigned
|
||||
|
||||
if (sem.waiters()) {
|
||||
testlog.trace("Waiting for queue to drain");
|
||||
auto permit = sem.make_permit(&s, "drain");
|
||||
permit.wait_admission(1, db::no_timeout).get();
|
||||
sem.obtain_permit(&s, "drain", 1, db::no_timeout).get();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -79,7 +79,7 @@ reader_concurrency_semaphore_wrapper::~reader_concurrency_semaphore_wrapper() {
|
||||
}
|
||||
|
||||
reader_permit reader_concurrency_semaphore_wrapper::make_permit() {
|
||||
return _semaphore->make_permit(nullptr, "perf");
|
||||
return _semaphore->make_tracking_only_permit(nullptr, "perf");
|
||||
}
|
||||
|
||||
} // namespace perf
|
||||
|
||||
@@ -67,7 +67,7 @@ struct table {
|
||||
}
|
||||
|
||||
reader_permit make_permit() {
|
||||
return semaphore.make_permit(s.schema().get(), "test");
|
||||
return semaphore.make_tracking_only_permit(s.schema().get(), "test");
|
||||
}
|
||||
future<> stop() noexcept {
|
||||
return semaphore.stop();
|
||||
|
||||
@@ -177,7 +177,7 @@ Note: UDT is not supported for now.
|
||||
sst->load().get();
|
||||
|
||||
{
|
||||
sstables::index_reader idx_reader(sst, rcs_sem.make_permit(primary_key_schema.get(), "idx"), default_priority_class(), {});
|
||||
sstables::index_reader idx_reader(sst, rcs_sem.make_tracking_only_permit(primary_key_schema.get(), "idx"), default_priority_class(), {});
|
||||
|
||||
list_partitions(*primary_key_schema, idx_reader);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user