Merge "Limit non-paged query memory consumption" from Botond

"
Non-paged queries completely ignore the query result size limiter
mechanism. They consume all the memory they want. With sufficiently
large datasets this can easily lead to a handful or even a single
unpaged query producing an OOM.

This series continues the work started by 134d5a5f7, by introducing a
configurable pair of soft/hard limit (default to 1MB/100MB) that is
applied to otherwise unlimited queries, like reverse and unpaged ones.
When an unlimited query reaches the soft limit a warning is logged. This
should give users some heads-up to adjust their application. When the
hard limit is reached the query is aborted. The idea is to not greet
users with failing queries after an upgrade while at the same time
protect the database from the really bad queries. The hard limit should
be decreased from time to time gradually approaching the desired goal of
1MB.

We don't want to limit internal queries, we trust ourselves to either
use another form of memory usage control, or read only small datasets.
So the limit is selected according to the query class. User reads use
the `max_memory_for_unlimited_query_{soft,hard}_limit` configuration
items, while internal reads are not limited. The limit is obtained by
the coordinator, who passes it down to replicas using the existing
`max_result_size` parameter (which is not a special type containing the
two limits), which is now passed on every verb, instead of once per
connection. This ensures that all replicas work with the same limits.
For normal paged queries `max_result_size` is set to the usual
`query::result_memory_limiter::maximum_result_size` For queries that can
consume unlimited amount of memory -- unpaged and reverse queries --
this is set to the value of the aforementioned
`max_memory_for_unlimited_query_{soft,hard}_limit` configuration item,
but only for user reads, internal reads are not limited.

This has the side-effect that reverse reads now send entire
partitions in a single page, but this is not that bad. The data was
already read, and its size was below the limit, the replica might as well
send it all.

Fixes: #5870
"

* 'nonpaged-query-limit/v5' of https://github.com/denesb/scylla: (26 commits)
  test: database_test: add test for enforced max result limit
  mutation_partition: abort read when hard limit is exceeded for non-paged reads
  query-result.hh: move the definition of short_read to the top
  test: cql_test_env: set the max_memory_unlimited_query_{soft,hard}_limit
  test: set the allow_short_read slice option for paged queries
  partition_slice_builder: add with_option()
  result_memory_accounter: remove default constructor
  query_*(): use the coordinator specified memory limit for unlimited queries
  storage_proxy: use read_command::max_result_size to pass max result size around
  query: result_memory_limiter: use the new max_result_size type
  query: read_command: add max_result_size
  query: read_command: use tagged ints for limit ctor params
  query: read_command: add separate convenience constructor
  service: query_pager: set the allow_short_read flag
  result_memory_accounter: check(): use _maximum_result_size instead of hardcoded limit
  storage_proxy: add get_max_result_size()
  result_memory_limiter: add unlimited_result_size constant
  database: add get_statement_scheduling_group()
  database: query_mutations(): obtain the memory accounter inside
  query: query_class_config: use max_result_size for the max_memory_for_unlimited_query field
  ...
This commit is contained in:
Avi Kivity
2020-07-29 13:41:53 +03:00
58 changed files with 717 additions and 339 deletions

View File

@@ -1225,7 +1225,8 @@ static future<std::unique_ptr<rjson::value>> get_previous_item(
service_permit permit,
alternator::stats& stats);
static lw_shared_ptr<query::read_command> previous_item_read_command(schema_ptr schema,
static lw_shared_ptr<query::read_command> previous_item_read_command(service::storage_proxy& proxy,
schema_ptr schema,
const clustering_key& ck,
shared_ptr<cql3::selection::selection> selection) {
std::vector<query::clustering_range> bounds;
@@ -1240,7 +1241,7 @@ static lw_shared_ptr<query::read_command> previous_item_read_command(schema_ptr
auto regular_columns = boost::copy_range<query::column_id_vector>(
schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice));
}
static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) {
@@ -1354,7 +1355,7 @@ static future<std::unique_ptr<rjson::value>> get_previous_item(
{
stats.reads_before_write++;
auto selection = cql3::selection::selection::wildcard(schema);
auto command = previous_item_read_command(schema, ck, selection);
auto command = previous_item_read_command(proxy, schema, ck, selection);
auto cl = db::consistency_level::LOCAL_QUORUM;
return proxy.query(schema, command, to_partition_ranges(*schema, pk), cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state)).then(
@@ -1405,7 +1406,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
auto timeout = executor::default_timeout();
auto selection = cql3::selection::selection::wildcard(schema());
auto read_command = needs_read_before_write ?
previous_item_read_command(schema(), _ck, selection) :
previous_item_read_command(proxy, schema(), _ck, selection) :
nullptr;
return proxy.cas(schema(), shared_from_this(), read_command, to_partition_ranges(*schema(), _pk),
{timeout, std::move(permit), client_state, trace_state},
@@ -2405,7 +2406,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
auto selection = cql3::selection::selection::wildcard(schema);
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
std::unordered_set<std::string> used_attribute_names;
auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names);
@@ -2475,7 +2476,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
rs.schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
auto selection = cql3::selection::selection::wildcard(rs.schema);
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, query::max_partitions);
auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
future<std::tuple<std::string, std::optional<rjson::value>>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
[schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
@@ -2728,7 +2729,8 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
return last_evaluated_key;
}
static future<executor::request_return_type> do_query(schema_ptr schema,
static future<executor::request_return_type> do_query(service::storage_proxy& proxy,
schema_ptr schema,
const rjson::value* exclusive_start_key,
dht::partition_range_vector&& partition_ranges,
std::vector<query::clustering_range>&& ck_bounds,
@@ -2762,7 +2764,7 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
query::partition_slice::option_set opts = selection->get_query_options();
opts.add(custom_opts);
auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts);
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice));
auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));
@@ -2883,7 +2885,7 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Scan");
verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Scan");
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
std::move(filter), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit));
}
@@ -3357,7 +3359,7 @@ future<executor::request_return_type> executor::query(client_state& client_state
verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Query");
query::partition_slice::option_set opts;
opts.set_if<query::partition_slice::option::reversed>(!forward);
return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
std::move(filter), opts, client_state, _stats.cql_stats, std::move(trace_state), std::move(permit));
}

View File

@@ -717,7 +717,8 @@ future<executor::request_return_type> executor::get_records(client_state& client
auto partition_slice = query::partition_slice(
std::move(bounds)
, {}, std::move(regular_columns), selection->get_query_options());
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, limit * 4);
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
query::row_limit(limit * 4));
return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
[this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {

View File

@@ -1356,7 +1356,8 @@ public:
opts.set_if<query::partition_slice::option::always_return_static_content>(!p.static_row().empty());
auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), std::move(opts));
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, row_limit);
const auto max_result_size = _ctx._proxy.get_max_result_size(partition_slice);
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::max_result_size(max_result_size), query::row_limit(row_limit));
const auto select_cl = adjust_cl(write_cl);

View File

@@ -385,7 +385,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard));
}
return proxy.cas(schema, request, request->read_command(), request->key(),
return proxy.cas(schema, request, request->read_command(proxy), request->key(),
{read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
cl_for_paxos, cl_for_learn, batch_timeout, cas_timeout).then([this, request] (bool is_applied) {
return modification_statement::build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied, request->rows());

View File

@@ -81,7 +81,7 @@ std::optional<mutation> cas_request::apply_updates(api::timestamp_type ts) const
return mutation_set;
}
lw_shared_ptr<query::read_command> cas_request::read_command() const {
lw_shared_ptr<query::read_command> cas_request::read_command(service::storage_proxy& proxy) const {
column_set columns_to_read(_schema->all_columns_count());
std::vector<query::clustering_range> ranges;
@@ -116,7 +116,7 @@ lw_shared_ptr<query::read_command> cas_request::read_command() const {
options.set(query::partition_slice::option::always_return_static_content);
query::partition_slice ps(std::move(ranges), *_schema, columns_to_read, options);
ps.set_partition_row_limit(max_rows);
return make_lw_shared<query::read_command>(_schema->id(), _schema->version(), std::move(ps));
return make_lw_shared<query::read_command>(_schema->id(), _schema->version(), std::move(ps), proxy.get_max_result_size(ps));
}
bool cas_request::applies_to() const {

View File

@@ -90,7 +90,7 @@ public:
return _rows;
}
lw_shared_ptr<query::read_command> read_command() const;
lw_shared_ptr<query::read_command> read_command(service::storage_proxy& proxy) const;
void add_row_update(const modification_statement& stmt_arg, std::vector<query::clustering_range> ranges_arg,
modification_statement::json_cache_opt json_cache_arg, const query_options& options_arg);

View File

@@ -168,7 +168,7 @@ modification_statement::get_mutations(service::storage_proxy& proxy, const query
}
if (requires_read()) {
lw_shared_ptr<query::read_command> cmd = read_command(ranges, cl);
lw_shared_ptr<query::read_command> cmd = read_command(proxy, ranges, cl);
// FIXME: ignoring "local"
f = proxy.query(s, cmd, dht::partition_range_vector(keys), cl,
{timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()}).then(
@@ -246,14 +246,15 @@ std::vector<mutation> modification_statement::apply_updates(
}
lw_shared_ptr<query::read_command>
modification_statement::read_command(query::clustering_row_ranges ranges, db::consistency_level cl) const {
modification_statement::read_command(service::storage_proxy& proxy, query::clustering_row_ranges ranges, db::consistency_level cl) const {
try {
validate_for_read(cl);
} catch (exceptions::invalid_request_exception& e) {
throw exceptions::invalid_request_exception(format("Write operation require a read but consistency {} is not supported on reads", cl));
}
query::partition_slice ps(std::move(ranges), *s, columns_to_read(), update_parameters::options);
return make_lw_shared<query::read_command>(s->id(), s->version(), std::move(ps));
const auto max_result_size = proxy.get_max_result_size(ps);
return make_lw_shared<query::read_command>(s->id(), s->version(), std::move(ps), query::max_result_size(max_result_size));
}
std::vector<query::clustering_range>
@@ -351,7 +352,7 @@ modification_statement::execute_with_condition(service::storage_proxy& proxy, se
make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard));
}
return proxy.cas(s, request, request->read_command(), request->key(),
return proxy.cas(s, request, request->read_command(proxy), request->key(),
{read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
cl_for_paxos, cl_for_learn, statement_timeout, cas_timeout).then([this, request] (bool is_applied) {
return build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied, request->rows());

View File

@@ -242,7 +242,7 @@ public:
// Build a read_command instance to fetch the previous mutation from storage. The mutation is
// fetched if we need to check LWT conditions or apply updates to non-frozen list elements.
lw_shared_ptr<query::read_command> read_command(query::clustering_row_ranges ranges, db::consistency_level cl) const;
lw_shared_ptr<query::read_command> read_command(service::storage_proxy& proxy, query::clustering_row_ranges ranges, db::consistency_level cl) const;
// Create a mutation object for the update operation represented by this modification statement.
// A single mutation object for lightweight transactions, which can only span one partition, or a vector
// of mutations, one per partition key, for statements which affect multiple partition keys,

View File

@@ -319,8 +319,19 @@ select_statement::do_execute(service::storage_proxy& proxy,
_stats.select_partition_range_scan += _range_scan;
_stats.select_partition_range_scan_no_bypass_cache += _range_scan_no_bypass_cache;
auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), query::is_first_page::no, options.get_timestamp(state));
auto slice = make_partition_slice(options);
auto command = ::make_lw_shared<query::read_command>(
_schema->id(),
_schema->version(),
std::move(slice),
proxy.get_max_result_size(slice),
query::row_limit(limit),
query::partition_limit(query::max_partitions),
now,
tracing::make_trace_info(state.get_trace_state()),
utils::UUID(),
query::is_first_page::no,
options.get_timestamp(state));
int32_t page_size = options.get_page_size();
@@ -471,25 +482,28 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
}
lw_shared_ptr<query::read_command>
indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) const {
indexed_table_select_statement::prepare_command_for_base_query(service::storage_proxy& proxy, const query_options& options,
service::query_state& state, gc_clock::time_point now, bool use_paging) const {
auto slice = make_partition_slice(options);
if (use_paging) {
slice.options.set<query::partition_slice::option::allow_short_read>();
slice.options.set<query::partition_slice::option::send_partition_key>();
if (_schema->clustering_key_size() > 0) {
slice.options.set<query::partition_slice::option::send_clustering_key>();
}
}
lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
_schema->id(),
_schema->version(),
make_partition_slice(options),
get_limit(options),
std::move(slice),
proxy.get_max_result_size(slice),
query::row_limit(get_limit(options)),
query::partition_limit(query::max_partitions),
now,
tracing::make_trace_info(state.get_trace_state()),
query::max_partitions,
utils::UUID(),
query::is_first_page::no,
options.get_timestamp(state));
if (use_paging) {
cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
if (_schema->clustering_key_size() > 0) {
cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
}
}
return cmd;
}
@@ -502,7 +516,7 @@ indexed_table_select_statement::do_execute_base_query(
gc_clock::time_point now,
lw_shared_ptr<const service::pager::paging_state> paging_state) const {
using value_type = std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>;
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
auto cmd = prepare_command_for_base_query(proxy, options, state, now, bool(paging_state));
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
uint32_t queried_ranges_count = partition_ranges.size();
service::query_ranges_to_vnodes_generator ranges_to_vnodes(proxy.get_token_metadata(), _schema, std::move(partition_ranges));
@@ -578,7 +592,7 @@ indexed_table_select_statement::do_execute_base_query(
gc_clock::time_point now,
lw_shared_ptr<const service::pager::paging_state> paging_state) const {
using value_type = std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>;
auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
auto cmd = prepare_command_for_base_query(proxy, options, state, now, bool(paging_state));
auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
struct base_query_state {
@@ -1146,10 +1160,11 @@ indexed_table_select_statement::read_posting_list(service::storage_proxy& proxy,
_view_schema->id(),
_view_schema->version(),
partition_slice,
limit,
proxy.get_max_result_size(partition_slice),
query::row_limit(limit),
query::partition_limit(query::max_partitions),
now,
tracing::make_trace_info(state.get_trace_state()),
query::max_partitions,
utils::UUID(),
query::is_first_page::no,
options.get_timestamp(state));

View File

@@ -237,7 +237,8 @@ private:
lw_shared_ptr<const service::pager::paging_state> paging_state) const;
lw_shared_ptr<query::read_command>
prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) const;
prepare_command_for_base_query(service::storage_proxy& proxy, const query_options& options, service::query_state& state, gc_clock::time_point now,
bool use_paging) const;
future<std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>>
do_execute_base_query(

View File

@@ -1178,18 +1178,18 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
tracing::trace_state_ptr trace_state, uint64_t max_result_size, db::timeout_clock::time_point timeout) {
tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
column_family& cf = find_column_family(cmd.cf_id);
auto class_config = query::query_class_config{.semaphore = get_reader_concurrency_semaphore(), .max_memory_for_unlimited_query = *cmd.max_result_size};
query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
return _data_query_stage(&cf,
std::move(s),
seastar::cref(cmd),
make_query_class_config(),
class_config,
opts,
seastar::cref(ranges),
std::move(trace_state),
seastar::ref(get_result_memory_limiter()),
max_result_size,
timeout,
std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
if (f.failed()) {
@@ -1206,8 +1206,12 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
future<std::tuple<reconcilable_result, cache_temperature>>
database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range,
query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
const auto short_read_allwoed = query::short_read(cmd.slice.options.contains<query::partition_slice::option::allow_short_read>());
return get_result_memory_limiter().new_mutation_read(*cmd.max_result_size, short_read_allwoed).then(
[&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) {
column_family& cf = find_column_family(cmd.cf_id);
auto class_config = query::query_class_config{.semaphore = get_reader_concurrency_semaphore(), .max_memory_for_unlimited_query = *cmd.max_result_size};
query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
return _mutation_query_stage(std::move(s),
cf.as_mutation_source(),
@@ -1217,7 +1221,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
cmd.partition_limit,
cmd.timestamp,
timeout,
make_query_class_config(),
class_config,
std::move(accounter),
std::move(trace_state),
std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
@@ -1231,6 +1235,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
return make_ready_future<std::tuple<reconcilable_result, cache_temperature>>(std::tuple(std::move(result), hit_rate));
}
});
});
}
std::unordered_set<sstring> database::get_initial_tokens() {
@@ -1279,16 +1284,16 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) {
});
}
query_class_config database::make_query_class_config() {
reader_concurrency_semaphore& database::get_reader_concurrency_semaphore() {
// Everything running in the statement group is considered a user query
if (current_scheduling_group() == _dbcfg.statement_scheduling_group) {
return query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query()};
return _read_concurrency_sem;
// Reads done on behalf of view update generation run in the streaming group
} else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) {
return query_class_config{_streaming_concurrency_sem, std::numeric_limits<uint64_t>::max()};
return _streaming_concurrency_sem;
// Everything else is considered a system query
} else {
return query_class_config{_system_read_concurrency_sem, std::numeric_limits<uint64_t>::max()};
return _system_read_concurrency_sem;
}
}
@@ -1348,7 +1353,7 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
// counter state for each modified cell...
tracing::trace(trace_state, "Reading counter values from the CF");
return counter_write_query(m_schema, cf.as_mutation_source(), make_query_class_config().semaphore.make_permit(), m.decorated_key(), slice, trace_state, timeout)
return counter_write_query(m_schema, cf.as_mutation_source(), get_reader_concurrency_semaphore().make_permit(), m.decorated_key(), slice, trace_state, timeout)
.then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) {
// ...now, that we got existing state of all affected counter
// cells we can look for our shard in each of them, increment
@@ -1559,7 +1564,7 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::tra
if (cf.views().empty()) {
return apply_with_commitlog(std::move(s), cf, std::move(uuid), m, timeout, sync).finally([op = std::move(op)] { });
}
future<row_locker::lock_holder> f = cf.push_view_replica_updates(s, m, timeout, std::move(tr_state), make_query_class_config().semaphore);
future<row_locker::lock_holder> f = cf.push_view_replica_updates(s, m, timeout, std::move(tr_state), get_reader_concurrency_semaphore());
return f.then([this, s = std::move(s), uuid = std::move(uuid), &m, timeout, &cf, op = std::move(op), sync] (row_locker::lock_holder lock) mutable {
return apply_with_commitlog(std::move(s), cf, std::move(uuid), m, timeout, sync).finally(
// Hold the local lock on the base-table partition or row

View File

@@ -748,12 +748,11 @@ public:
// Returns at most "cmd.limit" rows
future<lw_shared_ptr<query::result>> query(schema_ptr,
const query::read_command& cmd,
query_class_config class_config,
query::query_class_config class_config,
query::result_options opts,
const dht::partition_range_vector& ranges,
tracing::trace_state_ptr trace_state,
query::result_memory_limiter& memory_limiter,
uint64_t max_result_size,
db::timeout_clock::time_point timeout,
query::querier_cache_context cache_ctx = { });
@@ -1294,12 +1293,11 @@ private:
column_family*,
schema_ptr,
const query::read_command&,
query_class_config,
query::query_class_config,
query::result_options,
const dht::partition_range_vector&,
tracing::trace_state_ptr,
query::result_memory_limiter&,
uint64_t,
db::timeout_clock::time_point,
query::querier_cache_context> _data_query_stage;
@@ -1396,6 +1394,7 @@ public:
return _commitlog.get();
}
seastar::scheduling_group get_statement_scheduling_group() const { return _dbcfg.statement_scheduling_group; }
seastar::scheduling_group get_streaming_scheduling_group() const { return _dbcfg.streaming_scheduling_group; }
size_t get_available_memory() const { return _dbcfg.available_memory; }
@@ -1463,10 +1462,9 @@ public:
unsigned shard_of(const frozen_mutation& m);
future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>> query(schema_ptr, const query::read_command& cmd, query::result_options opts,
const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state,
uint64_t max_result_size, db::timeout_clock::time_point timeout);
db::timeout_clock::time_point timeout);
future<std::tuple<reconcilable_result, cache_temperature>> query_mutations(schema_ptr, const query::read_command& cmd, const dht::partition_range& range,
query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_state,
db::timeout_clock::time_point timeout);
tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout);
// Apply the mutation atomically.
// Throws timed_out_error when timeout is reached.
future<> apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, db::timeout_clock::time_point timeout);
@@ -1594,7 +1592,9 @@ public:
return _supports_infinite_bound_range_deletions;
}
query_class_config make_query_class_config();
// Get the reader concurrency semaphore, appropriate for the query class,
// which is deduced from the current scheduling group.
reader_concurrency_semaphore& get_reader_concurrency_semaphore();
};
future<> start_large_data_handler(sharded<database>& db);

View File

@@ -224,7 +224,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
"The directory in which Scylla will put all its subdirectories. The location of individual subdirs can be overriden by the respective *_directory options.")
, commitlog_directory(this, "commitlog_directory", value_status::Used, "",
"The directory where the commit log is stored. For optimal write performance, it is recommended the commit log be on a separate disk partition (ideally, a separate physical device) from the data file directories.")
, data_file_directories(this, "data_file_directories", value_status::Used, { },
, data_file_directories(this, "data_file_directories", "datadir", value_status::Used, { },
"The directory location where table data (SSTables) is stored")
, hints_directory(this, "hints_directory", value_status::Used, "",
"The directory where hints files are stored if hinted handoff is enabled.")
@@ -517,7 +517,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
/* Native transport (CQL Binary Protocol) */
, start_native_transport(this, "start_native_transport", value_status::Used, true,
"Enable or disable the native transport server. Uses the same address as the rpc_address, but the port is different from the rpc_port. See native_transport_port.")
, native_transport_port(this, "native_transport_port", value_status::Used, 9042,
, native_transport_port(this, "native_transport_port", "cql_port", value_status::Used, 9042,
"Port on which the CQL native transport listens for clients.")
, native_transport_port_ssl(this, "native_transport_port_ssl", value_status::Used, 9142,
"Port on which the CQL TLS native transport listens for clients."
@@ -536,7 +536,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
/* Settings for configuring and tuning client connections. */
, broadcast_rpc_address(this, "broadcast_rpc_address", value_status::Used, {/* unset */},
"RPC address to broadcast to drivers and other Scylla nodes. This cannot be set to 0.0.0.0. If blank, it is set to the value of the rpc_address or rpc_interface. If rpc_address or rpc_interfaceis set to 0.0.0.0, this property must be set.\n")
, rpc_port(this, "rpc_port", value_status::Used, 9160,
, rpc_port(this, "rpc_port", "thrift_port", value_status::Used, 9160,
"Thrift port for client connections.")
, start_rpc(this, "start_rpc", value_status::Used, true,
"Starts the Thrift RPC server")
@@ -722,8 +722,12 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, max_clustering_key_restrictions_per_query(this, "max_clustering_key_restrictions_per_query", liveness::LiveUpdate, value_status::Used, 100,
"Maximum number of distinct clustering key restrictions per query. This limit places a bound on the size of IN tuples, "
"especially when multiple clustering key columns have IN restrictions. Increasing this value can result in server instability.")
, max_memory_for_unlimited_query(this, "max_memory_for_unlimited_query", liveness::LiveUpdate, value_status::Used, size_t(1) << 20,
"Maximum amount of memory a query, whose memory consumption is not naturally limited, is allowed to consume, e.g. non-paged and reverse queries.")
, max_memory_for_unlimited_query_soft_limit(this, "max_memory_for_unlimited_query_soft_limit", liveness::LiveUpdate, value_status::Used, uint64_t(1) << 20,
"Maximum amount of memory a query, whose memory consumption is not naturally limited, is allowed to consume, e.g. non-paged and reverse queries. "
"This is the soft limit, there will be a warning logged for queries violating this limit.")
, max_memory_for_unlimited_query_hard_limit(this, "max_memory_for_unlimited_query_hard_limit", "max_memory_for_unlimited_query", liveness::LiveUpdate, value_status::Used, (uint64_t(100) << 20),
"Maximum amount of memory a query, whose memory consumption is not naturally limited, is allowed to consume, e.g. non-paged and reverse queries. "
"This is the hard limit, queries violating this limit will be aborted.")
, initial_sstable_loading_concurrency(this, "initial_sstable_loading_concurrency", value_status::Used, 4u,
"Maximum amount of sstables to load in parallel during initialization. A higher number can lead to more memory consumption. You should not need to touch this")
, enable_3_1_0_compatibility_mode(this, "enable_3_1_0_compatibility_mode", value_status::Used, false,
@@ -808,39 +812,27 @@ namespace utils {
template<>
void config_file::named_value<db::config::seed_provider_type>::add_command_line_option(
boost::program_options::options_description_easy_init& init,
const std::string_view& name, const std::string_view& desc) {
init((hyphenate(name) + "-class-name").data(),
boost::program_options::options_description_easy_init& init) {
init((hyphenate(name()) + "-class-name").data(),
value_ex<sstring>()->notifier(
[this](sstring new_class_name) {
auto old_seed_provider = operator()();
old_seed_provider.class_name = new_class_name;
set(std::move(old_seed_provider), config_source::CommandLine);
}),
desc.data());
init((hyphenate(name) + "-parameters").data(),
desc().data());
init((hyphenate(name()) + "-parameters").data(),
value_ex<std::unordered_map<sstring, sstring>>()->notifier(
[this](std::unordered_map<sstring, sstring> new_parameters) {
auto old_seed_provider = operator()();
old_seed_provider.parameters = new_parameters;
set(std::move(old_seed_provider), config_source::CommandLine);
}),
desc.data());
desc().data());
}
}
boost::program_options::options_description_easy_init&
db::config::add_options(boost::program_options::options_description_easy_init& init) {
config_file::add_options(init);
data_file_directories.add_command_line_option(init, "datadir", "alias for 'data-file-directories'");
rpc_port.add_command_line_option(init, "thrift-port", "alias for 'rpc-port'");
native_transport_port.add_command_line_option(init, "cql-port", "alias for 'native-transport-port'");
return init;
}
db::fs::path db::config::get_conf_dir() {
using namespace db::fs;

View File

@@ -303,7 +303,8 @@ public:
named_value<bool> abort_on_internal_error;
named_value<uint32_t> max_partition_key_restrictions_per_query;
named_value<uint32_t> max_clustering_key_restrictions_per_query;
named_value<uint64_t> max_memory_for_unlimited_query;
named_value<uint64_t> max_memory_for_unlimited_query_soft_limit;
named_value<uint64_t> max_memory_for_unlimited_query_hard_limit;
named_value<unsigned> initial_sstable_loading_concurrency;
named_value<bool> enable_3_1_0_compatibility_mode;
named_value<bool> enable_user_defined_functions;
@@ -329,9 +330,6 @@ public:
seastar::logging_settings logging_settings(const boost::program_options::variables_map&) const;
boost::program_options::options_description_easy_init&
add_options(boost::program_options::options_description_easy_init&);
const db::extensions& extensions() const;
static const sstring default_tls_priority;
@@ -346,8 +344,7 @@ private:
return this->is_set() ? (*this)() : t;
}
// do not add to boost::options. We only care about yaml config
void add_command_line_option(boost::program_options::options_description_easy_init&,
const std::string_view&, const std::string_view&) override {}
void add_command_line_option(boost::program_options::options_description_easy_init&) override {}
};
log_legacy_value<seastar::log_level> default_log_level;

View File

@@ -743,7 +743,7 @@ future<mutation> query_partition_mutation(service::storage_proxy& proxy,
{
auto dk = dht::decorate_key(*s, pkey);
return do_with(dht::partition_range::make_singular(dk), [&proxy, dk, s = std::move(s), cmd = std::move(cmd)] (auto& range) {
return proxy.query_mutations_locally(s, std::move(cmd), range, db::no_timeout)
return proxy.query_mutations_locally(s, std::move(cmd), range, db::no_timeout, tracing::trace_state_ptr{})
.then([dk = std::move(dk), s](rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature> res_hit_rate) {
auto&& [res, hit_rate] = res_hit_rate;
auto&& partitions = res->partitions();
@@ -778,7 +778,8 @@ read_schema_partition_for_table(distributed<service::storage_proxy>& proxy, sche
auto slice = partition_slice_builder(*schema)
.with_range(std::move(clustering_range))
.build();
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), query::max_rows);
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice),
query::row_limit(query::max_rows));
return query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key)).then([&proxy] (mutation mut) {
return redact_columns_for_missing_features(std::move(mut), proxy.local().get_db().local().features().cluster_schema_features());
});
@@ -788,7 +789,8 @@ future<mutation>
read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring& keyspace_name) {
schema_ptr s = keyspaces();
auto key = partition_key::from_singular(*s, keyspace_name);
auto cmd = make_lw_shared<query::read_command>(s->id(), s->version(), s->full_slice());
auto slice = s->full_slice();
auto cmd = make_lw_shared<query::read_command>(s->id(), s->version(), std::move(slice), proxy.local().get_max_result_size(slice));
return query_partition_mutation(proxy.local(), std::move(s), std::move(cmd), std::move(key));
}

View File

@@ -1984,8 +1984,7 @@ query_mutations(distributed<service::storage_proxy>& proxy, const sstring& ks_na
database& db = proxy.local().get_db().local();
schema_ptr schema = db.find_schema(ks_name, cf_name);
auto slice = partition_slice_builder(*schema).build();
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(),
std::move(slice), std::numeric_limits<uint32_t>::max());
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice));
return proxy.local().query_mutations_locally(std::move(schema), std::move(cmd), query::full_partition_range, db::no_timeout)
.then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature> rr_ht) { return std::get<0>(std::move(rr_ht)); });
}
@@ -1995,8 +1994,7 @@ query(distributed<service::storage_proxy>& proxy, const sstring& ks_name, const
database& db = proxy.local().get_db().local();
schema_ptr schema = db.find_schema(ks_name, cf_name);
auto slice = partition_slice_builder(*schema).build();
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(),
std::move(slice), std::numeric_limits<uint32_t>::max());
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice));
return proxy.local().query(schema, cmd, {query::full_partition_range}, db::consistency_level::ONE,
{db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) {
return make_lw_shared(query::result_set::from_raw_result(schema, cmd->slice, *qr.query_result));
@@ -2011,7 +2009,7 @@ query(distributed<service::storage_proxy>& proxy, const sstring& ks_name, const
auto slice = partition_slice_builder(*schema)
.with_range(std::move(row_range))
.build();
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), query::max_rows);
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice));
return proxy.local().query(schema, cmd, {dht::partition_range::make_singular(key)}, db::consistency_level::ONE,
{db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) {

View File

@@ -1285,7 +1285,7 @@ view_builder::build_step& view_builder::get_or_create_build_step(utils::UUID bas
void view_builder::initialize_reader_at_current_token(build_step& step) {
step.pslice = make_partition_slice(*step.base->schema());
step.prange = dht::partition_range(dht::ring_position::starting_at(step.current_token()), dht::ring_position::max());
auto permit = _db.make_query_class_config().semaphore.make_permit();
auto permit = _db.get_reader_concurrency_semaphore().make_permit();
step.reader = make_local_shard_sstable_reader(
step.base->schema(),
std::move(permit),

View File

@@ -78,7 +78,7 @@ future<> view_update_generator::start() {
auto [staging_sstable_reader, staging_sstable_reader_handle] = make_manually_paused_evictable_reader(
std::move(ms),
s,
_db.make_query_class_config().semaphore.make_permit(),
_db.get_reader_concurrency_semaphore().make_permit(),
query::full_partition_range,
s->full_slice(),
service::get_local_streaming_priority(),

View File

@@ -60,14 +60,15 @@ void flat_mutation_reader::impl::clear_buffer_to_next_partition() {
_buffer_size = compute_buffer_size(*_schema, _buffer);
}
flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_t max_memory_consumption) {
flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, query::max_result_size max_size) {
class partition_reversing_mutation_reader final : public flat_mutation_reader::impl {
flat_mutation_reader* _source;
range_tombstone_list _range_tombstones;
std::stack<mutation_fragment> _mutation_fragments;
mutation_fragment_opt _partition_end;
size_t _stack_size = 0;
const size_t _max_stack_size;
const query::max_result_size _max_size;
bool _below_soft_limit = true;
private:
stop_iteration emit_partition() {
auto emit_range_tombstone = [&] {
@@ -119,7 +120,7 @@ flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_
} else {
_mutation_fragments.emplace(std::move(mf));
_stack_size += _mutation_fragments.top().memory_usage(*_schema);
if (_stack_size >= _max_stack_size) {
if (_stack_size > _max_size.hard_limit || (_stack_size > _max_size.soft_limit && _below_soft_limit)) {
const partition_key* key = nullptr;
auto it = buffer().end();
--it;
@@ -129,21 +130,30 @@ flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_
--it;
key = &it->as_partition_start().key().key();
}
throw std::runtime_error(fmt::format(
"Aborting reverse partition read because partition {} is larger than the maximum safe size of {} for reversible partitions.",
key->with_schema(*_schema),
_max_stack_size));
if (_stack_size > _max_size.hard_limit) {
throw std::runtime_error(fmt::format(
"Memory usage of reversed read exceeds hard limit of {} (configured via max_memory_for_unlimited_query_hard_limit), while reading partition {}",
_max_size.hard_limit,
key->with_schema(*_schema)));
} else {
fmr_logger.warn(
"Memory usage of reversed read exceeds soft limit of {} (configured via max_memory_for_unlimited_query_soft_limit), while reading partition {}",
_max_size.soft_limit,
key->with_schema(*_schema));
_below_soft_limit = false;
}
}
}
}
return make_ready_future<stop_iteration>(is_buffer_full());
}
public:
explicit partition_reversing_mutation_reader(flat_mutation_reader& mr, size_t max_stack_size)
explicit partition_reversing_mutation_reader(flat_mutation_reader& mr, query::max_result_size max_size)
: flat_mutation_reader::impl(mr.schema())
, _source(&mr)
, _range_tombstones(*_schema)
, _max_stack_size(max_stack_size)
, _max_size(max_size)
{ }
virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
@@ -185,7 +195,7 @@ flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_
}
};
return make_flat_mutation_reader<partition_reversing_mutation_reader>(original, max_memory_consumption);
return make_flat_mutation_reader<partition_reversing_mutation_reader>(original, max_size);
}
template<typename Source>

View File

@@ -29,6 +29,7 @@
#include "mutation_fragment.hh"
#include "tracing/trace_state.hh"
#include "mutation.hh"
#include "query_class_config.hh"
#include <seastar/core/thread.hh>
#include <seastar/core/file.hh>
@@ -720,15 +721,17 @@ make_generating_reader(schema_ptr s, std::function<future<mutation_fragment_opt>
///
/// \param original the reader to be reversed, has to be kept alive while the
/// reversing reader is in use.
/// \param max_memory_consumption the maximum amount of memory the reader is
/// allowed to use for reversing. The reverse reader reads entire partitions
/// into memory, before reversing them. Since partitions can be larger than
/// the available memory, we need to enforce a limit on memory consumption.
/// If the read uses more memory then this limit, the read is aborted.
/// \param max_size the maximum amount of memory the reader is allowed to use
/// for reversing and conversely the maximum size of the results. The
/// reverse reader reads entire partitions into memory, before reversing
/// them. Since partitions can be larger than the available memory, we need
/// to enforce a limit on memory consumption. When reaching the soft limit
/// a warning will be logged. When reaching the hard limit the read will be
/// aborted.
///
/// FIXME: reversing should be done in the sstable layer, see #1413.
flat_mutation_reader
make_reversing_reader(flat_mutation_reader& original, size_t max_memory_consumption);
make_reversing_reader(flat_mutation_reader& original, query::max_result_size max_size);
/// Low level fragment stream validator.
///

View File

@@ -40,6 +40,11 @@ class partition_slice {
uint32_t partition_row_limit() [[version 1.3]] = std::numeric_limits<uint32_t>::max();
};
struct max_result_size {
uint64_t soft_limit;
uint64_t hard_limit;
}
class read_command {
utils::UUID cf_id;
utils::UUID schema_version;
@@ -50,6 +55,7 @@ class read_command {
uint32_t partition_limit [[version 1.3]] = std::numeric_limits<uint32_t>::max();
utils::UUID query_uuid [[version 2.2]] = utils::UUID();
query::is_first_page is_first_page [[version 2.2]] = query::is_first_page::no;
std::optional<query::max_result_size> max_result_size [[version 4.3]] = std::nullopt;
};
}

View File

@@ -244,7 +244,7 @@ public:
virtual reader_concurrency_semaphore& semaphore() override {
const auto shard = this_shard_id();
if (!_semaphores[shard]) {
_semaphores[shard] = &_db.local().make_query_class_config().semaphore;
_semaphores[shard] = &_db.local().get_reader_concurrency_semaphore();
}
return *_semaphores[shard];
}
@@ -618,18 +618,17 @@ static future<reconcilable_result> do_query_mutations(
mutation_reader::forwarding fwd_mr) {
return make_multishard_combining_reader(ctx, std::move(s), pr, ps, pc, std::move(trace_state), fwd_mr);
});
auto class_config = ctx->db().local().make_query_class_config();
auto reader = make_flat_multi_range_reader(s, class_config.semaphore.make_permit(), std::move(ms), ranges, cmd.slice,
service::get_local_sstable_query_read_priority(), trace_state, mutation_reader::forwarding::no);
auto reader = make_flat_multi_range_reader(s, ctx->db().local().get_reader_concurrency_semaphore().make_permit(), std::move(ms), ranges,
cmd.slice, service::get_local_sstable_query_read_priority(), trace_state, mutation_reader::forwarding::no);
auto compaction_state = make_lw_shared<compact_for_mutation_query_state>(*s, cmd.timestamp, cmd.slice, cmd.row_limit,
cmd.partition_limit);
return do_with(std::move(reader), std::move(compaction_state), [&, class_config, accounter = std::move(accounter), timeout] (
return do_with(std::move(reader), std::move(compaction_state), [&, accounter = std::move(accounter), timeout] (
flat_mutation_reader& reader, lw_shared_ptr<compact_for_mutation_query_state>& compaction_state) mutable {
auto rrb = reconcilable_result_builder(*reader.schema(), cmd.slice, std::move(accounter));
return query::consume_page(reader, compaction_state, cmd.slice, std::move(rrb), cmd.row_limit, cmd.partition_limit, cmd.timestamp,
timeout, class_config.max_memory_for_unlimited_query).then([&] (consume_result&& result) mutable {
timeout, *cmd.max_result_size).then([&] (consume_result&& result) mutable {
return make_ready_future<page_consume_result>(page_consume_result(std::move(result), reader.detach_buffer(), std::move(compaction_state)));
});
}).then_wrapped([&ctx] (future<page_consume_result>&& result_fut) {
@@ -659,7 +658,6 @@ future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_tempera
const query::read_command& cmd,
const dht::partition_range_vector& ranges,
tracing::trace_state_ptr trace_state,
uint64_t max_size,
db::timeout_clock::time_point timeout) {
if (cmd.row_limit == 0 || cmd.slice.partition_row_limit() == 0 || cmd.partition_limit == 0) {
return make_ready_future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>(
@@ -668,8 +666,9 @@ future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_tempera
db.local().find_column_family(s).get_global_cache_hit_rate()));
}
return db.local().get_result_memory_limiter().new_mutation_read(max_size).then([&, s = std::move(s), trace_state = std::move(trace_state),
timeout] (query::result_memory_accounter accounter) mutable {
const auto short_read_allwoed = query::short_read(cmd.slice.options.contains<query::partition_slice::option::allow_short_read>());
return db.local().get_result_memory_limiter().new_mutation_read(*cmd.max_result_size, short_read_allwoed).then([&, s = std::move(s),
trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) mutable {
return do_query_mutations(db, s, cmd, ranges, std::move(trace_state), timeout, std::move(accounter)).then_wrapped(
[&db, s = std::move(s)] (future<reconcilable_result>&& f) {
auto& local_db = db.local();

View File

@@ -67,5 +67,4 @@ future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_tempera
const query::read_command& cmd,
const dht::partition_range_vector& ranges,
tracing::trace_state_ptr trace_state,
uint64_t max_size,
db::timeout_clock::time_point timeout);

View File

@@ -122,20 +122,22 @@ mutation::query(query::result::builder& builder,
query::result
mutation::query(const query::partition_slice& slice,
query::result_memory_accounter&& accounter,
query::result_options opts,
gc_clock::time_point now, uint32_t row_limit) &&
{
query::result::builder builder(slice, opts, { });
query::result::builder builder(slice, opts, std::move(accounter));
std::move(*this).query(builder, slice, now, row_limit);
return builder.build();
}
query::result
mutation::query(const query::partition_slice& slice,
query::result_memory_accounter&& accounter,
query::result_options opts,
gc_clock::time_point now, uint32_t row_limit) const&
{
return mutation(*this).query(slice, opts, now, row_limit);
return mutation(*this).query(slice, std::move(accounter), opts, now, row_limit);
}
size_t

View File

@@ -108,6 +108,7 @@ public:
public:
// The supplied partition_slice must be governed by this mutation's schema
query::result query(const query::partition_slice&,
query::result_memory_accounter&& accounter,
query::result_options opts = query::result_options::only_result(),
gc_clock::time_point now = gc_clock::now(),
uint32_t row_limit = query::max_rows) &&;
@@ -115,6 +116,7 @@ public:
// The supplied partition_slice must be governed by this mutation's schema
// FIXME: Slower than the r-value version
query::result query(const query::partition_slice&,
query::result_memory_accounter&& accounter,
query::result_options opts = query::result_options::only_result(),
gc_clock::time_point now = gc_clock::now(),
uint32_t row_limit = query::max_rows) const&;

View File

@@ -1976,7 +1976,6 @@ class mutation_querier {
bool _live_data_in_static_row{};
uint32_t _live_clustering_rows = 0;
std::optional<ser::qr_partition__rows<bytes_ostream>> _rows_wr;
bool _short_reads_allowed;
private:
void query_static_row(const row& r, tombstone current_tombstone);
void prepare_writers();
@@ -1998,7 +1997,6 @@ mutation_querier::mutation_querier(const schema& s, query::result::partition_wri
, _memory_accounter(memory_accounter)
, _pw(std::move(pw))
, _static_cells_wr(pw.start().start_static_row().start_cells())
, _short_reads_allowed(pw.slice().options.contains<query::partition_slice::option::allow_short_read>())
{
}
@@ -2011,7 +2009,7 @@ void mutation_querier::query_static_row(const row& r, tombstone current_tombston
get_compacted_row_slice(_schema, slice, column_kind::static_column,
r, slice.static_columns, _static_cells_wr);
_memory_accounter.update(_static_cells_wr._out.size() - start);
} else if (_short_reads_allowed) {
} else {
seastar::measuring_output_stream stream;
ser::qr_partition__static_row__cells<seastar::measuring_output_stream> out(stream, { });
auto start = stream.size();
@@ -2075,7 +2073,7 @@ stop_iteration mutation_querier::consume(clustering_row&& cr, row_tombstone curr
auto start = _rows_wr->_out.size();
write_row(*_rows_wr);
stop = _memory_accounter.update_and_check(_rows_wr->_out.size() - start);
} else if (_short_reads_allowed) {
} else {
seastar::measuring_output_stream stream;
ser::qr_partition__rows<seastar::measuring_output_stream> out(stream, { });
auto start = stream.size();
@@ -2084,7 +2082,7 @@ stop_iteration mutation_querier::consume(clustering_row&& cr, row_tombstone curr
}
_live_clustering_rows++;
return stop && stop_iteration(_short_reads_allowed);
return stop;
}
uint32_t mutation_querier::consume_end_of_stream() {
@@ -2115,11 +2113,9 @@ class query_result_builder {
query::result::builder& _rb;
std::optional<mutation_querier> _mutation_consumer;
stop_iteration _stop;
stop_iteration _short_read_allowed;
public:
query_result_builder(const schema& s, query::result::builder& rb)
: _schema(s), _rb(rb)
, _short_read_allowed(_rb.slice().options.contains<query::partition_slice::option::allow_short_read>())
{ }
void consume_new_partition(const dht::decorated_key& dk) {
@@ -2130,21 +2126,21 @@ public:
_mutation_consumer->consume(t);
}
stop_iteration consume(static_row&& sr, tombstone t, bool) {
_stop = _mutation_consumer->consume(std::move(sr), t) && _short_read_allowed;
_stop = _mutation_consumer->consume(std::move(sr), t);
return _stop;
}
stop_iteration consume(clustering_row&& cr, row_tombstone t, bool) {
_stop = _mutation_consumer->consume(std::move(cr), t) && _short_read_allowed;
_stop = _mutation_consumer->consume(std::move(cr), t);
return _stop;
}
stop_iteration consume(range_tombstone&& rt) {
_stop = _mutation_consumer->consume(std::move(rt)) && _short_read_allowed;
_stop = _mutation_consumer->consume(std::move(rt));
return _stop;
}
stop_iteration consume_end_of_partition() {
auto live_rows_in_partition = _mutation_consumer->consume_end_of_stream();
if (_short_read_allowed && live_rows_in_partition > 0 && !_stop) {
if (live_rows_in_partition > 0 && !_stop) {
_stop = _rb.memory_accounter().check();
}
if (_stop) {
@@ -2167,7 +2163,7 @@ future<> data_query(
gc_clock::time_point query_time,
query::result::builder& builder,
db::timeout_clock::time_point timeout,
query_class_config class_config,
query::query_class_config class_config,
tracing::trace_state_ptr trace_ptr,
query::querier_cache_context cache_ctx)
{
@@ -2191,6 +2187,24 @@ future<> data_query(
});
}
stop_iteration query::result_memory_accounter::check_local_limit() const {
if (_total_used_memory > _maximum_result_size.hard_limit) {
if (_short_read_allowed) {
return stop_iteration::yes;
}
throw std::runtime_error(fmt::format(
"Memory usage of unpaged query exceeds hard limit of {} (configured via max_memory_for_unlimited_query_hard_limit)",
_maximum_result_size.hard_limit));
}
if (_below_soft_limit && !_short_read_allowed && _total_used_memory > _maximum_result_size.soft_limit) {
mplog.warn(
"Memory usage of unpaged query exceeds soft limit of {} (configured via max_memory_for_unlimited_query_soft_limit)",
_maximum_result_size.soft_limit);
_below_soft_limit = false;
}
return stop_iteration::no;
}
void reconcilable_result_builder::consume_new_partition(const dht::decorated_key& dk) {
_return_static_content_on_partition_with_no_rows =
_slice.options.contains(query::partition_slice::option::always_return_static_content) ||
@@ -2220,7 +2234,7 @@ stop_iteration reconcilable_result_builder::consume(clustering_row&& cr, row_tom
// guarantee progress, not ending the result on a live row would
// mean that the next page fetch will read all tombstones after the
// last live row again.
_stop = stop && stop_iteration(_short_read_allowed);
_stop = stop;
}
return _mutation_consumer->consume(std::move(cr)) || _stop;
}
@@ -2239,7 +2253,7 @@ stop_iteration reconcilable_result_builder::consume_end_of_partition() {
// well. Next page fetch will ask for the next partition and if we
// don't do that we could end up with an unbounded number of
// partitions with only a static row.
_stop = _stop || (_memory_accounter.check() && stop_iteration(_short_read_allowed));
_stop = _stop || _memory_accounter.check();
}
_total_live_rows += _live_rows;
_result.emplace_back(partition { _live_rows, _mutation_consumer->consume_end_of_stream() });
@@ -2261,7 +2275,7 @@ static do_mutation_query(schema_ptr s,
uint32_t partition_limit,
gc_clock::time_point query_time,
db::timeout_clock::time_point timeout,
query_class_config class_config,
query::query_class_config class_config,
query::result_memory_accounter&& accounter,
tracing::trace_state_ptr trace_ptr,
query::querier_cache_context cache_ctx)
@@ -2301,7 +2315,7 @@ mutation_query(schema_ptr s,
uint32_t partition_limit,
gc_clock::time_point query_time,
db::timeout_clock::time_point timeout,
query_class_config class_config,
query::query_class_config class_config,
query::result_memory_accounter&& accounter,
tracing::trace_state_ptr trace_ptr,
query::querier_cache_context cache_ctx)

View File

@@ -58,7 +58,8 @@ bool reconcilable_result::operator!=(const reconcilable_result& other) const {
query::result
to_data_query_result(const reconcilable_result& r, schema_ptr s, const query::partition_slice& slice, uint32_t max_rows, uint32_t max_partitions, query::result_options opts) {
query::result::builder builder(slice, opts, { });
// This result was already built with a limit, don't apply another one.
query::result::builder builder(slice, opts, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size });
for (const partition& p : r.partitions()) {
if (builder.row_count() >= max_rows || builder.partition_count() >= max_partitions) {
break;

View File

@@ -122,14 +122,12 @@ class reconcilable_result_builder {
uint32_t _total_live_rows = 0;
query::result_memory_accounter _memory_accounter;
stop_iteration _stop;
bool _short_read_allowed;
std::optional<streamed_mutation_freezer> _mutation_consumer;
public:
reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
query::result_memory_accounter&& accounter)
: _schema(s), _slice(slice)
, _memory_accounter(std::move(accounter))
, _short_read_allowed(slice.options.contains<query::partition_slice::option::allow_short_read>())
{ }
void consume_new_partition(const dht::decorated_key& dk);
@@ -163,8 +161,8 @@ future<reconcilable_result> mutation_query(
uint32_t partition_limit,
gc_clock::time_point query_time,
db::timeout_clock::time_point timeout,
query_class_config class_config,
query::result_memory_accounter&& accounter = { },
query::query_class_config class_config,
query::result_memory_accounter&& accounter,
tracing::trace_state_ptr trace_ptr = nullptr,
query::querier_cache_context cache_ctx = { });
@@ -178,7 +176,7 @@ future<> data_query(
gc_clock::time_point query_time,
query::result::builder& builder,
db::timeout_clock::time_point timeout,
query_class_config class_config,
query::query_class_config class_config,
tracing::trace_state_ptr trace_ptr = nullptr,
query::querier_cache_context cache_ctx = { });
@@ -193,7 +191,7 @@ class mutation_query_stage {
uint32_t,
gc_clock::time_point,
db::timeout_clock::time_point,
query_class_config,
query::query_class_config,
query::result_memory_accounter&&,
tracing::trace_state_ptr,
query::querier_cache_context> _execution_stage;

View File

@@ -54,6 +54,11 @@ public:
partition_slice_builder& without_partition_key_columns();
partition_slice_builder& without_clustering_key_columns();
partition_slice_builder& reversed();
template <query::partition_slice::option OPTION>
partition_slice_builder& with_option() {
_options.set<OPTION>();
return *this;
}
query::partition_slice build();
};

View File

@@ -83,7 +83,7 @@ auto consume_page(flat_mutation_reader& reader,
uint32_t partition_limit,
gc_clock::time_point query_time,
db::timeout_clock::time_point timeout,
size_t reverse_read_max_memory) {
query::max_result_size max_size) {
return reader.peek(timeout).then([=, &reader, consumer = std::move(consumer), &slice] (
mutation_fragment* next_fragment) mutable {
const auto next_fragment_kind = next_fragment ? next_fragment->mutation_fragment_kind() : mutation_fragment::kind::partition_end;
@@ -94,9 +94,9 @@ auto consume_page(flat_mutation_reader& reader,
compaction_state,
clustering_position_tracker(std::move(consumer), last_ckey));
auto consume = [&reader, &slice, reader_consumer = std::move(reader_consumer), timeout, reverse_read_max_memory] () mutable {
auto consume = [&reader, &slice, reader_consumer = std::move(reader_consumer), timeout, max_size] () mutable {
if (slice.options.contains(query::partition_slice::option::reversed)) {
return do_with(make_reversing_reader(reader, reverse_read_max_memory),
return do_with(make_reversing_reader(reader, max_size),
[reader_consumer = std::move(reader_consumer), timeout] (flat_mutation_reader& reversing_reader) mutable {
return reversing_reader.consume(std::move(reader_consumer), timeout);
});
@@ -223,9 +223,9 @@ public:
uint32_t partition_limit,
gc_clock::time_point query_time,
db::timeout_clock::time_point timeout,
size_t reverse_read_max_memory) {
query::max_result_size max_size) {
return ::query::consume_page(_reader, _compaction_state, *_slice, std::move(consumer), row_limit, partition_limit, query_time,
timeout, reverse_read_max_memory).then([this] (auto&& results) {
timeout, max_size).then([this] (auto&& results) {
_last_ckey = std::get<std::optional<clustering_key>>(std::move(results));
constexpr auto size = std::tuple_size<std::decay_t<decltype(results)>>::value;
static_assert(size <= 2);

View File

@@ -30,6 +30,7 @@
#include "range.hh"
#include "tracing/tracing.hh"
#include "utils/small_vector.hh"
#include "query_class_config.hh"
class position_in_partition_view;
@@ -206,6 +207,10 @@ public:
constexpr auto max_partitions = std::numeric_limits<uint32_t>::max();
// Tagged integers to disambiguate constructor arguments.
enum class row_limit : uint32_t { max = max_rows };
enum class partition_limit : uint32_t { max = max_partitions };
using is_first_page = bool_class<class is_first_page_tag>;
// Full specification of a query to the database.
@@ -233,18 +238,23 @@ public:
// to avoid doing work normally done on paged requests, e.g. attempting to
// reused suspended readers.
query::is_first_page is_first_page;
// The maximum size of the query result, for all queries.
// We use the entire value range, so we need an optional for the case when
// the remote doesn't send it.
std::optional<query::max_result_size> max_result_size;
api::timestamp_type read_timestamp; // not serialized
public:
// IDL constructor
read_command(utils::UUID cf_id,
table_schema_version schema_version,
partition_slice slice,
uint32_t row_limit = max_rows,
gc_clock::time_point now = gc_clock::now(),
std::optional<tracing::trace_info> ti = std::nullopt,
uint32_t partition_limit = max_partitions,
utils::UUID query_uuid = utils::UUID(),
query::is_first_page is_first_page = is_first_page::no,
api::timestamp_type rt = api::new_timestamp())
uint32_t row_limit,
gc_clock::time_point now,
std::optional<tracing::trace_info> ti,
uint32_t partition_limit,
utils::UUID query_uuid,
query::is_first_page is_first_page,
std::optional<query::max_result_size> max_result_size)
: cf_id(std::move(cf_id))
, schema_version(std::move(schema_version))
, slice(std::move(slice))
@@ -254,6 +264,31 @@ public:
, partition_limit(partition_limit)
, query_uuid(query_uuid)
, is_first_page(is_first_page)
, max_result_size(max_result_size)
, read_timestamp(api::new_timestamp())
{ }
read_command(utils::UUID cf_id,
table_schema_version schema_version,
partition_slice slice,
query::max_result_size max_result_size,
query::row_limit row_limit = query::row_limit::max,
query::partition_limit partition_limit = query::partition_limit::max,
gc_clock::time_point now = gc_clock::now(),
std::optional<tracing::trace_info> ti = std::nullopt,
utils::UUID query_uuid = utils::UUID(),
query::is_first_page is_first_page = query::is_first_page::no,
api::timestamp_type rt = api::new_timestamp())
: cf_id(std::move(cf_id))
, schema_version(std::move(schema_version))
, slice(std::move(slice))
, row_limit(static_cast<uint32_t>(row_limit))
, timestamp(now)
, trace_info(std::move(ti))
, partition_limit(static_cast<uint32_t>(partition_limit))
, query_uuid(query_uuid)
, is_first_page(is_first_page)
, max_result_size(max_result_size)
, read_timestamp(rt)
{ }

View File

@@ -225,7 +225,7 @@ result_set::from_raw_result(schema_ptr s, const partition_slice& slice, const re
result_set::result_set(const mutation& m) : result_set([&m] {
auto slice = partition_slice_builder(*m.schema()).build();
auto qr = mutation(m).query(slice, result_options::only_result());
auto qr = mutation(m).query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, result_options::only_result());
return result_set::from_raw_result(m.schema(), slice, qr);
}())
{ }

View File

@@ -30,6 +30,9 @@
namespace query {
struct short_read_tag { };
using short_read = bool_class<short_read_tag>;
// result_memory_limiter, result_memory_accounter and result_memory_tracker
// form an infrastructure for limiting size of query results.
//
@@ -51,6 +54,7 @@ class result_memory_limiter {
public:
static constexpr size_t minimum_result_size = 4 * 1024;
static constexpr size_t maximum_result_size = 1 * 1024 * 1024;
static constexpr size_t unlimited_result_size = std::numeric_limits<size_t>::max();
public:
explicit result_memory_limiter(size_t maximum_total_result_memory)
: _maximum_total_result_memory(maximum_total_result_memory)
@@ -67,18 +71,18 @@ public:
// Reserves minimum_result_size and creates new memory accounter for
// mutation query. Uses the specified maximum result size and may be
// stopped before reaching it due to memory pressure on shard.
future<result_memory_accounter> new_mutation_read(size_t max_result_size);
future<result_memory_accounter> new_mutation_read(query::max_result_size max_result_size, short_read short_read_allowed);
// Reserves minimum_result_size and creates new memory accounter for
// data query. Uses the specified maximum result size, result will *not*
// be stopped due to on shard memory pressure in order to avoid digest
// mismatches.
future<result_memory_accounter> new_data_read(size_t max_result_size);
future<result_memory_accounter> new_data_read(query::max_result_size max_result_size, short_read short_read_allowed);
// Creates a memory accounter for digest reads. Such accounter doesn't
// contribute to the shard memory usage, but still stops producing the
// result after individual limit has been reached.
future<result_memory_accounter> new_digest_read(size_t max_result_size);
future<result_memory_accounter> new_digest_read(query::max_result_size max_result_size, short_read short_read_allowed);
// Checks whether the result can grow any more, takes into account only
// the per shard limit.
@@ -118,40 +122,50 @@ class result_memory_accounter {
size_t _blocked_bytes = 0;
size_t _used_memory = 0;
size_t _total_used_memory = 0;
size_t _maximum_result_size = 0;
query::max_result_size _maximum_result_size;
stop_iteration _stop_on_global_limit;
short_read _short_read_allowed;
mutable bool _below_soft_limit = true;
private:
// Mutation query accounter. Uses provided individual result size limit and
// will stop when shard memory pressure grows too high.
struct mutation_query_tag { };
explicit result_memory_accounter(mutation_query_tag, result_memory_limiter& limiter, size_t max_size) noexcept
explicit result_memory_accounter(mutation_query_tag, result_memory_limiter& limiter, query::max_result_size max_size, short_read short_read_allowed) noexcept
: _limiter(&limiter)
, _blocked_bytes(result_memory_limiter::minimum_result_size)
, _maximum_result_size(max_size)
, _stop_on_global_limit(true)
, _short_read_allowed(short_read_allowed)
{ }
// Data query accounter. Uses provided individual result size limit and
// will *not* stop even though shard memory pressure grows too high.
struct data_query_tag { };
explicit result_memory_accounter(data_query_tag, result_memory_limiter& limiter, size_t max_size) noexcept
explicit result_memory_accounter(data_query_tag, result_memory_limiter& limiter, query::max_result_size max_size, short_read short_read_allowed) noexcept
: _limiter(&limiter)
, _blocked_bytes(result_memory_limiter::minimum_result_size)
, _maximum_result_size(max_size)
, _short_read_allowed(short_read_allowed)
{ }
// Digest query accounter. Uses provided individual result size limit and
// will *not* stop even though shard memory pressure grows too high. This
// accounter does not contribute to the shard memory limits.
struct digest_query_tag { };
explicit result_memory_accounter(digest_query_tag, result_memory_limiter&, size_t max_size) noexcept
explicit result_memory_accounter(digest_query_tag, result_memory_limiter&, query::max_result_size max_size, short_read short_read_allowed) noexcept
: _blocked_bytes(0)
, _maximum_result_size(max_size)
, _short_read_allowed(short_read_allowed)
{ }
stop_iteration check_local_limit() const;
friend class result_memory_limiter;
public:
result_memory_accounter() = default;
explicit result_memory_accounter(size_t max_size) noexcept
: _blocked_bytes(0)
, _maximum_result_size(max_size) {
}
result_memory_accounter(result_memory_accounter&& other) noexcept
: _limiter(std::exchange(other._limiter, nullptr))
@@ -160,6 +174,8 @@ public:
, _total_used_memory(other._total_used_memory)
, _maximum_result_size(other._maximum_result_size)
, _stop_on_global_limit(other._stop_on_global_limit)
, _short_read_allowed(other._short_read_allowed)
, _below_soft_limit(other._below_soft_limit)
{ }
result_memory_accounter& operator=(result_memory_accounter&& other) noexcept {
@@ -184,7 +200,7 @@ public:
stop_iteration update_and_check(size_t n) {
_used_memory += n;
_total_used_memory += n;
auto stop = stop_iteration(_total_used_memory > _maximum_result_size);
auto stop = check_local_limit();
if (_limiter && _used_memory > _blocked_bytes) {
auto to_block = std::min(_used_memory - _blocked_bytes, n);
_blocked_bytes += to_block;
@@ -195,7 +211,7 @@ public:
// Checks whether the result can grow any more.
stop_iteration check() const {
stop_iteration stop { _total_used_memory > result_memory_limiter::maximum_result_size };
auto stop = check_local_limit();
if (!stop && _used_memory >= _blocked_bytes && _limiter) {
return _limiter->check() && _stop_on_global_limit;
}
@@ -216,20 +232,20 @@ public:
}
};
inline future<result_memory_accounter> result_memory_limiter::new_mutation_read(size_t max_size) {
return _memory_limiter.wait(minimum_result_size).then([this, max_size] {
return result_memory_accounter(result_memory_accounter::mutation_query_tag(), *this, max_size);
inline future<result_memory_accounter> result_memory_limiter::new_mutation_read(query::max_result_size max_size, short_read short_read_allowed) {
return _memory_limiter.wait(minimum_result_size).then([this, max_size, short_read_allowed] {
return result_memory_accounter(result_memory_accounter::mutation_query_tag(), *this, max_size, short_read_allowed);
});
}
inline future<result_memory_accounter> result_memory_limiter::new_data_read(size_t max_size) {
return _memory_limiter.wait(minimum_result_size).then([this, max_size] {
return result_memory_accounter(result_memory_accounter::data_query_tag(), *this, max_size);
inline future<result_memory_accounter> result_memory_limiter::new_data_read(query::max_result_size max_size, short_read short_read_allowed) {
return _memory_limiter.wait(minimum_result_size).then([this, max_size, short_read_allowed] {
return result_memory_accounter(result_memory_accounter::data_query_tag(), *this, max_size, short_read_allowed);
});
}
inline future<result_memory_accounter> result_memory_limiter::new_digest_read(size_t max_size) {
return make_ready_future<result_memory_accounter>(result_memory_accounter(result_memory_accounter::digest_query_tag(), *this, max_size));
inline future<result_memory_accounter> result_memory_limiter::new_digest_read(query::max_result_size max_size, short_read short_read_allowed) {
return make_ready_future<result_memory_accounter>(result_memory_accounter(result_memory_accounter::digest_query_tag(), *this, max_size, short_read_allowed));
}
enum class result_request {
@@ -304,9 +320,6 @@ public:
// - query-result-reader.hh
// - query-result-writer.hh
struct short_read_tag { };
using short_read = bool_class<short_read_tag>;
class result {
bytes_ostream _w;
std::optional<result_digest> _digest;

View File

@@ -34,6 +34,7 @@ namespace query {
constexpr size_t result_memory_limiter::minimum_result_size;
constexpr size_t result_memory_limiter::maximum_result_size;
constexpr size_t result_memory_limiter::unlimited_result_size;
thread_local semaphore result_memory_tracker::_dummy { 0 };

View File

@@ -25,7 +25,20 @@
class reader_concurrency_semaphore;
namespace query {
struct max_result_size {
uint64_t soft_limit = 0;
uint64_t hard_limit = 0;
max_result_size() = default;
explicit max_result_size(uint64_t max_size) : soft_limit(max_size), hard_limit(max_size) { }
explicit max_result_size(uint64_t soft_limit, uint64_t hard_limit) : soft_limit(soft_limit), hard_limit(hard_limit) { }
};
struct query_class_config {
reader_concurrency_semaphore& semaphore;
uint64_t max_memory_for_unlimited_query;
max_result_size max_memory_for_unlimited_query;
};
}

View File

@@ -75,7 +75,8 @@ public:
future<lw_shared_ptr<strings_result>> read_strings(service::storage_proxy& proxy, const redis_options& options, const bytes& key, service_permit permit) {
auto schema = get_schema(proxy, options.get_keyspace_name(), redis::STRINGs);
auto ps = partition_slice_builder(*schema).build();
query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1);
const auto max_result_size = proxy.get_max_result_size(ps);
query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1, utils::UUID(), query::is_first_page::no, max_result_size);
auto pkey = partition_key::from_single_value(*schema, key);
auto partition_range = dht::partition_range::make_singular(dht::decorate_key(*schema, std::move(pkey)));
dht::partition_range_vector partition_ranges;

View File

@@ -84,6 +84,15 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
future<service::storage_proxy::coordinator_query_result> query_pager::do_fetch_page(uint32_t page_size, gc_clock::time_point now, db::timeout_clock::time_point timeout) {
auto state = _options.get_paging_state();
auto& proxy = get_local_storage_proxy();
// Most callers should set this but we want to make sure, as results
// won't be paged without it.
_cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
// Override this, to make sure we use the value appropriate for paging
// (with allow_short_read set).
_cmd->max_result_size = proxy.get_max_result_size(_cmd->slice);
if (!_last_pkey && state) {
_max = state->get_remaining();
_last_pkey = state->get_partition_key();
@@ -187,7 +196,7 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
auto ranges = _ranges;
auto command = ::make_lw_shared<query::read_command>(*_cmd);
return get_local_storage_proxy().query(_schema,
return proxy.query(_schema,
std::move(command),
std::move(ranges),
_options.get_consistency(),

View File

@@ -81,7 +81,7 @@ future<prepare_response> paxos_state::prepare(tracing::trace_state_ptr tr_state,
[tr_state, schema, &cmd, only_digest, da, timeout] (const dht::partition_range_vector& prv) {
return get_local_storage_proxy().get_db().local().query(schema, cmd,
{only_digest ? query::result_request::only_digest : query::result_request::result_and_digest, da},
prv, tr_state, query::result_memory_limiter::maximum_result_size, timeout);
prv, tr_state, timeout);
});
});
return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest] (auto t) {

View File

@@ -1316,6 +1316,21 @@ endpoints_to_replica_ids(locator::token_metadata& tm, const std::vector<gms::ine
return replica_ids;
}
query::max_result_size storage_proxy::get_max_result_size(const query::partition_slice& slice) const {
// Unpaged and reverse queries.
if (!slice.options.contains<query::partition_slice::option::allow_short_read>() || slice.options.contains<query::partition_slice::option::reversed>()) {
auto& db = _db.local();
// We only limit user queries.
if (current_scheduling_group() == db.get_statement_scheduling_group()) {
return query::max_result_size(db.get_config().max_memory_for_unlimited_query_soft_limit(), db.get_config().max_memory_for_unlimited_query_hard_limit());
} else {
return query::max_result_size(query::result_memory_limiter::unlimited_result_size);
}
} else {
return query::max_result_size(query::result_memory_limiter::maximum_result_size);
}
}
bool storage_proxy::need_throttle_writes() const {
return get_global_stats().background_write_bytes > _background_write_throttle_threahsold || get_global_stats().queued_write_bytes > 6*1024*1024;
}
@@ -3857,8 +3872,8 @@ db::read_repair_decision storage_proxy::new_read_repair_decision(const schema& s
}
future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>
storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da, uint64_t max_size) {
return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout, max_size).then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> result_and_hit_rate) {
storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da) {
return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout).then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> result_and_hit_rate) {
auto&& [result, hit_rate] = result_and_hit_rate;
return make_ready_future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>(rpc::tuple(*result->digest(), result->last_modified(), hit_rate));
});
@@ -3866,15 +3881,15 @@ storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read
future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>
storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, query::result_options opts,
tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, uint64_t max_size) {
tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout) {
cmd->slice.options.set_if<query::partition_slice::option::with_digest>(opts.request != query::result_request::only_result);
if (pr.is_singular()) {
unsigned shard = dht::shard_of(*s, pr.start()->value().token());
get_stats().replica_cross_shard_ops += shard != this_shard_id();
return _db.invoke_on(shard, _read_smp_service_group, [max_size, gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
return _db.invoke_on(shard, _read_smp_service_group, [gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
auto trace_state = gt.get();
tracing::trace(trace_state, "Start querying the token range that starts with {}", seastar::value_of([&prv] { return prv.begin()->start()->value().token(); }));
return db.query(gs, *cmd, opts, prv, trace_state, max_size, timeout).then([trace_state](std::tuple<lw_shared_ptr<query::result>, cache_temperature>&& f_ht) {
return db.query(gs, *cmd, opts, prv, trace_state, timeout).then([trace_state](std::tuple<lw_shared_ptr<query::result>, cache_temperature>&& f_ht) {
auto&& [f, ht] = f_ht;
tracing::trace(trace_state, "Querying is done");
return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>(rpc::tuple(make_foreign(std::move(f)), ht));
@@ -3882,7 +3897,7 @@ storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr<query::read_comman
});
} else {
// FIXME: adjust multishard_mutation_query to accept an smp_service_group and propagate it there
return query_nonsingular_mutations_locally(s, cmd, {pr}, std::move(trace_state), max_size, timeout).then([s, cmd, opts] (rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>&& r_ht) {
return query_nonsingular_mutations_locally(s, cmd, {pr}, std::move(trace_state), timeout).then([s, cmd, opts] (rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>&& r_ht) {
auto&& [r, ht] = r_ht;
return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>(
rpc::tuple(::make_foreign(::make_lw_shared(to_data_query_result(*r, s, cmd->slice, cmd->row_limit, cmd->partition_limit, opts))), ht));
@@ -4330,7 +4345,8 @@ static lw_shared_ptr<query::read_command> read_nothing_read_command(schema_ptr s
// Note that because this read-nothing command has an empty slice,
// storage_proxy::query() returns immediately - without any networking.
auto partition_slice = query::partition_slice({}, {}, {}, query::partition_slice::option_set());
return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice,
query::max_result_size(query::result_memory_limiter::unlimited_result_size));
}
static read_timeout_exception write_timeout_to_read(schema_ptr s, mutation_write_timeout_exception& ex) {
@@ -4920,11 +4936,13 @@ void storage_proxy::init_messaging_service() {
tracing::trace(trace_state_ptr, "read_data: message received from /{}", src_addr.addr);
}
auto da = oda.value_or(query::digest_algorithm::MD5);
auto max_size = cinfo.retrieve_auxiliary<uint64_t>("max_result_size");
return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, max_size, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
if (!cmd.max_result_size) {
cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
}
return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
p->get_stats().replica_data_reads++;
auto src_ip = src_addr.addr;
return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, da, &pr, &p, &trace_state_ptr, max_size, t] (schema_ptr s) {
return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, da, &pr, &p, &trace_state_ptr, t] (schema_ptr s) {
auto pr2 = ::compat::unwrap(std::move(pr), *s);
if (pr2.second) {
// this function assumes singular queries but doesn't validate
@@ -4934,7 +4952,7 @@ void storage_proxy::init_messaging_service() {
opts.digest_algo = da;
opts.request = da == query::digest_algorithm::none ? query::result_request::only_result : query::result_request::result_and_digest;
auto timeout = t ? *t : db::no_timeout;
return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout, max_size);
return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout);
}).finally([&trace_state_ptr, src_ip] () mutable {
tracing::trace(trace_state_ptr, "read_data handling is done, sending a response to /{}", src_ip);
});
@@ -4948,22 +4966,24 @@ void storage_proxy::init_messaging_service() {
tracing::begin(trace_state_ptr);
tracing::trace(trace_state_ptr, "read_mutation_data: message received from /{}", src_addr.addr);
}
auto max_size = cinfo.retrieve_auxiliary<uint64_t>("max_result_size");
if (!cmd.max_result_size) {
cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
}
return do_with(std::move(pr),
get_local_shared_storage_proxy(),
std::move(trace_state_ptr),
::compat::one_or_two_partition_ranges({}),
[&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), max_size, t] (
[&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), t] (
::compat::wrapping_partition_range& pr,
shared_ptr<storage_proxy>& p,
tracing::trace_state_ptr& trace_state_ptr,
::compat::one_or_two_partition_ranges& unwrapped) mutable {
p->get_stats().replica_mutation_data_reads++;
auto src_ip = src_addr.addr;
return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, max_size, &unwrapped, t] (schema_ptr s) mutable {
return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, &unwrapped, t] (schema_ptr s) mutable {
unwrapped = ::compat::unwrap(std::move(pr), *s);
auto timeout = t ? *t : db::no_timeout;
return p->query_mutations_locally(std::move(s), std::move(cmd), unwrapped, timeout, trace_state_ptr, max_size);
return p->query_mutations_locally(std::move(s), std::move(cmd), unwrapped, timeout, trace_state_ptr);
}).finally([&trace_state_ptr, src_ip] () mutable {
tracing::trace(trace_state_ptr, "read_mutation_data handling is done, sending a response to /{}", src_ip);
});
@@ -4978,18 +4998,20 @@ void storage_proxy::init_messaging_service() {
tracing::trace(trace_state_ptr, "read_digest: message received from /{}", src_addr.addr);
}
auto da = oda.value_or(query::digest_algorithm::MD5);
auto max_size = cinfo.retrieve_auxiliary<uint64_t>("max_result_size");
return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, max_size, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
if (!cmd.max_result_size) {
cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
}
return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
p->get_stats().replica_digest_reads++;
auto src_ip = src_addr.addr;
return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, max_size, t, da] (schema_ptr s) {
return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, t, da] (schema_ptr s) {
auto pr2 = ::compat::unwrap(std::move(pr), *s);
if (pr2.second) {
// this function assumes singular queries but doesn't validate
throw std::runtime_error("READ_DIGEST called with wrapping range");
}
auto timeout = t ? *t : db::no_timeout;
return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da, max_size);
return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da);
}).finally([&trace_state_ptr, src_ip] () mutable {
tracing::trace(trace_state_ptr, "read_digest handling is done, sending a response to /{}", src_ip);
});
@@ -5025,6 +5047,9 @@ void storage_proxy::init_messaging_service() {
tracing::begin(tr_state);
tracing::trace(tr_state, "paxos_prepare: message received from /{} ballot {}", src_ip, ballot);
}
if (!cmd.max_result_size) {
cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
}
return get_schema_for_read(cmd.schema_version, src_addr).then([this, cmd = std::move(cmd), key = std::move(key), ballot,
only_digest, da, timeout, tr_state = std::move(tr_state), src_ip] (schema_ptr schema) mutable {
@@ -5137,31 +5162,29 @@ future<> storage_proxy::uninit_messaging_service() {
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
storage_proxy::query_mutations_locally(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
storage_proxy::clock_type::time_point timeout,
tracing::trace_state_ptr trace_state, uint64_t max_size) {
tracing::trace_state_ptr trace_state) {
if (pr.is_singular()) {
unsigned shard = dht::shard_of(*s, pr.start()->value().token());
get_stats().replica_cross_shard_ops += shard != this_shard_id();
return _db.invoke_on(shard, _read_smp_service_group, [max_size, cmd, &pr, gs=global_schema_ptr(s), timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
return db.get_result_memory_limiter().new_mutation_read(max_size).then([&] (query::result_memory_accounter ma) {
return db.query_mutations(gs, *cmd, pr, std::move(ma), gt, timeout).then([] (std::tuple<reconcilable_result, cache_temperature> result_ht) {
return _db.invoke_on(shard, _read_smp_service_group, [cmd, &pr, gs=global_schema_ptr(s), timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
return db.query_mutations(gs, *cmd, pr, gt, timeout).then([] (std::tuple<reconcilable_result, cache_temperature> result_ht) {
auto&& [result, ht] = result_ht;
return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>(rpc::tuple(make_foreign(make_lw_shared(std::move(result))), ht));
});
});
});
} else {
return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), {pr}, std::move(trace_state), max_size, timeout);
return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), {pr}, std::move(trace_state), timeout);
}
}
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
storage_proxy::query_mutations_locally(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const ::compat::one_or_two_partition_ranges& pr,
storage_proxy::clock_type::time_point timeout,
tracing::trace_state_ptr trace_state, uint64_t max_size) {
tracing::trace_state_ptr trace_state) {
if (!pr.second) {
return query_mutations_locally(std::move(s), std::move(cmd), pr.first, timeout, std::move(trace_state), max_size);
return query_mutations_locally(std::move(s), std::move(cmd), pr.first, timeout, std::move(trace_state));
} else {
return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), pr, std::move(trace_state), max_size, timeout);
return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), pr, std::move(trace_state), timeout);
}
}
@@ -5170,11 +5193,10 @@ storage_proxy::query_nonsingular_mutations_locally(schema_ptr s,
lw_shared_ptr<query::read_command> cmd,
const dht::partition_range_vector&& prs,
tracing::trace_state_ptr trace_state,
uint64_t max_size,
storage_proxy::clock_type::time_point timeout) {
return do_with(cmd, std::move(prs), [this, max_size, timeout, s = std::move(s), trace_state = std::move(trace_state)] (lw_shared_ptr<query::read_command>& cmd,
return do_with(cmd, std::move(prs), [this, timeout, s = std::move(s), trace_state = std::move(trace_state)] (lw_shared_ptr<query::read_command>& cmd,
const dht::partition_range_vector& prs) mutable {
return query_mutations_on_all_shards(_db, std::move(s), *cmd, prs, std::move(trace_state), max_size, timeout).then([] (std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature> t) {
return query_mutations_on_all_shards(_db, std::move(s), *cmd, prs, std::move(trace_state), timeout).then([] (std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature> t) {
return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>(std::move(t));
});
});

View File

@@ -247,6 +247,8 @@ public:
const locator::token_metadata& get_token_metadata() const { return _token_metadata; }
locator::token_metadata& get_token_metadata() { return _token_metadata; }
query::max_result_size get_max_result_size(const query::partition_slice& slice) const;
private:
distributed<database>& _db;
locator::token_metadata& _token_metadata;
@@ -358,13 +360,11 @@ private:
future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> query_result_local(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
query::result_options opts,
tracing::trace_state_ptr trace_state,
clock_type::time_point timeout,
uint64_t max_size = query::result_memory_limiter::maximum_result_size);
clock_type::time_point timeout);
future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>> query_result_local_digest(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
tracing::trace_state_ptr trace_state,
clock_type::time_point timeout,
query::digest_algorithm da,
uint64_t max_size = query::result_memory_limiter::maximum_result_size);
query::digest_algorithm da);
future<coordinator_query_result> query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
dht::partition_range_vector partition_ranges,
db::consistency_level cl,
@@ -406,7 +406,7 @@ private:
future<> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, service_permit permit, std::optional<clock_type::time_point> timeout_opt = { }, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker = { });
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_nonsingular_mutations_locally(
schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state,
uint64_t max_size, clock_type::time_point timeout);
clock_type::time_point timeout);
future<> mutate_counters_on_leader(std::vector<frozen_mutation_and_schema> mutations, db::consistency_level cl, clock_type::time_point timeout,
tracing::trace_state_ptr trace_state, service_permit permit);
@@ -558,21 +558,18 @@ public:
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_mutations_locally(
schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range&,
clock_type::time_point timeout,
tracing::trace_state_ptr trace_state = nullptr,
uint64_t max_size = query::result_memory_limiter::maximum_result_size);
tracing::trace_state_ptr trace_state = nullptr);
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_mutations_locally(
schema_ptr, lw_shared_ptr<query::read_command> cmd, const ::compat::one_or_two_partition_ranges&,
clock_type::time_point timeout,
tracing::trace_state_ptr trace_state = nullptr,
uint64_t max_size = query::result_memory_limiter::maximum_result_size);
tracing::trace_state_ptr trace_state = nullptr);
future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_mutations_locally(
schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector& pr,
clock_type::time_point timeout,
tracing::trace_state_ptr trace_state = nullptr,
uint64_t max_size = query::result_memory_limiter::maximum_result_size);
tracing::trace_state_ptr trace_state = nullptr);
future<bool> cas(schema_ptr schema, shared_ptr<cas_request> request, lw_shared_ptr<query::read_command> cmd,
dht::partition_range_vector&& partition_ranges, coordinator_query_options query_options,

View File

@@ -2001,7 +2001,7 @@ struct query_state {
const query::read_command& cmd,
query::result_options opts,
const dht::partition_range_vector& ranges,
query::result_memory_accounter memory_accounter = { })
query::result_memory_accounter memory_accounter)
: schema(std::move(s))
, cmd(cmd)
, builder(cmd.slice, opts, std::move(memory_accounter))
@@ -2032,18 +2032,18 @@ struct query_state {
future<lw_shared_ptr<query::result>>
table::query(schema_ptr s,
const query::read_command& cmd,
query_class_config class_config,
query::query_class_config class_config,
query::result_options opts,
const dht::partition_range_vector& partition_ranges,
tracing::trace_state_ptr trace_state,
query::result_memory_limiter& memory_limiter,
uint64_t max_size,
db::timeout_clock::time_point timeout,
query::querier_cache_context cache_ctx) {
utils::latency_counter lc;
_stats.reads.set_latency(lc);
const auto short_read_allwoed = query::short_read(cmd.slice.options.contains<query::partition_slice::option::allow_short_read>());
auto f = opts.request == query::result_request::only_digest
? memory_limiter.new_digest_read(max_size) : memory_limiter.new_data_read(max_size);
? memory_limiter.new_digest_read(*cmd.max_result_size, short_read_allwoed) : memory_limiter.new_data_read(*cmd.max_result_size, short_read_allwoed);
return f.then([this, lc, s = std::move(s), &cmd, class_config, opts, &partition_ranges,
trace_state = std::move(trace_state), timeout, cache_ctx = std::move(cache_ctx)] (query::result_memory_accounter accounter) mutable {
auto qs_ptr = std::make_unique<query_state>(std::move(s), cmd, opts, partition_ranges, std::move(accounter));

View File

@@ -2765,7 +2765,8 @@ SEASTAR_TEST_CASE(test_reversed_slice_with_empty_range_before_all_rows) {
// See #6171
SEASTAR_TEST_CASE(test_reversed_slice_with_many_clustering_ranges) {
cql_test_config cfg;
cfg.db_config->max_memory_for_unlimited_query(std::numeric_limits<uint64_t>::max());
cfg.db_config->max_memory_for_unlimited_query_soft_limit(std::numeric_limits<uint64_t>::max());
cfg.db_config->max_memory_for_unlimited_query_hard_limit(std::numeric_limits<uint64_t>::max());
return do_with_cql_env_thread([] (cql_test_env& e) {
e.execute_cql("CREATE TABLE test (pk int, ck int, v text, PRIMARY KEY (pk, ck));").get();
auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0();
@@ -4574,3 +4575,81 @@ SEASTAR_TEST_CASE(test_impossible_where) {
require_rows(e, "SELECT * FROM t2 WHERE c>=10 AND c<=0 ALLOW FILTERING", {});
});
}
SEASTAR_THREAD_TEST_CASE(test_query_limit) {
cql_test_config cfg;
cfg.db_config->max_memory_for_unlimited_query_soft_limit.set(256, utils::config_file::config_source::CommandLine);
cfg.db_config->max_memory_for_unlimited_query_hard_limit.set(1024, utils::config_file::config_source::CommandLine);
cfg.dbcfg.emplace();
cfg.dbcfg->available_memory = memory::stats().total_memory();
cfg.dbcfg->statement_scheduling_group = seastar::create_scheduling_group("statement", 1000).get0();
cfg.dbcfg->streaming_scheduling_group = seastar::create_scheduling_group("streaming", 200).get0();
do_with_cql_env_thread([] (cql_test_env& e) {
e.execute_cql("CREATE TABLE test (pk int, ck int, v text, PRIMARY KEY (pk, ck));").get();
auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0();
const int pk = 0;
const auto raw_pk = int32_type->decompose(data_value(pk));
const auto cql3_pk = cql3::raw_value::make_value(raw_pk);
const auto value = sstring(1024, 'a');
const auto raw_value = utf8_type->decompose(data_value(value));
const auto cql3_value = cql3::raw_value::make_value(raw_value);
const int num_rows = 10;
for (int i = 0; i != num_rows; ++i) {
const auto cql3_ck = cql3::raw_value::make_value(int32_type->decompose(data_value(i)));
e.execute_prepared(id, {cql3_pk, cql3_ck, cql3_value}).get();
}
auto& db = e.local_db();
const auto make_expected_row = [&] (int ck) -> std::vector<bytes_opt> {
return {raw_pk, int32_type->decompose(ck), raw_value};
};
const auto normal_rows = boost::copy_range<std::vector<std::vector<bytes_opt>>>(boost::irange(0, num_rows) | boost::adaptors::transformed(make_expected_row));
const auto reversed_rows = boost::copy_range<std::vector<std::vector<bytes_opt>>>(boost::irange(0, num_rows) | boost::adaptors::reversed | boost::adaptors::transformed(make_expected_row));
for (auto is_paged : {true, false}) {
for (auto is_reversed : {true, false}) {
for (auto scheduling_group : {db.get_statement_scheduling_group(), db.get_streaming_scheduling_group(), default_scheduling_group()}) {
const auto should_fail = (!is_paged || is_reversed) && scheduling_group == db.get_statement_scheduling_group();
testlog.info("checking: is_paged={}, is_reversed={}, scheduling_group={}, should_fail={}", is_paged, is_reversed, scheduling_group.name(), should_fail);
const auto select_query = format("SELECT * FROM test WHERE pk = {} ORDER BY ck {};", pk, is_reversed ? "DESC" : "ASC");
int32_t page_size = is_paged ? 10000 : -1;
auto qo = std::make_unique<cql3::query_options>(db::consistency_level::LOCAL_ONE, infinite_timeout_config, std::vector<cql3::raw_value>{},
cql3::query_options::specific_options{page_size, nullptr, {}, api::new_timestamp()});
const auto* expected_rows = is_reversed ? &reversed_rows : &normal_rows;
try {
auto result = with_scheduling_group(scheduling_group, [&e] (const sstring& q, std::unique_ptr<cql3::query_options> qo) {
return e.execute_cql(q, std::move(qo));
}, select_query, std::move(qo)).get0();
assert_that(std::move(result))
.is_rows()
.with_rows(*expected_rows);
if (should_fail) {
BOOST_FAIL("Expected exception, but none was thrown.");
} else {
testlog.trace("No exception thrown, as expected.");
}
} catch (exceptions::read_failure_exception& e) {
if (should_fail) {
testlog.trace("Exception thrown, as expected: {}", e);
} else {
BOOST_FAIL(fmt::format("Expected no exception, but caught: {}", e));
}
}
}
}
}
}, std::move(cfg)).get();
}

View File

@@ -28,6 +28,7 @@
#include "test/lib/cql_test_env.hh"
#include "test/lib/result_set_assertions.hh"
#include "test/lib/reader_permit.hh"
#include "test/lib/log.hh"
#include "database.hh"
#include "partition_slice_builder.hh"
@@ -40,6 +41,7 @@
#include "db/commitlog/commitlog_replayer.hh"
#include "test/lib/tmpdir.hh"
#include "db/data_listeners.hh"
#include "multishard_mutation_query.hh"
using namespace std::chrono_literals;
@@ -62,8 +64,8 @@ SEASTAR_TEST_CASE(test_safety_after_truncate) {
auto assert_query_result = [&] (size_t expected_size) {
auto max_size = std::numeric_limits<size_t>::max();
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 1000);
auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0();
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(1000));
auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0();
assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(expected_size);
};
assert_query_result(1000);
@@ -105,22 +107,22 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
auto max_size = std::numeric_limits<size_t>::max();
{
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 3);
auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(3));
auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0());
assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
}
{
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
query::max_rows, gc_clock::now(), std::nullopt, 5);
auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size),
query::row_limit(query::max_rows), query::partition_limit(5));
auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0());
assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5);
}
{
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
query::max_rows, gc_clock::now(), std::nullopt, 3);
auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size),
query::row_limit(query::max_rows), query::partition_limit(3));
auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0());
assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
}
});
@@ -472,3 +474,86 @@ SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
tq.gather().get();
});
}
SEASTAR_THREAD_TEST_CASE(read_max_size) {
do_with_cql_env([] (cql_test_env& e) {
e.execute_cql("CREATE TABLE test (pk text, ck int, v text, PRIMARY KEY (pk, ck));").get();
auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0();
auto& db = e.local_db();
auto& tab = db.find_column_family("ks", "test");
auto s = tab.schema();
auto pk = make_local_key(s);
const auto raw_pk = utf8_type->decompose(data_value(pk));
const auto cql3_pk = cql3::raw_value::make_value(raw_pk);
const auto value = sstring(1024, 'a');
const auto raw_value = utf8_type->decompose(data_value(value));
const auto cql3_value = cql3::raw_value::make_value(raw_value);
const int num_rows = 1024;
for (int i = 0; i != num_rows; ++i) {
const auto cql3_ck = cql3::raw_value::make_value(int32_type->decompose(data_value(i)));
e.execute_prepared(id, {cql3_pk, cql3_ck, cql3_value}).get();
}
const auto partition_ranges = std::vector<dht::partition_range>{query::full_partition_range};
const std::vector<std::pair<sstring, std::function<future<size_t>(schema_ptr, const query::read_command&)>>> query_methods{
{"query_mutations()", [&db, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
return db.query_mutations(s, cmd, partition_ranges.front(), {}, db::no_timeout).then(
[] (const std::tuple<reconcilable_result, cache_temperature>& res) {
return std::get<0>(res).memory_usage();
});
}},
{"query()", [&db, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
return db.query(s, cmd, query::result_options::only_result(), partition_ranges, {}, db::no_timeout).then(
[] (const std::tuple<lw_shared_ptr<query::result>, cache_temperature>& res) {
return size_t(std::get<0>(res)->buf().size());
});
}},
{"query_mutations_on_all_shards()", [&e, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
return query_mutations_on_all_shards(e.db(), s, cmd, partition_ranges, {}, db::no_timeout).then(
[] (const std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>& res) {
return std::get<0>(res)->memory_usage();
});
}}
};
for (auto [query_method_name, query_method] : query_methods) {
for (auto allow_short_read : {true, false}) {
for (auto max_size : {1024u, 1024u * 1024u, 1024u * 1024u * 1024u}) {
const auto should_throw = max_size < (num_rows * value.size() * 2) && !allow_short_read;
testlog.info("checking: query_method={}, allow_short_read={}, max_size={}, should_throw={}", query_method_name, allow_short_read, max_size, should_throw);
auto slice = s->full_slice();
if (allow_short_read) {
slice.options.set<query::partition_slice::option::allow_short_read>();
} else {
slice.options.remove<query::partition_slice::option::allow_short_read>();
}
query::read_command cmd(s->id(), s->version(), slice, query::max_result_size(max_size));
try {
auto size = query_method(s, cmd).get0();
// Just to ensure we are not interpreting empty results as success.
BOOST_REQUIRE(size != 0);
if (should_throw) {
BOOST_FAIL("Expected exception, but none was thrown.");
} else {
testlog.trace("No exception thrown, as expected.");
}
} catch (std::runtime_error& e) {
if (should_throw) {
testlog.trace("Exception thrown, as expected: {}", e);
} else {
BOOST_FAIL(fmt::format("Expected no exception, but caught: {}", e));
}
}
}
}
}
return make_ready_future<>();
}).get();
}

View File

@@ -592,7 +592,7 @@ void test_flat_stream(schema_ptr s, std::vector<mutation> muts, reversed_partiti
return fmr.consume_in_thread(std::move(fsc), db::no_timeout);
} else {
if (reversed) {
auto reverse_reader = make_reversing_reader(fmr, size_t(1) << 20);
auto reverse_reader = make_reversing_reader(fmr, query::max_result_size(size_t(1) << 20));
return reverse_reader.consume(std::move(fsc), db::no_timeout).get0();
}
return fmr.consume(std::move(fsc), db::no_timeout).get0();
@@ -805,7 +805,8 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_memory_limit) {
auto test_with_partition = [&] (bool with_static_row) {
testlog.info("Testing with_static_row={}", with_static_row);
auto mut = schema.new_mutation("pk1");
const auto pk = "pk1";
auto mut = schema.new_mutation(pk);
const size_t desired_mut_size = 1 * 1024 * 1024;
const size_t row_size = 10 * 1024;
@@ -817,8 +818,9 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_memory_limit) {
schema.add_row(mut, schema.make_ckey(++i), sstring(row_size, '0'));
}
const uint64_t hard_limit = size_t(1) << 18;
auto reader = flat_mutation_reader_from_mutations({mut});
auto reverse_reader = make_reversing_reader(reader, size_t(1) << 10);
auto reverse_reader = make_reversing_reader(reader, query::max_result_size(size_t(1) << 10, hard_limit));
try {
reverse_reader.consume(phony_consumer{}, db::no_timeout).get();
@@ -826,7 +828,12 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_memory_limit) {
} catch (const std::runtime_error& e) {
testlog.info("Got exception with message: {}", e.what());
auto str = sstring(e.what());
BOOST_REQUIRE_EQUAL(str.find("Aborting reverse partition read because partition pk1"), 0);
const auto expected_str = format(
"Memory usage of reversed read exceeds hard limit of {} (configured via max_memory_for_unlimited_query_hard_limit), while reading partition {}",
hard_limit,
pk);
BOOST_REQUIRE_EQUAL(str.find(expected_str), 0);
} catch (...) {
throw;
}

View File

@@ -82,9 +82,9 @@ SEASTAR_THREAD_TEST_CASE(test_abandoned_read) {
(void)_;
auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), 7, gc_clock::now(), std::nullopt, query::max_partitions,
utils::make_random_uuid(), query::is_first_page::yes);
utils::make_random_uuid(), query::is_first_page::yes, query::max_result_size(query::result_memory_limiter::unlimited_result_size));
query_mutations_on_all_shards(env.db(), s, cmd, {query::full_partition_range}, nullptr, std::numeric_limits<uint64_t>::max(), db::no_timeout).get();
query_mutations_on_all_shards(env.db(), s, cmd, {query::full_partition_range}, nullptr, db::no_timeout).get();
check_cache_population(env.db(), 1);
@@ -104,11 +104,11 @@ static std::vector<mutation> read_all_partitions_one_by_one(distributed<database
for (const auto& pkey : pkeys) {
const auto res = db.invoke_on(sharder.shard_of(pkey.token()), [gs = global_schema_ptr(s), &pkey] (database& db) {
return async([s = gs.get(), &pkey, &db] () mutable {
auto accounter = db.get_result_memory_limiter().new_mutation_read(std::numeric_limits<size_t>::max()).get0();
const auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), query::max_rows);
const auto cmd = query::read_command(s->id(), s->version(), s->full_slice(),
query::max_result_size(query::result_memory_limiter::unlimited_result_size));
const auto range = dht::partition_range::make_singular(pkey);
return make_foreign(std::make_unique<reconcilable_result>(
std::get<0>(db.query_mutations(std::move(s), cmd, range, std::move(accounter), nullptr, db::no_timeout).get0())));
std::get<0>(db.query_mutations(std::move(s), cmd, range, nullptr, db::no_timeout).get0())));
});
}).get0();
@@ -126,13 +126,14 @@ read_partitions_with_paged_scan(distributed<database>& db, schema_ptr s, uint32_
const dht::partition_range& range, const query::partition_slice& slice, const std::function<void(size_t)>& page_hook = {}) {
const auto query_uuid = is_stateful ? utils::make_random_uuid() : utils::UUID{};
std::vector<mutation> results;
auto cmd = query::read_command(s->id(), s->version(), slice, page_size, gc_clock::now(), std::nullopt, query::max_partitions, query_uuid, query::is_first_page::yes);
auto cmd = query::read_command(s->id(), s->version(), slice, page_size, gc_clock::now(), std::nullopt, query::max_partitions, query_uuid,
query::is_first_page::yes, query::max_result_size(max_size));
bool has_more = true;
// First page is special, needs to have `is_first_page` set.
{
auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {range}, nullptr, max_size, db::no_timeout).get0());
auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {range}, nullptr, db::no_timeout).get0());
for (auto& part : res->partitions()) {
auto mut = part.mut().unfreeze(s);
results.emplace_back(std::move(mut));
@@ -176,7 +177,7 @@ read_partitions_with_paged_scan(distributed<database>& db, schema_ptr s, uint32_
cmd.slice.set_range(*s, last_pkey.key(), std::move(ckranges));
}
auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {pkrange}, nullptr, max_size, db::no_timeout).get0());
auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {pkrange}, nullptr, db::no_timeout).get0());
if (is_stateful) {
BOOST_REQUIRE(aggregate_querier_cache_stat(db, &query::querier_cache::stats::lookups) >= npages);
@@ -880,6 +881,7 @@ run_fuzzy_test_scan(size_t i, fuzzy_test_config cfg, distributed<database>& db,
const auto partition_slice = partition_slice_builder(*schema)
.with_ranges(generate_clustering_ranges(rnd_engine, *schema, part_descs))
.with_option<query::partition_slice::option::allow_short_read>()
.build();
const auto is_stateful = stateful_query(std::uniform_int_distribution<int>(0, 3)(rnd_engine));
@@ -972,7 +974,7 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) {
const auto& partitions = pop_desc.partitions;
smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(pop_desc.schema), &partitions] {
auto& sem = db->local().make_query_class_config().semaphore;
auto& sem = db->local().get_reader_concurrency_semaphore();
auto resources = sem.available_resources();
resources -= reader_concurrency_semaphore::resources{1, 0};

View File

@@ -78,6 +78,10 @@ static query::partition_slice make_full_slice(const schema& s) {
static auto inf32 = std::numeric_limits<unsigned>::max();
static query::result_memory_accounter make_accounter() {
return query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size };
}
query::result_set to_result_set(const reconcilable_result& r, schema_ptr s, const query::partition_slice& slice) {
return query::result_set::from_raw_result(s, slice, to_data_query_result(r, s, slice, inf32, inf32));
}
@@ -101,7 +105,7 @@ SEASTAR_TEST_CASE(test_reading_from_single_partition) {
auto slice = make_full_slice(*s);
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
// FIXME: use mutation assertions
assert_that(to_result_set(result, s, slice))
@@ -124,7 +128,7 @@ SEASTAR_TEST_CASE(test_reading_from_single_partition) {
.build();
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_only(a_row()
@@ -160,7 +164,7 @@ SEASTAR_TEST_CASE(test_cells_are_expired_according_to_query_timestamp) {
auto slice = make_full_slice(*s);
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_only(a_row()
@@ -174,7 +178,7 @@ SEASTAR_TEST_CASE(test_cells_are_expired_according_to_query_timestamp) {
auto slice = make_full_slice(*s);
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 1, query::max_partitions, now + 2s, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 1, query::max_partitions, now + 2s, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_only(a_row()
@@ -207,7 +211,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
.build();
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_size(3)
@@ -237,7 +241,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
.build();
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_size(3)
@@ -265,7 +269,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
{
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 10, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 10, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_size(3)
@@ -285,7 +289,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
{
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_size(1)
@@ -297,7 +301,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
{
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_size(2)
@@ -324,7 +328,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
.build();
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_size(2)
@@ -348,7 +352,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
.build();
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_size(2)
@@ -370,7 +374,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
.build();
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_only(a_row()
@@ -396,7 +400,7 @@ SEASTAR_TEST_CASE(test_query_when_partition_tombstone_covers_live_cells) {
auto slice = make_full_slice(*s);
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.is_empty();
@@ -447,7 +451,7 @@ SEASTAR_TEST_CASE(test_partitions_with_only_expired_tombstones_are_dropped) {
auto query_time = now + std::chrono::seconds(1);
reconcilable_result result = mutation_query(s, src, query::full_partition_range, slice, query::max_rows, query::max_partitions, query_time,
db::no_timeout, tests::make_query_class_config()).get0();
db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
BOOST_REQUIRE_EQUAL(result.partitions().size(), 2);
BOOST_REQUIRE_EQUAL(result.row_count(), 2);
@@ -466,28 +470,28 @@ SEASTAR_TEST_CASE(test_result_row_count) {
auto src = make_source({m1});
auto r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now,
db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
BOOST_REQUIRE_EQUAL(r.row_count().value(), 0);
m1.set_static_cell("s1", data_value(bytes("S_v1")), 1);
r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now,
db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
BOOST_REQUIRE_EQUAL(r.row_count().value(), 1);
m1.set_clustered_cell(clustering_key::from_single_value(*s, bytes("A")), "v1", data_value(bytes("A_v1")), 1);
r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now,
db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
BOOST_REQUIRE_EQUAL(r.row_count().value(), 1);
m1.set_clustered_cell(clustering_key::from_single_value(*s, bytes("B")), "v1", data_value(bytes("B_v1")), 1);
r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now,
db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
BOOST_REQUIRE_EQUAL(r.row_count().value(), 2);
mutation m2(s, partition_key::from_single_value(*s, "key2"));
m2.set_static_cell("s1", data_value(bytes("S_v1")), 1);
r = to_data_query_result(mutation_query(s, make_source({m1, m2}), query::full_partition_range, slice, 10000, query::max_partitions, now,
db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
BOOST_REQUIRE_EQUAL(r.row_count().value(), 3);
});
}
@@ -510,7 +514,7 @@ SEASTAR_TEST_CASE(test_partition_limit) {
{
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, query::max_rows, 10, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, query::max_rows, 10, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_size(2)
@@ -526,7 +530,7 @@ SEASTAR_TEST_CASE(test_partition_limit) {
{
reconcilable_result result = mutation_query(s, src,
query::full_partition_range, slice, query::max_rows, 1, now, db::no_timeout, tests::make_query_class_config()).get0();
query::full_partition_range, slice, query::max_rows, 1, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
assert_that(to_result_set(result, s, slice))
.has_size(1)
@@ -547,11 +551,13 @@ SEASTAR_THREAD_TEST_CASE(test_result_size_calculation) {
query::partition_slice slice = make_full_slice(*s);
slice.options.set<query::partition_slice::option::allow_short_read>();
query::result::builder digest_only_builder(slice, query::result_options{query::result_request::only_digest, query::digest_algorithm::xxHash}, l.new_digest_read(query::result_memory_limiter::maximum_result_size).get0());
query::result::builder digest_only_builder(slice, query::result_options{query::result_request::only_digest, query::digest_algorithm::xxHash},
l.new_digest_read(query::max_result_size(query::result_memory_limiter::maximum_result_size), query::short_read::yes).get0());
data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(),
gc_clock::now(), digest_only_builder, db::no_timeout, tests::make_query_class_config()).get0();
query::result::builder result_and_digest_builder(slice, query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash}, l.new_data_read(query::result_memory_limiter::maximum_result_size).get0());
query::result::builder result_and_digest_builder(slice, query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash},
l.new_data_read(query::max_result_size(query::result_memory_limiter::maximum_result_size), query::short_read::yes).get0());
data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(),
gc_clock::now(), result_and_digest_builder, db::no_timeout, tests::make_query_class_config()).get0();

View File

@@ -776,7 +776,8 @@ SEASTAR_TEST_CASE(test_querying_of_mutation) {
auto resultify = [s] (const mutation& m) -> query::result_set {
auto slice = make_full_slice(*s);
return query::result_set::from_raw_result(s, slice, m.query(slice));
return query::result_set::from_raw_result(s, slice,
m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }));
};
mutation m(s, partition_key::from_single_value(*s, "key1"));
@@ -811,7 +812,8 @@ SEASTAR_TEST_CASE(test_partition_with_no_live_data_is_absent_in_data_query_resul
auto slice = make_full_slice(*s);
assert_that(query::result_set::from_raw_result(s, slice, m.query(slice)))
assert_that(query::result_set::from_raw_result(s, slice,
m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size })))
.is_empty();
});
}
@@ -834,7 +836,8 @@ SEASTAR_TEST_CASE(test_partition_with_live_data_in_static_row_is_present_in_the_
.with_regular_column("v")
.build();
assert_that(query::result_set::from_raw_result(s, slice, m.query(slice)))
assert_that(query::result_set::from_raw_result(s, slice,
m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size })))
.has_only(a_row()
.with_column("pk", data_value(bytes("key1")))
.with_column("v", data_value::make_null(bytes_type)));
@@ -857,7 +860,8 @@ SEASTAR_TEST_CASE(test_query_result_with_one_regular_column_missing) {
auto slice = partition_slice_builder(*s).build();
assert_that(query::result_set::from_raw_result(s, slice, m.query(slice)))
assert_that(query::result_set::from_raw_result(s, slice,
m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size })))
.has_only(a_row()
.with_column("pk", data_value(bytes("key1")))
.with_column("ck", data_value(bytes("ck:A")))
@@ -1243,8 +1247,10 @@ SEASTAR_TEST_CASE(test_query_digest) {
auto check_digests_equal = [] (const mutation& m1, const mutation& m2) {
auto ps1 = partition_slice_builder(*m1.schema()).build();
auto ps2 = partition_slice_builder(*m2.schema()).build();
auto digest1 = *m1.query(ps1, query::result_options::only_digest(query::digest_algorithm::xxHash)).digest();
auto digest2 = *m2.query(ps2, query::result_options::only_digest(query::digest_algorithm::xxHash)).digest();
auto digest1 = *m1.query(ps1, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size },
query::result_options::only_digest(query::digest_algorithm::xxHash)).digest();
auto digest2 = *m2.query(ps2, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size },
query::result_options::only_digest(query::digest_algorithm::xxHash)).digest();
if (digest1 != digest2) {
BOOST_FAIL(format("Digest should be the same for {} and {}", m1, m2));
}
@@ -1493,7 +1499,8 @@ SEASTAR_THREAD_TEST_CASE(test_querying_expired_rows) {
.without_partition_key_columns()
.build();
auto opts = query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash};
return query::result_set::from_raw_result(s, slice, m.query(slice, opts, t));
return query::result_set::from_raw_result(s, slice,
m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, opts, t));
};
mutation m(s, pk);
@@ -1557,7 +1564,8 @@ SEASTAR_TEST_CASE(test_querying_expired_cells) {
.without_partition_key_columns()
.build();
auto opts = query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash};
return query::result_set::from_raw_result(s, slice, m.query(slice, opts, t));
return query::result_set::from_raw_result(s, slice,
m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, opts, t));
};
{

View File

@@ -214,7 +214,7 @@ public:
auto querier = make_querier<Querier>(range);
auto [dk, ck] = querier.consume_page(dummy_result_builder{}, row_limit, std::numeric_limits<uint32_t>::max(),
gc_clock::now(), db::no_timeout, std::numeric_limits<uint64_t>::max()).get0();
gc_clock::now(), db::no_timeout, query::max_result_size(std::numeric_limits<uint64_t>::max())).get0();
const auto memory_usage = querier.memory_usage();
_cache.insert(cache_key, std::move(querier), nullptr);
@@ -658,25 +658,28 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
auto s = cf.schema();
cf.flush().get();
auto slice = s->full_slice();
slice.options.set<query::partition_slice::option::allow_short_read>();
auto cmd1 = query::read_command(s->id(),
s->version(),
s->full_slice(),
slice,
1,
gc_clock::now(),
std::nullopt,
1,
utils::make_random_uuid());
utils::make_random_uuid(),
query::is_first_page::yes,
query::max_result_size(1024 * 1024));
// Should save the querier in cache.
db.query_mutations(s,
cmd1,
query::full_partition_range,
db.get_result_memory_limiter().new_mutation_read(1024 * 1024).get0(),
nullptr,
db::no_timeout).get();
auto& semaphore = db.make_query_class_config().semaphore;
auto& semaphore = db.get_reader_concurrency_semaphore();
auto permit = semaphore.make_permit();
BOOST_CHECK_EQUAL(db.get_querier_cache_stats().resource_based_evictions, 0);
@@ -695,18 +698,19 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
auto cmd2 = query::read_command(s->id(),
s->version(),
s->full_slice(),
slice,
1,
gc_clock::now(),
std::nullopt,
1,
utils::make_random_uuid());
utils::make_random_uuid(),
query::is_first_page::no,
query::max_result_size(1024 * 1024));
// Should evict the already cached querier.
db.query_mutations(s,
cmd2,
query::full_partition_range,
db.get_result_memory_limiter().new_mutation_read(1024 * 1024).get0(),
nullptr,
db::no_timeout).get();
@@ -720,10 +724,10 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
// of the tracked buffers.
cmd2.row_limit = query::max_rows;
cmd2.partition_limit = query::max_partitions;
cmd2.max_result_size.emplace(query::result_memory_limiter::unlimited_result_size);
db.query_mutations(s,
cmd2,
query::full_partition_range,
db.get_result_memory_limiter().new_mutation_read(1024 * 1024 * 1024 * 1024).get0(),
nullptr,
db::no_timeout).get();
return make_ready_future<>();

View File

@@ -555,7 +555,7 @@ SEASTAR_THREAD_TEST_CASE(test_view_update_generator_deadlock) {
t->add_sstable_and_update_cache(sst).get();
auto& sem = *with_scheduling_group(e.local_db().get_streaming_scheduling_group(), [&] () {
return &e.local_db().make_query_class_config().semaphore;
return &e.local_db().get_reader_concurrency_semaphore();
}).get0();
// consume all units except what is needed to admit a single reader.

View File

@@ -409,6 +409,13 @@ public:
create_directories((cfg->view_hints_directory() + "/" + std::to_string(i)).c_str());
}
if (!cfg->max_memory_for_unlimited_query_soft_limit.is_set()) {
cfg->max_memory_for_unlimited_query_soft_limit.set(uint64_t(query::result_memory_limiter::unlimited_result_size));
}
if (!cfg->max_memory_for_unlimited_query_hard_limit.is_set()) {
cfg->max_memory_for_unlimited_query_hard_limit.set(uint64_t(query::result_memory_limiter::unlimited_result_size));
}
sharded<locator::token_metadata> token_metadata;
token_metadata.start().get();
auto stop_token_metadata = defer([&token_metadata] { token_metadata.stop().get(); });

View File

@@ -33,8 +33,8 @@ reader_permit make_permit() {
return the_semaphore.make_permit();
}
query_class_config make_query_class_config() {
return query_class_config{the_semaphore, std::numeric_limits<uint64_t>::max()};
query::query_class_config make_query_class_config() {
return query::query_class_config{the_semaphore, query::max_result_size(std::numeric_limits<uint64_t>::max())};
}
} // namespace tests

View File

@@ -30,6 +30,6 @@ reader_concurrency_semaphore& semaphore();
reader_permit make_permit();
query_class_config make_query_class_config();
query::query_class_config make_query_class_config();
} // namespace tests

View File

@@ -308,7 +308,7 @@ int main(int argc, char** argv) {
auto prev_occupancy = logalloc::shard_tracker().occupancy();
testlog.info("Occupancy before: {}", prev_occupancy);
auto& sem = env.local_db().make_query_class_config().semaphore;
auto& sem = env.local_db().get_reader_concurrency_semaphore();
testlog.info("Reading");
stats_collector sc(sem, stats_collector_params);

View File

@@ -196,7 +196,8 @@ static sizes calculate_sizes(cache_tracker& tracker, const mutation_settings& se
result.cache = tracker.region().occupancy().used_space() - cache_initial_occupancy;
result.frozen = freeze(m).representation().size();
result.canonical = canonical_mutation(m).representation().size();
result.query_result = m.query(partition_slice_builder(*s).build(), query::result_options::only_result()).buf().size();
result.query_result = m.query(partition_slice_builder(*s).build(),
query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, query::result_options::only_result()).buf().size();
tmpdir sstable_dir;
sstables::test_env env;

View File

@@ -281,13 +281,14 @@ public:
if (!column_parent.super_column.empty()) {
fail(unimplemented::cause::SUPER);
}
auto cmd = slice_pred_to_read_cmd(*schema, predicate);
auto& proxy = service::get_local_storage_proxy();
auto cmd = slice_pred_to_read_cmd(proxy, *schema, predicate);
auto cell_limit = predicate.__isset.slice_range ? static_cast<uint32_t>(predicate.slice_range.count) : std::numeric_limits<uint32_t>::max();
auto pranges = make_partition_ranges(*schema, keys);
auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
return f.then([this, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable {
return f.then([this, &proxy, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable {
auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout;
return service::get_local_storage_proxy().query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
return proxy.query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
[schema, cmd, cell_limit, keys = std::move(keys)](service::storage_proxy::coordinator_query_result qr) {
return query::result_view::do_with(*qr.query_result, [schema, cmd, cell_limit, keys = std::move(keys)](query::result_view v) mutable {
if (schema->is_counter()) {
@@ -309,13 +310,14 @@ public:
if (!column_parent.super_column.empty()) {
fail(unimplemented::cause::SUPER);
}
auto cmd = slice_pred_to_read_cmd(*schema, predicate);
auto& proxy = service::get_local_storage_proxy();
auto cmd = slice_pred_to_read_cmd(proxy, *schema, predicate);
auto cell_limit = predicate.__isset.slice_range ? static_cast<uint32_t>(predicate.slice_range.count) : std::numeric_limits<uint32_t>::max();
auto pranges = make_partition_ranges(*schema, keys);
auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
return f.then([this, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable {
return f.then([this, &proxy, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable {
auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout;
return service::get_local_storage_proxy().query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
return proxy.query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
[schema, cmd, cell_limit, keys = std::move(keys)](service::storage_proxy::coordinator_query_result qr) {
return query::result_view::do_with(*qr.query_result, [schema, cmd, cell_limit, keys = std::move(keys)](query::result_view v) mutable {
column_counter counter(*schema, cmd->slice, cell_limit, std::move(keys));
@@ -338,8 +340,9 @@ public:
if (!column_parent.super_column.empty()) {
fail(unimplemented::cause::SUPER);
}
auto& proxy = service::get_local_storage_proxy();
auto&& prange = make_partition_range(*schema, range);
auto cmd = slice_pred_to_read_cmd(*schema, predicate);
auto cmd = slice_pred_to_read_cmd(proxy, *schema, predicate);
// KeyRange::count is the number of thrift rows to return, while
// SlicePredicte::slice_range::count limits the number of thrift colums.
if (schema->thrift().is_dynamic()) {
@@ -350,9 +353,9 @@ public:
cmd->row_limit = range.count;
}
auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
return f.then([this, schema, cmd, prange = std::move(prange), consistency_level] () mutable {
return f.then([this, &proxy, schema, cmd, prange = std::move(prange), consistency_level] () mutable {
auto timeout = db::timeout_clock::now() + _timeout_config.range_read_timeout;
return service::get_local_storage_proxy().query(schema, cmd, std::move(prange), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
return proxy.query(schema, cmd, std::move(prange), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
[schema, cmd](service::storage_proxy::coordinator_query_result qr) {
return query::result_view::do_with(*qr.query_result, [schema, cmd](query::result_view v) {
return to_key_slices(*schema, cmd->slice, v, std::numeric_limits<uint32_t>::max());
@@ -362,7 +365,7 @@ public:
});
}
static lw_shared_ptr<query::read_command> make_paged_read_cmd(const schema& s, uint32_t column_limit, const std::string* start_column, const dht::partition_range_vector& range) {
static lw_shared_ptr<query::read_command> make_paged_read_cmd(service::storage_proxy& proxy, const schema& s, uint32_t column_limit, const std::string* start_column, const dht::partition_range_vector& range) {
auto opts = query_opts(s);
std::vector<query::clustering_range> clustering_ranges;
query::column_id_vector regular_columns;
@@ -394,7 +397,8 @@ public:
clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides());
auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts,
std::move(specific_ranges), cql_serialization_format::internal());
return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), row_limit, gc_clock::now(), std::nullopt, partition_limit);
return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice),
query::row_limit(row_limit), query::partition_limit(partition_limit));
}
static future<> do_get_paged_slice(
@@ -406,7 +410,8 @@ public:
const ::timeout_config& timeout_config,
std::vector<KeySlice>& output,
service::query_state& qs) {
auto cmd = make_paged_read_cmd(*schema, column_limit, start_column, range);
auto& proxy = service::get_local_storage_proxy();
auto cmd = make_paged_read_cmd(proxy, *schema, column_limit, start_column, range);
std::optional<partition_key> start_key;
auto end = range[0].end();
if (start_column && !schema->thrift().is_dynamic()) {
@@ -417,7 +422,7 @@ public:
}
auto range1 = range; // query() below accepts an rvalue, so need a copy to reuse later
auto timeout = db::timeout_clock::now() + timeout_config.range_read_timeout;
return service::get_local_storage_proxy().query(schema, cmd, std::move(range), consistency_level, {timeout, empty_service_permit(), qs.get_client_state()}).then(
return proxy.query(schema, cmd, std::move(range), consistency_level, {timeout, empty_service_permit(), qs.get_client_state()}).then(
[schema, cmd, column_limit](service::storage_proxy::coordinator_query_result qr) {
return query::result_view::do_with(*qr.query_result, [schema, cmd, column_limit](query::result_view v) {
return to_key_slices(*schema, cmd->slice, v, column_limit);
@@ -664,11 +669,13 @@ public:
}
}
auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts, nullptr);
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), row_limit);
auto& proxy = service::get_local_storage_proxy();
auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.get_max_result_size(slice),
query::row_limit(row_limit));
auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
return f.then([this, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level] {
return f.then([this, &proxy, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level] {
auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout;
return service::get_local_storage_proxy().query(schema, cmd, {dht::partition_range::make_singular(dk)}, cl_from_thrift(cl), {timeout, /* FIXME: pass real permit */empty_service_permit(), _query_state.get_client_state()}).then(
return proxy.query(schema, cmd, {dht::partition_range::make_singular(dk)}, cl_from_thrift(cl), {timeout, /* FIXME: pass real permit */empty_service_permit(), _query_state.get_client_state()}).then(
[schema, cmd, column_limit](service::storage_proxy::coordinator_query_result qr) {
return query::result_view::do_with(*qr.query_result, [schema, cmd, column_limit](query::result_view v) {
column_aggregator<query_order::no> aggregator(*schema, cmd->slice, column_limit, { });
@@ -1464,7 +1471,7 @@ private:
opts.set(query::partition_slice::option::send_partition_key);
return opts;
}
static lw_shared_ptr<query::read_command> slice_pred_to_read_cmd(const schema& s, const SlicePredicate& predicate) {
static lw_shared_ptr<query::read_command> slice_pred_to_read_cmd(service::storage_proxy& proxy, const schema& s, const SlicePredicate& predicate) {
auto opts = query_opts(s);
std::vector<query::clustering_range> clustering_ranges;
query::column_id_vector regular_columns;
@@ -1511,7 +1518,7 @@ private:
}
auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts,
nullptr, cql_serialization_format::internal(), per_partition_row_limit);
return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice));
return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice));
}
static ColumnParent column_path_to_column_parent(const ColumnPath& column_path) {
ColumnParent ret;

View File

@@ -210,6 +210,17 @@ utils::config_type::to_json(const void* value) const {
return _to_json(value);
}
bool
utils::config_file::config_src::matches(std::string_view name) const {
if (_name == name) {
return true;
}
if (!_alias.empty() && _alias == name) {
return true;
}
return false;
}
json::json_return_type
utils::config_file::config_src::value_as_json() const {
return _type->to_json(current_value());
@@ -259,10 +270,7 @@ bpo::options_description_easy_init&
utils::config_file::add_options(bpo::options_description_easy_init& init) {
for (config_src& src : _cfgs) {
if (src.status() == value_status::Used) {
auto&& name = src.name();
sstring tmp(name.begin(), name.end());
std::replace(tmp.begin(), tmp.end(), '_', '-');
src.add_command_line_option(init, tmp, src.desc());
src.add_command_line_option(init);
}
}
return init;
@@ -291,7 +299,7 @@ void utils::config_file::read_from_yaml(const char* yaml, error_handler h) {
for (auto node : doc) {
auto label = node.first.as<sstring>();
auto i = std::find_if(_cfgs.begin(), _cfgs.end(), [&label](const config_src& cfg) { return cfg.name() == label; });
auto i = std::find_if(_cfgs.begin(), _cfgs.end(), [&label](const config_src& cfg) { return cfg.matches(label); });
if (i == _cfgs.end()) {
h(label, "Unknown option", std::nullopt);
continue;

View File

@@ -98,7 +98,7 @@ public:
struct config_src {
config_file* _cf;
std::string_view _name, _desc;
std::string_view _name, _alias, _desc;
const config_type* _type;
size_t _per_shard_values_offset;
protected:
@@ -110,11 +110,21 @@ public:
, _desc(desc)
, _type(type)
{}
config_src(config_file* cf, std::string_view name, std::string_view alias, const config_type* type, std::string_view desc)
: _cf(cf)
, _name(name)
, _alias(alias)
, _desc(desc)
, _type(type)
{}
virtual ~config_src() {}
const std::string_view & name() const {
return _name;
}
std::string_view alias() const {
return _alias;
}
const std::string_view & desc() const {
return _desc;
}
@@ -124,9 +134,8 @@ public:
config_file * get_config_file() const {
return _cf;
}
virtual void add_command_line_option(
bpo::options_description_easy_init&, const std::string_view&,
const std::string_view&) = 0;
bool matches(std::string_view name) const;
virtual void add_command_line_option(bpo::options_description_easy_init&) = 0;
virtual void set_value(const YAML::Node&) = 0;
virtual value_status status() const = 0;
virtual config_source source() const = 0;
@@ -168,18 +177,25 @@ public:
typedef T type;
typedef named_value<T> MyType;
named_value(config_file* file, std::string_view name, liveness liveness_, value_status vs, const T& t = T(), std::string_view desc = {},
named_value(config_file* file, std::string_view name, std::string_view alias, liveness liveness_, value_status vs, const T& t = T(), std::string_view desc = {},
std::initializer_list<T> allowed_values = {})
: config_src(file, name, &config_type_for<T>, desc)
: config_src(file, name, alias, &config_type_for<T>, desc)
, _value_status(vs)
, _liveness(liveness_)
, _allowed_values(std::move(allowed_values))
{
, _allowed_values(std::move(allowed_values)) {
file->add(*this, std::make_unique<the_value_type>(std::move(t)));
}
named_value(config_file* file, std::string_view name, liveness liveness_, value_status vs, const T& t = T(), std::string_view desc = {},
std::initializer_list<T> allowed_values = {})
: named_value(file, name, {}, liveness_, vs, t, desc) {
}
named_value(config_file* file, std::string_view name, std::string_view alias, value_status vs, const T& t = T(), std::string_view desc = {},
std::initializer_list<T> allowed_values = {})
: named_value(file, name, alias, liveness::MustRestart, vs, t, desc, allowed_values) {
}
named_value(config_file* file, std::string_view name, value_status vs, const T& t = T(), std::string_view desc = {},
std::initializer_list<T> allowed_values = {})
: named_value(file, name, liveness::MustRestart, vs, t, desc, allowed_values) {
: named_value(file, name, {}, liveness::MustRestart, vs, t, desc, allowed_values) {
}
value_status status() const override {
return _value_status;
@@ -222,8 +238,7 @@ public:
return the_value().observe(std::move(callback));
}
void add_command_line_option(bpo::options_description_easy_init&,
const std::string_view&, const std::string_view&) override;
void add_command_line_option(bpo::options_description_easy_init&) override;
void set_value(const YAML::Node&) override;
};

View File

@@ -192,13 +192,17 @@ sstring hyphenate(const std::string_view&);
}
template<typename T>
void utils::config_file::named_value<T>::add_command_line_option(
boost::program_options::options_description_easy_init& init,
const std::string_view& name, const std::string_view& desc) {
void utils::config_file::named_value<T>::add_command_line_option(boost::program_options::options_description_easy_init& init) {
const auto hyphenated_name = hyphenate(name());
// NOTE. We are not adding default values. We could, but must in that case manually (in some way) geenrate the textual
// version, since the available ostream operators for things like pairs and collections don't match what we can deal with parser-wise.
// See removed ostream operators above.
init(hyphenate(name).data(), value_ex<T>()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc.data());
init(hyphenated_name.data(), value_ex<T>()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc().data());
if (!alias().empty()) {
const auto alias_desc = fmt::format("Alias for {}", hyphenated_name);
init(hyphenate(alias()).data(), value_ex<T>()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), alias_desc.data());
}
}
template<typename T>