From 003f5e9e54625bb1cf2c72de0a1ef93f4bd14d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Mon, 27 Jul 2020 14:24:39 +0300 Subject: [PATCH 01/26] utils: config: add alias support Allow configuration items to also have an alias, besides the name. This allows easy replacement of configuration items, with newer names, while still supporting the old name for backward compatibility. The alias mechanism takes care of registering both the name and the alias as command line arguments, as well as parsing them from YAML. The command line documentation of the alias will just refer to the name for documentation. --- utils/config_file.cc | 13 ++++++++++++- utils/config_file.hh | 30 ++++++++++++++++++++++++------ utils/config_file_impl.hh | 8 +++++++- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/utils/config_file.cc b/utils/config_file.cc index 843343e4ec..a366e0b487 100644 --- a/utils/config_file.cc +++ b/utils/config_file.cc @@ -210,6 +210,17 @@ utils::config_type::to_json(const void* value) const { return _to_json(value); } +bool +utils::config_file::config_src::matches(std::string_view name) const { + if (_name == name) { + return true; + } + if (!_alias.empty() && _alias == name) { + return true; + } + return false; +} + json::json_return_type utils::config_file::config_src::value_as_json() const { return _type->to_json(current_value()); @@ -291,7 +302,7 @@ void utils::config_file::read_from_yaml(const char* yaml, error_handler h) { for (auto node : doc) { auto label = node.first.as(); - auto i = std::find_if(_cfgs.begin(), _cfgs.end(), [&label](const config_src& cfg) { return cfg.name() == label; }); + auto i = std::find_if(_cfgs.begin(), _cfgs.end(), [&label](const config_src& cfg) { return cfg.matches(label); }); if (i == _cfgs.end()) { h(label, "Unknown option", std::nullopt); continue; diff --git a/utils/config_file.hh b/utils/config_file.hh index b7390f12ec..0220ed96c1 100644 --- a/utils/config_file.hh +++ b/utils/config_file.hh @@ -98,7 +98,7 @@ public: struct config_src { config_file* _cf; - std::string_view _name, _desc; + std::string_view _name, _alias, _desc; const config_type* _type; size_t _per_shard_values_offset; protected: @@ -110,11 +110,21 @@ public: , _desc(desc) , _type(type) {} + config_src(config_file* cf, std::string_view name, std::string_view alias, const config_type* type, std::string_view desc) + : _cf(cf) + , _name(name) + , _alias(alias) + , _desc(desc) + , _type(type) + {} virtual ~config_src() {} const std::string_view & name() const { return _name; } + std::string_view alias() const { + return _alias; + } const std::string_view & desc() const { return _desc; } @@ -124,6 +134,7 @@ public: config_file * get_config_file() const { return _cf; } + bool matches(std::string_view name) const; virtual void add_command_line_option( bpo::options_description_easy_init&, const std::string_view&, const std::string_view&) = 0; @@ -168,18 +179,25 @@ public: typedef T type; typedef named_value MyType; - named_value(config_file* file, std::string_view name, liveness liveness_, value_status vs, const T& t = T(), std::string_view desc = {}, + named_value(config_file* file, std::string_view name, std::string_view alias, liveness liveness_, value_status vs, const T& t = T(), std::string_view desc = {}, std::initializer_list allowed_values = {}) - : config_src(file, name, &config_type_for, desc) + : config_src(file, name, alias, &config_type_for, desc) , _value_status(vs) , _liveness(liveness_) - , _allowed_values(std::move(allowed_values)) - { + , _allowed_values(std::move(allowed_values)) { file->add(*this, std::make_unique(std::move(t))); } + named_value(config_file* file, std::string_view name, liveness liveness_, value_status vs, const T& t = T(), std::string_view desc = {}, + std::initializer_list allowed_values = {}) + : named_value(file, name, {}, liveness_, vs, t, desc) { + } + named_value(config_file* file, std::string_view name, std::string_view alias, value_status vs, const T& t = T(), std::string_view desc = {}, + std::initializer_list allowed_values = {}) + : named_value(file, name, alias, liveness::MustRestart, vs, t, desc, allowed_values) { + } named_value(config_file* file, std::string_view name, value_status vs, const T& t = T(), std::string_view desc = {}, std::initializer_list allowed_values = {}) - : named_value(file, name, liveness::MustRestart, vs, t, desc, allowed_values) { + : named_value(file, name, {}, liveness::MustRestart, vs, t, desc, allowed_values) { } value_status status() const override { return _value_status; diff --git a/utils/config_file_impl.hh b/utils/config_file_impl.hh index c898ecad40..f6b3a2670a 100644 --- a/utils/config_file_impl.hh +++ b/utils/config_file_impl.hh @@ -195,10 +195,16 @@ template void utils::config_file::named_value::add_command_line_option( boost::program_options::options_description_easy_init& init, const std::string_view& name, const std::string_view& desc) { + const auto hyphenated_name = hyphenate(name); // NOTE. We are not adding default values. We could, but must in that case manually (in some way) geenrate the textual // version, since the available ostream operators for things like pairs and collections don't match what we can deal with parser-wise. // See removed ostream operators above. - init(hyphenate(name).data(), value_ex()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc.data()); + init(hyphenated_name.data(), value_ex()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc.data()); + + if (!alias().empty()) { + const auto alias_desc = fmt::format("Alias for {}", hyphenated_name); + init(hyphenate(alias()).data(), value_ex()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), alias_desc.data()); + } } template From dc23736d0c5d43d551663b34e99648092286dcaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Mon, 27 Jul 2020 15:35:00 +0300 Subject: [PATCH 02/26] db/config: replace ad-hoc aliases with alias mechanism We already uses aliases for some configuration items, although these are created with an ad-hoc mechanism that only registers them on the command line. Replace this with the built-in alias mechanism in the previous patch, which has the benefit of conflict resolution and also working with YAML. --- db/config.cc | 17 +++-------------- db/config.hh | 3 --- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/db/config.cc b/db/config.cc index b3af507875..6c1bfcde4e 100644 --- a/db/config.cc +++ b/db/config.cc @@ -224,7 +224,7 @@ db::config::config(std::shared_ptr exts) "The directory in which Scylla will put all its subdirectories. The location of individual subdirs can be overriden by the respective *_directory options.") , commitlog_directory(this, "commitlog_directory", value_status::Used, "", "The directory where the commit log is stored. For optimal write performance, it is recommended the commit log be on a separate disk partition (ideally, a separate physical device) from the data file directories.") - , data_file_directories(this, "data_file_directories", value_status::Used, { }, + , data_file_directories(this, "data_file_directories", "datadir", value_status::Used, { }, "The directory location where table data (SSTables) is stored") , hints_directory(this, "hints_directory", value_status::Used, "", "The directory where hints files are stored if hinted handoff is enabled.") @@ -517,7 +517,7 @@ db::config::config(std::shared_ptr exts) /* Native transport (CQL Binary Protocol) */ , start_native_transport(this, "start_native_transport", value_status::Used, true, "Enable or disable the native transport server. Uses the same address as the rpc_address, but the port is different from the rpc_port. See native_transport_port.") - , native_transport_port(this, "native_transport_port", value_status::Used, 9042, + , native_transport_port(this, "native_transport_port", "cql_port", value_status::Used, 9042, "Port on which the CQL native transport listens for clients.") , native_transport_port_ssl(this, "native_transport_port_ssl", value_status::Used, 9142, "Port on which the CQL TLS native transport listens for clients." @@ -536,7 +536,7 @@ db::config::config(std::shared_ptr exts) /* Settings for configuring and tuning client connections. */ , broadcast_rpc_address(this, "broadcast_rpc_address", value_status::Used, {/* unset */}, "RPC address to broadcast to drivers and other Scylla nodes. This cannot be set to 0.0.0.0. If blank, it is set to the value of the rpc_address or rpc_interface. If rpc_address or rpc_interfaceis set to 0.0.0.0, this property must be set.\n") - , rpc_port(this, "rpc_port", value_status::Used, 9160, + , rpc_port(this, "rpc_port", "thrift_port", value_status::Used, 9160, "Thrift port for client connections.") , start_rpc(this, "start_rpc", value_status::Used, true, "Starts the Thrift RPC server") @@ -830,17 +830,6 @@ void config_file::named_value::add_command_line_ } -boost::program_options::options_description_easy_init& -db::config::add_options(boost::program_options::options_description_easy_init& init) { - config_file::add_options(init); - - data_file_directories.add_command_line_option(init, "datadir", "alias for 'data-file-directories'"); - rpc_port.add_command_line_option(init, "thrift-port", "alias for 'rpc-port'"); - native_transport_port.add_command_line_option(init, "cql-port", "alias for 'native-transport-port'"); - - return init; -} - db::fs::path db::config::get_conf_dir() { using namespace db::fs; diff --git a/db/config.hh b/db/config.hh index d85d61ed68..86bb1d1bf6 100644 --- a/db/config.hh +++ b/db/config.hh @@ -329,9 +329,6 @@ public: seastar::logging_settings logging_settings(const boost::program_options::variables_map&) const; - boost::program_options::options_description_easy_init& - add_options(boost::program_options::options_description_easy_init&); - const db::extensions& extensions() const; static const sstring default_tls_priority; From 9faaf46d4b2df63d778091aba934e10b38195e58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Mon, 27 Jul 2020 15:42:49 +0300 Subject: [PATCH 03/26] utils: config_src::add_command_line_options(): drop name and desc args Now that there are no ad-hoc aliases needing to overwrite the name and description parameter of this method, we can drop these and have each config item just use `name()` and `desc()` to access these. --- db/config.cc | 11 +++++------ db/config.hh | 3 +-- utils/config_file.cc | 5 +---- utils/config_file.hh | 7 ++----- utils/config_file_impl.hh | 8 +++----- 5 files changed, 12 insertions(+), 22 deletions(-) diff --git a/db/config.cc b/db/config.cc index 6c1bfcde4e..ee27edc368 100644 --- a/db/config.cc +++ b/db/config.cc @@ -808,24 +808,23 @@ namespace utils { template<> void config_file::named_value::add_command_line_option( - boost::program_options::options_description_easy_init& init, - const std::string_view& name, const std::string_view& desc) { - init((hyphenate(name) + "-class-name").data(), + boost::program_options::options_description_easy_init& init) { + init((hyphenate(name()) + "-class-name").data(), value_ex()->notifier( [this](sstring new_class_name) { auto old_seed_provider = operator()(); old_seed_provider.class_name = new_class_name; set(std::move(old_seed_provider), config_source::CommandLine); }), - desc.data()); - init((hyphenate(name) + "-parameters").data(), + desc().data()); + init((hyphenate(name()) + "-parameters").data(), value_ex>()->notifier( [this](std::unordered_map new_parameters) { auto old_seed_provider = operator()(); old_seed_provider.parameters = new_parameters; set(std::move(old_seed_provider), config_source::CommandLine); }), - desc.data()); + desc().data()); } } diff --git a/db/config.hh b/db/config.hh index 86bb1d1bf6..3d225f987c 100644 --- a/db/config.hh +++ b/db/config.hh @@ -343,8 +343,7 @@ private: return this->is_set() ? (*this)() : t; } // do not add to boost::options. We only care about yaml config - void add_command_line_option(boost::program_options::options_description_easy_init&, - const std::string_view&, const std::string_view&) override {} + void add_command_line_option(boost::program_options::options_description_easy_init&) override {} }; log_legacy_value default_log_level; diff --git a/utils/config_file.cc b/utils/config_file.cc index a366e0b487..e68aaef20a 100644 --- a/utils/config_file.cc +++ b/utils/config_file.cc @@ -270,10 +270,7 @@ bpo::options_description_easy_init& utils::config_file::add_options(bpo::options_description_easy_init& init) { for (config_src& src : _cfgs) { if (src.status() == value_status::Used) { - auto&& name = src.name(); - sstring tmp(name.begin(), name.end()); - std::replace(tmp.begin(), tmp.end(), '_', '-'); - src.add_command_line_option(init, tmp, src.desc()); + src.add_command_line_option(init); } } return init; diff --git a/utils/config_file.hh b/utils/config_file.hh index 0220ed96c1..c9b8978c7f 100644 --- a/utils/config_file.hh +++ b/utils/config_file.hh @@ -135,9 +135,7 @@ public: return _cf; } bool matches(std::string_view name) const; - virtual void add_command_line_option( - bpo::options_description_easy_init&, const std::string_view&, - const std::string_view&) = 0; + virtual void add_command_line_option(bpo::options_description_easy_init&) = 0; virtual void set_value(const YAML::Node&) = 0; virtual value_status status() const = 0; virtual config_source source() const = 0; @@ -240,8 +238,7 @@ public: return the_value().observe(std::move(callback)); } - void add_command_line_option(bpo::options_description_easy_init&, - const std::string_view&, const std::string_view&) override; + void add_command_line_option(bpo::options_description_easy_init&) override; void set_value(const YAML::Node&) override; }; diff --git a/utils/config_file_impl.hh b/utils/config_file_impl.hh index f6b3a2670a..4049bfc40c 100644 --- a/utils/config_file_impl.hh +++ b/utils/config_file_impl.hh @@ -192,14 +192,12 @@ sstring hyphenate(const std::string_view&); } template -void utils::config_file::named_value::add_command_line_option( - boost::program_options::options_description_easy_init& init, - const std::string_view& name, const std::string_view& desc) { - const auto hyphenated_name = hyphenate(name); +void utils::config_file::named_value::add_command_line_option(boost::program_options::options_description_easy_init& init) { + const auto hyphenated_name = hyphenate(name()); // NOTE. We are not adding default values. We could, but must in that case manually (in some way) geenrate the textual // version, since the available ostream operators for things like pairs and collections don't match what we can deal with parser-wise. // See removed ostream operators above. - init(hyphenated_name.data(), value_ex()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc.data()); + init(hyphenated_name.data(), value_ex()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc().data()); if (!alias().empty()) { const auto alias_desc = fmt::format("Alias for {}", hyphenated_name); From 46d5b651eb4ed445ac0bc3d2aca067c643181695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Mon, 27 Jul 2020 15:25:58 +0300 Subject: [PATCH 04/26] db/config: introduce max_memory_for_unlimited_query_soft_limit and max_memory_for_unlimited_query_hard_limit This pair of limits replace the old max_memory_for_unlimited_query one, which remains as an alias to the hard limit. The soft limit inherits the previous value of the limit (1MB), when this limit is reached a warning will be logged allowing the users to adjust their client codes without downtime. The hard limit starts out with a more permissive default of 100MB. When this is reached queries are aborted, the same behaviour as with the previous single limit. The idea is to allow clients a grace period for fixing their code, while at the same time protecting the database from the really bad queries. --- database.cc | 2 +- db/config.cc | 8 ++++++-- db/config.hh | 3 ++- test/boost/cql_query_test.cc | 3 ++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/database.cc b/database.cc index be1d5c9e5c..dcb19880a0 100644 --- a/database.cc +++ b/database.cc @@ -1282,7 +1282,7 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) { query_class_config database::make_query_class_config() { // Everything running in the statement group is considered a user query if (current_scheduling_group() == _dbcfg.statement_scheduling_group) { - return query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query()}; + return query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query_hard_limit()}; // Reads done on behalf of view update generation run in the streaming group } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) { return query_class_config{_streaming_concurrency_sem, std::numeric_limits::max()}; diff --git a/db/config.cc b/db/config.cc index ee27edc368..0c98b9c4ff 100644 --- a/db/config.cc +++ b/db/config.cc @@ -722,8 +722,12 @@ db::config::config(std::shared_ptr exts) , max_clustering_key_restrictions_per_query(this, "max_clustering_key_restrictions_per_query", liveness::LiveUpdate, value_status::Used, 100, "Maximum number of distinct clustering key restrictions per query. This limit places a bound on the size of IN tuples, " "especially when multiple clustering key columns have IN restrictions. Increasing this value can result in server instability.") - , max_memory_for_unlimited_query(this, "max_memory_for_unlimited_query", liveness::LiveUpdate, value_status::Used, size_t(1) << 20, - "Maximum amount of memory a query, whose memory consumption is not naturally limited, is allowed to consume, e.g. non-paged and reverse queries.") + , max_memory_for_unlimited_query_soft_limit(this, "max_memory_for_unlimited_query_soft_limit", liveness::LiveUpdate, value_status::Used, uint64_t(1) << 20, + "Maximum amount of memory a query, whose memory consumption is not naturally limited, is allowed to consume, e.g. non-paged and reverse queries. " + "This is the soft limit, there will be a warning logged for queries violating this limit.") + , max_memory_for_unlimited_query_hard_limit(this, "max_memory_for_unlimited_query_hard_limit", "max_memory_for_unlimited_query", liveness::LiveUpdate, value_status::Used, (uint64_t(100) << 20), + "Maximum amount of memory a query, whose memory consumption is not naturally limited, is allowed to consume, e.g. non-paged and reverse queries. " + "This is the hard limit, queries violating this limit will be aborted.") , initial_sstable_loading_concurrency(this, "initial_sstable_loading_concurrency", value_status::Used, 4u, "Maximum amount of sstables to load in parallel during initialization. A higher number can lead to more memory consumption. You should not need to touch this") , enable_3_1_0_compatibility_mode(this, "enable_3_1_0_compatibility_mode", value_status::Used, false, diff --git a/db/config.hh b/db/config.hh index 3d225f987c..feadfa0dda 100644 --- a/db/config.hh +++ b/db/config.hh @@ -303,7 +303,8 @@ public: named_value abort_on_internal_error; named_value max_partition_key_restrictions_per_query; named_value max_clustering_key_restrictions_per_query; - named_value max_memory_for_unlimited_query; + named_value max_memory_for_unlimited_query_soft_limit; + named_value max_memory_for_unlimited_query_hard_limit; named_value initial_sstable_loading_concurrency; named_value enable_3_1_0_compatibility_mode; named_value enable_user_defined_functions; diff --git a/test/boost/cql_query_test.cc b/test/boost/cql_query_test.cc index 0722295bd7..7a14262168 100644 --- a/test/boost/cql_query_test.cc +++ b/test/boost/cql_query_test.cc @@ -2765,7 +2765,8 @@ SEASTAR_TEST_CASE(test_reversed_slice_with_empty_range_before_all_rows) { // See #6171 SEASTAR_TEST_CASE(test_reversed_slice_with_many_clustering_ranges) { cql_test_config cfg; - cfg.db_config->max_memory_for_unlimited_query(std::numeric_limits::max()); + cfg.db_config->max_memory_for_unlimited_query_soft_limit(std::numeric_limits::max()); + cfg.db_config->max_memory_for_unlimited_query_hard_limit(std::numeric_limits::max()); return do_with_cql_env_thread([] (cql_test_env& e) { e.execute_cql("CREATE TABLE test (pk int, ck int, v text, PRIMARY KEY (pk, ck));").get(); auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0(); From 517a941feb2dd6268840d7cfb8f09f364bec4c02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Tue, 28 Jul 2020 11:39:56 +0300 Subject: [PATCH 05/26] query_class_config: move into the query namespace It belongs there, its name even starts with "query". --- database.cc | 8 ++++---- database.hh | 6 +++--- mutation_partition.cc | 6 +++--- mutation_query.hh | 6 +++--- query_class_config.hh | 4 ++++ table.cc | 2 +- test/lib/reader_permit.cc | 4 ++-- test/lib/reader_permit.hh | 2 +- 8 files changed, 21 insertions(+), 17 deletions(-) diff --git a/database.cc b/database.cc index dcb19880a0..1471f6ae70 100644 --- a/database.cc +++ b/database.cc @@ -1279,16 +1279,16 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) { }); } -query_class_config database::make_query_class_config() { +query::query_class_config database::make_query_class_config() { // Everything running in the statement group is considered a user query if (current_scheduling_group() == _dbcfg.statement_scheduling_group) { - return query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query_hard_limit()}; + return query::query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query_hard_limit()}; // Reads done on behalf of view update generation run in the streaming group } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) { - return query_class_config{_streaming_concurrency_sem, std::numeric_limits::max()}; + return query::query_class_config{_streaming_concurrency_sem, std::numeric_limits::max()}; // Everything else is considered a system query } else { - return query_class_config{_system_read_concurrency_sem, std::numeric_limits::max()}; + return query::query_class_config{_system_read_concurrency_sem, std::numeric_limits::max()}; } } diff --git a/database.hh b/database.hh index d0fb002b74..b5866183fb 100644 --- a/database.hh +++ b/database.hh @@ -748,7 +748,7 @@ public: // Returns at most "cmd.limit" rows future> query(schema_ptr, const query::read_command& cmd, - query_class_config class_config, + query::query_class_config class_config, query::result_options opts, const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state, @@ -1294,7 +1294,7 @@ private: column_family*, schema_ptr, const query::read_command&, - query_class_config, + query::query_class_config, query::result_options, const dht::partition_range_vector&, tracing::trace_state_ptr, @@ -1594,7 +1594,7 @@ public: return _supports_infinite_bound_range_deletions; } - query_class_config make_query_class_config(); + query::query_class_config make_query_class_config(); }; future<> start_large_data_handler(sharded& db); diff --git a/mutation_partition.cc b/mutation_partition.cc index 59c7611028..113a8cbeba 100644 --- a/mutation_partition.cc +++ b/mutation_partition.cc @@ -2167,7 +2167,7 @@ future<> data_query( gc_clock::time_point query_time, query::result::builder& builder, db::timeout_clock::time_point timeout, - query_class_config class_config, + query::query_class_config class_config, tracing::trace_state_ptr trace_ptr, query::querier_cache_context cache_ctx) { @@ -2261,7 +2261,7 @@ static do_mutation_query(schema_ptr s, uint32_t partition_limit, gc_clock::time_point query_time, db::timeout_clock::time_point timeout, - query_class_config class_config, + query::query_class_config class_config, query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_ptr, query::querier_cache_context cache_ctx) @@ -2301,7 +2301,7 @@ mutation_query(schema_ptr s, uint32_t partition_limit, gc_clock::time_point query_time, db::timeout_clock::time_point timeout, - query_class_config class_config, + query::query_class_config class_config, query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_ptr, query::querier_cache_context cache_ctx) diff --git a/mutation_query.hh b/mutation_query.hh index ce2309a11e..dbd0374b09 100644 --- a/mutation_query.hh +++ b/mutation_query.hh @@ -163,7 +163,7 @@ future mutation_query( uint32_t partition_limit, gc_clock::time_point query_time, db::timeout_clock::time_point timeout, - query_class_config class_config, + query::query_class_config class_config, query::result_memory_accounter&& accounter = { }, tracing::trace_state_ptr trace_ptr = nullptr, query::querier_cache_context cache_ctx = { }); @@ -178,7 +178,7 @@ future<> data_query( gc_clock::time_point query_time, query::result::builder& builder, db::timeout_clock::time_point timeout, - query_class_config class_config, + query::query_class_config class_config, tracing::trace_state_ptr trace_ptr = nullptr, query::querier_cache_context cache_ctx = { }); @@ -193,7 +193,7 @@ class mutation_query_stage { uint32_t, gc_clock::time_point, db::timeout_clock::time_point, - query_class_config, + query::query_class_config, query::result_memory_accounter&&, tracing::trace_state_ptr, query::querier_cache_context> _execution_stage; diff --git a/query_class_config.hh b/query_class_config.hh index c1ad431904..d634bd5ad6 100644 --- a/query_class_config.hh +++ b/query_class_config.hh @@ -25,7 +25,11 @@ class reader_concurrency_semaphore; +namespace query { + struct query_class_config { reader_concurrency_semaphore& semaphore; uint64_t max_memory_for_unlimited_query; }; + +} diff --git a/table.cc b/table.cc index fc51b4967e..b7d77e674c 100644 --- a/table.cc +++ b/table.cc @@ -2032,7 +2032,7 @@ struct query_state { future> table::query(schema_ptr s, const query::read_command& cmd, - query_class_config class_config, + query::query_class_config class_config, query::result_options opts, const dht::partition_range_vector& partition_ranges, tracing::trace_state_ptr trace_state, diff --git a/test/lib/reader_permit.cc b/test/lib/reader_permit.cc index 3a9e4604c8..bf292784e7 100644 --- a/test/lib/reader_permit.cc +++ b/test/lib/reader_permit.cc @@ -33,8 +33,8 @@ reader_permit make_permit() { return the_semaphore.make_permit(); } -query_class_config make_query_class_config() { - return query_class_config{the_semaphore, std::numeric_limits::max()}; +query::query_class_config make_query_class_config() { + return query::query_class_config{the_semaphore, std::numeric_limits::max()}; } } // namespace tests diff --git a/test/lib/reader_permit.hh b/test/lib/reader_permit.hh index bc3d9b1a14..d6942c4813 100644 --- a/test/lib/reader_permit.hh +++ b/test/lib/reader_permit.hh @@ -30,6 +30,6 @@ reader_concurrency_semaphore& semaphore(); reader_permit make_permit(); -query_class_config make_query_class_config(); +query::query_class_config make_query_class_config(); } // namespace tests From 8aee7662a916c7f44498c1cecc63b870b03dcc61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Mon, 27 Jul 2020 16:20:12 +0300 Subject: [PATCH 06/26] query: introduce max_result_size To be used to pass around the soft/hard limit configured via `max_memory_for_unlimited_query_{soft,hard}_limit` in the codebase. --- query_class_config.hh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/query_class_config.hh b/query_class_config.hh index d634bd5ad6..5119934e33 100644 --- a/query_class_config.hh +++ b/query_class_config.hh @@ -27,6 +27,15 @@ class reader_concurrency_semaphore; namespace query { +struct max_result_size { + uint64_t soft_limit = 0; + uint64_t hard_limit = 0; + + max_result_size() = default; + explicit max_result_size(uint64_t max_size) : soft_limit(max_size), hard_limit(max_size) { } + explicit max_result_size(uint64_t soft_limit, uint64_t hard_limit) : soft_limit(soft_limit), hard_limit(hard_limit) { } +}; + struct query_class_config { reader_concurrency_semaphore& semaphore; uint64_t max_memory_for_unlimited_query; From 92ce39f01476b1b2b9c278221dc0d1556f72d2dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Tue, 28 Jul 2020 11:56:25 +0300 Subject: [PATCH 07/26] query: query_class_config: use max_result_size for the max_memory_for_unlimited_query field We want to switch from using a single limit to a dual soft/hard limit. As a first step we switch the limit field of `query_class_config` to use the recently introduced type for this. As this field has a single user at the moment -- reverse queries (and not a lot of propagation) -- we update it in this same patch to use the soft/hard limit: warn on reaching the soft limit and abort on the hard limit (the previous behaviour). --- database.cc | 7 +++--- flat_mutation_reader.cc | 30 ++++++++++++++++--------- flat_mutation_reader.hh | 15 ++++++++----- querier.hh | 10 ++++----- query_class_config.hh | 2 +- test/boost/flat_mutation_reader_test.cc | 15 +++++++++---- test/boost/querier_cache_test.cc | 2 +- test/lib/reader_permit.cc | 2 +- 8 files changed, 52 insertions(+), 31 deletions(-) diff --git a/database.cc b/database.cc index 1471f6ae70..24e937896b 100644 --- a/database.cc +++ b/database.cc @@ -1282,13 +1282,14 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) { query::query_class_config database::make_query_class_config() { // Everything running in the statement group is considered a user query if (current_scheduling_group() == _dbcfg.statement_scheduling_group) { - return query::query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query_hard_limit()}; + return query::query_class_config{_read_concurrency_sem, + query::max_result_size(_cfg.max_memory_for_unlimited_query_soft_limit(), _cfg.max_memory_for_unlimited_query_hard_limit())}; // Reads done on behalf of view update generation run in the streaming group } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) { - return query::query_class_config{_streaming_concurrency_sem, std::numeric_limits::max()}; + return query::query_class_config{_streaming_concurrency_sem, query::max_result_size(std::numeric_limits::max())}; // Everything else is considered a system query } else { - return query::query_class_config{_system_read_concurrency_sem, std::numeric_limits::max()}; + return query::query_class_config{_system_read_concurrency_sem, query::max_result_size(std::numeric_limits::max())}; } } diff --git a/flat_mutation_reader.cc b/flat_mutation_reader.cc index 746c09e6f8..c054a80c18 100644 --- a/flat_mutation_reader.cc +++ b/flat_mutation_reader.cc @@ -60,14 +60,15 @@ void flat_mutation_reader::impl::clear_buffer_to_next_partition() { _buffer_size = compute_buffer_size(*_schema, _buffer); } -flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_t max_memory_consumption) { +flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, query::max_result_size max_size) { class partition_reversing_mutation_reader final : public flat_mutation_reader::impl { flat_mutation_reader* _source; range_tombstone_list _range_tombstones; std::stack _mutation_fragments; mutation_fragment_opt _partition_end; size_t _stack_size = 0; - const size_t _max_stack_size; + const query::max_result_size _max_size; + bool _below_soft_limit = true; private: stop_iteration emit_partition() { auto emit_range_tombstone = [&] { @@ -119,7 +120,7 @@ flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_ } else { _mutation_fragments.emplace(std::move(mf)); _stack_size += _mutation_fragments.top().memory_usage(*_schema); - if (_stack_size >= _max_stack_size) { + if (_stack_size > _max_size.hard_limit || (_stack_size > _max_size.soft_limit && _below_soft_limit)) { const partition_key* key = nullptr; auto it = buffer().end(); --it; @@ -129,21 +130,30 @@ flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_ --it; key = &it->as_partition_start().key().key(); } - throw std::runtime_error(fmt::format( - "Aborting reverse partition read because partition {} is larger than the maximum safe size of {} for reversible partitions.", - key->with_schema(*_schema), - _max_stack_size)); + + if (_stack_size > _max_size.hard_limit) { + throw std::runtime_error(fmt::format( + "Memory usage of reversed read exceeds hard limit of {} (configured via max_memory_for_unlimited_query_hard_limit), while reading partition {}", + _max_size.hard_limit, + key->with_schema(*_schema))); + } else { + fmr_logger.warn( + "Memory usage of reversed read exceeds soft limit of {} (configured via max_memory_for_unlimited_query_soft_limit), while reading partition {}", + _max_size.soft_limit, + key->with_schema(*_schema)); + _below_soft_limit = false; + } } } } return make_ready_future(is_buffer_full()); } public: - explicit partition_reversing_mutation_reader(flat_mutation_reader& mr, size_t max_stack_size) + explicit partition_reversing_mutation_reader(flat_mutation_reader& mr, query::max_result_size max_size) : flat_mutation_reader::impl(mr.schema()) , _source(&mr) , _range_tombstones(*_schema) - , _max_stack_size(max_stack_size) + , _max_size(max_size) { } virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override { @@ -185,7 +195,7 @@ flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_ } }; - return make_flat_mutation_reader(original, max_memory_consumption); + return make_flat_mutation_reader(original, max_size); } template diff --git a/flat_mutation_reader.hh b/flat_mutation_reader.hh index 31373a44f6..a7b0a4ca23 100644 --- a/flat_mutation_reader.hh +++ b/flat_mutation_reader.hh @@ -29,6 +29,7 @@ #include "mutation_fragment.hh" #include "tracing/trace_state.hh" #include "mutation.hh" +#include "query_class_config.hh" #include #include @@ -720,15 +721,17 @@ make_generating_reader(schema_ptr s, std::function /// /// \param original the reader to be reversed, has to be kept alive while the /// reversing reader is in use. -/// \param max_memory_consumption the maximum amount of memory the reader is -/// allowed to use for reversing. The reverse reader reads entire partitions -/// into memory, before reversing them. Since partitions can be larger than -/// the available memory, we need to enforce a limit on memory consumption. -/// If the read uses more memory then this limit, the read is aborted. +/// \param max_size the maximum amount of memory the reader is allowed to use +/// for reversing and conversely the maximum size of the results. The +/// reverse reader reads entire partitions into memory, before reversing +/// them. Since partitions can be larger than the available memory, we need +/// to enforce a limit on memory consumption. When reaching the soft limit +/// a warning will be logged. When reaching the hard limit the read will be +/// aborted. /// /// FIXME: reversing should be done in the sstable layer, see #1413. flat_mutation_reader -make_reversing_reader(flat_mutation_reader& original, size_t max_memory_consumption); +make_reversing_reader(flat_mutation_reader& original, query::max_result_size max_size); /// Low level fragment stream validator. /// diff --git a/querier.hh b/querier.hh index 1e3f5691f5..04efbccc21 100644 --- a/querier.hh +++ b/querier.hh @@ -83,7 +83,7 @@ auto consume_page(flat_mutation_reader& reader, uint32_t partition_limit, gc_clock::time_point query_time, db::timeout_clock::time_point timeout, - size_t reverse_read_max_memory) { + query::max_result_size max_size) { return reader.peek(timeout).then([=, &reader, consumer = std::move(consumer), &slice] ( mutation_fragment* next_fragment) mutable { const auto next_fragment_kind = next_fragment ? next_fragment->mutation_fragment_kind() : mutation_fragment::kind::partition_end; @@ -94,9 +94,9 @@ auto consume_page(flat_mutation_reader& reader, compaction_state, clustering_position_tracker(std::move(consumer), last_ckey)); - auto consume = [&reader, &slice, reader_consumer = std::move(reader_consumer), timeout, reverse_read_max_memory] () mutable { + auto consume = [&reader, &slice, reader_consumer = std::move(reader_consumer), timeout, max_size] () mutable { if (slice.options.contains(query::partition_slice::option::reversed)) { - return do_with(make_reversing_reader(reader, reverse_read_max_memory), + return do_with(make_reversing_reader(reader, max_size), [reader_consumer = std::move(reader_consumer), timeout] (flat_mutation_reader& reversing_reader) mutable { return reversing_reader.consume(std::move(reader_consumer), timeout); }); @@ -223,9 +223,9 @@ public: uint32_t partition_limit, gc_clock::time_point query_time, db::timeout_clock::time_point timeout, - size_t reverse_read_max_memory) { + query::max_result_size max_size) { return ::query::consume_page(_reader, _compaction_state, *_slice, std::move(consumer), row_limit, partition_limit, query_time, - timeout, reverse_read_max_memory).then([this] (auto&& results) { + timeout, max_size).then([this] (auto&& results) { _last_ckey = std::get>(std::move(results)); constexpr auto size = std::tuple_size>::value; static_assert(size <= 2); diff --git a/query_class_config.hh b/query_class_config.hh index 5119934e33..168143219a 100644 --- a/query_class_config.hh +++ b/query_class_config.hh @@ -38,7 +38,7 @@ struct max_result_size { struct query_class_config { reader_concurrency_semaphore& semaphore; - uint64_t max_memory_for_unlimited_query; + max_result_size max_memory_for_unlimited_query; }; } diff --git a/test/boost/flat_mutation_reader_test.cc b/test/boost/flat_mutation_reader_test.cc index 7e2ffefe2f..f6ce5426b8 100644 --- a/test/boost/flat_mutation_reader_test.cc +++ b/test/boost/flat_mutation_reader_test.cc @@ -592,7 +592,7 @@ void test_flat_stream(schema_ptr s, std::vector muts, reversed_partiti return fmr.consume_in_thread(std::move(fsc), db::no_timeout); } else { if (reversed) { - auto reverse_reader = make_reversing_reader(fmr, size_t(1) << 20); + auto reverse_reader = make_reversing_reader(fmr, query::max_result_size(size_t(1) << 20)); return reverse_reader.consume(std::move(fsc), db::no_timeout).get0(); } return fmr.consume(std::move(fsc), db::no_timeout).get0(); @@ -805,7 +805,8 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_memory_limit) { auto test_with_partition = [&] (bool with_static_row) { testlog.info("Testing with_static_row={}", with_static_row); - auto mut = schema.new_mutation("pk1"); + const auto pk = "pk1"; + auto mut = schema.new_mutation(pk); const size_t desired_mut_size = 1 * 1024 * 1024; const size_t row_size = 10 * 1024; @@ -817,8 +818,9 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_memory_limit) { schema.add_row(mut, schema.make_ckey(++i), sstring(row_size, '0')); } + const uint64_t hard_limit = size_t(1) << 18; auto reader = flat_mutation_reader_from_mutations({mut}); - auto reverse_reader = make_reversing_reader(reader, size_t(1) << 10); + auto reverse_reader = make_reversing_reader(reader, query::max_result_size(size_t(1) << 10, hard_limit)); try { reverse_reader.consume(phony_consumer{}, db::no_timeout).get(); @@ -826,7 +828,12 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_memory_limit) { } catch (const std::runtime_error& e) { testlog.info("Got exception with message: {}", e.what()); auto str = sstring(e.what()); - BOOST_REQUIRE_EQUAL(str.find("Aborting reverse partition read because partition pk1"), 0); + const auto expected_str = format( + "Memory usage of reversed read exceeds hard limit of {} (configured via max_memory_for_unlimited_query_hard_limit), while reading partition {}", + hard_limit, + pk); + + BOOST_REQUIRE_EQUAL(str.find(expected_str), 0); } catch (...) { throw; } diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc index 79e6203156..a393a06f35 100644 --- a/test/boost/querier_cache_test.cc +++ b/test/boost/querier_cache_test.cc @@ -214,7 +214,7 @@ public: auto querier = make_querier(range); auto [dk, ck] = querier.consume_page(dummy_result_builder{}, row_limit, std::numeric_limits::max(), - gc_clock::now(), db::no_timeout, std::numeric_limits::max()).get0(); + gc_clock::now(), db::no_timeout, query::max_result_size(std::numeric_limits::max())).get0(); const auto memory_usage = querier.memory_usage(); _cache.insert(cache_key, std::move(querier), nullptr); diff --git a/test/lib/reader_permit.cc b/test/lib/reader_permit.cc index bf292784e7..cb0e429375 100644 --- a/test/lib/reader_permit.cc +++ b/test/lib/reader_permit.cc @@ -34,7 +34,7 @@ reader_permit make_permit() { } query::query_class_config make_query_class_config() { - return query::query_class_config{the_semaphore, std::numeric_limits::max()}; + return query::query_class_config{the_semaphore, query::max_result_size(std::numeric_limits::max())}; } } // namespace tests From d5cc932a0b40f9dcf4283b97f9f9572dec67bab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Tue, 19 May 2020 15:09:48 +0300 Subject: [PATCH 08/26] database: query_mutations(): obtain the memory accounter inside Instead of requesting callers to do it and pass it as a parameter. This is in line with data_query(). --- database.cc | 5 ++++- database.hh | 2 +- service/storage_proxy.cc | 4 +--- test/boost/multishard_mutation_query_test.cc | 3 +-- test/boost/querier_cache_test.cc | 6 +++--- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/database.cc b/database.cc index 24e937896b..0340798244 100644 --- a/database.cc +++ b/database.cc @@ -1206,7 +1206,9 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti future> database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range, - query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) { + size_t result_max_size, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) { + return get_result_memory_limiter().new_mutation_read(result_max_size).then( + [&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) { column_family& cf = find_column_family(cmd.cf_id); query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page); return _mutation_query_stage(std::move(s), @@ -1231,6 +1233,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh return make_ready_future>(std::tuple(std::move(result), hit_rate)); } }); + }); } std::unordered_set database::get_initial_tokens() { diff --git a/database.hh b/database.hh index b5866183fb..61e337c6d4 100644 --- a/database.hh +++ b/database.hh @@ -1465,7 +1465,7 @@ public: const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state, uint64_t max_result_size, db::timeout_clock::time_point timeout); future> query_mutations(schema_ptr, const query::read_command& cmd, const dht::partition_range& range, - query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_state, + size_t max_result_size, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout); // Apply the mutation atomically. // Throws timed_out_error when timeout is reached. diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index 47678ab4b1..b4c623d9a9 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -5142,12 +5142,10 @@ storage_proxy::query_mutations_locally(schema_ptr s, lw_shared_ptrvalue().token()); get_stats().replica_cross_shard_ops += shard != this_shard_id(); return _db.invoke_on(shard, _read_smp_service_group, [max_size, cmd, &pr, gs=global_schema_ptr(s), timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable { - return db.get_result_memory_limiter().new_mutation_read(max_size).then([&] (query::result_memory_accounter ma) { - return db.query_mutations(gs, *cmd, pr, std::move(ma), gt, timeout).then([] (std::tuple result_ht) { + return db.query_mutations(gs, *cmd, pr, max_size, gt, timeout).then([] (std::tuple result_ht) { auto&& [result, ht] = result_ht; return make_ready_future>, cache_temperature>>(rpc::tuple(make_foreign(make_lw_shared(std::move(result))), ht)); }); - }); }); } else { return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), {pr}, std::move(trace_state), max_size, timeout); diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc index 45d823613a..db345b123b 100644 --- a/test/boost/multishard_mutation_query_test.cc +++ b/test/boost/multishard_mutation_query_test.cc @@ -104,11 +104,10 @@ static std::vector read_all_partitions_one_by_one(distributed::max()).get0(); const auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), query::max_rows); const auto range = dht::partition_range::make_singular(pkey); return make_foreign(std::make_unique( - std::get<0>(db.query_mutations(std::move(s), cmd, range, std::move(accounter), nullptr, db::no_timeout).get0()))); + std::get<0>(db.query_mutations(std::move(s), cmd, range, std::numeric_limits::max(), nullptr, db::no_timeout).get0()))); }); }).get0(); diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc index a393a06f35..3925ad848e 100644 --- a/test/boost/querier_cache_test.cc +++ b/test/boost/querier_cache_test.cc @@ -672,7 +672,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { db.query_mutations(s, cmd1, query::full_partition_range, - db.get_result_memory_limiter().new_mutation_read(1024 * 1024).get0(), + 1024 * 1024, nullptr, db::no_timeout).get(); @@ -706,7 +706,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { db.query_mutations(s, cmd2, query::full_partition_range, - db.get_result_memory_limiter().new_mutation_read(1024 * 1024).get0(), + 1024 * 1024, nullptr, db::no_timeout).get(); @@ -723,7 +723,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { db.query_mutations(s, cmd2, query::full_partition_range, - db.get_result_memory_limiter().new_mutation_read(1024 * 1024 * 1024 * 1024).get0(), + 1024 * 1024 * 1024 * 1024, nullptr, db::no_timeout).get(); return make_ready_future<>(); From a64d9b8883c0fd43eef4cced55c254ba0760dd00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 28 May 2020 10:17:57 +0300 Subject: [PATCH 09/26] database: add get_statement_scheduling_group() --- database.hh | 1 + 1 file changed, 1 insertion(+) diff --git a/database.hh b/database.hh index 61e337c6d4..090d3be332 100644 --- a/database.hh +++ b/database.hh @@ -1396,6 +1396,7 @@ public: return _commitlog.get(); } + seastar::scheduling_group get_statement_scheduling_group() const { return _dbcfg.statement_scheduling_group; } seastar::scheduling_group get_streaming_scheduling_group() const { return _dbcfg.streaming_scheduling_group; } size_t get_available_memory() const { return _dbcfg.available_memory; } From c364c7c6a24c8d1c585854b9708811883e7bf891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 28 May 2020 10:31:30 +0300 Subject: [PATCH 10/26] result_memory_limiter: add unlimited_result_size constant To be used as the max result size for internal queries. --- query-result.hh | 1 + query.cc | 1 + test/boost/querier_cache_test.cc | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/query-result.hh b/query-result.hh index 7b6e1d3d29..9634dca6ee 100644 --- a/query-result.hh +++ b/query-result.hh @@ -51,6 +51,7 @@ class result_memory_limiter { public: static constexpr size_t minimum_result_size = 4 * 1024; static constexpr size_t maximum_result_size = 1 * 1024 * 1024; + static constexpr size_t unlimited_result_size = std::numeric_limits::max(); public: explicit result_memory_limiter(size_t maximum_total_result_memory) : _maximum_total_result_memory(maximum_total_result_memory) diff --git a/query.cc b/query.cc index 2d22194d06..4ba98a0c51 100644 --- a/query.cc +++ b/query.cc @@ -34,6 +34,7 @@ namespace query { constexpr size_t result_memory_limiter::minimum_result_size; constexpr size_t result_memory_limiter::maximum_result_size; +constexpr size_t result_memory_limiter::unlimited_result_size; thread_local semaphore result_memory_tracker::_dummy { 0 }; diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc index 3925ad848e..39b5464269 100644 --- a/test/boost/querier_cache_test.cc +++ b/test/boost/querier_cache_test.cc @@ -723,7 +723,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { db.query_mutations(s, cmd2, query::full_partition_range, - 1024 * 1024 * 1024 * 1024, + query::result_memory_limiter::unlimited_result_size, nullptr, db::no_timeout).get(); return make_ready_future<>(); From 9eb6d704b235ab57f032f32fc66df800c5ca8fcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 28 May 2020 10:31:11 +0300 Subject: [PATCH 11/26] storage_proxy: add get_max_result_size() Meant to be used by the coordinator node to obtain the max result size applicable to the query-class (determined based on the current scheduling group). For normal paged queries the previously used `query::result_memory_limiter::maximum_result_size` is used uniformly. For reverse and unpaged queries, a query class dependent value is used. For user reads, the value of the `max_memory_for_unlimited_query_{soft,hard}_limit` configuration items is used, for other classes no limit is used (`query::result_memory_limiter::unlimited_result_size`). --- service/storage_proxy.cc | 15 +++++++++++++++ service/storage_proxy.hh | 2 ++ 2 files changed, 17 insertions(+) diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index b4c623d9a9..e1ebab1a04 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -1316,6 +1316,21 @@ endpoints_to_replica_ids(locator::token_metadata& tm, const std::vector() || slice.options.contains()) { + auto& db = _db.local(); + // We only limit user queries. + if (current_scheduling_group() == db.get_statement_scheduling_group()) { + return query::max_result_size(db.get_config().max_memory_for_unlimited_query_soft_limit(), db.get_config().max_memory_for_unlimited_query_hard_limit()); + } else { + return query::max_result_size(query::result_memory_limiter::unlimited_result_size); + } + } else { + return query::max_result_size(query::result_memory_limiter::maximum_result_size); + } +} + bool storage_proxy::need_throttle_writes() const { return get_global_stats().background_write_bytes > _background_write_throttle_threahsold || get_global_stats().queued_write_bytes > 6*1024*1024; } diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh index 73f42cacec..13ed45a92e 100644 --- a/service/storage_proxy.hh +++ b/service/storage_proxy.hh @@ -247,6 +247,8 @@ public: const locator::token_metadata& get_token_metadata() const { return _token_metadata; } locator::token_metadata& get_token_metadata() { return _token_metadata; } + query::max_result_size get_max_result_size(const query::partition_slice& slice) const; + private: distributed& _db; locator::token_metadata& _token_metadata; From 989142464ce4466d42e034eacca725e6b353f16f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 28 May 2020 10:18:49 +0300 Subject: [PATCH 12/26] result_memory_accounter: check(): use _maximum_result_size instead of hardcoded limit The use of the global `result_memory_limiter::maximum_result_size` is probably a leftover from before the `_maximum_result_size` member was introduced (aa083d3d85). --- query-result.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/query-result.hh b/query-result.hh index 9634dca6ee..fdff5d3740 100644 --- a/query-result.hh +++ b/query-result.hh @@ -196,7 +196,7 @@ public: // Checks whether the result can grow any more. stop_iteration check() const { - stop_iteration stop { _total_used_memory > result_memory_limiter::maximum_result_size }; + stop_iteration stop { _total_used_memory > _maximum_result_size }; if (!stop && _used_memory >= _blocked_bytes && _limiter) { return _limiter->check() && _stop_on_global_limit; } From 1615fe4c5ee41c64ca224d08bfd32c05e086dae6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Wed, 10 Jun 2020 12:32:42 +0300 Subject: [PATCH 13/26] service: query_pager: set the allow_short_read flag All callers should set this already before passing the slice to the pager, however not all actually do (e.g. `cql3::indexed_table_select_statement::read_posting_list()`). Instead of auditing each call site, just make sure this is set in the pager itself. If someone is creating a pager we can be sure they mean to use paging. --- service/pager/query_pagers.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/service/pager/query_pagers.cc b/service/pager/query_pagers.cc index 30493a59fd..a2b603cb50 100644 --- a/service/pager/query_pagers.cc +++ b/service/pager/query_pagers.cc @@ -84,6 +84,10 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd) future query_pager::do_fetch_page(uint32_t page_size, gc_clock::time_point now, db::timeout_clock::time_point timeout) { auto state = _options.get_paging_state(); + // Most callers should set this but we want to make sure, as results + // won't be paged without it. + _cmd->slice.options.set(); + if (!_last_pkey && state) { _max = state->get_remaining(); _last_pkey = state->get_partition_key(); From 2ca118b2d5262360a07869d4f6efb51ecc287b31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Tue, 9 Jun 2020 15:18:06 +0300 Subject: [PATCH 14/26] query: read_command: add separate convenience constructor query::read_command currently has a single constructor, which serves both as an idl constructor (order of parameters is fixed) and a convenience one (most parameters have default values). This makes it very error prone to add new parameters, that everyone should fill. The new parameter has to be added as last, with a default value, as the previous ones have a default value as well. This means the compiler's help cannot be enlisted to make sure all usages are updated. This patch adds a separate convenience constructor to be used by normal code. The idl constructor looses all default parameters. New parameters can be added to any position in the convenience constructor (to force users to fill in a meaningful value) while the removed default parameters from the idl constructor means code cannot accidentally use it without noticing. --- cql3/statements/select_statement.cc | 17 ++++++++++---- query-request.hh | 36 +++++++++++++++++++++++------ redis/query_utils.cc | 2 +- test/boost/database_test.cc | 4 ++-- test/boost/querier_cache_test.cc | 6 +++-- thrift/handler.cc | 2 +- 6 files changed, 50 insertions(+), 17 deletions(-) diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc index 315656d22d..014ae1df57 100644 --- a/cql3/statements/select_statement.cc +++ b/cql3/statements/select_statement.cc @@ -319,8 +319,17 @@ select_statement::do_execute(service::storage_proxy& proxy, _stats.select_partition_range_scan += _range_scan; _stats.select_partition_range_scan_no_bypass_cache += _range_scan_no_bypass_cache; - auto command = ::make_lw_shared(_schema->id(), _schema->version(), - make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), query::is_first_page::no, options.get_timestamp(state)); + auto command = ::make_lw_shared( + _schema->id(), + _schema->version(), + make_partition_slice(options), + limit, + query::max_partitions, + now, + tracing::make_trace_info(state.get_trace_state()), + utils::UUID(), + query::is_first_page::no, + options.get_timestamp(state)); int32_t page_size = options.get_page_size(); @@ -477,9 +486,9 @@ indexed_table_select_statement::prepare_command_for_base_query(const query_optio _schema->version(), make_partition_slice(options), get_limit(options), + query::max_partitions, now, tracing::make_trace_info(state.get_trace_state()), - query::max_partitions, utils::UUID(), query::is_first_page::no, options.get_timestamp(state)); @@ -1147,9 +1156,9 @@ indexed_table_select_statement::read_posting_list(service::storage_proxy& proxy, _view_schema->version(), partition_slice, limit, + query::max_partitions, now, tracing::make_trace_info(state.get_trace_state()), - query::max_partitions, utils::UUID(), query::is_first_page::no, options.get_timestamp(state)); diff --git a/query-request.hh b/query-request.hh index 56f965c627..609ed93448 100644 --- a/query-request.hh +++ b/query-request.hh @@ -235,16 +235,38 @@ public: query::is_first_page is_first_page; api::timestamp_type read_timestamp; // not serialized public: + // IDL constructor read_command(utils::UUID cf_id, table_schema_version schema_version, partition_slice slice, - uint32_t row_limit = max_rows, - gc_clock::time_point now = gc_clock::now(), - std::optional ti = std::nullopt, - uint32_t partition_limit = max_partitions, - utils::UUID query_uuid = utils::UUID(), - query::is_first_page is_first_page = is_first_page::no, - api::timestamp_type rt = api::new_timestamp()) + uint32_t row_limit, + gc_clock::time_point now, + std::optional ti, + uint32_t partition_limit, + utils::UUID query_uuid, + query::is_first_page is_first_page) + : cf_id(std::move(cf_id)) + , schema_version(std::move(schema_version)) + , slice(std::move(slice)) + , row_limit(row_limit) + , timestamp(now) + , trace_info(std::move(ti)) + , partition_limit(partition_limit) + , query_uuid(query_uuid) + , is_first_page(is_first_page) + , read_timestamp(api::new_timestamp()) + { } + + read_command(utils::UUID cf_id, + table_schema_version schema_version, + partition_slice slice, + uint32_t row_limit = max_rows, + uint32_t partition_limit = max_partitions, + gc_clock::time_point now = gc_clock::now(), + std::optional ti = std::nullopt, + utils::UUID query_uuid = utils::UUID(), + query::is_first_page is_first_page = query::is_first_page::no, + api::timestamp_type rt = api::new_timestamp()) : cf_id(std::move(cf_id)) , schema_version(std::move(schema_version)) , slice(std::move(slice)) diff --git a/redis/query_utils.cc b/redis/query_utils.cc index 37179dfeb5..c6fd7b73d7 100644 --- a/redis/query_utils.cc +++ b/redis/query_utils.cc @@ -75,7 +75,7 @@ public: future> read_strings(service::storage_proxy& proxy, const redis_options& options, const bytes& key, service_permit permit) { auto schema = get_schema(proxy, options.get_keyspace_name(), redis::STRINGs); auto ps = partition_slice_builder(*schema).build(); - query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1); + query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1, utils::UUID(), query::is_first_page::no); auto pkey = partition_key::from_single_value(*schema, key); auto partition_range = dht::partition_range::make_singular(dht::decorate_key(*schema, std::move(pkey))); dht::partition_range_vector partition_ranges; diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc index 155d51ec0c..8eef7bfbe1 100644 --- a/test/boost/database_test.cc +++ b/test/boost/database_test.cc @@ -112,14 +112,14 @@ SEASTAR_TEST_CASE(test_querying_with_limits) { { auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), - query::max_rows, gc_clock::now(), std::nullopt, 5); + query::max_rows, 5); auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5); } { auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), - query::max_rows, gc_clock::now(), std::nullopt, 3); + query::max_rows, 3); auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3); } diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc index 39b5464269..cc5ef71e14 100644 --- a/test/boost/querier_cache_test.cc +++ b/test/boost/querier_cache_test.cc @@ -666,7 +666,8 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { gc_clock::now(), std::nullopt, 1, - utils::make_random_uuid()); + utils::make_random_uuid(), + query::is_first_page::yes); // Should save the querier in cache. db.query_mutations(s, @@ -700,7 +701,8 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { gc_clock::now(), std::nullopt, 1, - utils::make_random_uuid()); + utils::make_random_uuid(), + query::is_first_page::no); // Should evict the already cached querier. db.query_mutations(s, diff --git a/thrift/handler.cc b/thrift/handler.cc index 5686a7c47c..8d28bc08d7 100644 --- a/thrift/handler.cc +++ b/thrift/handler.cc @@ -394,7 +394,7 @@ public: clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides()); auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts, std::move(specific_ranges), cql_serialization_format::internal()); - return make_lw_shared(s.id(), s.version(), std::move(slice), row_limit, gc_clock::now(), std::nullopt, partition_limit); + return make_lw_shared(s.id(), s.version(), std::move(slice), row_limit, partition_limit); } static future<> do_get_paged_slice( From 8992bcd1f83f0b14b46b9fca3140271e3467ce22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Tue, 9 Jun 2020 16:15:13 +0300 Subject: [PATCH 15/26] query: read_command: use tagged ints for limit ctor params The convenience constructor of read_command now has two integer parameter next to each other. In the next patch we intend to add another one. This is recipe for disaster, so to avoid mistakes this patch converts these parameters to tagged integers. This makes sure callers pass what they meant to pass. As a matter of fact, while fixing up call-sites, I already found several ones passing `query::max_partitions` to the `row_limit` parameter. No harm done yet, as `query::max_partitions` == `query::max_rows` but this shows just how easy it is to mix up parameters with the same type. --- alternator/executor.cc | 8 ++++---- alternator/streams.cc | 2 +- cdc/log.cc | 2 +- cql3/statements/select_statement.cc | 12 ++++++------ db/schema_tables.cc | 2 +- db/system_keyspace.cc | 8 +++----- query-request.hh | 12 ++++++++---- service/storage_proxy.cc | 2 +- test/boost/database_test.cc | 8 ++++---- test/boost/multishard_mutation_query_test.cc | 2 +- thrift/handler.cc | 4 ++-- 11 files changed, 32 insertions(+), 30 deletions(-) diff --git a/alternator/executor.cc b/alternator/executor.cc index a9fb41f916..8b8b5dc1a0 100644 --- a/alternator/executor.cc +++ b/alternator/executor.cc @@ -1240,7 +1240,7 @@ static lw_shared_ptr previous_item_read_command(schema_ptr auto regular_columns = boost::copy_range( schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; })); auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - return ::make_lw_shared(schema->id(), schema->version(), partition_slice, query::max_partitions); + return ::make_lw_shared(schema->id(), schema->version(), partition_slice); } static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) { @@ -2405,7 +2405,7 @@ future executor::get_item(client_state& client_st auto selection = cql3::selection::selection::wildcard(schema); auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, query::max_partitions); + auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice); std::unordered_set used_attribute_names; auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names); @@ -2475,7 +2475,7 @@ future executor::batch_get_item(client_state& cli rs.schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; })); auto selection = cql3::selection::selection::wildcard(rs.schema); auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(rs.schema->id(), rs.schema->version(), partition_slice, query::max_partitions); + auto command = ::make_lw_shared(rs.schema->id(), rs.schema->version(), partition_slice); future>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then( [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable { @@ -2762,7 +2762,7 @@ static future do_query(schema_ptr schema, query::partition_slice::option_set opts = selection->get_query_options(); opts.add(custom_opts); auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts); - auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, query::max_partitions); + auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice); auto query_state_ptr = std::make_unique(client_state, trace_state, std::move(permit)); diff --git a/alternator/streams.cc b/alternator/streams.cc index 8fa3e5a8b5..d00d3d93b9 100644 --- a/alternator/streams.cc +++ b/alternator/streams.cc @@ -717,7 +717,7 @@ future executor::get_records(client_state& client auto partition_slice = query::partition_slice( std::move(bounds) , {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, limit * 4); + auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, query::row_limit(limit * 4)); return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then( [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable { diff --git a/cdc/log.cc b/cdc/log.cc index 52040c6bd9..806dfadaf6 100644 --- a/cdc/log.cc +++ b/cdc/log.cc @@ -1356,7 +1356,7 @@ public: opts.set_if(!p.static_row().empty()); auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), std::move(opts)); - auto command = ::make_lw_shared(_schema->id(), _schema->version(), partition_slice, row_limit); + auto command = ::make_lw_shared(_schema->id(), _schema->version(), partition_slice, query::row_limit(row_limit)); const auto select_cl = adjust_cl(write_cl); diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc index 014ae1df57..8af941c7c0 100644 --- a/cql3/statements/select_statement.cc +++ b/cql3/statements/select_statement.cc @@ -323,8 +323,8 @@ select_statement::do_execute(service::storage_proxy& proxy, _schema->id(), _schema->version(), make_partition_slice(options), - limit, - query::max_partitions, + query::row_limit(limit), + query::partition_limit(query::max_partitions), now, tracing::make_trace_info(state.get_trace_state()), utils::UUID(), @@ -485,8 +485,8 @@ indexed_table_select_statement::prepare_command_for_base_query(const query_optio _schema->id(), _schema->version(), make_partition_slice(options), - get_limit(options), - query::max_partitions, + query::row_limit(get_limit(options)), + query::partition_limit(query::max_partitions), now, tracing::make_trace_info(state.get_trace_state()), utils::UUID(), @@ -1155,8 +1155,8 @@ indexed_table_select_statement::read_posting_list(service::storage_proxy& proxy, _view_schema->id(), _view_schema->version(), partition_slice, - limit, - query::max_partitions, + query::row_limit(limit), + query::partition_limit(query::max_partitions), now, tracing::make_trace_info(state.get_trace_state()), utils::UUID(), diff --git a/db/schema_tables.cc b/db/schema_tables.cc index 7f65f9c7e7..68157acb46 100644 --- a/db/schema_tables.cc +++ b/db/schema_tables.cc @@ -778,7 +778,7 @@ read_schema_partition_for_table(distributed& proxy, sche auto slice = partition_slice_builder(*schema) .with_range(std::move(clustering_range)) .build(); - auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), query::max_rows); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), query::row_limit(query::max_rows)); return query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key)).then([&proxy] (mutation mut) { return redact_columns_for_missing_features(std::move(mut), proxy.local().get_db().local().features().cluster_schema_features()); }); diff --git a/db/system_keyspace.cc b/db/system_keyspace.cc index 0df4181fd6..41e443f8f2 100644 --- a/db/system_keyspace.cc +++ b/db/system_keyspace.cc @@ -1984,8 +1984,7 @@ query_mutations(distributed& proxy, const sstring& ks_na database& db = proxy.local().get_db().local(); schema_ptr schema = db.find_schema(ks_name, cf_name); auto slice = partition_slice_builder(*schema).build(); - auto cmd = make_lw_shared(schema->id(), schema->version(), - std::move(slice), std::numeric_limits::max()); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice)); return proxy.local().query_mutations_locally(std::move(schema), std::move(cmd), query::full_partition_range, db::no_timeout) .then([] (rpc::tuple>, cache_temperature> rr_ht) { return std::get<0>(std::move(rr_ht)); }); } @@ -1995,8 +1994,7 @@ query(distributed& proxy, const sstring& ks_name, const database& db = proxy.local().get_db().local(); schema_ptr schema = db.find_schema(ks_name, cf_name); auto slice = partition_slice_builder(*schema).build(); - auto cmd = make_lw_shared(schema->id(), schema->version(), - std::move(slice), std::numeric_limits::max()); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice)); return proxy.local().query(schema, cmd, {query::full_partition_range}, db::consistency_level::ONE, {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) { return make_lw_shared(query::result_set::from_raw_result(schema, cmd->slice, *qr.query_result)); @@ -2011,7 +2009,7 @@ query(distributed& proxy, const sstring& ks_name, const auto slice = partition_slice_builder(*schema) .with_range(std::move(row_range)) .build(); - auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), query::max_rows); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice)); return proxy.local().query(schema, cmd, {dht::partition_range::make_singular(key)}, db::consistency_level::ONE, {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) { diff --git a/query-request.hh b/query-request.hh index 609ed93448..8cd7505180 100644 --- a/query-request.hh +++ b/query-request.hh @@ -206,6 +206,10 @@ public: constexpr auto max_partitions = std::numeric_limits::max(); +// Tagged integers to disambiguate constructor arguments. +enum class row_limit : uint32_t { max = max_rows }; +enum class partition_limit : uint32_t { max = max_partitions }; + using is_first_page = bool_class; // Full specification of a query to the database. @@ -260,8 +264,8 @@ public: read_command(utils::UUID cf_id, table_schema_version schema_version, partition_slice slice, - uint32_t row_limit = max_rows, - uint32_t partition_limit = max_partitions, + query::row_limit row_limit = query::row_limit::max, + query::partition_limit partition_limit = query::partition_limit::max, gc_clock::time_point now = gc_clock::now(), std::optional ti = std::nullopt, utils::UUID query_uuid = utils::UUID(), @@ -270,10 +274,10 @@ public: : cf_id(std::move(cf_id)) , schema_version(std::move(schema_version)) , slice(std::move(slice)) - , row_limit(row_limit) + , row_limit(static_cast(row_limit)) , timestamp(now) , trace_info(std::move(ti)) - , partition_limit(partition_limit) + , partition_limit(static_cast(partition_limit)) , query_uuid(query_uuid) , is_first_page(is_first_page) , read_timestamp(rt) diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index e1ebab1a04..eb4af0bb14 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -4345,7 +4345,7 @@ static lw_shared_ptr read_nothing_read_command(schema_ptr s // Note that because this read-nothing command has an empty slice, // storage_proxy::query() returns immediately - without any networking. auto partition_slice = query::partition_slice({}, {}, {}, query::partition_slice::option_set()); - return ::make_lw_shared(schema->id(), schema->version(), partition_slice, query::max_partitions); + return ::make_lw_shared(schema->id(), schema->version(), partition_slice); } static read_timeout_exception write_timeout_to_read(schema_ptr s, mutation_write_timeout_exception& ex) { diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc index 8eef7bfbe1..faaae1b1ef 100644 --- a/test/boost/database_test.cc +++ b/test/boost/database_test.cc @@ -62,7 +62,7 @@ SEASTAR_TEST_CASE(test_safety_after_truncate) { auto assert_query_result = [&] (size_t expected_size) { auto max_size = std::numeric_limits::max(); - auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 1000); + auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::row_limit(1000)); auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0(); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(expected_size); }; @@ -105,21 +105,21 @@ SEASTAR_TEST_CASE(test_querying_with_limits) { auto max_size = std::numeric_limits::max(); { - auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 3); + auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::row_limit(3)); auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3); } { auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), - query::max_rows, 5); + query::row_limit(query::max_rows), query::partition_limit(5)); auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5); } { auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), - query::max_rows, 3); + query::row_limit(query::max_rows), query::partition_limit(3)); auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3); } diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc index db345b123b..768fb85ac9 100644 --- a/test/boost/multishard_mutation_query_test.cc +++ b/test/boost/multishard_mutation_query_test.cc @@ -104,7 +104,7 @@ static std::vector read_all_partitions_one_by_one(distributedid(), s->version(), s->full_slice(), query::max_rows); + const auto cmd = query::read_command(s->id(), s->version(), s->full_slice()); const auto range = dht::partition_range::make_singular(pkey); return make_foreign(std::make_unique( std::get<0>(db.query_mutations(std::move(s), cmd, range, std::numeric_limits::max(), nullptr, db::no_timeout).get0()))); diff --git a/thrift/handler.cc b/thrift/handler.cc index 8d28bc08d7..f0625010a2 100644 --- a/thrift/handler.cc +++ b/thrift/handler.cc @@ -394,7 +394,7 @@ public: clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides()); auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts, std::move(specific_ranges), cql_serialization_format::internal()); - return make_lw_shared(s.id(), s.version(), std::move(slice), row_limit, partition_limit); + return make_lw_shared(s.id(), s.version(), std::move(slice), query::row_limit(row_limit), query::partition_limit(partition_limit)); } static future<> do_get_paged_slice( @@ -664,7 +664,7 @@ public: } } auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts, nullptr); - auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), row_limit); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), query::row_limit(row_limit)); auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT); return f.then([this, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level] { auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout; From 92a7b16cba437501a02394e19c58f34f50ffa0be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Tue, 9 Jun 2020 10:34:59 +0300 Subject: [PATCH 16/26] query: read_command: add max_result_size This field will replace max size which is currently passed once per established rpc connection via the CLIENT_ID verb and stored as an auxiliary value on the client_info. For now it is unused, but we update all sites creating a read command to pass the correct value to it. In the next patch we will phase out the old max size and use this field to pass max size on each verb instead. --- alternator/executor.cc | 22 +++++----- alternator/streams.cc | 3 +- cdc/log.cc | 3 +- cql3/statements/batch_statement.cc | 2 +- cql3/statements/cas_request.cc | 4 +- cql3/statements/cas_request.hh | 2 +- cql3/statements/modification_statement.cc | 9 ++-- cql3/statements/modification_statement.hh | 2 +- cql3/statements/select_statement.cc | 30 ++++++++------ cql3/statements/select_statement.hh | 3 +- db/schema_tables.cc | 6 ++- db/system_keyspace.cc | 6 +-- idl/read_command.idl.hh | 6 +++ query-request.hh | 11 ++++- redis/query_utils.cc | 3 +- service/pager/query_pagers.cc | 7 +++- service/storage_proxy.cc | 3 +- test/boost/database_test.cc | 8 ++-- test/boost/multishard_mutation_query_test.cc | 8 ++-- test/boost/querier_cache_test.cc | 6 ++- thrift/handler.cc | 43 ++++++++++++-------- 21 files changed, 117 insertions(+), 70 deletions(-) diff --git a/alternator/executor.cc b/alternator/executor.cc index 8b8b5dc1a0..b65d0be76b 100644 --- a/alternator/executor.cc +++ b/alternator/executor.cc @@ -1225,7 +1225,8 @@ static future> get_previous_item( service_permit permit, alternator::stats& stats); -static lw_shared_ptr previous_item_read_command(schema_ptr schema, +static lw_shared_ptr previous_item_read_command(service::storage_proxy& proxy, + schema_ptr schema, const clustering_key& ck, shared_ptr selection) { std::vector bounds; @@ -1240,7 +1241,7 @@ static lw_shared_ptr previous_item_read_command(schema_ptr auto regular_columns = boost::copy_range( schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; })); auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - return ::make_lw_shared(schema->id(), schema->version(), partition_slice); + return ::make_lw_shared(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice)); } static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) { @@ -1354,7 +1355,7 @@ static future> get_previous_item( { stats.reads_before_write++; auto selection = cql3::selection::selection::wildcard(schema); - auto command = previous_item_read_command(schema, ck, selection); + auto command = previous_item_read_command(proxy, schema, ck, selection); auto cl = db::consistency_level::LOCAL_QUORUM; return proxy.query(schema, command, to_partition_ranges(*schema, pk), cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state)).then( @@ -1405,7 +1406,7 @@ future rmw_operation::execute(service::storage_pr auto timeout = executor::default_timeout(); auto selection = cql3::selection::selection::wildcard(schema()); auto read_command = needs_read_before_write ? - previous_item_read_command(schema(), _ck, selection) : + previous_item_read_command(proxy, schema(), _ck, selection) : nullptr; return proxy.cas(schema(), shared_from_this(), read_command, to_partition_ranges(*schema(), _pk), {timeout, std::move(permit), client_state, trace_state}, @@ -2405,7 +2406,7 @@ future executor::get_item(client_state& client_st auto selection = cql3::selection::selection::wildcard(schema); auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice); + auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice)); std::unordered_set used_attribute_names; auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names); @@ -2475,7 +2476,7 @@ future executor::batch_get_item(client_state& cli rs.schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; })); auto selection = cql3::selection::selection::wildcard(rs.schema); auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(rs.schema->id(), rs.schema->version(), partition_slice); + auto command = ::make_lw_shared(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice)); future>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then( [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable { @@ -2728,7 +2729,8 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag return last_evaluated_key; } -static future do_query(schema_ptr schema, +static future do_query(service::storage_proxy& proxy, + schema_ptr schema, const rjson::value* exclusive_start_key, dht::partition_range_vector&& partition_ranges, std::vector&& ck_bounds, @@ -2762,7 +2764,7 @@ static future do_query(schema_ptr schema, query::partition_slice::option_set opts = selection->get_query_options(); opts.add(custom_opts); auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts); - auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice); + auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice)); auto query_state_ptr = std::make_unique(client_state, trace_state, std::move(permit)); @@ -2883,7 +2885,7 @@ future executor::scan(client_state& client_state, verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Scan"); verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Scan"); - return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, + return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filter), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit)); } @@ -3357,7 +3359,7 @@ future executor::query(client_state& client_state verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Query"); query::partition_slice::option_set opts; opts.set_if(!forward); - return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, + return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, std::move(filter), opts, client_state, _stats.cql_stats, std::move(trace_state), std::move(permit)); } diff --git a/alternator/streams.cc b/alternator/streams.cc index d00d3d93b9..25c854421a 100644 --- a/alternator/streams.cc +++ b/alternator/streams.cc @@ -717,7 +717,8 @@ future executor::get_records(client_state& client auto partition_slice = query::partition_slice( std::move(bounds) , {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, query::row_limit(limit * 4)); + auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice), + query::row_limit(limit * 4)); return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then( [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable { diff --git a/cdc/log.cc b/cdc/log.cc index 806dfadaf6..2ae0530b52 100644 --- a/cdc/log.cc +++ b/cdc/log.cc @@ -1356,7 +1356,8 @@ public: opts.set_if(!p.static_row().empty()); auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), std::move(opts)); - auto command = ::make_lw_shared(_schema->id(), _schema->version(), partition_slice, query::row_limit(row_limit)); + const auto max_result_size = _ctx._proxy.get_max_result_size(partition_slice); + auto command = ::make_lw_shared(_schema->id(), _schema->version(), partition_slice, query::max_result_size(max_result_size), query::row_limit(row_limit)); const auto select_cl = adjust_cl(write_cl); diff --git a/cql3/statements/batch_statement.cc b/cql3/statements/batch_statement.cc index 63f7998a86..bbf6212ad1 100644 --- a/cql3/statements/batch_statement.cc +++ b/cql3/statements/batch_statement.cc @@ -385,7 +385,7 @@ future> batch_statement::exe make_shared(shard)); } - return proxy.cas(schema, request, request->read_command(), request->key(), + return proxy.cas(schema, request, request->read_command(proxy), request->key(), {read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()}, cl_for_paxos, cl_for_learn, batch_timeout, cas_timeout).then([this, request] (bool is_applied) { return modification_statement::build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied, request->rows()); diff --git a/cql3/statements/cas_request.cc b/cql3/statements/cas_request.cc index f597bad302..058b312ad4 100644 --- a/cql3/statements/cas_request.cc +++ b/cql3/statements/cas_request.cc @@ -81,7 +81,7 @@ std::optional cas_request::apply_updates(api::timestamp_type ts) const return mutation_set; } -lw_shared_ptr cas_request::read_command() const { +lw_shared_ptr cas_request::read_command(service::storage_proxy& proxy) const { column_set columns_to_read(_schema->all_columns_count()); std::vector ranges; @@ -116,7 +116,7 @@ lw_shared_ptr cas_request::read_command() const { options.set(query::partition_slice::option::always_return_static_content); query::partition_slice ps(std::move(ranges), *_schema, columns_to_read, options); ps.set_partition_row_limit(max_rows); - return make_lw_shared(_schema->id(), _schema->version(), std::move(ps)); + return make_lw_shared(_schema->id(), _schema->version(), std::move(ps), proxy.get_max_result_size(ps)); } bool cas_request::applies_to() const { diff --git a/cql3/statements/cas_request.hh b/cql3/statements/cas_request.hh index 48246f83e4..5f254b6d20 100644 --- a/cql3/statements/cas_request.hh +++ b/cql3/statements/cas_request.hh @@ -90,7 +90,7 @@ public: return _rows; } - lw_shared_ptr read_command() const; + lw_shared_ptr read_command(service::storage_proxy& proxy) const; void add_row_update(const modification_statement& stmt_arg, std::vector ranges_arg, modification_statement::json_cache_opt json_cache_arg, const query_options& options_arg); diff --git a/cql3/statements/modification_statement.cc b/cql3/statements/modification_statement.cc index c0a062cc5d..bdcc13f0fb 100644 --- a/cql3/statements/modification_statement.cc +++ b/cql3/statements/modification_statement.cc @@ -168,7 +168,7 @@ modification_statement::get_mutations(service::storage_proxy& proxy, const query } if (requires_read()) { - lw_shared_ptr cmd = read_command(ranges, cl); + lw_shared_ptr cmd = read_command(proxy, ranges, cl); // FIXME: ignoring "local" f = proxy.query(s, cmd, dht::partition_range_vector(keys), cl, {timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()}).then( @@ -246,14 +246,15 @@ std::vector modification_statement::apply_updates( } lw_shared_ptr -modification_statement::read_command(query::clustering_row_ranges ranges, db::consistency_level cl) const { +modification_statement::read_command(service::storage_proxy& proxy, query::clustering_row_ranges ranges, db::consistency_level cl) const { try { validate_for_read(cl); } catch (exceptions::invalid_request_exception& e) { throw exceptions::invalid_request_exception(format("Write operation require a read but consistency {} is not supported on reads", cl)); } query::partition_slice ps(std::move(ranges), *s, columns_to_read(), update_parameters::options); - return make_lw_shared(s->id(), s->version(), std::move(ps)); + const auto max_result_size = proxy.get_max_result_size(ps); + return make_lw_shared(s->id(), s->version(), std::move(ps), query::max_result_size(max_result_size)); } std::vector @@ -351,7 +352,7 @@ modification_statement::execute_with_condition(service::storage_proxy& proxy, se make_shared(shard)); } - return proxy.cas(s, request, request->read_command(), request->key(), + return proxy.cas(s, request, request->read_command(proxy), request->key(), {read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()}, cl_for_paxos, cl_for_learn, statement_timeout, cas_timeout).then([this, request] (bool is_applied) { return build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied, request->rows()); diff --git a/cql3/statements/modification_statement.hh b/cql3/statements/modification_statement.hh index de5dfeaa6e..435e9ce863 100644 --- a/cql3/statements/modification_statement.hh +++ b/cql3/statements/modification_statement.hh @@ -242,7 +242,7 @@ public: // Build a read_command instance to fetch the previous mutation from storage. The mutation is // fetched if we need to check LWT conditions or apply updates to non-frozen list elements. - lw_shared_ptr read_command(query::clustering_row_ranges ranges, db::consistency_level cl) const; + lw_shared_ptr read_command(service::storage_proxy& proxy, query::clustering_row_ranges ranges, db::consistency_level cl) const; // Create a mutation object for the update operation represented by this modification statement. // A single mutation object for lightweight transactions, which can only span one partition, or a vector // of mutations, one per partition key, for statements which affect multiple partition keys, diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc index 8af941c7c0..64185cc6be 100644 --- a/cql3/statements/select_statement.cc +++ b/cql3/statements/select_statement.cc @@ -319,10 +319,12 @@ select_statement::do_execute(service::storage_proxy& proxy, _stats.select_partition_range_scan += _range_scan; _stats.select_partition_range_scan_no_bypass_cache += _range_scan_no_bypass_cache; + auto slice = make_partition_slice(options); auto command = ::make_lw_shared( _schema->id(), _schema->version(), - make_partition_slice(options), + std::move(slice), + proxy.get_max_result_size(slice), query::row_limit(limit), query::partition_limit(query::max_partitions), now, @@ -480,11 +482,21 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option } lw_shared_ptr -indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) const { +indexed_table_select_statement::prepare_command_for_base_query(service::storage_proxy& proxy, const query_options& options, + service::query_state& state, gc_clock::time_point now, bool use_paging) const { + auto slice = make_partition_slice(options); + if (use_paging) { + slice.options.set(); + slice.options.set(); + if (_schema->clustering_key_size() > 0) { + slice.options.set(); + } + } lw_shared_ptr cmd = ::make_lw_shared( _schema->id(), _schema->version(), - make_partition_slice(options), + std::move(slice), + proxy.get_max_result_size(slice), query::row_limit(get_limit(options)), query::partition_limit(query::max_partitions), now, @@ -492,13 +504,6 @@ indexed_table_select_statement::prepare_command_for_base_query(const query_optio utils::UUID(), query::is_first_page::no, options.get_timestamp(state)); - if (use_paging) { - cmd->slice.options.set(); - cmd->slice.options.set(); - if (_schema->clustering_key_size() > 0) { - cmd->slice.options.set(); - } - } return cmd; } @@ -511,7 +516,7 @@ indexed_table_select_statement::do_execute_base_query( gc_clock::time_point now, lw_shared_ptr paging_state) const { using value_type = std::tuple>, lw_shared_ptr>; - auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state)); + auto cmd = prepare_command_for_base_query(proxy, options, state, now, bool(paging_state)); auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector(); uint32_t queried_ranges_count = partition_ranges.size(); service::query_ranges_to_vnodes_generator ranges_to_vnodes(proxy.get_token_metadata(), _schema, std::move(partition_ranges)); @@ -587,7 +592,7 @@ indexed_table_select_statement::do_execute_base_query( gc_clock::time_point now, lw_shared_ptr paging_state) const { using value_type = std::tuple>, lw_shared_ptr>; - auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state)); + auto cmd = prepare_command_for_base_query(proxy, options, state, now, bool(paging_state)); auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector(); struct base_query_state { @@ -1155,6 +1160,7 @@ indexed_table_select_statement::read_posting_list(service::storage_proxy& proxy, _view_schema->id(), _view_schema->version(), partition_slice, + proxy.get_max_result_size(partition_slice), query::row_limit(limit), query::partition_limit(query::max_partitions), now, diff --git a/cql3/statements/select_statement.hh b/cql3/statements/select_statement.hh index 800457cab1..e82ed58c99 100644 --- a/cql3/statements/select_statement.hh +++ b/cql3/statements/select_statement.hh @@ -237,7 +237,8 @@ private: lw_shared_ptr paging_state) const; lw_shared_ptr - prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) const; + prepare_command_for_base_query(service::storage_proxy& proxy, const query_options& options, service::query_state& state, gc_clock::time_point now, + bool use_paging) const; future>, lw_shared_ptr>> do_execute_base_query( diff --git a/db/schema_tables.cc b/db/schema_tables.cc index 68157acb46..cb3d5dbf03 100644 --- a/db/schema_tables.cc +++ b/db/schema_tables.cc @@ -778,7 +778,8 @@ read_schema_partition_for_table(distributed& proxy, sche auto slice = partition_slice_builder(*schema) .with_range(std::move(clustering_range)) .build(); - auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), query::row_limit(query::max_rows)); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice), + query::row_limit(query::max_rows)); return query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key)).then([&proxy] (mutation mut) { return redact_columns_for_missing_features(std::move(mut), proxy.local().get_db().local().features().cluster_schema_features()); }); @@ -788,7 +789,8 @@ future read_keyspace_mutation(distributed& proxy, const sstring& keyspace_name) { schema_ptr s = keyspaces(); auto key = partition_key::from_singular(*s, keyspace_name); - auto cmd = make_lw_shared(s->id(), s->version(), s->full_slice()); + auto slice = s->full_slice(); + auto cmd = make_lw_shared(s->id(), s->version(), std::move(slice), proxy.local().get_max_result_size(slice)); return query_partition_mutation(proxy.local(), std::move(s), std::move(cmd), std::move(key)); } diff --git a/db/system_keyspace.cc b/db/system_keyspace.cc index 41e443f8f2..026f930afd 100644 --- a/db/system_keyspace.cc +++ b/db/system_keyspace.cc @@ -1984,7 +1984,7 @@ query_mutations(distributed& proxy, const sstring& ks_na database& db = proxy.local().get_db().local(); schema_ptr schema = db.find_schema(ks_name, cf_name); auto slice = partition_slice_builder(*schema).build(); - auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice)); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice)); return proxy.local().query_mutations_locally(std::move(schema), std::move(cmd), query::full_partition_range, db::no_timeout) .then([] (rpc::tuple>, cache_temperature> rr_ht) { return std::get<0>(std::move(rr_ht)); }); } @@ -1994,7 +1994,7 @@ query(distributed& proxy, const sstring& ks_name, const database& db = proxy.local().get_db().local(); schema_ptr schema = db.find_schema(ks_name, cf_name); auto slice = partition_slice_builder(*schema).build(); - auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice)); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice)); return proxy.local().query(schema, cmd, {query::full_partition_range}, db::consistency_level::ONE, {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) { return make_lw_shared(query::result_set::from_raw_result(schema, cmd->slice, *qr.query_result)); @@ -2009,7 +2009,7 @@ query(distributed& proxy, const sstring& ks_name, const auto slice = partition_slice_builder(*schema) .with_range(std::move(row_range)) .build(); - auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice)); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice)); return proxy.local().query(schema, cmd, {dht::partition_range::make_singular(key)}, db::consistency_level::ONE, {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) { diff --git a/idl/read_command.idl.hh b/idl/read_command.idl.hh index 29d8a4bcb5..50db525cd1 100644 --- a/idl/read_command.idl.hh +++ b/idl/read_command.idl.hh @@ -40,6 +40,11 @@ class partition_slice { uint32_t partition_row_limit() [[version 1.3]] = std::numeric_limits::max(); }; +struct max_result_size { + uint64_t soft_limit; + uint64_t hard_limit; +} + class read_command { utils::UUID cf_id; utils::UUID schema_version; @@ -50,6 +55,7 @@ class read_command { uint32_t partition_limit [[version 1.3]] = std::numeric_limits::max(); utils::UUID query_uuid [[version 2.2]] = utils::UUID(); query::is_first_page is_first_page [[version 2.2]] = query::is_first_page::no; + std::optional max_result_size [[version 4.3]] = std::nullopt; }; } diff --git a/query-request.hh b/query-request.hh index 8cd7505180..e0d45fd9a7 100644 --- a/query-request.hh +++ b/query-request.hh @@ -30,6 +30,7 @@ #include "range.hh" #include "tracing/tracing.hh" #include "utils/small_vector.hh" +#include "query_class_config.hh" class position_in_partition_view; @@ -237,6 +238,10 @@ public: // to avoid doing work normally done on paged requests, e.g. attempting to // reused suspended readers. query::is_first_page is_first_page; + // The maximum size of the query result, for all queries. + // We use the entire value range, so we need an optional for the case when + // the remote doesn't send it. + std::optional max_result_size; api::timestamp_type read_timestamp; // not serialized public: // IDL constructor @@ -248,7 +253,8 @@ public: std::optional ti, uint32_t partition_limit, utils::UUID query_uuid, - query::is_first_page is_first_page) + query::is_first_page is_first_page, + std::optional max_result_size) : cf_id(std::move(cf_id)) , schema_version(std::move(schema_version)) , slice(std::move(slice)) @@ -258,12 +264,14 @@ public: , partition_limit(partition_limit) , query_uuid(query_uuid) , is_first_page(is_first_page) + , max_result_size(max_result_size) , read_timestamp(api::new_timestamp()) { } read_command(utils::UUID cf_id, table_schema_version schema_version, partition_slice slice, + query::max_result_size max_result_size, query::row_limit row_limit = query::row_limit::max, query::partition_limit partition_limit = query::partition_limit::max, gc_clock::time_point now = gc_clock::now(), @@ -280,6 +288,7 @@ public: , partition_limit(static_cast(partition_limit)) , query_uuid(query_uuid) , is_first_page(is_first_page) + , max_result_size(max_result_size) , read_timestamp(rt) { } diff --git a/redis/query_utils.cc b/redis/query_utils.cc index c6fd7b73d7..e77d21824e 100644 --- a/redis/query_utils.cc +++ b/redis/query_utils.cc @@ -75,7 +75,8 @@ public: future> read_strings(service::storage_proxy& proxy, const redis_options& options, const bytes& key, service_permit permit) { auto schema = get_schema(proxy, options.get_keyspace_name(), redis::STRINGs); auto ps = partition_slice_builder(*schema).build(); - query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1, utils::UUID(), query::is_first_page::no); + const auto max_result_size = proxy.get_max_result_size(ps); + query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1, utils::UUID(), query::is_first_page::no, max_result_size); auto pkey = partition_key::from_single_value(*schema, key); auto partition_range = dht::partition_range::make_singular(dht::decorate_key(*schema, std::move(pkey))); dht::partition_range_vector partition_ranges; diff --git a/service/pager/query_pagers.cc b/service/pager/query_pagers.cc index a2b603cb50..17811f934c 100644 --- a/service/pager/query_pagers.cc +++ b/service/pager/query_pagers.cc @@ -84,9 +84,14 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd) future query_pager::do_fetch_page(uint32_t page_size, gc_clock::time_point now, db::timeout_clock::time_point timeout) { auto state = _options.get_paging_state(); + auto& proxy = get_local_storage_proxy(); + // Most callers should set this but we want to make sure, as results // won't be paged without it. _cmd->slice.options.set(); + // Override this, to make sure we use the value appropriate for paging + // (with allow_short_read set). + _cmd->max_result_size = proxy.get_max_result_size(_cmd->slice); if (!_last_pkey && state) { _max = state->get_remaining(); @@ -191,7 +196,7 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd) auto ranges = _ranges; auto command = ::make_lw_shared(*_cmd); - return get_local_storage_proxy().query(_schema, + return proxy.query(_schema, std::move(command), std::move(ranges), _options.get_consistency(), diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index eb4af0bb14..b6554d7959 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -4345,7 +4345,8 @@ static lw_shared_ptr read_nothing_read_command(schema_ptr s // Note that because this read-nothing command has an empty slice, // storage_proxy::query() returns immediately - without any networking. auto partition_slice = query::partition_slice({}, {}, {}, query::partition_slice::option_set()); - return ::make_lw_shared(schema->id(), schema->version(), partition_slice); + return ::make_lw_shared(schema->id(), schema->version(), partition_slice, + query::max_result_size(query::result_memory_limiter::unlimited_result_size)); } static read_timeout_exception write_timeout_to_read(schema_ptr s, mutation_write_timeout_exception& ex) { diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc index faaae1b1ef..a2ef47712e 100644 --- a/test/boost/database_test.cc +++ b/test/boost/database_test.cc @@ -62,7 +62,7 @@ SEASTAR_TEST_CASE(test_safety_after_truncate) { auto assert_query_result = [&] (size_t expected_size) { auto max_size = std::numeric_limits::max(); - auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::row_limit(1000)); + auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(1000)); auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0(); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(expected_size); }; @@ -105,20 +105,20 @@ SEASTAR_TEST_CASE(test_querying_with_limits) { auto max_size = std::numeric_limits::max(); { - auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::row_limit(3)); + auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(3)); auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3); } { - auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), + auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(query::max_rows), query::partition_limit(5)); auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5); } { - auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), + auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(query::max_rows), query::partition_limit(3)); auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3); diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc index 768fb85ac9..a4b3651797 100644 --- a/test/boost/multishard_mutation_query_test.cc +++ b/test/boost/multishard_mutation_query_test.cc @@ -82,7 +82,7 @@ SEASTAR_THREAD_TEST_CASE(test_abandoned_read) { (void)_; auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), 7, gc_clock::now(), std::nullopt, query::max_partitions, - utils::make_random_uuid(), query::is_first_page::yes); + utils::make_random_uuid(), query::is_first_page::yes, query::max_result_size(query::result_memory_limiter::unlimited_result_size)); query_mutations_on_all_shards(env.db(), s, cmd, {query::full_partition_range}, nullptr, std::numeric_limits::max(), db::no_timeout).get(); @@ -104,7 +104,8 @@ static std::vector read_all_partitions_one_by_one(distributedid(), s->version(), s->full_slice()); + const auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), + query::max_result_size(query::result_memory_limiter::unlimited_result_size)); const auto range = dht::partition_range::make_singular(pkey); return make_foreign(std::make_unique( std::get<0>(db.query_mutations(std::move(s), cmd, range, std::numeric_limits::max(), nullptr, db::no_timeout).get0()))); @@ -125,7 +126,8 @@ read_partitions_with_paged_scan(distributed& db, schema_ptr s, uint32_ const dht::partition_range& range, const query::partition_slice& slice, const std::function& page_hook = {}) { const auto query_uuid = is_stateful ? utils::make_random_uuid() : utils::UUID{}; std::vector results; - auto cmd = query::read_command(s->id(), s->version(), slice, page_size, gc_clock::now(), std::nullopt, query::max_partitions, query_uuid, query::is_first_page::yes); + auto cmd = query::read_command(s->id(), s->version(), slice, page_size, gc_clock::now(), std::nullopt, query::max_partitions, query_uuid, + query::is_first_page::yes, query::max_result_size(max_size)); bool has_more = true; diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc index cc5ef71e14..93eeff994a 100644 --- a/test/boost/querier_cache_test.cc +++ b/test/boost/querier_cache_test.cc @@ -667,7 +667,8 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { std::nullopt, 1, utils::make_random_uuid(), - query::is_first_page::yes); + query::is_first_page::yes, + query::max_result_size(1024 * 1024)); // Should save the querier in cache. db.query_mutations(s, @@ -702,7 +703,8 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { std::nullopt, 1, utils::make_random_uuid(), - query::is_first_page::no); + query::is_first_page::no, + query::max_result_size(1024 * 1024)); // Should evict the already cached querier. db.query_mutations(s, diff --git a/thrift/handler.cc b/thrift/handler.cc index f0625010a2..2fea15f932 100644 --- a/thrift/handler.cc +++ b/thrift/handler.cc @@ -281,13 +281,14 @@ public: if (!column_parent.super_column.empty()) { fail(unimplemented::cause::SUPER); } - auto cmd = slice_pred_to_read_cmd(*schema, predicate); + auto& proxy = service::get_local_storage_proxy(); + auto cmd = slice_pred_to_read_cmd(proxy, *schema, predicate); auto cell_limit = predicate.__isset.slice_range ? static_cast(predicate.slice_range.count) : std::numeric_limits::max(); auto pranges = make_partition_ranges(*schema, keys); auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT); - return f.then([this, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable { + return f.then([this, &proxy, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable { auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout; - return service::get_local_storage_proxy().query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then( + return proxy.query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then( [schema, cmd, cell_limit, keys = std::move(keys)](service::storage_proxy::coordinator_query_result qr) { return query::result_view::do_with(*qr.query_result, [schema, cmd, cell_limit, keys = std::move(keys)](query::result_view v) mutable { if (schema->is_counter()) { @@ -309,13 +310,14 @@ public: if (!column_parent.super_column.empty()) { fail(unimplemented::cause::SUPER); } - auto cmd = slice_pred_to_read_cmd(*schema, predicate); + auto& proxy = service::get_local_storage_proxy(); + auto cmd = slice_pred_to_read_cmd(proxy, *schema, predicate); auto cell_limit = predicate.__isset.slice_range ? static_cast(predicate.slice_range.count) : std::numeric_limits::max(); auto pranges = make_partition_ranges(*schema, keys); auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT); - return f.then([this, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable { + return f.then([this, &proxy, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable { auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout; - return service::get_local_storage_proxy().query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then( + return proxy.query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then( [schema, cmd, cell_limit, keys = std::move(keys)](service::storage_proxy::coordinator_query_result qr) { return query::result_view::do_with(*qr.query_result, [schema, cmd, cell_limit, keys = std::move(keys)](query::result_view v) mutable { column_counter counter(*schema, cmd->slice, cell_limit, std::move(keys)); @@ -338,8 +340,9 @@ public: if (!column_parent.super_column.empty()) { fail(unimplemented::cause::SUPER); } + auto& proxy = service::get_local_storage_proxy(); auto&& prange = make_partition_range(*schema, range); - auto cmd = slice_pred_to_read_cmd(*schema, predicate); + auto cmd = slice_pred_to_read_cmd(proxy, *schema, predicate); // KeyRange::count is the number of thrift rows to return, while // SlicePredicte::slice_range::count limits the number of thrift colums. if (schema->thrift().is_dynamic()) { @@ -350,9 +353,9 @@ public: cmd->row_limit = range.count; } auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT); - return f.then([this, schema, cmd, prange = std::move(prange), consistency_level] () mutable { + return f.then([this, &proxy, schema, cmd, prange = std::move(prange), consistency_level] () mutable { auto timeout = db::timeout_clock::now() + _timeout_config.range_read_timeout; - return service::get_local_storage_proxy().query(schema, cmd, std::move(prange), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then( + return proxy.query(schema, cmd, std::move(prange), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then( [schema, cmd](service::storage_proxy::coordinator_query_result qr) { return query::result_view::do_with(*qr.query_result, [schema, cmd](query::result_view v) { return to_key_slices(*schema, cmd->slice, v, std::numeric_limits::max()); @@ -362,7 +365,7 @@ public: }); } - static lw_shared_ptr make_paged_read_cmd(const schema& s, uint32_t column_limit, const std::string* start_column, const dht::partition_range_vector& range) { + static lw_shared_ptr make_paged_read_cmd(service::storage_proxy& proxy, const schema& s, uint32_t column_limit, const std::string* start_column, const dht::partition_range_vector& range) { auto opts = query_opts(s); std::vector clustering_ranges; query::column_id_vector regular_columns; @@ -394,7 +397,8 @@ public: clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides()); auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts, std::move(specific_ranges), cql_serialization_format::internal()); - return make_lw_shared(s.id(), s.version(), std::move(slice), query::row_limit(row_limit), query::partition_limit(partition_limit)); + return make_lw_shared(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice), + query::row_limit(row_limit), query::partition_limit(partition_limit)); } static future<> do_get_paged_slice( @@ -406,7 +410,8 @@ public: const ::timeout_config& timeout_config, std::vector& output, service::query_state& qs) { - auto cmd = make_paged_read_cmd(*schema, column_limit, start_column, range); + auto& proxy = service::get_local_storage_proxy(); + auto cmd = make_paged_read_cmd(proxy, *schema, column_limit, start_column, range); std::optional start_key; auto end = range[0].end(); if (start_column && !schema->thrift().is_dynamic()) { @@ -417,7 +422,7 @@ public: } auto range1 = range; // query() below accepts an rvalue, so need a copy to reuse later auto timeout = db::timeout_clock::now() + timeout_config.range_read_timeout; - return service::get_local_storage_proxy().query(schema, cmd, std::move(range), consistency_level, {timeout, empty_service_permit(), qs.get_client_state()}).then( + return proxy.query(schema, cmd, std::move(range), consistency_level, {timeout, empty_service_permit(), qs.get_client_state()}).then( [schema, cmd, column_limit](service::storage_proxy::coordinator_query_result qr) { return query::result_view::do_with(*qr.query_result, [schema, cmd, column_limit](query::result_view v) { return to_key_slices(*schema, cmd->slice, v, column_limit); @@ -664,11 +669,13 @@ public: } } auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts, nullptr); - auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), query::row_limit(row_limit)); + auto& proxy = service::get_local_storage_proxy(); + auto cmd = make_lw_shared(schema->id(), schema->version(), std::move(slice), proxy.get_max_result_size(slice), + query::row_limit(row_limit)); auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT); - return f.then([this, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level] { + return f.then([this, &proxy, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level] { auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout; - return service::get_local_storage_proxy().query(schema, cmd, {dht::partition_range::make_singular(dk)}, cl_from_thrift(cl), {timeout, /* FIXME: pass real permit */empty_service_permit(), _query_state.get_client_state()}).then( + return proxy.query(schema, cmd, {dht::partition_range::make_singular(dk)}, cl_from_thrift(cl), {timeout, /* FIXME: pass real permit */empty_service_permit(), _query_state.get_client_state()}).then( [schema, cmd, column_limit](service::storage_proxy::coordinator_query_result qr) { return query::result_view::do_with(*qr.query_result, [schema, cmd, column_limit](query::result_view v) { column_aggregator aggregator(*schema, cmd->slice, column_limit, { }); @@ -1464,7 +1471,7 @@ private: opts.set(query::partition_slice::option::send_partition_key); return opts; } - static lw_shared_ptr slice_pred_to_read_cmd(const schema& s, const SlicePredicate& predicate) { + static lw_shared_ptr slice_pred_to_read_cmd(service::storage_proxy& proxy, const schema& s, const SlicePredicate& predicate) { auto opts = query_opts(s); std::vector clustering_ranges; query::column_id_vector regular_columns; @@ -1511,7 +1518,7 @@ private: } auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts, nullptr, cql_serialization_format::internal(), per_partition_row_limit); - return make_lw_shared(s.id(), s.version(), std::move(slice)); + return make_lw_shared(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice)); } static ColumnParent column_path_to_column_parent(const ColumnPath& column_path) { ColumnParent ret; From fbbbc3e05c5fc13c4aaa9ac11871648de207d954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Tue, 28 Jul 2020 09:56:22 +0300 Subject: [PATCH 17/26] query: result_memory_limiter: use the new max_result_size type --- database.cc | 2 +- multishard_mutation_query.cc | 2 +- query-result.hh | 24 ++++++++++++------------ table.cc | 2 +- test/boost/mutation_query_test.cc | 6 ++++-- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/database.cc b/database.cc index 0340798244..c655096cca 100644 --- a/database.cc +++ b/database.cc @@ -1207,7 +1207,7 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti future> database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range, size_t result_max_size, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) { - return get_result_memory_limiter().new_mutation_read(result_max_size).then( + return get_result_memory_limiter().new_mutation_read(query::max_result_size(result_max_size)).then( [&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) { column_family& cf = find_column_family(cmd.cf_id); query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page); diff --git a/multishard_mutation_query.cc b/multishard_mutation_query.cc index f2177f1c57..d280a04a0d 100644 --- a/multishard_mutation_query.cc +++ b/multishard_mutation_query.cc @@ -668,7 +668,7 @@ future>, cache_tempera db.local().find_column_family(s).get_global_cache_hit_rate())); } - return db.local().get_result_memory_limiter().new_mutation_read(max_size).then([&, s = std::move(s), trace_state = std::move(trace_state), + return db.local().get_result_memory_limiter().new_mutation_read(query::max_result_size(max_size)).then([&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) mutable { return do_query_mutations(db, s, cmd, ranges, std::move(trace_state), timeout, std::move(accounter)).then_wrapped( [&db, s = std::move(s)] (future&& f) { diff --git a/query-result.hh b/query-result.hh index fdff5d3740..2dc70377ba 100644 --- a/query-result.hh +++ b/query-result.hh @@ -68,18 +68,18 @@ public: // Reserves minimum_result_size and creates new memory accounter for // mutation query. Uses the specified maximum result size and may be // stopped before reaching it due to memory pressure on shard. - future new_mutation_read(size_t max_result_size); + future new_mutation_read(query::max_result_size max_result_size); // Reserves minimum_result_size and creates new memory accounter for // data query. Uses the specified maximum result size, result will *not* // be stopped due to on shard memory pressure in order to avoid digest // mismatches. - future new_data_read(size_t max_result_size); + future new_data_read(query::max_result_size max_result_size); // Creates a memory accounter for digest reads. Such accounter doesn't // contribute to the shard memory usage, but still stops producing the // result after individual limit has been reached. - future new_digest_read(size_t max_result_size); + future new_digest_read(query::max_result_size max_result_size); // Checks whether the result can grow any more, takes into account only // the per shard limit. @@ -119,13 +119,13 @@ class result_memory_accounter { size_t _blocked_bytes = 0; size_t _used_memory = 0; size_t _total_used_memory = 0; - size_t _maximum_result_size = 0; + query::max_result_size _maximum_result_size; stop_iteration _stop_on_global_limit; private: // Mutation query accounter. Uses provided individual result size limit and // will stop when shard memory pressure grows too high. struct mutation_query_tag { }; - explicit result_memory_accounter(mutation_query_tag, result_memory_limiter& limiter, size_t max_size) noexcept + explicit result_memory_accounter(mutation_query_tag, result_memory_limiter& limiter, query::max_result_size max_size) noexcept : _limiter(&limiter) , _blocked_bytes(result_memory_limiter::minimum_result_size) , _maximum_result_size(max_size) @@ -135,7 +135,7 @@ private: // Data query accounter. Uses provided individual result size limit and // will *not* stop even though shard memory pressure grows too high. struct data_query_tag { }; - explicit result_memory_accounter(data_query_tag, result_memory_limiter& limiter, size_t max_size) noexcept + explicit result_memory_accounter(data_query_tag, result_memory_limiter& limiter, query::max_result_size max_size) noexcept : _limiter(&limiter) , _blocked_bytes(result_memory_limiter::minimum_result_size) , _maximum_result_size(max_size) @@ -145,7 +145,7 @@ private: // will *not* stop even though shard memory pressure grows too high. This // accounter does not contribute to the shard memory limits. struct digest_query_tag { }; - explicit result_memory_accounter(digest_query_tag, result_memory_limiter&, size_t max_size) noexcept + explicit result_memory_accounter(digest_query_tag, result_memory_limiter&, query::max_result_size max_size) noexcept : _blocked_bytes(0) , _maximum_result_size(max_size) { } @@ -185,7 +185,7 @@ public: stop_iteration update_and_check(size_t n) { _used_memory += n; _total_used_memory += n; - auto stop = stop_iteration(_total_used_memory > _maximum_result_size); + auto stop = stop_iteration(_total_used_memory > _maximum_result_size.hard_limit); if (_limiter && _used_memory > _blocked_bytes) { auto to_block = std::min(_used_memory - _blocked_bytes, n); _blocked_bytes += to_block; @@ -196,7 +196,7 @@ public: // Checks whether the result can grow any more. stop_iteration check() const { - stop_iteration stop { _total_used_memory > _maximum_result_size }; + stop_iteration stop { _total_used_memory > _maximum_result_size.hard_limit }; if (!stop && _used_memory >= _blocked_bytes && _limiter) { return _limiter->check() && _stop_on_global_limit; } @@ -217,19 +217,19 @@ public: } }; -inline future result_memory_limiter::new_mutation_read(size_t max_size) { +inline future result_memory_limiter::new_mutation_read(query::max_result_size max_size) { return _memory_limiter.wait(minimum_result_size).then([this, max_size] { return result_memory_accounter(result_memory_accounter::mutation_query_tag(), *this, max_size); }); } -inline future result_memory_limiter::new_data_read(size_t max_size) { +inline future result_memory_limiter::new_data_read(query::max_result_size max_size) { return _memory_limiter.wait(minimum_result_size).then([this, max_size] { return result_memory_accounter(result_memory_accounter::data_query_tag(), *this, max_size); }); } -inline future result_memory_limiter::new_digest_read(size_t max_size) { +inline future result_memory_limiter::new_digest_read(query::max_result_size max_size) { return make_ready_future(result_memory_accounter(result_memory_accounter::digest_query_tag(), *this, max_size)); } diff --git a/table.cc b/table.cc index b7d77e674c..23ccb61441 100644 --- a/table.cc +++ b/table.cc @@ -2043,7 +2043,7 @@ table::query(schema_ptr s, utils::latency_counter lc; _stats.reads.set_latency(lc); auto f = opts.request == query::result_request::only_digest - ? memory_limiter.new_digest_read(max_size) : memory_limiter.new_data_read(max_size); + ? memory_limiter.new_digest_read(query::max_result_size(max_size)) : memory_limiter.new_data_read(query::max_result_size(max_size)); return f.then([this, lc, s = std::move(s), &cmd, class_config, opts, &partition_ranges, trace_state = std::move(trace_state), timeout, cache_ctx = std::move(cache_ctx)] (query::result_memory_accounter accounter) mutable { auto qs_ptr = std::make_unique(std::move(s), cmd, opts, partition_ranges, std::move(accounter)); diff --git a/test/boost/mutation_query_test.cc b/test/boost/mutation_query_test.cc index 38056eb1a5..1c268759a5 100644 --- a/test/boost/mutation_query_test.cc +++ b/test/boost/mutation_query_test.cc @@ -547,11 +547,13 @@ SEASTAR_THREAD_TEST_CASE(test_result_size_calculation) { query::partition_slice slice = make_full_slice(*s); slice.options.set(); - query::result::builder digest_only_builder(slice, query::result_options{query::result_request::only_digest, query::digest_algorithm::xxHash}, l.new_digest_read(query::result_memory_limiter::maximum_result_size).get0()); + query::result::builder digest_only_builder(slice, query::result_options{query::result_request::only_digest, query::digest_algorithm::xxHash}, + l.new_digest_read(query::max_result_size(query::result_memory_limiter::maximum_result_size)).get0()); data_query(s, source, query::full_partition_range, slice, std::numeric_limits::max(), std::numeric_limits::max(), gc_clock::now(), digest_only_builder, db::no_timeout, tests::make_query_class_config()).get0(); - query::result::builder result_and_digest_builder(slice, query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash}, l.new_data_read(query::result_memory_limiter::maximum_result_size).get0()); + query::result::builder result_and_digest_builder(slice, query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash}, + l.new_data_read(query::max_result_size(query::result_memory_limiter::maximum_result_size)).get0()); data_query(s, source, query::full_partition_range, slice, std::numeric_limits::max(), std::numeric_limits::max(), gc_clock::now(), result_and_digest_builder, db::no_timeout, tests::make_query_class_config()).get0(); From 159d37053da18fc4aec1c3007a567d324858b8ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 28 May 2020 10:33:13 +0300 Subject: [PATCH 18/26] storage_proxy: use read_command::max_result_size to pass max result size around Use the recently added `max_result_size` field of `query::read_command` to pass the max result size around, including passing it to remote nodes. This means that the max result size will be sent along each read, instead of once per connection. As we want to select the appropriate `max_result_size` based on the type of the query as well as based on the query class (user or internal) the previous method won't do anymore. If the remote doesn't fill this field, the old per-connection value is used. --- database.cc | 7 +-- database.hh | 7 +-- db/schema_tables.cc | 2 +- multishard_mutation_query.cc | 3 +- multishard_mutation_query.hh | 1 - service/paxos/paxos_state.cc | 2 +- service/storage_proxy.cc | 64 +++++++++++--------- service/storage_proxy.hh | 17 ++---- table.cc | 3 +- test/boost/database_test.cc | 8 +-- test/boost/multishard_mutation_query_test.cc | 8 +-- test/boost/querier_cache_test.cc | 4 +- 12 files changed, 60 insertions(+), 66 deletions(-) diff --git a/database.cc b/database.cc index c655096cca..5a7b79f498 100644 --- a/database.cc +++ b/database.cc @@ -1178,7 +1178,7 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) { future, cache_temperature>> database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges, - tracing::trace_state_ptr trace_state, uint64_t max_result_size, db::timeout_clock::time_point timeout) { + tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) { column_family& cf = find_column_family(cmd.cf_id); query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page); return _data_query_stage(&cf, @@ -1189,7 +1189,6 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti seastar::cref(ranges), std::move(trace_state), seastar::ref(get_result_memory_limiter()), - max_result_size, timeout, std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) { if (f.failed()) { @@ -1206,8 +1205,8 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti future> database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range, - size_t result_max_size, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) { - return get_result_memory_limiter().new_mutation_read(query::max_result_size(result_max_size)).then( + tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) { + return get_result_memory_limiter().new_mutation_read(*cmd.max_result_size).then( [&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) { column_family& cf = find_column_family(cmd.cf_id); query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page); diff --git a/database.hh b/database.hh index 090d3be332..26b5e92055 100644 --- a/database.hh +++ b/database.hh @@ -753,7 +753,6 @@ public: const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state, query::result_memory_limiter& memory_limiter, - uint64_t max_result_size, db::timeout_clock::time_point timeout, query::querier_cache_context cache_ctx = { }); @@ -1299,7 +1298,6 @@ private: const dht::partition_range_vector&, tracing::trace_state_ptr, query::result_memory_limiter&, - uint64_t, db::timeout_clock::time_point, query::querier_cache_context> _data_query_stage; @@ -1464,10 +1462,9 @@ public: unsigned shard_of(const frozen_mutation& m); future, cache_temperature>> query(schema_ptr, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state, - uint64_t max_result_size, db::timeout_clock::time_point timeout); + db::timeout_clock::time_point timeout); future> query_mutations(schema_ptr, const query::read_command& cmd, const dht::partition_range& range, - size_t max_result_size, tracing::trace_state_ptr trace_state, - db::timeout_clock::time_point timeout); + tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout); // Apply the mutation atomically. // Throws timed_out_error when timeout is reached. future<> apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, db::timeout_clock::time_point timeout); diff --git a/db/schema_tables.cc b/db/schema_tables.cc index cb3d5dbf03..0014cf22e3 100644 --- a/db/schema_tables.cc +++ b/db/schema_tables.cc @@ -743,7 +743,7 @@ future query_partition_mutation(service::storage_proxy& proxy, { auto dk = dht::decorate_key(*s, pkey); return do_with(dht::partition_range::make_singular(dk), [&proxy, dk, s = std::move(s), cmd = std::move(cmd)] (auto& range) { - return proxy.query_mutations_locally(s, std::move(cmd), range, db::no_timeout) + return proxy.query_mutations_locally(s, std::move(cmd), range, db::no_timeout, tracing::trace_state_ptr{}) .then([dk = std::move(dk), s](rpc::tuple>, cache_temperature> res_hit_rate) { auto&& [res, hit_rate] = res_hit_rate; auto&& partitions = res->partitions(); diff --git a/multishard_mutation_query.cc b/multishard_mutation_query.cc index d280a04a0d..fc10c97bc6 100644 --- a/multishard_mutation_query.cc +++ b/multishard_mutation_query.cc @@ -659,7 +659,6 @@ future>, cache_tempera const query::read_command& cmd, const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state, - uint64_t max_size, db::timeout_clock::time_point timeout) { if (cmd.row_limit == 0 || cmd.slice.partition_row_limit() == 0 || cmd.partition_limit == 0) { return make_ready_future>, cache_temperature>>( @@ -668,7 +667,7 @@ future>, cache_tempera db.local().find_column_family(s).get_global_cache_hit_rate())); } - return db.local().get_result_memory_limiter().new_mutation_read(query::max_result_size(max_size)).then([&, s = std::move(s), trace_state = std::move(trace_state), + return db.local().get_result_memory_limiter().new_mutation_read(*cmd.max_result_size).then([&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) mutable { return do_query_mutations(db, s, cmd, ranges, std::move(trace_state), timeout, std::move(accounter)).then_wrapped( [&db, s = std::move(s)] (future&& f) { diff --git a/multishard_mutation_query.hh b/multishard_mutation_query.hh index 8d49a4dd5b..f0baed9c78 100644 --- a/multishard_mutation_query.hh +++ b/multishard_mutation_query.hh @@ -67,5 +67,4 @@ future>, cache_tempera const query::read_command& cmd, const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state, - uint64_t max_size, db::timeout_clock::time_point timeout); diff --git a/service/paxos/paxos_state.cc b/service/paxos/paxos_state.cc index 4fabfa5c6d..a106f39b68 100644 --- a/service/paxos/paxos_state.cc +++ b/service/paxos/paxos_state.cc @@ -81,7 +81,7 @@ future paxos_state::prepare(tracing::trace_state_ptr tr_state, [tr_state, schema, &cmd, only_digest, da, timeout] (const dht::partition_range_vector& prv) { return get_local_storage_proxy().get_db().local().query(schema, cmd, {only_digest ? query::result_request::only_digest : query::result_request::result_and_digest, da}, - prv, tr_state, query::result_memory_limiter::maximum_result_size, timeout); + prv, tr_state, timeout); }); }); return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest] (auto t) { diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index b6554d7959..bd0bc6e221 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -3872,8 +3872,8 @@ db::read_repair_decision storage_proxy::new_read_repair_decision(const schema& s } future> -storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da, uint64_t max_size) { - return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout, max_size).then([] (rpc::tuple>, cache_temperature> result_and_hit_rate) { +storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da) { + return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout).then([] (rpc::tuple>, cache_temperature> result_and_hit_rate) { auto&& [result, hit_rate] = result_and_hit_rate; return make_ready_future>(rpc::tuple(*result->digest(), result->last_modified(), hit_rate)); }); @@ -3881,15 +3881,15 @@ storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr>, cache_temperature>> storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr cmd, const dht::partition_range& pr, query::result_options opts, - tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, uint64_t max_size) { + tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout) { cmd->slice.options.set_if(opts.request != query::result_request::only_result); if (pr.is_singular()) { unsigned shard = dht::shard_of(*s, pr.start()->value().token()); get_stats().replica_cross_shard_ops += shard != this_shard_id(); - return _db.invoke_on(shard, _read_smp_service_group, [max_size, gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable { + return _db.invoke_on(shard, _read_smp_service_group, [gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable { auto trace_state = gt.get(); tracing::trace(trace_state, "Start querying the token range that starts with {}", seastar::value_of([&prv] { return prv.begin()->start()->value().token(); })); - return db.query(gs, *cmd, opts, prv, trace_state, max_size, timeout).then([trace_state](std::tuple, cache_temperature>&& f_ht) { + return db.query(gs, *cmd, opts, prv, trace_state, timeout).then([trace_state](std::tuple, cache_temperature>&& f_ht) { auto&& [f, ht] = f_ht; tracing::trace(trace_state, "Querying is done"); return make_ready_future>, cache_temperature>>(rpc::tuple(make_foreign(std::move(f)), ht)); @@ -3897,7 +3897,7 @@ storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr>, cache_temperature>&& r_ht) { + return query_nonsingular_mutations_locally(s, cmd, {pr}, std::move(trace_state), timeout).then([s, cmd, opts] (rpc::tuple>, cache_temperature>&& r_ht) { auto&& [r, ht] = r_ht; return make_ready_future>, cache_temperature>>( rpc::tuple(::make_foreign(::make_lw_shared(to_data_query_result(*r, s, cmd->slice, cmd->row_limit, cmd->partition_limit, opts))), ht)); @@ -4936,11 +4936,13 @@ void storage_proxy::init_messaging_service() { tracing::trace(trace_state_ptr, "read_data: message received from /{}", src_addr.addr); } auto da = oda.value_or(query::digest_algorithm::MD5); - auto max_size = cinfo.retrieve_auxiliary("max_result_size"); - return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared(std::move(cmd)), src_addr = std::move(src_addr), da, max_size, t] (::compat::wrapping_partition_range& pr, shared_ptr& p, tracing::trace_state_ptr& trace_state_ptr) mutable { + if (!cmd.max_result_size) { + cmd.max_result_size.emplace(cinfo.retrieve_auxiliary("max_result_size")); + } + return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr& p, tracing::trace_state_ptr& trace_state_ptr) mutable { p->get_stats().replica_data_reads++; auto src_ip = src_addr.addr; - return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, da, &pr, &p, &trace_state_ptr, max_size, t] (schema_ptr s) { + return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, da, &pr, &p, &trace_state_ptr, t] (schema_ptr s) { auto pr2 = ::compat::unwrap(std::move(pr), *s); if (pr2.second) { // this function assumes singular queries but doesn't validate @@ -4950,7 +4952,7 @@ void storage_proxy::init_messaging_service() { opts.digest_algo = da; opts.request = da == query::digest_algorithm::none ? query::result_request::only_result : query::result_request::result_and_digest; auto timeout = t ? *t : db::no_timeout; - return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout, max_size); + return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout); }).finally([&trace_state_ptr, src_ip] () mutable { tracing::trace(trace_state_ptr, "read_data handling is done, sending a response to /{}", src_ip); }); @@ -4964,22 +4966,24 @@ void storage_proxy::init_messaging_service() { tracing::begin(trace_state_ptr); tracing::trace(trace_state_ptr, "read_mutation_data: message received from /{}", src_addr.addr); } - auto max_size = cinfo.retrieve_auxiliary("max_result_size"); + if (!cmd.max_result_size) { + cmd.max_result_size.emplace(cinfo.retrieve_auxiliary("max_result_size")); + } return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), ::compat::one_or_two_partition_ranges({}), - [&cinfo, cmd = make_lw_shared(std::move(cmd)), src_addr = std::move(src_addr), max_size, t] ( + [&cinfo, cmd = make_lw_shared(std::move(cmd)), src_addr = std::move(src_addr), t] ( ::compat::wrapping_partition_range& pr, shared_ptr& p, tracing::trace_state_ptr& trace_state_ptr, ::compat::one_or_two_partition_ranges& unwrapped) mutable { p->get_stats().replica_mutation_data_reads++; auto src_ip = src_addr.addr; - return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, max_size, &unwrapped, t] (schema_ptr s) mutable { + return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, &unwrapped, t] (schema_ptr s) mutable { unwrapped = ::compat::unwrap(std::move(pr), *s); auto timeout = t ? *t : db::no_timeout; - return p->query_mutations_locally(std::move(s), std::move(cmd), unwrapped, timeout, trace_state_ptr, max_size); + return p->query_mutations_locally(std::move(s), std::move(cmd), unwrapped, timeout, trace_state_ptr); }).finally([&trace_state_ptr, src_ip] () mutable { tracing::trace(trace_state_ptr, "read_mutation_data handling is done, sending a response to /{}", src_ip); }); @@ -4994,18 +4998,20 @@ void storage_proxy::init_messaging_service() { tracing::trace(trace_state_ptr, "read_digest: message received from /{}", src_addr.addr); } auto da = oda.value_or(query::digest_algorithm::MD5); - auto max_size = cinfo.retrieve_auxiliary("max_result_size"); - return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared(std::move(cmd)), src_addr = std::move(src_addr), da, max_size, t] (::compat::wrapping_partition_range& pr, shared_ptr& p, tracing::trace_state_ptr& trace_state_ptr) mutable { + if (!cmd.max_result_size) { + cmd.max_result_size.emplace(cinfo.retrieve_auxiliary("max_result_size")); + } + return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr& p, tracing::trace_state_ptr& trace_state_ptr) mutable { p->get_stats().replica_digest_reads++; auto src_ip = src_addr.addr; - return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, max_size, t, da] (schema_ptr s) { + return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, t, da] (schema_ptr s) { auto pr2 = ::compat::unwrap(std::move(pr), *s); if (pr2.second) { // this function assumes singular queries but doesn't validate throw std::runtime_error("READ_DIGEST called with wrapping range"); } auto timeout = t ? *t : db::no_timeout; - return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da, max_size); + return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da); }).finally([&trace_state_ptr, src_ip] () mutable { tracing::trace(trace_state_ptr, "read_digest handling is done, sending a response to /{}", src_ip); }); @@ -5041,6 +5047,9 @@ void storage_proxy::init_messaging_service() { tracing::begin(tr_state); tracing::trace(tr_state, "paxos_prepare: message received from /{} ballot {}", src_ip, ballot); } + if (!cmd.max_result_size) { + cmd.max_result_size.emplace(cinfo.retrieve_auxiliary("max_result_size")); + } return get_schema_for_read(cmd.schema_version, src_addr).then([this, cmd = std::move(cmd), key = std::move(key), ballot, only_digest, da, timeout, tr_state = std::move(tr_state), src_ip] (schema_ptr schema) mutable { @@ -5153,29 +5162,29 @@ future<> storage_proxy::uninit_messaging_service() { future>, cache_temperature>> storage_proxy::query_mutations_locally(schema_ptr s, lw_shared_ptr cmd, const dht::partition_range& pr, storage_proxy::clock_type::time_point timeout, - tracing::trace_state_ptr trace_state, uint64_t max_size) { + tracing::trace_state_ptr trace_state) { if (pr.is_singular()) { unsigned shard = dht::shard_of(*s, pr.start()->value().token()); get_stats().replica_cross_shard_ops += shard != this_shard_id(); - return _db.invoke_on(shard, _read_smp_service_group, [max_size, cmd, &pr, gs=global_schema_ptr(s), timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable { - return db.query_mutations(gs, *cmd, pr, max_size, gt, timeout).then([] (std::tuple result_ht) { + return _db.invoke_on(shard, _read_smp_service_group, [cmd, &pr, gs=global_schema_ptr(s), timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable { + return db.query_mutations(gs, *cmd, pr, gt, timeout).then([] (std::tuple result_ht) { auto&& [result, ht] = result_ht; return make_ready_future>, cache_temperature>>(rpc::tuple(make_foreign(make_lw_shared(std::move(result))), ht)); }); }); } else { - return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), {pr}, std::move(trace_state), max_size, timeout); + return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), {pr}, std::move(trace_state), timeout); } } future>, cache_temperature>> storage_proxy::query_mutations_locally(schema_ptr s, lw_shared_ptr cmd, const ::compat::one_or_two_partition_ranges& pr, storage_proxy::clock_type::time_point timeout, - tracing::trace_state_ptr trace_state, uint64_t max_size) { + tracing::trace_state_ptr trace_state) { if (!pr.second) { - return query_mutations_locally(std::move(s), std::move(cmd), pr.first, timeout, std::move(trace_state), max_size); + return query_mutations_locally(std::move(s), std::move(cmd), pr.first, timeout, std::move(trace_state)); } else { - return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), pr, std::move(trace_state), max_size, timeout); + return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), pr, std::move(trace_state), timeout); } } @@ -5184,11 +5193,10 @@ storage_proxy::query_nonsingular_mutations_locally(schema_ptr s, lw_shared_ptr cmd, const dht::partition_range_vector&& prs, tracing::trace_state_ptr trace_state, - uint64_t max_size, storage_proxy::clock_type::time_point timeout) { - return do_with(cmd, std::move(prs), [this, max_size, timeout, s = std::move(s), trace_state = std::move(trace_state)] (lw_shared_ptr& cmd, + return do_with(cmd, std::move(prs), [this, timeout, s = std::move(s), trace_state = std::move(trace_state)] (lw_shared_ptr& cmd, const dht::partition_range_vector& prs) mutable { - return query_mutations_on_all_shards(_db, std::move(s), *cmd, prs, std::move(trace_state), max_size, timeout).then([] (std::tuple>, cache_temperature> t) { + return query_mutations_on_all_shards(_db, std::move(s), *cmd, prs, std::move(trace_state), timeout).then([] (std::tuple>, cache_temperature> t) { return make_ready_future>, cache_temperature>>(std::move(t)); }); }); diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh index 13ed45a92e..6d728d5fba 100644 --- a/service/storage_proxy.hh +++ b/service/storage_proxy.hh @@ -360,13 +360,11 @@ private: future>, cache_temperature>> query_result_local(schema_ptr, lw_shared_ptr cmd, const dht::partition_range& pr, query::result_options opts, tracing::trace_state_ptr trace_state, - clock_type::time_point timeout, - uint64_t max_size = query::result_memory_limiter::maximum_result_size); + clock_type::time_point timeout); future> query_result_local_digest(schema_ptr, lw_shared_ptr cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, clock_type::time_point timeout, - query::digest_algorithm da, - uint64_t max_size = query::result_memory_limiter::maximum_result_size); + query::digest_algorithm da); future query_partition_key_range(lw_shared_ptr cmd, dht::partition_range_vector partition_ranges, db::consistency_level cl, @@ -408,7 +406,7 @@ private: future<> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, service_permit permit, std::optional timeout_opt = { }, lw_shared_ptr cdc_tracker = { }); future>, cache_temperature>> query_nonsingular_mutations_locally( schema_ptr s, lw_shared_ptr cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state, - uint64_t max_size, clock_type::time_point timeout); + clock_type::time_point timeout); future<> mutate_counters_on_leader(std::vector mutations, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr trace_state, service_permit permit); @@ -560,21 +558,18 @@ public: future>, cache_temperature>> query_mutations_locally( schema_ptr, lw_shared_ptr cmd, const dht::partition_range&, clock_type::time_point timeout, - tracing::trace_state_ptr trace_state = nullptr, - uint64_t max_size = query::result_memory_limiter::maximum_result_size); + tracing::trace_state_ptr trace_state = nullptr); future>, cache_temperature>> query_mutations_locally( schema_ptr, lw_shared_ptr cmd, const ::compat::one_or_two_partition_ranges&, clock_type::time_point timeout, - tracing::trace_state_ptr trace_state = nullptr, - uint64_t max_size = query::result_memory_limiter::maximum_result_size); + tracing::trace_state_ptr trace_state = nullptr); future>, cache_temperature>> query_mutations_locally( schema_ptr s, lw_shared_ptr cmd, const dht::partition_range_vector& pr, clock_type::time_point timeout, - tracing::trace_state_ptr trace_state = nullptr, - uint64_t max_size = query::result_memory_limiter::maximum_result_size); + tracing::trace_state_ptr trace_state = nullptr); future cas(schema_ptr schema, shared_ptr request, lw_shared_ptr cmd, dht::partition_range_vector&& partition_ranges, coordinator_query_options query_options, diff --git a/table.cc b/table.cc index 23ccb61441..c5323278bb 100644 --- a/table.cc +++ b/table.cc @@ -2037,13 +2037,12 @@ table::query(schema_ptr s, const dht::partition_range_vector& partition_ranges, tracing::trace_state_ptr trace_state, query::result_memory_limiter& memory_limiter, - uint64_t max_size, db::timeout_clock::time_point timeout, query::querier_cache_context cache_ctx) { utils::latency_counter lc; _stats.reads.set_latency(lc); auto f = opts.request == query::result_request::only_digest - ? memory_limiter.new_digest_read(query::max_result_size(max_size)) : memory_limiter.new_data_read(query::max_result_size(max_size)); + ? memory_limiter.new_digest_read(*cmd.max_result_size) : memory_limiter.new_data_read(*cmd.max_result_size); return f.then([this, lc, s = std::move(s), &cmd, class_config, opts, &partition_ranges, trace_state = std::move(trace_state), timeout, cache_ctx = std::move(cache_ctx)] (query::result_memory_accounter accounter) mutable { auto qs_ptr = std::make_unique(std::move(s), cmd, opts, partition_ranges, std::move(accounter)); diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc index a2ef47712e..e36949d40c 100644 --- a/test/boost/database_test.cc +++ b/test/boost/database_test.cc @@ -63,7 +63,7 @@ SEASTAR_TEST_CASE(test_safety_after_truncate) { auto assert_query_result = [&] (size_t expected_size) { auto max_size = std::numeric_limits::max(); auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(1000)); - auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0(); + auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0(); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(expected_size); }; assert_query_result(1000); @@ -106,21 +106,21 @@ SEASTAR_TEST_CASE(test_querying_with_limits) { auto max_size = std::numeric_limits::max(); { auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(3)); - auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); + auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3); } { auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(query::max_rows), query::partition_limit(5)); - auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); + auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5); } { auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(query::max_rows), query::partition_limit(3)); - auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0()); + auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0()); assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3); } }); diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc index a4b3651797..9e43b2d166 100644 --- a/test/boost/multishard_mutation_query_test.cc +++ b/test/boost/multishard_mutation_query_test.cc @@ -84,7 +84,7 @@ SEASTAR_THREAD_TEST_CASE(test_abandoned_read) { auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), 7, gc_clock::now(), std::nullopt, query::max_partitions, utils::make_random_uuid(), query::is_first_page::yes, query::max_result_size(query::result_memory_limiter::unlimited_result_size)); - query_mutations_on_all_shards(env.db(), s, cmd, {query::full_partition_range}, nullptr, std::numeric_limits::max(), db::no_timeout).get(); + query_mutations_on_all_shards(env.db(), s, cmd, {query::full_partition_range}, nullptr, db::no_timeout).get(); check_cache_population(env.db(), 1); @@ -108,7 +108,7 @@ static std::vector read_all_partitions_one_by_one(distributed( - std::get<0>(db.query_mutations(std::move(s), cmd, range, std::numeric_limits::max(), nullptr, db::no_timeout).get0()))); + std::get<0>(db.query_mutations(std::move(s), cmd, range, nullptr, db::no_timeout).get0()))); }); }).get0(); @@ -133,7 +133,7 @@ read_partitions_with_paged_scan(distributed& db, schema_ptr s, uint32_ // First page is special, needs to have `is_first_page` set. { - auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {range}, nullptr, max_size, db::no_timeout).get0()); + auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {range}, nullptr, db::no_timeout).get0()); for (auto& part : res->partitions()) { auto mut = part.mut().unfreeze(s); results.emplace_back(std::move(mut)); @@ -177,7 +177,7 @@ read_partitions_with_paged_scan(distributed& db, schema_ptr s, uint32_ cmd.slice.set_range(*s, last_pkey.key(), std::move(ckranges)); } - auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {pkrange}, nullptr, max_size, db::no_timeout).get0()); + auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {pkrange}, nullptr, db::no_timeout).get0()); if (is_stateful) { BOOST_REQUIRE(aggregate_querier_cache_stat(db, &query::querier_cache::stats::lookups) >= npages); diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc index 93eeff994a..63940fadca 100644 --- a/test/boost/querier_cache_test.cc +++ b/test/boost/querier_cache_test.cc @@ -674,7 +674,6 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { db.query_mutations(s, cmd1, query::full_partition_range, - 1024 * 1024, nullptr, db::no_timeout).get(); @@ -710,7 +709,6 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { db.query_mutations(s, cmd2, query::full_partition_range, - 1024 * 1024, nullptr, db::no_timeout).get(); @@ -724,10 +722,10 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { // of the tracked buffers. cmd2.row_limit = query::max_rows; cmd2.partition_limit = query::max_partitions; + cmd2.max_result_size.emplace(query::result_memory_limiter::unlimited_result_size); db.query_mutations(s, cmd2, query::full_partition_range, - query::result_memory_limiter::unlimited_result_size, nullptr, db::no_timeout).get(); return make_ready_future<>(); From 9eab5bca27be01e8d2db5ac49e7a8339a283be03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 28 May 2020 12:21:37 +0300 Subject: [PATCH 19/26] query_*(): use the coordinator specified memory limit for unlimited queries It is important that all replicas participating in a read use the same memory limits to avoid artificial differences due to different amount of results. The coordinator now passes down its own memory limit for reads, in the form of max_result_size (or max_size). For unpaged or reverse queries this has to be used now instead of the locally set max_memory_unlimited_query configuration item. To avoid the replicas accidentally using the local limit contained in the `query_class_config` returned from `database::make_query_class_config()`, we refactor the latter into `database::get_reader_concurrency_semaphore()`. Most of its callers were only interested in the semaphore only anyway and those that were interested in the limit as well should get it from the coordinator instead, so this refactoring is a win-win. --- database.cc | 19 ++++++++++--------- database.hh | 4 +++- db/view/view.cc | 2 +- db/view/view_update_generator.cc | 2 +- multishard_mutation_query.cc | 11 +++++------ test/boost/multishard_mutation_query_test.cc | 2 +- test/boost/querier_cache_test.cc | 2 +- test/boost/view_build_test.cc | 2 +- test/manual/sstable_scan_footprint_test.cc | 2 +- 9 files changed, 24 insertions(+), 22 deletions(-) diff --git a/database.cc b/database.cc index 5a7b79f498..1a6bdbf0b8 100644 --- a/database.cc +++ b/database.cc @@ -1180,11 +1180,12 @@ future, cache_temperature>> database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) { column_family& cf = find_column_family(cmd.cf_id); + auto class_config = query::query_class_config{.semaphore = get_reader_concurrency_semaphore(), .max_memory_for_unlimited_query = *cmd.max_result_size}; query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page); return _data_query_stage(&cf, std::move(s), seastar::cref(cmd), - make_query_class_config(), + class_config, opts, seastar::cref(ranges), std::move(trace_state), @@ -1209,6 +1210,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh return get_result_memory_limiter().new_mutation_read(*cmd.max_result_size).then( [&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) { column_family& cf = find_column_family(cmd.cf_id); + auto class_config = query::query_class_config{.semaphore = get_reader_concurrency_semaphore(), .max_memory_for_unlimited_query = *cmd.max_result_size}; query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page); return _mutation_query_stage(std::move(s), cf.as_mutation_source(), @@ -1218,7 +1220,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh cmd.partition_limit, cmd.timestamp, timeout, - make_query_class_config(), + class_config, std::move(accounter), std::move(trace_state), std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) { @@ -1281,17 +1283,16 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) { }); } -query::query_class_config database::make_query_class_config() { +reader_concurrency_semaphore& database::get_reader_concurrency_semaphore() { // Everything running in the statement group is considered a user query if (current_scheduling_group() == _dbcfg.statement_scheduling_group) { - return query::query_class_config{_read_concurrency_sem, - query::max_result_size(_cfg.max_memory_for_unlimited_query_soft_limit(), _cfg.max_memory_for_unlimited_query_hard_limit())}; + return _read_concurrency_sem; // Reads done on behalf of view update generation run in the streaming group } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) { - return query::query_class_config{_streaming_concurrency_sem, query::max_result_size(std::numeric_limits::max())}; + return _streaming_concurrency_sem; // Everything else is considered a system query } else { - return query::query_class_config{_system_read_concurrency_sem, query::max_result_size(std::numeric_limits::max())}; + return _system_read_concurrency_sem; } } @@ -1351,7 +1352,7 @@ future database::do_apply_counter_update(column_family& cf, const froz // counter state for each modified cell... tracing::trace(trace_state, "Reading counter values from the CF"); - return counter_write_query(m_schema, cf.as_mutation_source(), make_query_class_config().semaphore.make_permit(), m.decorated_key(), slice, trace_state, timeout) + return counter_write_query(m_schema, cf.as_mutation_source(), get_reader_concurrency_semaphore().make_permit(), m.decorated_key(), slice, trace_state, timeout) .then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) { // ...now, that we got existing state of all affected counter // cells we can look for our shard in each of them, increment @@ -1562,7 +1563,7 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::tra if (cf.views().empty()) { return apply_with_commitlog(std::move(s), cf, std::move(uuid), m, timeout, sync).finally([op = std::move(op)] { }); } - future f = cf.push_view_replica_updates(s, m, timeout, std::move(tr_state), make_query_class_config().semaphore); + future f = cf.push_view_replica_updates(s, m, timeout, std::move(tr_state), get_reader_concurrency_semaphore()); return f.then([this, s = std::move(s), uuid = std::move(uuid), &m, timeout, &cf, op = std::move(op), sync] (row_locker::lock_holder lock) mutable { return apply_with_commitlog(std::move(s), cf, std::move(uuid), m, timeout, sync).finally( // Hold the local lock on the base-table partition or row diff --git a/database.hh b/database.hh index 26b5e92055..c8540ff5cd 100644 --- a/database.hh +++ b/database.hh @@ -1592,7 +1592,9 @@ public: return _supports_infinite_bound_range_deletions; } - query::query_class_config make_query_class_config(); + // Get the reader concurrency semaphore, appropriate for the query class, + // which is deduced from the current scheduling group. + reader_concurrency_semaphore& get_reader_concurrency_semaphore(); }; future<> start_large_data_handler(sharded& db); diff --git a/db/view/view.cc b/db/view/view.cc index 4b18d59f6d..3d0ee79134 100644 --- a/db/view/view.cc +++ b/db/view/view.cc @@ -1285,7 +1285,7 @@ view_builder::build_step& view_builder::get_or_create_build_step(utils::UUID bas void view_builder::initialize_reader_at_current_token(build_step& step) { step.pslice = make_partition_slice(*step.base->schema()); step.prange = dht::partition_range(dht::ring_position::starting_at(step.current_token()), dht::ring_position::max()); - auto permit = _db.make_query_class_config().semaphore.make_permit(); + auto permit = _db.get_reader_concurrency_semaphore().make_permit(); step.reader = make_local_shard_sstable_reader( step.base->schema(), std::move(permit), diff --git a/db/view/view_update_generator.cc b/db/view/view_update_generator.cc index 624d192e5f..0552b95d7a 100644 --- a/db/view/view_update_generator.cc +++ b/db/view/view_update_generator.cc @@ -78,7 +78,7 @@ future<> view_update_generator::start() { auto [staging_sstable_reader, staging_sstable_reader_handle] = make_manually_paused_evictable_reader( std::move(ms), s, - _db.make_query_class_config().semaphore.make_permit(), + _db.get_reader_concurrency_semaphore().make_permit(), query::full_partition_range, s->full_slice(), service::get_local_streaming_priority(), diff --git a/multishard_mutation_query.cc b/multishard_mutation_query.cc index fc10c97bc6..7e3516c32b 100644 --- a/multishard_mutation_query.cc +++ b/multishard_mutation_query.cc @@ -244,7 +244,7 @@ public: virtual reader_concurrency_semaphore& semaphore() override { const auto shard = this_shard_id(); if (!_semaphores[shard]) { - _semaphores[shard] = &_db.local().make_query_class_config().semaphore; + _semaphores[shard] = &_db.local().get_reader_concurrency_semaphore(); } return *_semaphores[shard]; } @@ -618,18 +618,17 @@ static future do_query_mutations( mutation_reader::forwarding fwd_mr) { return make_multishard_combining_reader(ctx, std::move(s), pr, ps, pc, std::move(trace_state), fwd_mr); }); - auto class_config = ctx->db().local().make_query_class_config(); - auto reader = make_flat_multi_range_reader(s, class_config.semaphore.make_permit(), std::move(ms), ranges, cmd.slice, - service::get_local_sstable_query_read_priority(), trace_state, mutation_reader::forwarding::no); + auto reader = make_flat_multi_range_reader(s, ctx->db().local().get_reader_concurrency_semaphore().make_permit(), std::move(ms), ranges, + cmd.slice, service::get_local_sstable_query_read_priority(), trace_state, mutation_reader::forwarding::no); auto compaction_state = make_lw_shared(*s, cmd.timestamp, cmd.slice, cmd.row_limit, cmd.partition_limit); - return do_with(std::move(reader), std::move(compaction_state), [&, class_config, accounter = std::move(accounter), timeout] ( + return do_with(std::move(reader), std::move(compaction_state), [&, accounter = std::move(accounter), timeout] ( flat_mutation_reader& reader, lw_shared_ptr& compaction_state) mutable { auto rrb = reconcilable_result_builder(*reader.schema(), cmd.slice, std::move(accounter)); return query::consume_page(reader, compaction_state, cmd.slice, std::move(rrb), cmd.row_limit, cmd.partition_limit, cmd.timestamp, - timeout, class_config.max_memory_for_unlimited_query).then([&] (consume_result&& result) mutable { + timeout, *cmd.max_result_size).then([&] (consume_result&& result) mutable { return make_ready_future(page_consume_result(std::move(result), reader.detach_buffer(), std::move(compaction_state))); }); }).then_wrapped([&ctx] (future&& result_fut) { diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc index 9e43b2d166..295c14bc34 100644 --- a/test/boost/multishard_mutation_query_test.cc +++ b/test/boost/multishard_mutation_query_test.cc @@ -973,7 +973,7 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) { const auto& partitions = pop_desc.partitions; smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(pop_desc.schema), &partitions] { - auto& sem = db->local().make_query_class_config().semaphore; + auto& sem = db->local().get_reader_concurrency_semaphore(); auto resources = sem.available_resources(); resources -= reader_concurrency_semaphore::resources{1, 0}; diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc index 63940fadca..207ad883dd 100644 --- a/test/boost/querier_cache_test.cc +++ b/test/boost/querier_cache_test.cc @@ -677,7 +677,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) { nullptr, db::no_timeout).get(); - auto& semaphore = db.make_query_class_config().semaphore; + auto& semaphore = db.get_reader_concurrency_semaphore(); auto permit = semaphore.make_permit(); BOOST_CHECK_EQUAL(db.get_querier_cache_stats().resource_based_evictions, 0); diff --git a/test/boost/view_build_test.cc b/test/boost/view_build_test.cc index 8c91b5217d..f4c39c9444 100644 --- a/test/boost/view_build_test.cc +++ b/test/boost/view_build_test.cc @@ -554,7 +554,7 @@ SEASTAR_THREAD_TEST_CASE(test_view_update_generator_deadlock) { t->add_sstable_and_update_cache(sst).get(); auto& sem = *with_scheduling_group(e.local_db().get_streaming_scheduling_group(), [&] () { - return &e.local_db().make_query_class_config().semaphore; + return &e.local_db().get_reader_concurrency_semaphore(); }).get0(); // consume all units except what is needed to admit a single reader. diff --git a/test/manual/sstable_scan_footprint_test.cc b/test/manual/sstable_scan_footprint_test.cc index 99287ef49f..34edf85145 100644 --- a/test/manual/sstable_scan_footprint_test.cc +++ b/test/manual/sstable_scan_footprint_test.cc @@ -308,7 +308,7 @@ int main(int argc, char** argv) { auto prev_occupancy = logalloc::shard_tracker().occupancy(); testlog.info("Occupancy before: {}", prev_occupancy); - auto& sem = env.local_db().make_query_class_config().semaphore; + auto& sem = env.local_db().get_reader_concurrency_semaphore(); testlog.info("Reading"); stats_collector sc(sem, stats_collector_params); From 6660a5df511a3727fa0b8cc15149f56a3af5006e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 28 May 2020 15:55:04 +0300 Subject: [PATCH 20/26] result_memory_accounter: remove default constructor If somebody wants to bypass proper memory accounting they should at the very least be forced to consider if that is indeed wise and think a second about the limit they want to apply. --- mutation.cc | 6 ++-- mutation.hh | 2 ++ mutation_query.cc | 3 +- mutation_query.hh | 2 +- query-result-set.cc | 2 +- query-result.hh | 5 +++- table.cc | 2 +- test/boost/mutation_query_test.cc | 46 ++++++++++++++++-------------- test/boost/mutation_test.cc | 24 ++++++++++------ test/perf/memory_footprint_test.cc | 3 +- 10 files changed, 58 insertions(+), 37 deletions(-) diff --git a/mutation.cc b/mutation.cc index 0f68421695..3260ec97c8 100644 --- a/mutation.cc +++ b/mutation.cc @@ -122,20 +122,22 @@ mutation::query(query::result::builder& builder, query::result mutation::query(const query::partition_slice& slice, + query::result_memory_accounter&& accounter, query::result_options opts, gc_clock::time_point now, uint32_t row_limit) && { - query::result::builder builder(slice, opts, { }); + query::result::builder builder(slice, opts, std::move(accounter)); std::move(*this).query(builder, slice, now, row_limit); return builder.build(); } query::result mutation::query(const query::partition_slice& slice, + query::result_memory_accounter&& accounter, query::result_options opts, gc_clock::time_point now, uint32_t row_limit) const& { - return mutation(*this).query(slice, opts, now, row_limit); + return mutation(*this).query(slice, std::move(accounter), opts, now, row_limit); } size_t diff --git a/mutation.hh b/mutation.hh index 8baee800a8..e85ed56bd6 100644 --- a/mutation.hh +++ b/mutation.hh @@ -108,6 +108,7 @@ public: public: // The supplied partition_slice must be governed by this mutation's schema query::result query(const query::partition_slice&, + query::result_memory_accounter&& accounter, query::result_options opts = query::result_options::only_result(), gc_clock::time_point now = gc_clock::now(), uint32_t row_limit = query::max_rows) &&; @@ -115,6 +116,7 @@ public: // The supplied partition_slice must be governed by this mutation's schema // FIXME: Slower than the r-value version query::result query(const query::partition_slice&, + query::result_memory_accounter&& accounter, query::result_options opts = query::result_options::only_result(), gc_clock::time_point now = gc_clock::now(), uint32_t row_limit = query::max_rows) const&; diff --git a/mutation_query.cc b/mutation_query.cc index 8c9dffa71c..530c987978 100644 --- a/mutation_query.cc +++ b/mutation_query.cc @@ -58,7 +58,8 @@ bool reconcilable_result::operator!=(const reconcilable_result& other) const { query::result to_data_query_result(const reconcilable_result& r, schema_ptr s, const query::partition_slice& slice, uint32_t max_rows, uint32_t max_partitions, query::result_options opts) { - query::result::builder builder(slice, opts, { }); + // This result was already built with a limit, don't apply another one. + query::result::builder builder(slice, opts, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }); for (const partition& p : r.partitions()) { if (builder.row_count() >= max_rows || builder.partition_count() >= max_partitions) { break; diff --git a/mutation_query.hh b/mutation_query.hh index dbd0374b09..0bdb68bae5 100644 --- a/mutation_query.hh +++ b/mutation_query.hh @@ -164,7 +164,7 @@ future mutation_query( gc_clock::time_point query_time, db::timeout_clock::time_point timeout, query::query_class_config class_config, - query::result_memory_accounter&& accounter = { }, + query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_ptr = nullptr, query::querier_cache_context cache_ctx = { }); diff --git a/query-result-set.cc b/query-result-set.cc index fdd976c5e5..075c423155 100644 --- a/query-result-set.cc +++ b/query-result-set.cc @@ -225,7 +225,7 @@ result_set::from_raw_result(schema_ptr s, const partition_slice& slice, const re result_set::result_set(const mutation& m) : result_set([&m] { auto slice = partition_slice_builder(*m.schema()).build(); - auto qr = mutation(m).query(slice, result_options::only_result()); + auto qr = mutation(m).query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, result_options::only_result()); return result_set::from_raw_result(m.schema(), slice, qr); }()) { } diff --git a/query-result.hh b/query-result.hh index 2dc70377ba..749c51cefe 100644 --- a/query-result.hh +++ b/query-result.hh @@ -152,7 +152,10 @@ private: friend class result_memory_limiter; public: - result_memory_accounter() = default; + explicit result_memory_accounter(size_t max_size) noexcept + : _blocked_bytes(0) + , _maximum_result_size(max_size) { + } result_memory_accounter(result_memory_accounter&& other) noexcept : _limiter(std::exchange(other._limiter, nullptr)) diff --git a/table.cc b/table.cc index c5323278bb..9418a820f7 100644 --- a/table.cc +++ b/table.cc @@ -2001,7 +2001,7 @@ struct query_state { const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges, - query::result_memory_accounter memory_accounter = { }) + query::result_memory_accounter memory_accounter) : schema(std::move(s)) , cmd(cmd) , builder(cmd.slice, opts, std::move(memory_accounter)) diff --git a/test/boost/mutation_query_test.cc b/test/boost/mutation_query_test.cc index 1c268759a5..133bbc885b 100644 --- a/test/boost/mutation_query_test.cc +++ b/test/boost/mutation_query_test.cc @@ -78,6 +78,10 @@ static query::partition_slice make_full_slice(const schema& s) { static auto inf32 = std::numeric_limits::max(); +static query::result_memory_accounter make_accounter() { + return query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }; +} + query::result_set to_result_set(const reconcilable_result& r, schema_ptr s, const query::partition_slice& slice) { return query::result_set::from_raw_result(s, slice, to_data_query_result(r, s, slice, inf32, inf32)); } @@ -101,7 +105,7 @@ SEASTAR_TEST_CASE(test_reading_from_single_partition) { auto slice = make_full_slice(*s); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); // FIXME: use mutation assertions assert_that(to_result_set(result, s, slice)) @@ -124,7 +128,7 @@ SEASTAR_TEST_CASE(test_reading_from_single_partition) { .build(); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_only(a_row() @@ -160,7 +164,7 @@ SEASTAR_TEST_CASE(test_cells_are_expired_according_to_query_timestamp) { auto slice = make_full_slice(*s); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_only(a_row() @@ -174,7 +178,7 @@ SEASTAR_TEST_CASE(test_cells_are_expired_according_to_query_timestamp) { auto slice = make_full_slice(*s); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 1, query::max_partitions, now + 2s, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 1, query::max_partitions, now + 2s, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_only(a_row() @@ -207,7 +211,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) { .build(); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_size(3) @@ -237,7 +241,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) { .build(); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_size(3) @@ -265,7 +269,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) { { reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 10, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 10, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_size(3) @@ -285,7 +289,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) { { reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_size(1) @@ -297,7 +301,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) { { reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_size(2) @@ -324,7 +328,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) { .build(); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_size(2) @@ -348,7 +352,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) { .build(); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_size(2) @@ -370,7 +374,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) { .build(); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_only(a_row() @@ -396,7 +400,7 @@ SEASTAR_TEST_CASE(test_query_when_partition_tombstone_covers_live_cells) { auto slice = make_full_slice(*s); reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .is_empty(); @@ -447,7 +451,7 @@ SEASTAR_TEST_CASE(test_partitions_with_only_expired_tombstones_are_dropped) { auto query_time = now + std::chrono::seconds(1); reconcilable_result result = mutation_query(s, src, query::full_partition_range, slice, query::max_rows, query::max_partitions, query_time, - db::no_timeout, tests::make_query_class_config()).get0(); + db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); BOOST_REQUIRE_EQUAL(result.partitions().size(), 2); BOOST_REQUIRE_EQUAL(result.row_count(), 2); @@ -466,28 +470,28 @@ SEASTAR_TEST_CASE(test_result_row_count) { auto src = make_source({m1}); auto r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now, - db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32); + db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32); BOOST_REQUIRE_EQUAL(r.row_count().value(), 0); m1.set_static_cell("s1", data_value(bytes("S_v1")), 1); r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now, - db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32); + db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32); BOOST_REQUIRE_EQUAL(r.row_count().value(), 1); m1.set_clustered_cell(clustering_key::from_single_value(*s, bytes("A")), "v1", data_value(bytes("A_v1")), 1); r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now, - db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32); + db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32); BOOST_REQUIRE_EQUAL(r.row_count().value(), 1); m1.set_clustered_cell(clustering_key::from_single_value(*s, bytes("B")), "v1", data_value(bytes("B_v1")), 1); r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now, - db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32); + db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32); BOOST_REQUIRE_EQUAL(r.row_count().value(), 2); mutation m2(s, partition_key::from_single_value(*s, "key2")); m2.set_static_cell("s1", data_value(bytes("S_v1")), 1); r = to_data_query_result(mutation_query(s, make_source({m1, m2}), query::full_partition_range, slice, 10000, query::max_partitions, now, - db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32); + db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32); BOOST_REQUIRE_EQUAL(r.row_count().value(), 3); }); } @@ -510,7 +514,7 @@ SEASTAR_TEST_CASE(test_partition_limit) { { reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, query::max_rows, 10, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, query::max_rows, 10, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_size(2) @@ -526,7 +530,7 @@ SEASTAR_TEST_CASE(test_partition_limit) { { reconcilable_result result = mutation_query(s, src, - query::full_partition_range, slice, query::max_rows, 1, now, db::no_timeout, tests::make_query_class_config()).get0(); + query::full_partition_range, slice, query::max_rows, 1, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(); assert_that(to_result_set(result, s, slice)) .has_size(1) diff --git a/test/boost/mutation_test.cc b/test/boost/mutation_test.cc index d29de47fc6..8d6bad425d 100644 --- a/test/boost/mutation_test.cc +++ b/test/boost/mutation_test.cc @@ -776,7 +776,8 @@ SEASTAR_TEST_CASE(test_querying_of_mutation) { auto resultify = [s] (const mutation& m) -> query::result_set { auto slice = make_full_slice(*s); - return query::result_set::from_raw_result(s, slice, m.query(slice)); + return query::result_set::from_raw_result(s, slice, + m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size })); }; mutation m(s, partition_key::from_single_value(*s, "key1")); @@ -811,7 +812,8 @@ SEASTAR_TEST_CASE(test_partition_with_no_live_data_is_absent_in_data_query_resul auto slice = make_full_slice(*s); - assert_that(query::result_set::from_raw_result(s, slice, m.query(slice))) + assert_that(query::result_set::from_raw_result(s, slice, + m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }))) .is_empty(); }); } @@ -834,7 +836,8 @@ SEASTAR_TEST_CASE(test_partition_with_live_data_in_static_row_is_present_in_the_ .with_regular_column("v") .build(); - assert_that(query::result_set::from_raw_result(s, slice, m.query(slice))) + assert_that(query::result_set::from_raw_result(s, slice, + m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }))) .has_only(a_row() .with_column("pk", data_value(bytes("key1"))) .with_column("v", data_value::make_null(bytes_type))); @@ -857,7 +860,8 @@ SEASTAR_TEST_CASE(test_query_result_with_one_regular_column_missing) { auto slice = partition_slice_builder(*s).build(); - assert_that(query::result_set::from_raw_result(s, slice, m.query(slice))) + assert_that(query::result_set::from_raw_result(s, slice, + m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }))) .has_only(a_row() .with_column("pk", data_value(bytes("key1"))) .with_column("ck", data_value(bytes("ck:A"))) @@ -1243,8 +1247,10 @@ SEASTAR_TEST_CASE(test_query_digest) { auto check_digests_equal = [] (const mutation& m1, const mutation& m2) { auto ps1 = partition_slice_builder(*m1.schema()).build(); auto ps2 = partition_slice_builder(*m2.schema()).build(); - auto digest1 = *m1.query(ps1, query::result_options::only_digest(query::digest_algorithm::xxHash)).digest(); - auto digest2 = *m2.query(ps2, query::result_options::only_digest(query::digest_algorithm::xxHash)).digest(); + auto digest1 = *m1.query(ps1, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, + query::result_options::only_digest(query::digest_algorithm::xxHash)).digest(); + auto digest2 = *m2.query(ps2, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, + query::result_options::only_digest(query::digest_algorithm::xxHash)).digest(); if (digest1 != digest2) { BOOST_FAIL(format("Digest should be the same for {} and {}", m1, m2)); } @@ -1493,7 +1499,8 @@ SEASTAR_THREAD_TEST_CASE(test_querying_expired_rows) { .without_partition_key_columns() .build(); auto opts = query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash}; - return query::result_set::from_raw_result(s, slice, m.query(slice, opts, t)); + return query::result_set::from_raw_result(s, slice, + m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, opts, t)); }; mutation m(s, pk); @@ -1557,7 +1564,8 @@ SEASTAR_TEST_CASE(test_querying_expired_cells) { .without_partition_key_columns() .build(); auto opts = query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash}; - return query::result_set::from_raw_result(s, slice, m.query(slice, opts, t)); + return query::result_set::from_raw_result(s, slice, + m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, opts, t)); }; { diff --git a/test/perf/memory_footprint_test.cc b/test/perf/memory_footprint_test.cc index 104e7a5cb0..a34cbff7fc 100644 --- a/test/perf/memory_footprint_test.cc +++ b/test/perf/memory_footprint_test.cc @@ -196,7 +196,8 @@ static sizes calculate_sizes(cache_tracker& tracker, const mutation_settings& se result.cache = tracker.region().occupancy().used_space() - cache_initial_occupancy; result.frozen = freeze(m).representation().size(); result.canonical = canonical_mutation(m).representation().size(); - result.query_result = m.query(partition_slice_builder(*s).build(), query::result_options::only_result()).buf().size(); + result.query_result = m.query(partition_slice_builder(*s).build(), + query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, query::result_options::only_result()).buf().size(); tmpdir sstable_dir; sstables::test_env env; From d27f8321d7e4147bd18efe0a9e08fb342d954b24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 28 May 2020 15:57:40 +0300 Subject: [PATCH 21/26] partition_slice_builder: add with_option() --- partition_slice_builder.hh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/partition_slice_builder.hh b/partition_slice_builder.hh index 2b3796bc1a..5337775377 100644 --- a/partition_slice_builder.hh +++ b/partition_slice_builder.hh @@ -54,6 +54,11 @@ public: partition_slice_builder& without_partition_key_columns(); partition_slice_builder& without_clustering_key_columns(); partition_slice_builder& reversed(); + template + partition_slice_builder& with_option() { + _options.set