From 003f5e9e54625bb1cf2c72de0a1ef93f4bd14d38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Mon, 27 Jul 2020 14:24:39 +0300
Subject: [PATCH 01/26] utils: config: add alias support

Allow configuration items to also have an alias, besides the name.
This allows easy replacement of configuration items, with newer names,
while still supporting the old name for backward compatibility.

The alias mechanism takes care of registering both the name and the
alias as command line arguments, as well as parsing them from YAML.
The command line documentation of the alias will just refer to the name
for documentation.
---
 utils/config_file.cc      | 13 ++++++++++++-
 utils/config_file.hh      | 30 ++++++++++++++++++++++++------
 utils/config_file_impl.hh |  8 +++++++-
 3 files changed, 43 insertions(+), 8 deletions(-)
diff --git a/utils/config_file.cc b/utils/config_file.cc
index 843343e4ec..a366e0b487 100644
--- a/utils/config_file.cc
+++ b/utils/config_file.cc
@@ -210,6 +210,17 @@ utils::config_type::to_json(const void* value) const {
     return _to_json(value);
 }
 
+bool
+utils::config_file::config_src::matches(std::string_view name) const {
+    if (_name == name) {
+        return true;
+    }
+    if (!_alias.empty() && _alias == name) {
+        return true;
+    }
+    return false;
+}
+
 json::json_return_type
 utils::config_file::config_src::value_as_json() const {
     return _type->to_json(current_value());
@@ -291,7 +302,7 @@ void utils::config_file::read_from_yaml(const char* yaml, error_handler h) {
     for (auto node : doc) {
         auto label = node.first.as<sstring>();
 
-        auto i = std::find_if(_cfgs.begin(), _cfgs.end(), [&label](const config_src& cfg) { return cfg.name() == label; });
+        auto i = std::find_if(_cfgs.begin(), _cfgs.end(), [&label](const config_src& cfg) { return cfg.matches(label); });
         if (i == _cfgs.end()) {
             h(label, "Unknown option", std::nullopt);
             continue;
diff --git a/utils/config_file.hh b/utils/config_file.hh
index b7390f12ec..0220ed96c1 100644
--- a/utils/config_file.hh
+++ b/utils/config_file.hh
@@ -98,7 +98,7 @@ public:
 
     struct config_src {
         config_file* _cf;
-        std::string_view _name, _desc;
+        std::string_view _name, _alias, _desc;
         const config_type* _type;
         size_t _per_shard_values_offset;
     protected:
@@ -110,11 +110,21 @@ public:
             , _desc(desc)
             , _type(type)
         {}
+        config_src(config_file* cf, std::string_view name, std::string_view alias, const config_type* type, std::string_view desc)
+            : _cf(cf)
+            , _name(name)
+            , _alias(alias)
+            , _desc(desc)
+            , _type(type)
+        {}
         virtual ~config_src() {}
 
         const std::string_view & name() const {
             return _name;
         }
+        std::string_view alias() const {
+            return _alias;
+        }
         const std::string_view & desc() const {
             return _desc;
         }
@@ -124,6 +134,7 @@ public:
         config_file * get_config_file() const {
             return _cf;
         }
+        bool matches(std::string_view name) const;
         virtual void add_command_line_option(
                         bpo::options_description_easy_init&, const std::string_view&,
                         const std::string_view&) = 0;
@@ -168,18 +179,25 @@ public:
         typedef T type;
         typedef named_value<T> MyType;
 
-        named_value(config_file* file, std::string_view name, liveness liveness_, value_status vs, const T& t = T(), std::string_view desc = {},
+        named_value(config_file* file, std::string_view name, std::string_view alias, liveness liveness_, value_status vs, const T& t = T(), std::string_view desc = {},
                 std::initializer_list<T> allowed_values = {})
-            : config_src(file, name, &config_type_for<T>, desc)
+            : config_src(file, name, alias, &config_type_for<T>, desc)
             , _value_status(vs)
             , _liveness(liveness_)
-            , _allowed_values(std::move(allowed_values))
-        {
+            , _allowed_values(std::move(allowed_values)) {
             file->add(*this, std::make_unique<the_value_type>(std::move(t)));
         }
+        named_value(config_file* file, std::string_view name, liveness liveness_, value_status vs, const T& t = T(), std::string_view desc = {},
+                std::initializer_list<T> allowed_values = {})
+            : named_value(file, name, {}, liveness_, vs, t, desc) {
+        }
+        named_value(config_file* file, std::string_view name, std::string_view alias, value_status vs, const T& t = T(), std::string_view desc = {},
+                std::initializer_list<T> allowed_values = {})
+                : named_value(file, name, alias, liveness::MustRestart, vs, t, desc, allowed_values) {
+        }
         named_value(config_file* file, std::string_view name, value_status vs, const T& t = T(), std::string_view desc = {},
                 std::initializer_list<T> allowed_values = {})
-                : named_value(file, name, liveness::MustRestart, vs, t, desc, allowed_values) {
+                : named_value(file, name, {}, liveness::MustRestart, vs, t, desc, allowed_values) {
         }
         value_status status() const override {
             return _value_status;
diff --git a/utils/config_file_impl.hh b/utils/config_file_impl.hh
index c898ecad40..f6b3a2670a 100644
--- a/utils/config_file_impl.hh
+++ b/utils/config_file_impl.hh
@@ -195,10 +195,16 @@ template<typename T>
 void utils::config_file::named_value<T>::add_command_line_option(
                 boost::program_options::options_description_easy_init& init,
                 const std::string_view& name, const std::string_view& desc) {
+    const auto hyphenated_name = hyphenate(name);
     // NOTE. We are not adding default values. We could, but must in that case manually (in some way) geenrate the textual
     // version, since the available ostream operators for things like pairs and collections don't match what we can deal with parser-wise.
     // See removed ostream operators above.
-    init(hyphenate(name).data(), value_ex<T>()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc.data());
+    init(hyphenated_name.data(), value_ex<T>()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc.data());
+
+    if (!alias().empty()) {
+        const auto alias_desc = fmt::format("Alias for {}", hyphenated_name);
+        init(hyphenate(alias()).data(), value_ex<T>()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), alias_desc.data());
+    }
 }
 
 template<typename T>

From dc23736d0c5d43d551663b34e99648092286dcaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Mon, 27 Jul 2020 15:35:00 +0300
Subject: [PATCH 02/26] db/config: replace ad-hoc aliases with alias mechanism

We already uses aliases for some configuration items, although these are
created with an ad-hoc mechanism that only registers them on the command
line. Replace this with the built-in alias mechanism in the previous
patch, which has the benefit of conflict resolution and also working
with YAML.
---
 db/config.cc | 17 +++--------------
 db/config.hh |  3 ---
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/db/config.cc b/db/config.cc
index b3af507875..6c1bfcde4e 100644
--- a/db/config.cc
+++ b/db/config.cc
@@ -224,7 +224,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
         "The directory in which Scylla will put all its subdirectories. The location of individual subdirs can be overriden by the respective *_directory options.")
     , commitlog_directory(this, "commitlog_directory", value_status::Used, "",
         "The directory where the commit log is stored. For optimal write performance, it is recommended the commit log be on a separate disk partition (ideally, a separate physical device) from the data file directories.")
-    , data_file_directories(this, "data_file_directories", value_status::Used, { },
+    , data_file_directories(this, "data_file_directories", "datadir", value_status::Used, { },
         "The directory location where table data (SSTables) is stored")
     , hints_directory(this, "hints_directory", value_status::Used, "",
         "The directory where hints files are stored if hinted handoff is enabled.")
@@ -517,7 +517,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
     /* Native transport (CQL Binary Protocol) */
     , start_native_transport(this, "start_native_transport", value_status::Used, true,
         "Enable or disable the native transport server. Uses the same address as the rpc_address, but the port is different from the rpc_port. See native_transport_port.")
-    , native_transport_port(this, "native_transport_port", value_status::Used, 9042,
+    , native_transport_port(this, "native_transport_port", "cql_port", value_status::Used, 9042,
         "Port on which the CQL native transport listens for clients.")
     , native_transport_port_ssl(this, "native_transport_port_ssl", value_status::Used, 9142,
         "Port on which the CQL TLS native transport listens for clients."
@@ -536,7 +536,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
     /* Settings for configuring and tuning client connections. */
     , broadcast_rpc_address(this, "broadcast_rpc_address", value_status::Used, {/* unset */},
         "RPC address to broadcast to drivers and other Scylla nodes. This cannot be set to 0.0.0.0. If blank, it is set to the value of the rpc_address or rpc_interface. If rpc_address or rpc_interfaceis set to 0.0.0.0, this property must be set.\n")
-    , rpc_port(this, "rpc_port", value_status::Used, 9160,
+    , rpc_port(this, "rpc_port", "thrift_port", value_status::Used, 9160,
         "Thrift port for client connections.")
     , start_rpc(this, "start_rpc", value_status::Used, true,
         "Starts the Thrift RPC server")
@@ -830,17 +830,6 @@ void config_file::named_value<db::config::seed_provider_type>::add_command_line_
 
 }
 
-boost::program_options::options_description_easy_init&
-db::config::add_options(boost::program_options::options_description_easy_init& init) {
-    config_file::add_options(init);
-
-    data_file_directories.add_command_line_option(init, "datadir", "alias for 'data-file-directories'");
-    rpc_port.add_command_line_option(init, "thrift-port", "alias for 'rpc-port'");
-    native_transport_port.add_command_line_option(init, "cql-port", "alias for 'native-transport-port'");
-
-    return init;
-}
-
 db::fs::path db::config::get_conf_dir() {
     using namespace db::fs;
 
diff --git a/db/config.hh b/db/config.hh
index d85d61ed68..86bb1d1bf6 100644
--- a/db/config.hh
+++ b/db/config.hh
@@ -329,9 +329,6 @@ public:
 
     seastar::logging_settings logging_settings(const boost::program_options::variables_map&) const;
 
-    boost::program_options::options_description_easy_init&
-    add_options(boost::program_options::options_description_easy_init&);
-
     const db::extensions& extensions() const;
 
     static const sstring default_tls_priority;

From 9faaf46d4b2df63d778091aba934e10b38195e58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Mon, 27 Jul 2020 15:42:49 +0300
Subject: [PATCH 03/26] utils: config_src::add_command_line_options(): drop
 name and desc args

Now that there are no ad-hoc aliases needing to overwrite the name and
description parameter of this method, we can drop these and have each
config item just use `name()` and `desc()` to access these.
---
 db/config.cc              | 11 +++++------
 db/config.hh              |  3 +--
 utils/config_file.cc      |  5 +----
 utils/config_file.hh      |  7 ++-----
 utils/config_file_impl.hh |  8 +++-----
 5 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/db/config.cc b/db/config.cc
index 6c1bfcde4e..ee27edc368 100644
--- a/db/config.cc
+++ b/db/config.cc
@@ -808,24 +808,23 @@ namespace utils {
 
 template<>
 void config_file::named_value<db::config::seed_provider_type>::add_command_line_option(
-                boost::program_options::options_description_easy_init& init,
-                const std::string_view& name, const std::string_view& desc) {
-    init((hyphenate(name) + "-class-name").data(),
+                boost::program_options::options_description_easy_init& init) {
+    init((hyphenate(name()) + "-class-name").data(),
                     value_ex<sstring>()->notifier(
                                     [this](sstring new_class_name) {
                                         auto old_seed_provider = operator()();
                                         old_seed_provider.class_name = new_class_name;
                                         set(std::move(old_seed_provider), config_source::CommandLine);
                                     }),
-                    desc.data());
-    init((hyphenate(name) + "-parameters").data(),
+                    desc().data());
+    init((hyphenate(name()) + "-parameters").data(),
                     value_ex<std::unordered_map<sstring, sstring>>()->notifier(
                                     [this](std::unordered_map<sstring, sstring> new_parameters) {
                                         auto old_seed_provider = operator()();
                                         old_seed_provider.parameters = new_parameters;
                                         set(std::move(old_seed_provider), config_source::CommandLine);
                                     }),
-                    desc.data());
+                    desc().data());
 }
 
 }
diff --git a/db/config.hh b/db/config.hh
index 86bb1d1bf6..3d225f987c 100644
--- a/db/config.hh
+++ b/db/config.hh
@@ -343,8 +343,7 @@ private:
             return this->is_set() ? (*this)() : t;
         }
         // do not add to boost::options. We only care about yaml config
-        void add_command_line_option(boost::program_options::options_description_easy_init&,
-                        const std::string_view&, const std::string_view&) override {}
+        void add_command_line_option(boost::program_options::options_description_easy_init&) override {}
     };
 
     log_legacy_value<seastar::log_level> default_log_level;
diff --git a/utils/config_file.cc b/utils/config_file.cc
index a366e0b487..e68aaef20a 100644
--- a/utils/config_file.cc
+++ b/utils/config_file.cc
@@ -270,10 +270,7 @@ bpo::options_description_easy_init&
 utils::config_file::add_options(bpo::options_description_easy_init& init) {
     for (config_src& src : _cfgs) {
         if (src.status() == value_status::Used) {
-            auto&& name = src.name();
-            sstring tmp(name.begin(), name.end());
-            std::replace(tmp.begin(), tmp.end(), '_', '-');
-            src.add_command_line_option(init, tmp, src.desc());
+            src.add_command_line_option(init);
         }
     }
     return init;
diff --git a/utils/config_file.hh b/utils/config_file.hh
index 0220ed96c1..c9b8978c7f 100644
--- a/utils/config_file.hh
+++ b/utils/config_file.hh
@@ -135,9 +135,7 @@ public:
             return _cf;
         }
         bool matches(std::string_view name) const;
-        virtual void add_command_line_option(
-                        bpo::options_description_easy_init&, const std::string_view&,
-                        const std::string_view&) = 0;
+        virtual void add_command_line_option(bpo::options_description_easy_init&) = 0;
         virtual void set_value(const YAML::Node&) = 0;
         virtual value_status status() const = 0;
         virtual config_source source() const = 0;
@@ -240,8 +238,7 @@ public:
             return the_value().observe(std::move(callback));
         }
 
-        void add_command_line_option(bpo::options_description_easy_init&,
-                        const std::string_view&, const std::string_view&) override;
+        void add_command_line_option(bpo::options_description_easy_init&) override;
         void set_value(const YAML::Node&) override;
     };
 
diff --git a/utils/config_file_impl.hh b/utils/config_file_impl.hh
index f6b3a2670a..4049bfc40c 100644
--- a/utils/config_file_impl.hh
+++ b/utils/config_file_impl.hh
@@ -192,14 +192,12 @@ sstring hyphenate(const std::string_view&);
 }
 
 template<typename T>
-void utils::config_file::named_value<T>::add_command_line_option(
-                boost::program_options::options_description_easy_init& init,
-                const std::string_view& name, const std::string_view& desc) {
-    const auto hyphenated_name = hyphenate(name);
+void utils::config_file::named_value<T>::add_command_line_option(boost::program_options::options_description_easy_init& init) {
+    const auto hyphenated_name = hyphenate(name());
     // NOTE. We are not adding default values. We could, but must in that case manually (in some way) geenrate the textual
     // version, since the available ostream operators for things like pairs and collections don't match what we can deal with parser-wise.
     // See removed ostream operators above.
-    init(hyphenated_name.data(), value_ex<T>()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc.data());
+    init(hyphenated_name.data(), value_ex<T>()->notifier([this](T new_val) { set(std::move(new_val), config_source::CommandLine); }), desc().data());
 
     if (!alias().empty()) {
         const auto alias_desc = fmt::format("Alias for {}", hyphenated_name);

From 46d5b651eb4ed445ac0bc3d2aca067c643181695 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Mon, 27 Jul 2020 15:25:58 +0300
Subject: [PATCH 04/26] db/config: introduce
 max_memory_for_unlimited_query_soft_limit and
 max_memory_for_unlimited_query_hard_limit

This pair of limits replace the old max_memory_for_unlimited_query one,
which remains as an alias to the hard limit. The soft limit inherits the
previous value of the limit (1MB), when this limit is reached a warning
will be logged allowing the users to adjust their client codes without
downtime. The hard limit starts out with a more permissive default of
100MB. When this is reached queries are aborted, the same behaviour as
with the previous single limit.

The idea is to allow clients a grace period for fixing their code, while
at the same time protecting the database from the really bad queries.
---
 database.cc                  | 2 +-
 db/config.cc                 | 8 ++++++--
 db/config.hh                 | 3 ++-
 test/boost/cql_query_test.cc | 3 ++-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/database.cc b/database.cc
index be1d5c9e5c..dcb19880a0 100644
--- a/database.cc
+++ b/database.cc
@@ -1282,7 +1282,7 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) {
 query_class_config database::make_query_class_config() {
     // Everything running in the statement group is considered a user query
     if (current_scheduling_group() == _dbcfg.statement_scheduling_group) {
-        return query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query()};
+        return query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query_hard_limit()};
     // Reads done on behalf of view update generation run in the streaming group
     } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) {
         return query_class_config{_streaming_concurrency_sem, std::numeric_limits<uint64_t>::max()};
diff --git a/db/config.cc b/db/config.cc
index ee27edc368..0c98b9c4ff 100644
--- a/db/config.cc
+++ b/db/config.cc
@@ -722,8 +722,12 @@ db::config::config(std::shared_ptr<db::extensions> exts)
     , max_clustering_key_restrictions_per_query(this, "max_clustering_key_restrictions_per_query", liveness::LiveUpdate, value_status::Used, 100,
             "Maximum number of distinct clustering key restrictions per query. This limit places a bound on the size of IN tuples, "
             "especially when multiple clustering key columns have IN restrictions. Increasing this value can result in server instability.")
-    , max_memory_for_unlimited_query(this, "max_memory_for_unlimited_query", liveness::LiveUpdate, value_status::Used, size_t(1) << 20,
-            "Maximum amount of memory a query, whose memory consumption is not naturally limited, is allowed to consume, e.g. non-paged and reverse queries.")
+    , max_memory_for_unlimited_query_soft_limit(this, "max_memory_for_unlimited_query_soft_limit", liveness::LiveUpdate, value_status::Used, uint64_t(1) << 20,
+            "Maximum amount of memory a query, whose memory consumption is not naturally limited, is allowed to consume, e.g. non-paged and reverse queries. "
+            "This is the soft limit, there will be a warning logged for queries violating this limit.")
+    , max_memory_for_unlimited_query_hard_limit(this, "max_memory_for_unlimited_query_hard_limit", "max_memory_for_unlimited_query", liveness::LiveUpdate, value_status::Used, (uint64_t(100) << 20),
+            "Maximum amount of memory a query, whose memory consumption is not naturally limited, is allowed to consume, e.g. non-paged and reverse queries. "
+            "This is the hard limit, queries violating this limit will be aborted.")
     , initial_sstable_loading_concurrency(this, "initial_sstable_loading_concurrency", value_status::Used, 4u,
             "Maximum amount of sstables to load in parallel during initialization. A higher number can lead to more memory consumption. You should not need to touch this")
     , enable_3_1_0_compatibility_mode(this, "enable_3_1_0_compatibility_mode", value_status::Used, false,
diff --git a/db/config.hh b/db/config.hh
index 3d225f987c..feadfa0dda 100644
--- a/db/config.hh
+++ b/db/config.hh
@@ -303,7 +303,8 @@ public:
     named_value<bool> abort_on_internal_error;
     named_value<uint32_t> max_partition_key_restrictions_per_query;
     named_value<uint32_t> max_clustering_key_restrictions_per_query;
-    named_value<uint64_t> max_memory_for_unlimited_query;
+    named_value<uint64_t> max_memory_for_unlimited_query_soft_limit;
+    named_value<uint64_t> max_memory_for_unlimited_query_hard_limit;
     named_value<unsigned> initial_sstable_loading_concurrency;
     named_value<bool> enable_3_1_0_compatibility_mode;
     named_value<bool> enable_user_defined_functions;
diff --git a/test/boost/cql_query_test.cc b/test/boost/cql_query_test.cc
index 0722295bd7..7a14262168 100644
--- a/test/boost/cql_query_test.cc
+++ b/test/boost/cql_query_test.cc
@@ -2765,7 +2765,8 @@ SEASTAR_TEST_CASE(test_reversed_slice_with_empty_range_before_all_rows) {
 // See #6171
 SEASTAR_TEST_CASE(test_reversed_slice_with_many_clustering_ranges) {
     cql_test_config cfg;
-    cfg.db_config->max_memory_for_unlimited_query(std::numeric_limits<uint64_t>::max());
+    cfg.db_config->max_memory_for_unlimited_query_soft_limit(std::numeric_limits<uint64_t>::max());
+    cfg.db_config->max_memory_for_unlimited_query_hard_limit(std::numeric_limits<uint64_t>::max());
     return do_with_cql_env_thread([] (cql_test_env& e) {
         e.execute_cql("CREATE TABLE test (pk int, ck int, v text, PRIMARY KEY (pk, ck));").get();
         auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0();

From 517a941feb2dd6268840d7cfb8f09f364bec4c02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 28 Jul 2020 11:39:56 +0300
Subject: [PATCH 05/26] query_class_config: move into the query namespace

It belongs there, its name even starts with "query".
---
 database.cc               | 8 ++++----
 database.hh               | 6 +++---
 mutation_partition.cc     | 6 +++---
 mutation_query.hh         | 6 +++---
 query_class_config.hh     | 4 ++++
 table.cc                  | 2 +-
 test/lib/reader_permit.cc | 4 ++--
 test/lib/reader_permit.hh | 2 +-
 8 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/database.cc b/database.cc
index dcb19880a0..1471f6ae70 100644
--- a/database.cc
+++ b/database.cc
@@ -1279,16 +1279,16 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) {
     });
 }
 
-query_class_config database::make_query_class_config() {
+query::query_class_config database::make_query_class_config() {
     // Everything running in the statement group is considered a user query
     if (current_scheduling_group() == _dbcfg.statement_scheduling_group) {
-        return query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query_hard_limit()};
+        return query::query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query_hard_limit()};
     // Reads done on behalf of view update generation run in the streaming group
     } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) {
-        return query_class_config{_streaming_concurrency_sem, std::numeric_limits<uint64_t>::max()};
+        return query::query_class_config{_streaming_concurrency_sem, std::numeric_limits<uint64_t>::max()};
     // Everything else is considered a system query
     } else {
-        return query_class_config{_system_read_concurrency_sem, std::numeric_limits<uint64_t>::max()};
+        return query::query_class_config{_system_read_concurrency_sem, std::numeric_limits<uint64_t>::max()};
     }
 }
 
diff --git a/database.hh b/database.hh
index d0fb002b74..b5866183fb 100644
--- a/database.hh
+++ b/database.hh
@@ -748,7 +748,7 @@ public:
     // Returns at most "cmd.limit" rows
     future<lw_shared_ptr<query::result>> query(schema_ptr,
         const query::read_command& cmd,
-        query_class_config class_config,
+        query::query_class_config class_config,
         query::result_options opts,
         const dht::partition_range_vector& ranges,
         tracing::trace_state_ptr trace_state,
@@ -1294,7 +1294,7 @@ private:
         column_family*,
         schema_ptr,
         const query::read_command&,
-        query_class_config,
+        query::query_class_config,
         query::result_options,
         const dht::partition_range_vector&,
         tracing::trace_state_ptr,
@@ -1594,7 +1594,7 @@ public:
         return _supports_infinite_bound_range_deletions;
     }
 
-    query_class_config make_query_class_config();
+    query::query_class_config make_query_class_config();
 };
 
 future<> start_large_data_handler(sharded<database>& db);
diff --git a/mutation_partition.cc b/mutation_partition.cc
index 59c7611028..113a8cbeba 100644
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -2167,7 +2167,7 @@ future<> data_query(
         gc_clock::time_point query_time,
         query::result::builder& builder,
         db::timeout_clock::time_point timeout,
-        query_class_config class_config,
+        query::query_class_config class_config,
         tracing::trace_state_ptr trace_ptr,
         query::querier_cache_context cache_ctx)
 {
@@ -2261,7 +2261,7 @@ static do_mutation_query(schema_ptr s,
                uint32_t partition_limit,
                gc_clock::time_point query_time,
                db::timeout_clock::time_point timeout,
-               query_class_config class_config,
+               query::query_class_config class_config,
                query::result_memory_accounter&& accounter,
                tracing::trace_state_ptr trace_ptr,
                query::querier_cache_context cache_ctx)
@@ -2301,7 +2301,7 @@ mutation_query(schema_ptr s,
                uint32_t partition_limit,
                gc_clock::time_point query_time,
                db::timeout_clock::time_point timeout,
-               query_class_config class_config,
+               query::query_class_config class_config,
                query::result_memory_accounter&& accounter,
                tracing::trace_state_ptr trace_ptr,
                query::querier_cache_context cache_ctx)
diff --git a/mutation_query.hh b/mutation_query.hh
index ce2309a11e..dbd0374b09 100644
--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -163,7 +163,7 @@ future<reconcilable_result> mutation_query(
     uint32_t partition_limit,
     gc_clock::time_point query_time,
     db::timeout_clock::time_point timeout,
-    query_class_config class_config,
+    query::query_class_config class_config,
     query::result_memory_accounter&& accounter = { },
     tracing::trace_state_ptr trace_ptr = nullptr,
     query::querier_cache_context cache_ctx = { });
@@ -178,7 +178,7 @@ future<> data_query(
     gc_clock::time_point query_time,
     query::result::builder& builder,
     db::timeout_clock::time_point timeout,
-    query_class_config class_config,
+    query::query_class_config class_config,
     tracing::trace_state_ptr trace_ptr = nullptr,
     query::querier_cache_context cache_ctx = { });
 
@@ -193,7 +193,7 @@ class mutation_query_stage {
         uint32_t,
         gc_clock::time_point,
         db::timeout_clock::time_point,
-        query_class_config,
+        query::query_class_config,
         query::result_memory_accounter&&,
         tracing::trace_state_ptr,
         query::querier_cache_context> _execution_stage;
diff --git a/query_class_config.hh b/query_class_config.hh
index c1ad431904..d634bd5ad6 100644
--- a/query_class_config.hh
+++ b/query_class_config.hh
@@ -25,7 +25,11 @@
 
 class reader_concurrency_semaphore;
 
+namespace query {
+
 struct query_class_config {
     reader_concurrency_semaphore& semaphore;
     uint64_t max_memory_for_unlimited_query;
 };
+
+}
diff --git a/table.cc b/table.cc
index fc51b4967e..b7d77e674c 100644
--- a/table.cc
+++ b/table.cc
@@ -2032,7 +2032,7 @@ struct query_state {
 future<lw_shared_ptr<query::result>>
 table::query(schema_ptr s,
         const query::read_command& cmd,
-        query_class_config class_config,
+        query::query_class_config class_config,
         query::result_options opts,
         const dht::partition_range_vector& partition_ranges,
         tracing::trace_state_ptr trace_state,
diff --git a/test/lib/reader_permit.cc b/test/lib/reader_permit.cc
index 3a9e4604c8..bf292784e7 100644
--- a/test/lib/reader_permit.cc
+++ b/test/lib/reader_permit.cc
@@ -33,8 +33,8 @@ reader_permit make_permit() {
     return the_semaphore.make_permit();
 }
 
-query_class_config make_query_class_config() {
-    return query_class_config{the_semaphore, std::numeric_limits<uint64_t>::max()};
+query::query_class_config make_query_class_config() {
+    return query::query_class_config{the_semaphore, std::numeric_limits<uint64_t>::max()};
 }
 
 } // namespace tests
diff --git a/test/lib/reader_permit.hh b/test/lib/reader_permit.hh
index bc3d9b1a14..d6942c4813 100644
--- a/test/lib/reader_permit.hh
+++ b/test/lib/reader_permit.hh
@@ -30,6 +30,6 @@ reader_concurrency_semaphore& semaphore();
 
 reader_permit make_permit();
 
-query_class_config make_query_class_config();
+query::query_class_config make_query_class_config();
 
 } // namespace tests

From 8aee7662a916c7f44498c1cecc63b870b03dcc61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Mon, 27 Jul 2020 16:20:12 +0300
Subject: [PATCH 06/26] query: introduce max_result_size

To be used to pass around the soft/hard limit configured via
`max_memory_for_unlimited_query_{soft,hard}_limit` in the codebase.
---
 query_class_config.hh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/query_class_config.hh b/query_class_config.hh
index d634bd5ad6..5119934e33 100644
--- a/query_class_config.hh
+++ b/query_class_config.hh
@@ -27,6 +27,15 @@ class reader_concurrency_semaphore;
 
 namespace query {
 
+struct max_result_size {
+    uint64_t soft_limit = 0;
+    uint64_t hard_limit = 0;
+
+    max_result_size() = default;
+    explicit max_result_size(uint64_t max_size) : soft_limit(max_size), hard_limit(max_size) { }
+    explicit max_result_size(uint64_t soft_limit, uint64_t hard_limit) : soft_limit(soft_limit), hard_limit(hard_limit) { }
+};
+
 struct query_class_config {
     reader_concurrency_semaphore& semaphore;
     uint64_t max_memory_for_unlimited_query;

From 92ce39f01476b1b2b9c278221dc0d1556f72d2dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 28 Jul 2020 11:56:25 +0300
Subject: [PATCH 07/26] query: query_class_config: use max_result_size for the
 max_memory_for_unlimited_query field

We want to switch from using a single limit to a dual soft/hard limit.
As a first step we switch the limit field of `query_class_config` to use
the recently introduced type for this. As this field has a single user
at the moment -- reverse queries (and not a lot of propagation) -- we
update it in this same patch to use the soft/hard limit: warn on
reaching the soft limit and abort on the hard limit (the previous
behaviour).
---
 database.cc                             |  7 +++---
 flat_mutation_reader.cc                 | 30 ++++++++++++++++---------
 flat_mutation_reader.hh                 | 15 ++++++++-----
 querier.hh                              | 10 ++++-----
 query_class_config.hh                   |  2 +-
 test/boost/flat_mutation_reader_test.cc | 15 +++++++++----
 test/boost/querier_cache_test.cc        |  2 +-
 test/lib/reader_permit.cc               |  2 +-
 8 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/database.cc b/database.cc
index 1471f6ae70..24e937896b 100644
--- a/database.cc
+++ b/database.cc
@@ -1282,13 +1282,14 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) {
 query::query_class_config database::make_query_class_config() {
     // Everything running in the statement group is considered a user query
     if (current_scheduling_group() == _dbcfg.statement_scheduling_group) {
-        return query::query_class_config{_read_concurrency_sem, _cfg.max_memory_for_unlimited_query_hard_limit()};
+        return query::query_class_config{_read_concurrency_sem,
+            query::max_result_size(_cfg.max_memory_for_unlimited_query_soft_limit(), _cfg.max_memory_for_unlimited_query_hard_limit())};
     // Reads done on behalf of view update generation run in the streaming group
     } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) {
-        return query::query_class_config{_streaming_concurrency_sem, std::numeric_limits<uint64_t>::max()};
+        return query::query_class_config{_streaming_concurrency_sem, query::max_result_size(std::numeric_limits<uint64_t>::max())};
     // Everything else is considered a system query
     } else {
-        return query::query_class_config{_system_read_concurrency_sem, std::numeric_limits<uint64_t>::max()};
+        return query::query_class_config{_system_read_concurrency_sem, query::max_result_size(std::numeric_limits<uint64_t>::max())};
     }
 }
 
diff --git a/flat_mutation_reader.cc b/flat_mutation_reader.cc
index 746c09e6f8..c054a80c18 100644
--- a/flat_mutation_reader.cc
+++ b/flat_mutation_reader.cc
@@ -60,14 +60,15 @@ void flat_mutation_reader::impl::clear_buffer_to_next_partition() {
     _buffer_size = compute_buffer_size(*_schema, _buffer);
 }
 
-flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_t max_memory_consumption) {
+flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, query::max_result_size max_size) {
     class partition_reversing_mutation_reader final : public flat_mutation_reader::impl {
         flat_mutation_reader* _source;
         range_tombstone_list _range_tombstones;
         std::stack<mutation_fragment> _mutation_fragments;
         mutation_fragment_opt _partition_end;
         size_t _stack_size = 0;
-        const size_t _max_stack_size;
+        const query::max_result_size _max_size;
+        bool _below_soft_limit = true;
     private:
         stop_iteration emit_partition() {
             auto emit_range_tombstone = [&] {
@@ -119,7 +120,7 @@ flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_
                 } else {
                     _mutation_fragments.emplace(std::move(mf));
                     _stack_size += _mutation_fragments.top().memory_usage(*_schema);
-                    if (_stack_size >= _max_stack_size) {
+                    if (_stack_size > _max_size.hard_limit || (_stack_size > _max_size.soft_limit && _below_soft_limit)) {
                         const partition_key* key = nullptr;
                         auto it = buffer().end();
                         --it;
@@ -129,21 +130,30 @@ flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_
                             --it;
                             key = &it->as_partition_start().key().key();
                         }
-                        throw std::runtime_error(fmt::format(
-                                "Aborting reverse partition read because partition {} is larger than the maximum safe size of {} for reversible partitions.",
-                                key->with_schema(*_schema),
-                                _max_stack_size));
+
+                        if (_stack_size > _max_size.hard_limit) {
+                            throw std::runtime_error(fmt::format(
+                                    "Memory usage of reversed read exceeds hard limit of {} (configured via max_memory_for_unlimited_query_hard_limit), while reading partition {}",
+                                    _max_size.hard_limit,
+                                    key->with_schema(*_schema)));
+                        } else {
+                            fmr_logger.warn(
+                                    "Memory usage of reversed read exceeds soft limit of {} (configured via max_memory_for_unlimited_query_soft_limit), while reading partition {}",
+                                    _max_size.soft_limit,
+                                    key->with_schema(*_schema));
+                            _below_soft_limit = false;
+                        }
                     }
                 }
             }
             return make_ready_future<stop_iteration>(is_buffer_full());
         }
     public:
-        explicit partition_reversing_mutation_reader(flat_mutation_reader& mr, size_t max_stack_size)
+        explicit partition_reversing_mutation_reader(flat_mutation_reader& mr, query::max_result_size max_size)
             : flat_mutation_reader::impl(mr.schema())
             , _source(&mr)
             , _range_tombstones(*_schema)
-            , _max_stack_size(max_stack_size)
+            , _max_size(max_size)
         { }
 
         virtual future<> fill_buffer(db::timeout_clock::time_point timeout) override {
@@ -185,7 +195,7 @@ flat_mutation_reader make_reversing_reader(flat_mutation_reader& original, size_
         }
     };
 
-    return make_flat_mutation_reader<partition_reversing_mutation_reader>(original, max_memory_consumption);
+    return make_flat_mutation_reader<partition_reversing_mutation_reader>(original, max_size);
 }
 
 template<typename Source>
diff --git a/flat_mutation_reader.hh b/flat_mutation_reader.hh
index 31373a44f6..a7b0a4ca23 100644
--- a/flat_mutation_reader.hh
+++ b/flat_mutation_reader.hh
@@ -29,6 +29,7 @@
 #include "mutation_fragment.hh"
 #include "tracing/trace_state.hh"
 #include "mutation.hh"
+#include "query_class_config.hh"
 
 #include <seastar/core/thread.hh>
 #include <seastar/core/file.hh>
@@ -720,15 +721,17 @@ make_generating_reader(schema_ptr s, std::function<future<mutation_fragment_opt>
 ///
 /// \param original the reader to be reversed, has to be kept alive while the
 ///     reversing reader is in use.
-/// \param max_memory_consumption the maximum amount of memory the reader is
-///     allowed to use for reversing. The reverse reader reads entire partitions
-///     into memory, before reversing them. Since partitions can be larger than
-///     the available memory, we need to enforce a limit on memory consumption.
-///     If the read uses more memory then this limit, the read is aborted.
+/// \param max_size the maximum amount of memory the reader is allowed to use
+///     for reversing and conversely the maximum size of the results. The
+///     reverse reader reads entire partitions into memory, before reversing
+///     them. Since partitions can be larger than the available memory, we need
+///     to enforce a limit on memory consumption. When reaching the soft limit
+///     a warning will be logged. When reaching the hard limit the read will be
+///     aborted.
 ///
 /// FIXME: reversing should be done in the sstable layer, see #1413.
 flat_mutation_reader
-make_reversing_reader(flat_mutation_reader& original, size_t max_memory_consumption);
+make_reversing_reader(flat_mutation_reader& original, query::max_result_size max_size);
 
 /// Low level fragment stream validator.
 ///
diff --git a/querier.hh b/querier.hh
index 1e3f5691f5..04efbccc21 100644
--- a/querier.hh
+++ b/querier.hh
@@ -83,7 +83,7 @@ auto consume_page(flat_mutation_reader& reader,
         uint32_t partition_limit,
         gc_clock::time_point query_time,
         db::timeout_clock::time_point timeout,
-        size_t reverse_read_max_memory) {
+        query::max_result_size max_size) {
     return reader.peek(timeout).then([=, &reader, consumer = std::move(consumer), &slice] (
                 mutation_fragment* next_fragment) mutable {
         const auto next_fragment_kind = next_fragment ? next_fragment->mutation_fragment_kind() : mutation_fragment::kind::partition_end;
@@ -94,9 +94,9 @@ auto consume_page(flat_mutation_reader& reader,
                 compaction_state,
                 clustering_position_tracker(std::move(consumer), last_ckey));
 
-        auto consume = [&reader, &slice, reader_consumer = std::move(reader_consumer), timeout, reverse_read_max_memory] () mutable {
+        auto consume = [&reader, &slice, reader_consumer = std::move(reader_consumer), timeout, max_size] () mutable {
             if (slice.options.contains(query::partition_slice::option::reversed)) {
-                return do_with(make_reversing_reader(reader, reverse_read_max_memory),
+                return do_with(make_reversing_reader(reader, max_size),
                         [reader_consumer = std::move(reader_consumer), timeout] (flat_mutation_reader& reversing_reader) mutable {
                     return reversing_reader.consume(std::move(reader_consumer), timeout);
                 });
@@ -223,9 +223,9 @@ public:
             uint32_t partition_limit,
             gc_clock::time_point query_time,
             db::timeout_clock::time_point timeout,
-            size_t reverse_read_max_memory) {
+            query::max_result_size max_size) {
         return ::query::consume_page(_reader, _compaction_state, *_slice, std::move(consumer), row_limit, partition_limit, query_time,
-                timeout, reverse_read_max_memory).then([this] (auto&& results) {
+                timeout, max_size).then([this] (auto&& results) {
             _last_ckey = std::get<std::optional<clustering_key>>(std::move(results));
             constexpr auto size = std::tuple_size<std::decay_t<decltype(results)>>::value;
             static_assert(size <= 2);
diff --git a/query_class_config.hh b/query_class_config.hh
index 5119934e33..168143219a 100644
--- a/query_class_config.hh
+++ b/query_class_config.hh
@@ -38,7 +38,7 @@ struct max_result_size {
 
 struct query_class_config {
     reader_concurrency_semaphore& semaphore;
-    uint64_t max_memory_for_unlimited_query;
+    max_result_size max_memory_for_unlimited_query;
 };
 
 }
diff --git a/test/boost/flat_mutation_reader_test.cc b/test/boost/flat_mutation_reader_test.cc
index 7e2ffefe2f..f6ce5426b8 100644
--- a/test/boost/flat_mutation_reader_test.cc
+++ b/test/boost/flat_mutation_reader_test.cc
@@ -592,7 +592,7 @@ void test_flat_stream(schema_ptr s, std::vector<mutation> muts, reversed_partiti
             return fmr.consume_in_thread(std::move(fsc), db::no_timeout);
         } else {
             if (reversed) {
-                auto reverse_reader = make_reversing_reader(fmr, size_t(1) << 20);
+                auto reverse_reader = make_reversing_reader(fmr, query::max_result_size(size_t(1) << 20));
                 return reverse_reader.consume(std::move(fsc), db::no_timeout).get0();
             }
             return fmr.consume(std::move(fsc), db::no_timeout).get0();
@@ -805,7 +805,8 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_memory_limit) {
 
     auto test_with_partition = [&] (bool with_static_row) {
         testlog.info("Testing with_static_row={}", with_static_row);
-        auto mut = schema.new_mutation("pk1");
+        const auto pk = "pk1";
+        auto mut = schema.new_mutation(pk);
         const size_t desired_mut_size = 1 * 1024 * 1024;
         const size_t row_size = 10 * 1024;
 
@@ -817,8 +818,9 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_memory_limit) {
             schema.add_row(mut, schema.make_ckey(++i), sstring(row_size, '0'));
         }
 
+        const uint64_t hard_limit = size_t(1) << 18;
         auto reader = flat_mutation_reader_from_mutations({mut});
-        auto reverse_reader = make_reversing_reader(reader, size_t(1) << 10);
+        auto reverse_reader = make_reversing_reader(reader, query::max_result_size(size_t(1) << 10, hard_limit));
 
         try {
             reverse_reader.consume(phony_consumer{}, db::no_timeout).get();
@@ -826,7 +828,12 @@ SEASTAR_THREAD_TEST_CASE(test_reverse_reader_memory_limit) {
         } catch (const std::runtime_error& e) {
             testlog.info("Got exception with message: {}", e.what());
             auto str = sstring(e.what());
-            BOOST_REQUIRE_EQUAL(str.find("Aborting reverse partition read because partition pk1"), 0);
+            const auto expected_str = format(
+                    "Memory usage of reversed read exceeds hard limit of {} (configured via max_memory_for_unlimited_query_hard_limit), while reading partition {}",
+                    hard_limit,
+                    pk);
+
+            BOOST_REQUIRE_EQUAL(str.find(expected_str), 0);
         } catch (...) {
             throw;
         }
diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc
index 79e6203156..a393a06f35 100644
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -214,7 +214,7 @@ public:
 
         auto querier = make_querier<Querier>(range);
         auto [dk, ck] = querier.consume_page(dummy_result_builder{}, row_limit, std::numeric_limits<uint32_t>::max(),
-                gc_clock::now(), db::no_timeout, std::numeric_limits<uint64_t>::max()).get0();
+                gc_clock::now(), db::no_timeout, query::max_result_size(std::numeric_limits<uint64_t>::max())).get0();
         const auto memory_usage = querier.memory_usage();
         _cache.insert(cache_key, std::move(querier), nullptr);
 
diff --git a/test/lib/reader_permit.cc b/test/lib/reader_permit.cc
index bf292784e7..cb0e429375 100644
--- a/test/lib/reader_permit.cc
+++ b/test/lib/reader_permit.cc
@@ -34,7 +34,7 @@ reader_permit make_permit() {
 }
 
 query::query_class_config make_query_class_config() {
-    return query::query_class_config{the_semaphore, std::numeric_limits<uint64_t>::max()};
+    return query::query_class_config{the_semaphore, query::max_result_size(std::numeric_limits<uint64_t>::max())};
 }
 
 } // namespace tests

From d5cc932a0b40f9dcf4283b97f9f9572dec67bab5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 19 May 2020 15:09:48 +0300
Subject: [PATCH 08/26] database: query_mutations(): obtain the memory
 accounter inside

Instead of requesting callers to do it and pass it as a parameter. This
is in line with data_query().
---
 database.cc                                  | 5 ++++-
 database.hh                                  | 2 +-
 service/storage_proxy.cc                     | 4 +---
 test/boost/multishard_mutation_query_test.cc | 3 +--
 test/boost/querier_cache_test.cc             | 6 +++---
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/database.cc b/database.cc
index 24e937896b..0340798244 100644
--- a/database.cc
+++ b/database.cc
@@ -1206,7 +1206,9 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
 
 future<std::tuple<reconcilable_result, cache_temperature>>
 database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range,
-                          query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
+                          size_t result_max_size, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
+  return get_result_memory_limiter().new_mutation_read(result_max_size).then(
+          [&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) {
     column_family& cf = find_column_family(cmd.cf_id);
     query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
     return _mutation_query_stage(std::move(s),
@@ -1231,6 +1233,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
             return make_ready_future<std::tuple<reconcilable_result, cache_temperature>>(std::tuple(std::move(result), hit_rate));
         }
     });
+  });
 }
 
 std::unordered_set<sstring> database::get_initial_tokens() {
diff --git a/database.hh b/database.hh
index b5866183fb..61e337c6d4 100644
--- a/database.hh
+++ b/database.hh
@@ -1465,7 +1465,7 @@ public:
                                                                   const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state,
                                                                   uint64_t max_result_size, db::timeout_clock::time_point timeout);
     future<std::tuple<reconcilable_result, cache_temperature>> query_mutations(schema_ptr, const query::read_command& cmd, const dht::partition_range& range,
-                                                query::result_memory_accounter&& accounter, tracing::trace_state_ptr trace_state,
+                                                size_t max_result_size, tracing::trace_state_ptr trace_state,
                                                 db::timeout_clock::time_point timeout);
     // Apply the mutation atomically.
     // Throws timed_out_error when timeout is reached.
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index 47678ab4b1..b4c623d9a9 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -5142,12 +5142,10 @@ storage_proxy::query_mutations_locally(schema_ptr s, lw_shared_ptr<query::read_c
         unsigned shard = dht::shard_of(*s, pr.start()->value().token());
         get_stats().replica_cross_shard_ops += shard != this_shard_id();
         return _db.invoke_on(shard, _read_smp_service_group, [max_size, cmd, &pr, gs=global_schema_ptr(s), timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
-          return db.get_result_memory_limiter().new_mutation_read(max_size).then([&] (query::result_memory_accounter ma) {
-            return db.query_mutations(gs, *cmd, pr, std::move(ma), gt, timeout).then([] (std::tuple<reconcilable_result, cache_temperature> result_ht) {
+            return db.query_mutations(gs, *cmd, pr, max_size, gt, timeout).then([] (std::tuple<reconcilable_result, cache_temperature> result_ht) {
                 auto&& [result, ht] = result_ht;
                 return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>(rpc::tuple(make_foreign(make_lw_shared(std::move(result))), ht));
             });
-          });
         });
     } else {
         return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), {pr}, std::move(trace_state), max_size, timeout);
diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc
index 45d823613a..db345b123b 100644
--- a/test/boost/multishard_mutation_query_test.cc
+++ b/test/boost/multishard_mutation_query_test.cc
@@ -104,11 +104,10 @@ static std::vector<mutation> read_all_partitions_one_by_one(distributed<database
     for (const auto& pkey : pkeys) {
         const auto res = db.invoke_on(sharder.shard_of(pkey.token()), [gs = global_schema_ptr(s), &pkey] (database& db) {
             return async([s = gs.get(), &pkey, &db] () mutable {
-                auto accounter = db.get_result_memory_limiter().new_mutation_read(std::numeric_limits<size_t>::max()).get0();
                 const auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), query::max_rows);
                 const auto range = dht::partition_range::make_singular(pkey);
                 return make_foreign(std::make_unique<reconcilable_result>(
-                    std::get<0>(db.query_mutations(std::move(s), cmd, range, std::move(accounter), nullptr, db::no_timeout).get0())));
+                    std::get<0>(db.query_mutations(std::move(s), cmd, range, std::numeric_limits<size_t>::max(), nullptr, db::no_timeout).get0())));
             });
         }).get0();
 
diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc
index a393a06f35..3925ad848e 100644
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -672,7 +672,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
         db.query_mutations(s,
                 cmd1,
                 query::full_partition_range,
-                db.get_result_memory_limiter().new_mutation_read(1024 * 1024).get0(),
+                1024 * 1024,
                 nullptr,
                 db::no_timeout).get();
 
@@ -706,7 +706,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
         db.query_mutations(s,
                 cmd2,
                 query::full_partition_range,
-                db.get_result_memory_limiter().new_mutation_read(1024 * 1024).get0(),
+                1024 * 1024,
                 nullptr,
                 db::no_timeout).get();
 
@@ -723,7 +723,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
         db.query_mutations(s,
                 cmd2,
                 query::full_partition_range,
-                db.get_result_memory_limiter().new_mutation_read(1024 * 1024 * 1024 * 1024).get0(),
+                1024 * 1024 * 1024 * 1024,
                 nullptr,
                 db::no_timeout).get();
         return make_ready_future<>();

From a64d9b8883c0fd43eef4cced55c254ba0760dd00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 10:17:57 +0300
Subject: [PATCH 09/26] database: add get_statement_scheduling_group()

---
 database.hh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/database.hh b/database.hh
index 61e337c6d4..090d3be332 100644
--- a/database.hh
+++ b/database.hh
@@ -1396,6 +1396,7 @@ public:
         return _commitlog.get();
     }
 
+    seastar::scheduling_group get_statement_scheduling_group() const { return _dbcfg.statement_scheduling_group; }
     seastar::scheduling_group get_streaming_scheduling_group() const { return _dbcfg.streaming_scheduling_group; }
     size_t get_available_memory() const { return _dbcfg.available_memory; }
 

From c364c7c6a24c8d1c585854b9708811883e7bf891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 10:31:30 +0300
Subject: [PATCH 10/26] result_memory_limiter: add unlimited_result_size
 constant

To be used as the max result size for internal queries.
---
 query-result.hh                  | 1 +
 query.cc                         | 1 +
 test/boost/querier_cache_test.cc | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/query-result.hh b/query-result.hh
index 7b6e1d3d29..9634dca6ee 100644
--- a/query-result.hh
+++ b/query-result.hh
@@ -51,6 +51,7 @@ class result_memory_limiter {
 public:
     static constexpr size_t minimum_result_size = 4 * 1024;
     static constexpr size_t maximum_result_size = 1 * 1024 * 1024;
+    static constexpr size_t unlimited_result_size = std::numeric_limits<size_t>::max();
 public:
     explicit result_memory_limiter(size_t maximum_total_result_memory)
         : _maximum_total_result_memory(maximum_total_result_memory)
diff --git a/query.cc b/query.cc
index 2d22194d06..4ba98a0c51 100644
--- a/query.cc
+++ b/query.cc
@@ -34,6 +34,7 @@ namespace query {
 
 constexpr size_t result_memory_limiter::minimum_result_size;
 constexpr size_t result_memory_limiter::maximum_result_size;
+constexpr size_t result_memory_limiter::unlimited_result_size;
 
 thread_local semaphore result_memory_tracker::_dummy { 0 };
 
diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc
index 3925ad848e..39b5464269 100644
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -723,7 +723,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
         db.query_mutations(s,
                 cmd2,
                 query::full_partition_range,
-                1024 * 1024 * 1024 * 1024,
+                query::result_memory_limiter::unlimited_result_size,
                 nullptr,
                 db::no_timeout).get();
         return make_ready_future<>();

From 9eb6d704b235ab57f032f32fc66df800c5ca8fcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 10:31:11 +0300
Subject: [PATCH 11/26] storage_proxy: add get_max_result_size()

Meant to be used by the coordinator node to obtain the max result size
applicable to the query-class (determined based on the current
scheduling group). For normal  paged queries the previously used
`query::result_memory_limiter::maximum_result_size` is used uniformly.
For reverse and unpaged queries, a query class dependent value is used.
For user reads, the value of the
`max_memory_for_unlimited_query_{soft,hard}_limit` configuration items
is used, for other classes no limit is used
(`query::result_memory_limiter::unlimited_result_size`).
---
 service/storage_proxy.cc | 15 +++++++++++++++
 service/storage_proxy.hh |  2 ++
 2 files changed, 17 insertions(+)

diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index b4c623d9a9..e1ebab1a04 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -1316,6 +1316,21 @@ endpoints_to_replica_ids(locator::token_metadata& tm, const std::vector<gms::ine
     return replica_ids;
 }
 
+query::max_result_size storage_proxy::get_max_result_size(const query::partition_slice& slice) const {
+    // Unpaged and reverse queries.
+    if (!slice.options.contains<query::partition_slice::option::allow_short_read>() || slice.options.contains<query::partition_slice::option::reversed>()) {
+        auto& db = _db.local();
+        // We only limit user queries.
+        if (current_scheduling_group() == db.get_statement_scheduling_group()) {
+            return query::max_result_size(db.get_config().max_memory_for_unlimited_query_soft_limit(), db.get_config().max_memory_for_unlimited_query_hard_limit());
+        } else {
+            return query::max_result_size(query::result_memory_limiter::unlimited_result_size);
+        }
+    } else {
+        return query::max_result_size(query::result_memory_limiter::maximum_result_size);
+    }
+}
+
 bool storage_proxy::need_throttle_writes() const {
     return get_global_stats().background_write_bytes > _background_write_throttle_threahsold || get_global_stats().queued_write_bytes > 6*1024*1024;
 }
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index 73f42cacec..13ed45a92e 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -247,6 +247,8 @@ public:
     const locator::token_metadata& get_token_metadata() const { return _token_metadata; }
     locator::token_metadata& get_token_metadata() { return _token_metadata; }
 
+    query::max_result_size get_max_result_size(const query::partition_slice& slice) const;
+
 private:
     distributed<database>& _db;
     locator::token_metadata& _token_metadata;

From 989142464ce4466d42e034eacca725e6b353f16f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 10:18:49 +0300
Subject: [PATCH 12/26] result_memory_accounter: check(): use
 _maximum_result_size instead of hardcoded limit

The use of the global `result_memory_limiter::maximum_result_size` is
probably a leftover from before the `_maximum_result_size` member was
introduced (aa083d3d85).
---
 query-result.hh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/query-result.hh b/query-result.hh
index 9634dca6ee..fdff5d3740 100644
--- a/query-result.hh
+++ b/query-result.hh
@@ -196,7 +196,7 @@ public:
 
     // Checks whether the result can grow any more.
     stop_iteration check() const {
-        stop_iteration stop { _total_used_memory > result_memory_limiter::maximum_result_size };
+        stop_iteration stop { _total_used_memory > _maximum_result_size };
         if (!stop && _used_memory >= _blocked_bytes && _limiter) {
             return _limiter->check() && _stop_on_global_limit;
         }

From 1615fe4c5ee41c64ca224d08bfd32c05e086dae6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Wed, 10 Jun 2020 12:32:42 +0300
Subject: [PATCH 13/26] service: query_pager: set the allow_short_read flag

All callers should set this already before passing the slice to the
pager, however not all actually do (e.g.
`cql3::indexed_table_select_statement::read_posting_list()`). Instead of
auditing each call site, just make sure this is set in the pager
itself. If someone is creating a pager we can be sure they mean to use
paging.
---
 service/pager/query_pagers.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/service/pager/query_pagers.cc b/service/pager/query_pagers.cc
index 30493a59fd..a2b603cb50 100644
--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -84,6 +84,10 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
     future<service::storage_proxy::coordinator_query_result> query_pager::do_fetch_page(uint32_t page_size, gc_clock::time_point now, db::timeout_clock::time_point timeout) {
         auto state = _options.get_paging_state();
 
+        // Most callers should set this but we want to make sure, as results
+        // won't be paged without it.
+        _cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
+
         if (!_last_pkey && state) {
             _max = state->get_remaining();
             _last_pkey = state->get_partition_key();

From 2ca118b2d5262360a07869d4f6efb51ecc287b31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 9 Jun 2020 15:18:06 +0300
Subject: [PATCH 14/26] query: read_command: add separate convenience
 constructor

query::read_command currently has a single constructor, which serves
both as an idl constructor (order of parameters is fixed) and a convenience one
(most parameters have default values). This makes it very error prone to
add new parameters, that everyone should fill. The new parameter has to
be added as last, with a default value, as the previous ones have a
default value as well. This means the compiler's help cannot be enlisted
to make sure all usages are updated.

This patch adds a separate convenience constructor to be used by normal
code. The idl constructor looses all default parameters. New parameters
can be added to any position in the convenience constructor (to force
users to fill in a meaningful value) while the removed default
parameters from the idl constructor means code cannot accidentally use
it without noticing.
---
 cql3/statements/select_statement.cc | 17 ++++++++++----
 query-request.hh                    | 36 +++++++++++++++++++++++------
 redis/query_utils.cc                |  2 +-
 test/boost/database_test.cc         |  4 ++--
 test/boost/querier_cache_test.cc    |  6 +++--
 thrift/handler.cc                   |  2 +-
 6 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc
index 315656d22d..014ae1df57 100644
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -319,8 +319,17 @@ select_statement::do_execute(service::storage_proxy& proxy,
     _stats.select_partition_range_scan += _range_scan;
     _stats.select_partition_range_scan_no_bypass_cache += _range_scan_no_bypass_cache;
 
-    auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(),
-        make_partition_slice(options), limit, now, tracing::make_trace_info(state.get_trace_state()), query::max_partitions, utils::UUID(), query::is_first_page::no, options.get_timestamp(state));
+    auto command = ::make_lw_shared<query::read_command>(
+            _schema->id(),
+            _schema->version(),
+            make_partition_slice(options),
+            limit,
+            query::max_partitions,
+            now,
+            tracing::make_trace_info(state.get_trace_state()),
+            utils::UUID(),
+            query::is_first_page::no,
+            options.get_timestamp(state));
 
     int32_t page_size = options.get_page_size();
 
@@ -477,9 +486,9 @@ indexed_table_select_statement::prepare_command_for_base_query(const query_optio
             _schema->version(),
             make_partition_slice(options),
             get_limit(options),
+            query::max_partitions,
             now,
             tracing::make_trace_info(state.get_trace_state()),
-            query::max_partitions,
             utils::UUID(),
             query::is_first_page::no,
             options.get_timestamp(state));
@@ -1147,9 +1156,9 @@ indexed_table_select_statement::read_posting_list(service::storage_proxy& proxy,
             _view_schema->version(),
             partition_slice,
             limit,
+            query::max_partitions,
             now,
             tracing::make_trace_info(state.get_trace_state()),
-            query::max_partitions,
             utils::UUID(),
             query::is_first_page::no,
             options.get_timestamp(state));
diff --git a/query-request.hh b/query-request.hh
index 56f965c627..609ed93448 100644
--- a/query-request.hh
+++ b/query-request.hh
@@ -235,16 +235,38 @@ public:
     query::is_first_page is_first_page;
     api::timestamp_type read_timestamp; // not serialized
 public:
+    // IDL constructor
     read_command(utils::UUID cf_id,
                  table_schema_version schema_version,
                  partition_slice slice,
-                 uint32_t row_limit = max_rows,
-                 gc_clock::time_point now = gc_clock::now(),
-                 std::optional<tracing::trace_info> ti = std::nullopt,
-                 uint32_t partition_limit = max_partitions,
-                 utils::UUID query_uuid = utils::UUID(),
-                 query::is_first_page is_first_page = is_first_page::no,
-                 api::timestamp_type rt = api::new_timestamp())
+                 uint32_t row_limit,
+                 gc_clock::time_point now,
+                 std::optional<tracing::trace_info> ti,
+                 uint32_t partition_limit,
+                 utils::UUID query_uuid,
+                 query::is_first_page is_first_page)
+        : cf_id(std::move(cf_id))
+        , schema_version(std::move(schema_version))
+        , slice(std::move(slice))
+        , row_limit(row_limit)
+        , timestamp(now)
+        , trace_info(std::move(ti))
+        , partition_limit(partition_limit)
+        , query_uuid(query_uuid)
+        , is_first_page(is_first_page)
+        , read_timestamp(api::new_timestamp())
+    { }
+
+    read_command(utils::UUID cf_id,
+            table_schema_version schema_version,
+            partition_slice slice,
+            uint32_t row_limit = max_rows,
+            uint32_t partition_limit = max_partitions,
+            gc_clock::time_point now = gc_clock::now(),
+            std::optional<tracing::trace_info> ti = std::nullopt,
+            utils::UUID query_uuid = utils::UUID(),
+            query::is_first_page is_first_page = query::is_first_page::no,
+            api::timestamp_type rt = api::new_timestamp())
         : cf_id(std::move(cf_id))
         , schema_version(std::move(schema_version))
         , slice(std::move(slice))
diff --git a/redis/query_utils.cc b/redis/query_utils.cc
index 37179dfeb5..c6fd7b73d7 100644
--- a/redis/query_utils.cc
+++ b/redis/query_utils.cc
@@ -75,7 +75,7 @@ public:
 future<lw_shared_ptr<strings_result>> read_strings(service::storage_proxy& proxy, const redis_options& options, const bytes& key, service_permit permit) {
     auto schema = get_schema(proxy, options.get_keyspace_name(), redis::STRINGs);
     auto ps = partition_slice_builder(*schema).build();
-    query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1); 
+    query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1, utils::UUID(), query::is_first_page::no);
     auto pkey = partition_key::from_single_value(*schema, key);
     auto partition_range = dht::partition_range::make_singular(dht::decorate_key(*schema, std::move(pkey)));
     dht::partition_range_vector partition_ranges;
diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc
index 155d51ec0c..8eef7bfbe1 100644
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -112,14 +112,14 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
 
             {
                 auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
-                        query::max_rows, gc_clock::now(), std::nullopt, 5);
+                        query::max_rows, 5);
                 auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5);
             }
 
             {
                 auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
-                        query::max_rows, gc_clock::now(), std::nullopt, 3);
+                        query::max_rows, 3);
                 auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
             }
diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc
index 39b5464269..cc5ef71e14 100644
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -666,7 +666,8 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
                 gc_clock::now(),
                 std::nullopt,
                 1,
-                utils::make_random_uuid());
+                utils::make_random_uuid(),
+                query::is_first_page::yes);
 
         // Should save the querier in cache.
         db.query_mutations(s,
@@ -700,7 +701,8 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
                 gc_clock::now(),
                 std::nullopt,
                 1,
-                utils::make_random_uuid());
+                utils::make_random_uuid(),
+                query::is_first_page::no);
 
         // Should evict the already cached querier.
         db.query_mutations(s,
diff --git a/thrift/handler.cc b/thrift/handler.cc
index 5686a7c47c..8d28bc08d7 100644
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -394,7 +394,7 @@ public:
         clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides());
         auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts,
                 std::move(specific_ranges), cql_serialization_format::internal());
-        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), row_limit, gc_clock::now(), std::nullopt, partition_limit);
+        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), row_limit, partition_limit);
     }
 
     static future<> do_get_paged_slice(

From 8992bcd1f83f0b14b46b9fca3140271e3467ce22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 9 Jun 2020 16:15:13 +0300
Subject: [PATCH 15/26] query: read_command: use tagged ints for limit ctor
 params

The convenience constructor of read_command now has two integer
parameter next to each other. In the next patch we intend to add another
one. This is recipe for disaster, so to avoid mistakes this patch
converts these parameters to tagged integers. This makes sure callers
pass what they meant to pass. As a matter of fact, while fixing up
call-sites, I already found several ones passing `query::max_partitions`
to the `row_limit` parameter. No harm done yet, as
`query::max_partitions` == `query::max_rows` but this shows just how
easy it is to mix up parameters with the same type.
---
 alternator/executor.cc                       |  8 ++++----
 alternator/streams.cc                        |  2 +-
 cdc/log.cc                                   |  2 +-
 cql3/statements/select_statement.cc          | 12 ++++++------
 db/schema_tables.cc                          |  2 +-
 db/system_keyspace.cc                        |  8 +++-----
 query-request.hh                             | 12 ++++++++----
 service/storage_proxy.cc                     |  2 +-
 test/boost/database_test.cc                  |  8 ++++----
 test/boost/multishard_mutation_query_test.cc |  2 +-
 thrift/handler.cc                            |  4 ++--
 11 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/alternator/executor.cc b/alternator/executor.cc
index a9fb41f916..8b8b5dc1a0 100644
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1240,7 +1240,7 @@ static lw_shared_ptr<query::read_command> previous_item_read_command(schema_ptr
     auto regular_columns = boost::copy_range<query::column_id_vector>(
             schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
     auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
-    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
+    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice);
 }
 
 static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) {
@@ -2405,7 +2405,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
     auto selection = cql3::selection::selection::wildcard(schema);
 
     auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
+    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice);
 
     std::unordered_set<std::string> used_attribute_names;
     auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names);
@@ -2475,7 +2475,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                     rs.schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
             auto selection = cql3::selection::selection::wildcard(rs.schema);
             auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
-            auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, query::max_partitions);
+            auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice);
             future<std::tuple<std::string, std::optional<rjson::value>>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
                     service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                     [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
@@ -2762,7 +2762,7 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
     query::partition_slice::option_set opts = selection->get_query_options();
     opts.add(custom_opts);
     auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts);
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
+    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice);
 
     auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));
 
diff --git a/alternator/streams.cc b/alternator/streams.cc
index 8fa3e5a8b5..d00d3d93b9 100644
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -717,7 +717,7 @@ future<executor::request_return_type> executor::get_records(client_state& client
     auto partition_slice = query::partition_slice(
         std::move(bounds)
         , {}, std::move(regular_columns), selection->get_query_options());
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, limit * 4);
+    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::row_limit(limit * 4));
 
     return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
             [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
diff --git a/cdc/log.cc b/cdc/log.cc
index 52040c6bd9..806dfadaf6 100644
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -1356,7 +1356,7 @@ public:
         opts.set_if<query::partition_slice::option::always_return_static_content>(!p.static_row().empty());
 
         auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), std::move(opts));
-        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, row_limit);
+        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::row_limit(row_limit));
 
         const auto select_cl = adjust_cl(write_cl);
 
diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc
index 014ae1df57..8af941c7c0 100644
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -323,8 +323,8 @@ select_statement::do_execute(service::storage_proxy& proxy,
             _schema->id(),
             _schema->version(),
             make_partition_slice(options),
-            limit,
-            query::max_partitions,
+            query::row_limit(limit),
+            query::partition_limit(query::max_partitions),
             now,
             tracing::make_trace_info(state.get_trace_state()),
             utils::UUID(),
@@ -485,8 +485,8 @@ indexed_table_select_statement::prepare_command_for_base_query(const query_optio
             _schema->id(),
             _schema->version(),
             make_partition_slice(options),
-            get_limit(options),
-            query::max_partitions,
+            query::row_limit(get_limit(options)),
+            query::partition_limit(query::max_partitions),
             now,
             tracing::make_trace_info(state.get_trace_state()),
             utils::UUID(),
@@ -1155,8 +1155,8 @@ indexed_table_select_statement::read_posting_list(service::storage_proxy& proxy,
             _view_schema->id(),
             _view_schema->version(),
             partition_slice,
-            limit,
-            query::max_partitions,
+            query::row_limit(limit),
+            query::partition_limit(query::max_partitions),
             now,
             tracing::make_trace_info(state.get_trace_state()),
             utils::UUID(),
diff --git a/db/schema_tables.cc b/db/schema_tables.cc
index 7f65f9c7e7..68157acb46 100644
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -778,7 +778,7 @@ read_schema_partition_for_table(distributed<service::storage_proxy>& proxy, sche
     auto slice = partition_slice_builder(*schema)
             .with_range(std::move(clustering_range))
             .build();
-    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), query::max_rows);
+    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), query::row_limit(query::max_rows));
     return query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key)).then([&proxy] (mutation mut) {
          return redact_columns_for_missing_features(std::move(mut), proxy.local().get_db().local().features().cluster_schema_features());
     });
diff --git a/db/system_keyspace.cc b/db/system_keyspace.cc
index 0df4181fd6..41e443f8f2 100644
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1984,8 +1984,7 @@ query_mutations(distributed<service::storage_proxy>& proxy, const sstring& ks_na
     database& db = proxy.local().get_db().local();
     schema_ptr schema = db.find_schema(ks_name, cf_name);
     auto slice = partition_slice_builder(*schema).build();
-    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(),
-        std::move(slice), std::numeric_limits<uint32_t>::max());
+    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice));
     return proxy.local().query_mutations_locally(std::move(schema), std::move(cmd), query::full_partition_range, db::no_timeout)
             .then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature> rr_ht) { return std::get<0>(std::move(rr_ht)); });
 }
@@ -1995,8 +1994,7 @@ query(distributed<service::storage_proxy>& proxy, const sstring& ks_name, const
     database& db = proxy.local().get_db().local();
     schema_ptr schema = db.find_schema(ks_name, cf_name);
     auto slice = partition_slice_builder(*schema).build();
-    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(),
-        std::move(slice), std::numeric_limits<uint32_t>::max());
+    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice));
     return proxy.local().query(schema, cmd, {query::full_partition_range}, db::consistency_level::ONE,
             {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) {
         return make_lw_shared(query::result_set::from_raw_result(schema, cmd->slice, *qr.query_result));
@@ -2011,7 +2009,7 @@ query(distributed<service::storage_proxy>& proxy, const sstring& ks_name, const
     auto slice = partition_slice_builder(*schema)
         .with_range(std::move(row_range))
         .build();
-    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), query::max_rows);
+    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice));
 
     return proxy.local().query(schema, cmd, {dht::partition_range::make_singular(key)}, db::consistency_level::ONE,
             {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) {
diff --git a/query-request.hh b/query-request.hh
index 609ed93448..8cd7505180 100644
--- a/query-request.hh
+++ b/query-request.hh
@@ -206,6 +206,10 @@ public:
 
 constexpr auto max_partitions = std::numeric_limits<uint32_t>::max();
 
+// Tagged integers to disambiguate constructor arguments.
+enum class row_limit : uint32_t { max = max_rows };
+enum class partition_limit : uint32_t { max = max_partitions };
+
 using is_first_page = bool_class<class is_first_page_tag>;
 
 // Full specification of a query to the database.
@@ -260,8 +264,8 @@ public:
     read_command(utils::UUID cf_id,
             table_schema_version schema_version,
             partition_slice slice,
-            uint32_t row_limit = max_rows,
-            uint32_t partition_limit = max_partitions,
+            query::row_limit row_limit = query::row_limit::max,
+            query::partition_limit partition_limit = query::partition_limit::max,
             gc_clock::time_point now = gc_clock::now(),
             std::optional<tracing::trace_info> ti = std::nullopt,
             utils::UUID query_uuid = utils::UUID(),
@@ -270,10 +274,10 @@ public:
         : cf_id(std::move(cf_id))
         , schema_version(std::move(schema_version))
         , slice(std::move(slice))
-        , row_limit(row_limit)
+        , row_limit(static_cast<uint32_t>(row_limit))
         , timestamp(now)
         , trace_info(std::move(ti))
-        , partition_limit(partition_limit)
+        , partition_limit(static_cast<uint32_t>(partition_limit))
         , query_uuid(query_uuid)
         , is_first_page(is_first_page)
         , read_timestamp(rt)
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index e1ebab1a04..eb4af0bb14 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -4345,7 +4345,7 @@ static lw_shared_ptr<query::read_command> read_nothing_read_command(schema_ptr s
     // Note that because this read-nothing command has an empty slice,
     // storage_proxy::query() returns immediately - without any networking.
     auto partition_slice = query::partition_slice({}, {}, {}, query::partition_slice::option_set());
-    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::max_partitions);
+    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice);
 }
 
 static read_timeout_exception write_timeout_to_read(schema_ptr s, mutation_write_timeout_exception& ex) {
diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc
index 8eef7bfbe1..faaae1b1ef 100644
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -62,7 +62,7 @@ SEASTAR_TEST_CASE(test_safety_after_truncate) {
 
         auto assert_query_result = [&] (size_t expected_size) {
             auto max_size = std::numeric_limits<size_t>::max();
-            auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 1000);
+            auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::row_limit(1000));
             auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0();
             assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(expected_size);
         };
@@ -105,21 +105,21 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
 
             auto max_size = std::numeric_limits<size_t>::max();
             {
-                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), 3);
+                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::row_limit(3));
                 auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
             }
 
             {
                 auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
-                        query::max_rows, 5);
+                        query::row_limit(query::max_rows), query::partition_limit(5));
                 auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5);
             }
 
             {
                 auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
-                        query::max_rows, 3);
+                        query::row_limit(query::max_rows), query::partition_limit(3));
                 auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
             }
diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc
index db345b123b..768fb85ac9 100644
--- a/test/boost/multishard_mutation_query_test.cc
+++ b/test/boost/multishard_mutation_query_test.cc
@@ -104,7 +104,7 @@ static std::vector<mutation> read_all_partitions_one_by_one(distributed<database
     for (const auto& pkey : pkeys) {
         const auto res = db.invoke_on(sharder.shard_of(pkey.token()), [gs = global_schema_ptr(s), &pkey] (database& db) {
             return async([s = gs.get(), &pkey, &db] () mutable {
-                const auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), query::max_rows);
+                const auto cmd = query::read_command(s->id(), s->version(), s->full_slice());
                 const auto range = dht::partition_range::make_singular(pkey);
                 return make_foreign(std::make_unique<reconcilable_result>(
                     std::get<0>(db.query_mutations(std::move(s), cmd, range, std::numeric_limits<size_t>::max(), nullptr, db::no_timeout).get0())));
diff --git a/thrift/handler.cc b/thrift/handler.cc
index 8d28bc08d7..f0625010a2 100644
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -394,7 +394,7 @@ public:
         clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides());
         auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts,
                 std::move(specific_ranges), cql_serialization_format::internal());
-        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), row_limit, partition_limit);
+        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), query::row_limit(row_limit), query::partition_limit(partition_limit));
     }
 
     static future<> do_get_paged_slice(
@@ -664,7 +664,7 @@ public:
                 }
             }
             auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts, nullptr);
-            auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), row_limit);
+            auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), query::row_limit(row_limit));
             auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
             return f.then([this, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level] {
                 auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout;

From 92a7b16cba437501a02394e19c58f34f50ffa0be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 9 Jun 2020 10:34:59 +0300
Subject: [PATCH 16/26] query: read_command: add max_result_size

This field will replace max size which is currently passed once per
established rpc connection via the CLIENT_ID verb and stored as an
auxiliary value on the client_info. For now it is unused, but we update
all sites creating a read command to pass the correct value to it. In the
next patch we will phase out the old max size and use this field to pass
max size on each verb instead.
---
 alternator/executor.cc                       | 22 +++++-----
 alternator/streams.cc                        |  3 +-
 cdc/log.cc                                   |  3 +-
 cql3/statements/batch_statement.cc           |  2 +-
 cql3/statements/cas_request.cc               |  4 +-
 cql3/statements/cas_request.hh               |  2 +-
 cql3/statements/modification_statement.cc    |  9 ++--
 cql3/statements/modification_statement.hh    |  2 +-
 cql3/statements/select_statement.cc          | 30 ++++++++------
 cql3/statements/select_statement.hh          |  3 +-
 db/schema_tables.cc                          |  6 ++-
 db/system_keyspace.cc                        |  6 +--
 idl/read_command.idl.hh                      |  6 +++
 query-request.hh                             | 11 ++++-
 redis/query_utils.cc                         |  3 +-
 service/pager/query_pagers.cc                |  7 +++-
 service/storage_proxy.cc                     |  3 +-
 test/boost/database_test.cc                  |  8 ++--
 test/boost/multishard_mutation_query_test.cc |  8 ++--
 test/boost/querier_cache_test.cc             |  6 ++-
 thrift/handler.cc                            | 43 ++++++++++++--------
 21 files changed, 117 insertions(+), 70 deletions(-)

diff --git a/alternator/executor.cc b/alternator/executor.cc
index 8b8b5dc1a0..b65d0be76b 100644
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1225,7 +1225,8 @@ static future<std::unique_ptr<rjson::value>> get_previous_item(
         service_permit permit,
         alternator::stats& stats);
 
-static lw_shared_ptr<query::read_command> previous_item_read_command(schema_ptr schema,
+static lw_shared_ptr<query::read_command> previous_item_read_command(service::storage_proxy& proxy,
+        schema_ptr schema,
         const clustering_key& ck,
         shared_ptr<cql3::selection::selection> selection) {
     std::vector<query::clustering_range> bounds;
@@ -1240,7 +1241,7 @@ static lw_shared_ptr<query::read_command> previous_item_read_command(schema_ptr
     auto regular_columns = boost::copy_range<query::column_id_vector>(
             schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
     auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
-    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice);
+    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice));
 }
 
 static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) {
@@ -1354,7 +1355,7 @@ static future<std::unique_ptr<rjson::value>> get_previous_item(
 {
     stats.reads_before_write++;
     auto selection = cql3::selection::selection::wildcard(schema);
-    auto command = previous_item_read_command(schema, ck, selection);
+    auto command = previous_item_read_command(proxy, schema, ck, selection);
     auto cl = db::consistency_level::LOCAL_QUORUM;
 
     return proxy.query(schema, command, to_partition_ranges(*schema, pk), cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state)).then(
@@ -1405,7 +1406,7 @@ future<executor::request_return_type> rmw_operation::execute(service::storage_pr
     auto timeout = executor::default_timeout();
     auto selection = cql3::selection::selection::wildcard(schema());
     auto read_command = needs_read_before_write ?
-            previous_item_read_command(schema(), _ck, selection) :
+            previous_item_read_command(proxy, schema(), _ck, selection) :
             nullptr;
     return proxy.cas(schema(), shared_from_this(), read_command, to_partition_ranges(*schema(), _pk),
             {timeout, std::move(permit), client_state, trace_state},
@@ -2405,7 +2406,7 @@ future<executor::request_return_type> executor::get_item(client_state& client_st
     auto selection = cql3::selection::selection::wildcard(schema);
 
     auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice);
+    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
 
     std::unordered_set<std::string> used_attribute_names;
     auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names);
@@ -2475,7 +2476,7 @@ future<executor::request_return_type> executor::batch_get_item(client_state& cli
                     rs.schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
             auto selection = cql3::selection::selection::wildcard(rs.schema);
             auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
-            auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice);
+            auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
             future<std::tuple<std::string, std::optional<rjson::value>>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
                     service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
                     [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
@@ -2728,7 +2729,8 @@ static rjson::value encode_paging_state(const schema& schema, const service::pag
     return last_evaluated_key;
 }
 
-static future<executor::request_return_type> do_query(schema_ptr schema,
+static future<executor::request_return_type> do_query(service::storage_proxy& proxy,
+        schema_ptr schema,
         const rjson::value* exclusive_start_key,
         dht::partition_range_vector&& partition_ranges,
         std::vector<query::clustering_range>&& ck_bounds,
@@ -2762,7 +2764,7 @@ static future<executor::request_return_type> do_query(schema_ptr schema,
     query::partition_slice::option_set opts = selection->get_query_options();
     opts.add(custom_opts);
     auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts);
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice);
+    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice));
 
     auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));
 
@@ -2883,7 +2885,7 @@ future<executor::request_return_type> executor::scan(client_state& client_state,
     verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Scan");
     verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Scan");
 
-    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
+    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
             std::move(filter), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit));
 }
 
@@ -3357,7 +3359,7 @@ future<executor::request_return_type> executor::query(client_state& client_state
     verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Query");
     query::partition_slice::option_set opts;
     opts.set_if<query::partition_slice::option::reversed>(!forward);
-    return do_query(schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
+    return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
             std::move(filter), opts, client_state, _stats.cql_stats, std::move(trace_state), std::move(permit));
 }
 
diff --git a/alternator/streams.cc b/alternator/streams.cc
index d00d3d93b9..25c854421a 100644
--- a/alternator/streams.cc
+++ b/alternator/streams.cc
@@ -717,7 +717,8 @@ future<executor::request_return_type> executor::get_records(client_state& client
     auto partition_slice = query::partition_slice(
         std::move(bounds)
         , {}, std::move(regular_columns), selection->get_query_options());
-    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, query::row_limit(limit * 4));
+    auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
+            query::row_limit(limit * 4));
 
     return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl, service::storage_proxy::coordinator_query_options(default_timeout(), std::move(permit), client_state)).then(
             [this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), start_time = std::move(start_time), limit, key_names = std::move(key_names), attr_names = std::move(attr_names), type, iter, high_ts] (service::storage_proxy::coordinator_query_result qr) mutable {       
diff --git a/cdc/log.cc b/cdc/log.cc
index 806dfadaf6..2ae0530b52 100644
--- a/cdc/log.cc
+++ b/cdc/log.cc
@@ -1356,7 +1356,8 @@ public:
         opts.set_if<query::partition_slice::option::always_return_static_content>(!p.static_row().empty());
 
         auto partition_slice = query::partition_slice(std::move(bounds), std::move(static_columns), std::move(regular_columns), std::move(opts));
-        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::row_limit(row_limit));
+        const auto max_result_size = _ctx._proxy.get_max_result_size(partition_slice);
+        auto command = ::make_lw_shared<query::read_command>(_schema->id(), _schema->version(), partition_slice, query::max_result_size(max_result_size), query::row_limit(row_limit));
 
         const auto select_cl = adjust_cl(write_cl);
 
diff --git a/cql3/statements/batch_statement.cc b/cql3/statements/batch_statement.cc
index 63f7998a86..bbf6212ad1 100644
--- a/cql3/statements/batch_statement.cc
+++ b/cql3/statements/batch_statement.cc
@@ -385,7 +385,7 @@ future<shared_ptr<cql_transport::messages::result_message>> batch_statement::exe
                 make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard));
     }
 
-    return proxy.cas(schema, request, request->read_command(), request->key(),
+    return proxy.cas(schema, request, request->read_command(proxy), request->key(),
             {read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
             cl_for_paxos, cl_for_learn, batch_timeout, cas_timeout).then([this, request] (bool is_applied) {
         return modification_statement::build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied, request->rows());
diff --git a/cql3/statements/cas_request.cc b/cql3/statements/cas_request.cc
index f597bad302..058b312ad4 100644
--- a/cql3/statements/cas_request.cc
+++ b/cql3/statements/cas_request.cc
@@ -81,7 +81,7 @@ std::optional<mutation> cas_request::apply_updates(api::timestamp_type ts) const
     return mutation_set;
 }
 
-lw_shared_ptr<query::read_command> cas_request::read_command() const {
+lw_shared_ptr<query::read_command> cas_request::read_command(service::storage_proxy& proxy) const {
 
     column_set columns_to_read(_schema->all_columns_count());
     std::vector<query::clustering_range> ranges;
@@ -116,7 +116,7 @@ lw_shared_ptr<query::read_command> cas_request::read_command() const {
     options.set(query::partition_slice::option::always_return_static_content);
     query::partition_slice ps(std::move(ranges), *_schema, columns_to_read, options);
     ps.set_partition_row_limit(max_rows);
-    return make_lw_shared<query::read_command>(_schema->id(), _schema->version(), std::move(ps));
+    return make_lw_shared<query::read_command>(_schema->id(), _schema->version(), std::move(ps), proxy.get_max_result_size(ps));
 }
 
 bool cas_request::applies_to() const {
diff --git a/cql3/statements/cas_request.hh b/cql3/statements/cas_request.hh
index 48246f83e4..5f254b6d20 100644
--- a/cql3/statements/cas_request.hh
+++ b/cql3/statements/cas_request.hh
@@ -90,7 +90,7 @@ public:
         return _rows;
     }
 
-    lw_shared_ptr<query::read_command> read_command() const;
+    lw_shared_ptr<query::read_command> read_command(service::storage_proxy& proxy) const;
 
     void add_row_update(const modification_statement& stmt_arg, std::vector<query::clustering_range> ranges_arg,
         modification_statement::json_cache_opt json_cache_arg, const query_options& options_arg);
diff --git a/cql3/statements/modification_statement.cc b/cql3/statements/modification_statement.cc
index c0a062cc5d..bdcc13f0fb 100644
--- a/cql3/statements/modification_statement.cc
+++ b/cql3/statements/modification_statement.cc
@@ -168,7 +168,7 @@ modification_statement::get_mutations(service::storage_proxy& proxy, const query
     }
 
     if (requires_read()) {
-        lw_shared_ptr<query::read_command> cmd = read_command(ranges, cl);
+        lw_shared_ptr<query::read_command> cmd = read_command(proxy, ranges, cl);
         // FIXME: ignoring "local"
         f = proxy.query(s, cmd, dht::partition_range_vector(keys), cl,
                 {timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()}).then(
@@ -246,14 +246,15 @@ std::vector<mutation> modification_statement::apply_updates(
 }
 
 lw_shared_ptr<query::read_command>
-modification_statement::read_command(query::clustering_row_ranges ranges, db::consistency_level cl) const {
+modification_statement::read_command(service::storage_proxy& proxy, query::clustering_row_ranges ranges, db::consistency_level cl) const {
     try {
         validate_for_read(cl);
     } catch (exceptions::invalid_request_exception& e) {
         throw exceptions::invalid_request_exception(format("Write operation require a read but consistency {} is not supported on reads", cl));
     }
     query::partition_slice ps(std::move(ranges), *s, columns_to_read(), update_parameters::options);
-    return make_lw_shared<query::read_command>(s->id(), s->version(), std::move(ps));
+    const auto max_result_size = proxy.get_max_result_size(ps);
+    return make_lw_shared<query::read_command>(s->id(), s->version(), std::move(ps), query::max_result_size(max_result_size));
 }
 
 std::vector<query::clustering_range>
@@ -351,7 +352,7 @@ modification_statement::execute_with_condition(service::storage_proxy& proxy, se
                 make_shared<cql_transport::messages::result_message::bounce_to_shard>(shard));
     }
 
-    return proxy.cas(s, request, request->read_command(), request->key(),
+    return proxy.cas(s, request, request->read_command(proxy), request->key(),
             {read_timeout, qs.get_permit(), qs.get_client_state(), qs.get_trace_state()},
             cl_for_paxos, cl_for_learn, statement_timeout, cas_timeout).then([this, request] (bool is_applied) {
         return build_cas_result_set(_metadata, _columns_of_cas_result_set, is_applied, request->rows());
diff --git a/cql3/statements/modification_statement.hh b/cql3/statements/modification_statement.hh
index de5dfeaa6e..435e9ce863 100644
--- a/cql3/statements/modification_statement.hh
+++ b/cql3/statements/modification_statement.hh
@@ -242,7 +242,7 @@ public:
 
     // Build a read_command instance to fetch the previous mutation from storage. The mutation is
     // fetched if we need to check LWT conditions or apply updates to non-frozen list elements.
-    lw_shared_ptr<query::read_command> read_command(query::clustering_row_ranges ranges, db::consistency_level cl) const;
+    lw_shared_ptr<query::read_command> read_command(service::storage_proxy& proxy, query::clustering_row_ranges ranges, db::consistency_level cl) const;
     // Create a mutation object for the update operation represented by this modification statement.
     // A single mutation object for lightweight transactions, which can only span one partition, or a vector
     // of mutations, one per partition key, for statements which affect multiple partition keys,
diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc
index 8af941c7c0..64185cc6be 100644
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -319,10 +319,12 @@ select_statement::do_execute(service::storage_proxy& proxy,
     _stats.select_partition_range_scan += _range_scan;
     _stats.select_partition_range_scan_no_bypass_cache += _range_scan_no_bypass_cache;
 
+    auto slice = make_partition_slice(options);
     auto command = ::make_lw_shared<query::read_command>(
             _schema->id(),
             _schema->version(),
-            make_partition_slice(options),
+            std::move(slice),
+            proxy.get_max_result_size(slice),
             query::row_limit(limit),
             query::partition_limit(query::max_partitions),
             now,
@@ -480,11 +482,21 @@ generate_base_key_from_index_pk(const partition_key& index_pk, const std::option
 }
 
 lw_shared_ptr<query::read_command>
-indexed_table_select_statement::prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) const {
+indexed_table_select_statement::prepare_command_for_base_query(service::storage_proxy& proxy, const query_options& options,
+        service::query_state& state, gc_clock::time_point now, bool use_paging) const {
+    auto slice = make_partition_slice(options);
+    if (use_paging) {
+        slice.options.set<query::partition_slice::option::allow_short_read>();
+        slice.options.set<query::partition_slice::option::send_partition_key>();
+        if (_schema->clustering_key_size() > 0) {
+            slice.options.set<query::partition_slice::option::send_clustering_key>();
+        }
+    }
     lw_shared_ptr<query::read_command> cmd = ::make_lw_shared<query::read_command>(
             _schema->id(),
             _schema->version(),
-            make_partition_slice(options),
+            std::move(slice),
+            proxy.get_max_result_size(slice),
             query::row_limit(get_limit(options)),
             query::partition_limit(query::max_partitions),
             now,
@@ -492,13 +504,6 @@ indexed_table_select_statement::prepare_command_for_base_query(const query_optio
             utils::UUID(),
             query::is_first_page::no,
             options.get_timestamp(state));
-    if (use_paging) {
-        cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
-        cmd->slice.options.set<query::partition_slice::option::send_partition_key>();
-        if (_schema->clustering_key_size() > 0) {
-            cmd->slice.options.set<query::partition_slice::option::send_clustering_key>();
-        }
-    }
     return cmd;
 }
 
@@ -511,7 +516,7 @@ indexed_table_select_statement::do_execute_base_query(
         gc_clock::time_point now,
         lw_shared_ptr<const service::pager::paging_state> paging_state) const {
     using value_type = std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>;
-    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
+    auto cmd = prepare_command_for_base_query(proxy, options, state, now, bool(paging_state));
     auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
     uint32_t queried_ranges_count = partition_ranges.size();
     service::query_ranges_to_vnodes_generator ranges_to_vnodes(proxy.get_token_metadata(), _schema, std::move(partition_ranges));
@@ -587,7 +592,7 @@ indexed_table_select_statement::do_execute_base_query(
         gc_clock::time_point now,
         lw_shared_ptr<const service::pager::paging_state> paging_state) const {
     using value_type = std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>;
-    auto cmd = prepare_command_for_base_query(options, state, now, bool(paging_state));
+    auto cmd = prepare_command_for_base_query(proxy, options, state, now, bool(paging_state));
     auto timeout = db::timeout_clock::now() + options.get_timeout_config().*get_timeout_config_selector();
 
     struct base_query_state {
@@ -1155,6 +1160,7 @@ indexed_table_select_statement::read_posting_list(service::storage_proxy& proxy,
             _view_schema->id(),
             _view_schema->version(),
             partition_slice,
+            proxy.get_max_result_size(partition_slice),
             query::row_limit(limit),
             query::partition_limit(query::max_partitions),
             now,
diff --git a/cql3/statements/select_statement.hh b/cql3/statements/select_statement.hh
index 800457cab1..e82ed58c99 100644
--- a/cql3/statements/select_statement.hh
+++ b/cql3/statements/select_statement.hh
@@ -237,7 +237,8 @@ private:
             lw_shared_ptr<const service::pager::paging_state> paging_state) const;
 
     lw_shared_ptr<query::read_command>
-    prepare_command_for_base_query(const query_options& options, service::query_state& state, gc_clock::time_point now, bool use_paging) const;
+    prepare_command_for_base_query(service::storage_proxy& proxy, const query_options& options, service::query_state& state, gc_clock::time_point now,
+            bool use_paging) const;
 
     future<std::tuple<foreign_ptr<lw_shared_ptr<query::result>>, lw_shared_ptr<query::read_command>>>
     do_execute_base_query(
diff --git a/db/schema_tables.cc b/db/schema_tables.cc
index 68157acb46..cb3d5dbf03 100644
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -778,7 +778,8 @@ read_schema_partition_for_table(distributed<service::storage_proxy>& proxy, sche
     auto slice = partition_slice_builder(*schema)
             .with_range(std::move(clustering_range))
             .build();
-    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), query::row_limit(query::max_rows));
+    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice),
+            query::row_limit(query::max_rows));
     return query_partition_mutation(proxy.local(), std::move(schema), std::move(cmd), std::move(keyspace_key)).then([&proxy] (mutation mut) {
          return redact_columns_for_missing_features(std::move(mut), proxy.local().get_db().local().features().cluster_schema_features());
     });
@@ -788,7 +789,8 @@ future<mutation>
 read_keyspace_mutation(distributed<service::storage_proxy>& proxy, const sstring& keyspace_name) {
     schema_ptr s = keyspaces();
     auto key = partition_key::from_singular(*s, keyspace_name);
-    auto cmd = make_lw_shared<query::read_command>(s->id(), s->version(), s->full_slice());
+    auto slice = s->full_slice();
+    auto cmd = make_lw_shared<query::read_command>(s->id(), s->version(), std::move(slice), proxy.local().get_max_result_size(slice));
     return query_partition_mutation(proxy.local(), std::move(s), std::move(cmd), std::move(key));
 }
 
diff --git a/db/system_keyspace.cc b/db/system_keyspace.cc
index 41e443f8f2..026f930afd 100644
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -1984,7 +1984,7 @@ query_mutations(distributed<service::storage_proxy>& proxy, const sstring& ks_na
     database& db = proxy.local().get_db().local();
     schema_ptr schema = db.find_schema(ks_name, cf_name);
     auto slice = partition_slice_builder(*schema).build();
-    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice));
+    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice));
     return proxy.local().query_mutations_locally(std::move(schema), std::move(cmd), query::full_partition_range, db::no_timeout)
             .then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature> rr_ht) { return std::get<0>(std::move(rr_ht)); });
 }
@@ -1994,7 +1994,7 @@ query(distributed<service::storage_proxy>& proxy, const sstring& ks_name, const
     database& db = proxy.local().get_db().local();
     schema_ptr schema = db.find_schema(ks_name, cf_name);
     auto slice = partition_slice_builder(*schema).build();
-    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice));
+    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice));
     return proxy.local().query(schema, cmd, {query::full_partition_range}, db::consistency_level::ONE,
             {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) {
         return make_lw_shared(query::result_set::from_raw_result(schema, cmd->slice, *qr.query_result));
@@ -2009,7 +2009,7 @@ query(distributed<service::storage_proxy>& proxy, const sstring& ks_name, const
     auto slice = partition_slice_builder(*schema)
         .with_range(std::move(row_range))
         .build();
-    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice));
+    auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.local().get_max_result_size(slice));
 
     return proxy.local().query(schema, cmd, {dht::partition_range::make_singular(key)}, db::consistency_level::ONE,
             {db::no_timeout, empty_service_permit(), service::client_state::for_internal_calls(), nullptr}).then([schema, cmd] (auto&& qr) {
diff --git a/idl/read_command.idl.hh b/idl/read_command.idl.hh
index 29d8a4bcb5..50db525cd1 100644
--- a/idl/read_command.idl.hh
+++ b/idl/read_command.idl.hh
@@ -40,6 +40,11 @@ class partition_slice {
     uint32_t partition_row_limit() [[version 1.3]] = std::numeric_limits<uint32_t>::max();
 };
 
+struct max_result_size {
+    uint64_t soft_limit;
+    uint64_t hard_limit;
+}
+
 class read_command {
     utils::UUID cf_id;
     utils::UUID schema_version;
@@ -50,6 +55,7 @@ class read_command {
     uint32_t partition_limit [[version 1.3]] = std::numeric_limits<uint32_t>::max();
     utils::UUID query_uuid [[version 2.2]] = utils::UUID();
     query::is_first_page is_first_page [[version 2.2]] = query::is_first_page::no;
+    std::optional<query::max_result_size> max_result_size [[version 4.3]] = std::nullopt;
 };
 
 }
diff --git a/query-request.hh b/query-request.hh
index 8cd7505180..e0d45fd9a7 100644
--- a/query-request.hh
+++ b/query-request.hh
@@ -30,6 +30,7 @@
 #include "range.hh"
 #include "tracing/tracing.hh"
 #include "utils/small_vector.hh"
+#include "query_class_config.hh"
 
 class position_in_partition_view;
 
@@ -237,6 +238,10 @@ public:
     // to avoid doing work normally done on paged requests, e.g. attempting to
     // reused suspended readers.
     query::is_first_page is_first_page;
+    // The maximum size of the query result, for all queries.
+    // We use the entire value range, so we need an optional for the case when
+    // the remote doesn't send it.
+    std::optional<query::max_result_size> max_result_size;
     api::timestamp_type read_timestamp; // not serialized
 public:
     // IDL constructor
@@ -248,7 +253,8 @@ public:
                  std::optional<tracing::trace_info> ti,
                  uint32_t partition_limit,
                  utils::UUID query_uuid,
-                 query::is_first_page is_first_page)
+                 query::is_first_page is_first_page,
+                 std::optional<query::max_result_size> max_result_size)
         : cf_id(std::move(cf_id))
         , schema_version(std::move(schema_version))
         , slice(std::move(slice))
@@ -258,12 +264,14 @@ public:
         , partition_limit(partition_limit)
         , query_uuid(query_uuid)
         , is_first_page(is_first_page)
+        , max_result_size(max_result_size)
         , read_timestamp(api::new_timestamp())
     { }
 
     read_command(utils::UUID cf_id,
             table_schema_version schema_version,
             partition_slice slice,
+            query::max_result_size max_result_size,
             query::row_limit row_limit = query::row_limit::max,
             query::partition_limit partition_limit = query::partition_limit::max,
             gc_clock::time_point now = gc_clock::now(),
@@ -280,6 +288,7 @@ public:
         , partition_limit(static_cast<uint32_t>(partition_limit))
         , query_uuid(query_uuid)
         , is_first_page(is_first_page)
+        , max_result_size(max_result_size)
         , read_timestamp(rt)
     { }
 
diff --git a/redis/query_utils.cc b/redis/query_utils.cc
index c6fd7b73d7..e77d21824e 100644
--- a/redis/query_utils.cc
+++ b/redis/query_utils.cc
@@ -75,7 +75,8 @@ public:
 future<lw_shared_ptr<strings_result>> read_strings(service::storage_proxy& proxy, const redis_options& options, const bytes& key, service_permit permit) {
     auto schema = get_schema(proxy, options.get_keyspace_name(), redis::STRINGs);
     auto ps = partition_slice_builder(*schema).build();
-    query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1, utils::UUID(), query::is_first_page::no);
+    const auto max_result_size = proxy.get_max_result_size(ps);
+    query::read_command cmd(schema->id(), schema->version(), ps, 1, gc_clock::now(), std::nullopt, 1, utils::UUID(), query::is_first_page::no, max_result_size);
     auto pkey = partition_key::from_single_value(*schema, key);
     auto partition_range = dht::partition_range::make_singular(dht::decorate_key(*schema, std::move(pkey)));
     dht::partition_range_vector partition_ranges;
diff --git a/service/pager/query_pagers.cc b/service/pager/query_pagers.cc
index a2b603cb50..17811f934c 100644
--- a/service/pager/query_pagers.cc
+++ b/service/pager/query_pagers.cc
@@ -84,9 +84,14 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
     future<service::storage_proxy::coordinator_query_result> query_pager::do_fetch_page(uint32_t page_size, gc_clock::time_point now, db::timeout_clock::time_point timeout) {
         auto state = _options.get_paging_state();
 
+        auto& proxy = get_local_storage_proxy();
+
         // Most callers should set this but we want to make sure, as results
         // won't be paged without it.
         _cmd->slice.options.set<query::partition_slice::option::allow_short_read>();
+        // Override this, to make sure we use the value appropriate for paging
+        // (with allow_short_read set).
+        _cmd->max_result_size = proxy.get_max_result_size(_cmd->slice);
 
         if (!_last_pkey && state) {
             _max = state->get_remaining();
@@ -191,7 +196,7 @@ static bool has_clustering_keys(const schema& s, const query::read_command& cmd)
 
         auto ranges = _ranges;
         auto command = ::make_lw_shared<query::read_command>(*_cmd);
-        return get_local_storage_proxy().query(_schema,
+        return proxy.query(_schema,
                 std::move(command),
                 std::move(ranges),
                 _options.get_consistency(),
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index eb4af0bb14..b6554d7959 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -4345,7 +4345,8 @@ static lw_shared_ptr<query::read_command> read_nothing_read_command(schema_ptr s
     // Note that because this read-nothing command has an empty slice,
     // storage_proxy::query() returns immediately - without any networking.
     auto partition_slice = query::partition_slice({}, {}, {}, query::partition_slice::option_set());
-    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice);
+    return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice,
+            query::max_result_size(query::result_memory_limiter::unlimited_result_size));
 }
 
 static read_timeout_exception write_timeout_to_read(schema_ptr s, mutation_write_timeout_exception& ex) {
diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc
index faaae1b1ef..a2ef47712e 100644
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -62,7 +62,7 @@ SEASTAR_TEST_CASE(test_safety_after_truncate) {
 
         auto assert_query_result = [&] (size_t expected_size) {
             auto max_size = std::numeric_limits<size_t>::max();
-            auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::row_limit(1000));
+            auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(1000));
             auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0();
             assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(expected_size);
         };
@@ -105,20 +105,20 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
 
             auto max_size = std::numeric_limits<size_t>::max();
             {
-                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::row_limit(3));
+                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(3));
                 auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
             }
 
             {
-                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
+                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size),
                         query::row_limit(query::max_rows), query::partition_limit(5));
                 auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5);
             }
 
             {
-                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(),
+                auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size),
                         query::row_limit(query::max_rows), query::partition_limit(3));
                 auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc
index 768fb85ac9..a4b3651797 100644
--- a/test/boost/multishard_mutation_query_test.cc
+++ b/test/boost/multishard_mutation_query_test.cc
@@ -82,7 +82,7 @@ SEASTAR_THREAD_TEST_CASE(test_abandoned_read) {
         (void)_;
 
         auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), 7, gc_clock::now(), std::nullopt, query::max_partitions,
-                utils::make_random_uuid(), query::is_first_page::yes);
+                utils::make_random_uuid(), query::is_first_page::yes, query::max_result_size(query::result_memory_limiter::unlimited_result_size));
 
         query_mutations_on_all_shards(env.db(), s, cmd, {query::full_partition_range}, nullptr, std::numeric_limits<uint64_t>::max(), db::no_timeout).get();
 
@@ -104,7 +104,8 @@ static std::vector<mutation> read_all_partitions_one_by_one(distributed<database
     for (const auto& pkey : pkeys) {
         const auto res = db.invoke_on(sharder.shard_of(pkey.token()), [gs = global_schema_ptr(s), &pkey] (database& db) {
             return async([s = gs.get(), &pkey, &db] () mutable {
-                const auto cmd = query::read_command(s->id(), s->version(), s->full_slice());
+                const auto cmd = query::read_command(s->id(), s->version(), s->full_slice(),
+                        query::max_result_size(query::result_memory_limiter::unlimited_result_size));
                 const auto range = dht::partition_range::make_singular(pkey);
                 return make_foreign(std::make_unique<reconcilable_result>(
                     std::get<0>(db.query_mutations(std::move(s), cmd, range, std::numeric_limits<size_t>::max(), nullptr, db::no_timeout).get0())));
@@ -125,7 +126,8 @@ read_partitions_with_paged_scan(distributed<database>& db, schema_ptr s, uint32_
         const dht::partition_range& range, const query::partition_slice& slice, const std::function<void(size_t)>& page_hook = {}) {
     const auto query_uuid = is_stateful ? utils::make_random_uuid() : utils::UUID{};
     std::vector<mutation> results;
-    auto cmd = query::read_command(s->id(), s->version(), slice, page_size, gc_clock::now(), std::nullopt, query::max_partitions, query_uuid, query::is_first_page::yes);
+    auto cmd = query::read_command(s->id(), s->version(), slice, page_size, gc_clock::now(), std::nullopt, query::max_partitions, query_uuid,
+            query::is_first_page::yes, query::max_result_size(max_size));
 
     bool has_more = true;
 
diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc
index cc5ef71e14..93eeff994a 100644
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -667,7 +667,8 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
                 std::nullopt,
                 1,
                 utils::make_random_uuid(),
-                query::is_first_page::yes);
+                query::is_first_page::yes,
+                query::max_result_size(1024 * 1024));
 
         // Should save the querier in cache.
         db.query_mutations(s,
@@ -702,7 +703,8 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
                 std::nullopt,
                 1,
                 utils::make_random_uuid(),
-                query::is_first_page::no);
+                query::is_first_page::no,
+                query::max_result_size(1024 * 1024));
 
         // Should evict the already cached querier.
         db.query_mutations(s,
diff --git a/thrift/handler.cc b/thrift/handler.cc
index f0625010a2..2fea15f932 100644
--- a/thrift/handler.cc
+++ b/thrift/handler.cc
@@ -281,13 +281,14 @@ public:
             if (!column_parent.super_column.empty()) {
                 fail(unimplemented::cause::SUPER);
             }
-            auto cmd = slice_pred_to_read_cmd(*schema, predicate);
+            auto& proxy = service::get_local_storage_proxy();
+            auto cmd = slice_pred_to_read_cmd(proxy, *schema, predicate);
             auto cell_limit = predicate.__isset.slice_range ? static_cast<uint32_t>(predicate.slice_range.count) : std::numeric_limits<uint32_t>::max();
             auto pranges = make_partition_ranges(*schema, keys);
             auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
-            return f.then([this, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable {
+            return f.then([this, &proxy, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout;
-                return service::get_local_storage_proxy().query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
+                return proxy.query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
                         [schema, cmd, cell_limit, keys = std::move(keys)](service::storage_proxy::coordinator_query_result qr) {
                     return query::result_view::do_with(*qr.query_result, [schema, cmd, cell_limit, keys = std::move(keys)](query::result_view v) mutable {
                         if (schema->is_counter()) {
@@ -309,13 +310,14 @@ public:
             if (!column_parent.super_column.empty()) {
                 fail(unimplemented::cause::SUPER);
             }
-            auto cmd = slice_pred_to_read_cmd(*schema, predicate);
+            auto& proxy = service::get_local_storage_proxy();
+            auto cmd = slice_pred_to_read_cmd(proxy, *schema, predicate);
             auto cell_limit = predicate.__isset.slice_range ? static_cast<uint32_t>(predicate.slice_range.count) : std::numeric_limits<uint32_t>::max();
             auto pranges = make_partition_ranges(*schema, keys);
             auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
-            return f.then([this, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable {
+            return f.then([this, &proxy, schema, cmd, pranges = std::move(pranges), cell_limit, consistency_level, keys]() mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout;
-                return service::get_local_storage_proxy().query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
+                return proxy.query(schema, cmd, std::move(pranges), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
                         [schema, cmd, cell_limit, keys = std::move(keys)](service::storage_proxy::coordinator_query_result qr) {
                     return query::result_view::do_with(*qr.query_result, [schema, cmd, cell_limit, keys = std::move(keys)](query::result_view v) mutable {
                         column_counter counter(*schema, cmd->slice, cell_limit, std::move(keys));
@@ -338,8 +340,9 @@ public:
             if (!column_parent.super_column.empty()) {
                 fail(unimplemented::cause::SUPER);
             }
+            auto& proxy = service::get_local_storage_proxy();
             auto&& prange = make_partition_range(*schema, range);
-            auto cmd = slice_pred_to_read_cmd(*schema, predicate);
+            auto cmd = slice_pred_to_read_cmd(proxy, *schema, predicate);
             // KeyRange::count is the number of thrift rows to return, while
             // SlicePredicte::slice_range::count limits the number of thrift colums.
             if (schema->thrift().is_dynamic()) {
@@ -350,9 +353,9 @@ public:
                 cmd->row_limit = range.count;
             }
             auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
-            return f.then([this, schema, cmd, prange = std::move(prange), consistency_level] () mutable {
+            return f.then([this, &proxy, schema, cmd, prange = std::move(prange), consistency_level] () mutable {
                 auto timeout = db::timeout_clock::now() + _timeout_config.range_read_timeout;
-                return service::get_local_storage_proxy().query(schema, cmd, std::move(prange), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
+                return proxy.query(schema, cmd, std::move(prange), cl_from_thrift(consistency_level), {timeout, empty_service_permit(), _query_state.get_client_state()}).then(
                         [schema, cmd](service::storage_proxy::coordinator_query_result qr) {
                     return query::result_view::do_with(*qr.query_result, [schema, cmd](query::result_view v) {
                         return to_key_slices(*schema, cmd->slice, v, std::numeric_limits<uint32_t>::max());
@@ -362,7 +365,7 @@ public:
         });
     }
 
-    static lw_shared_ptr<query::read_command> make_paged_read_cmd(const schema& s, uint32_t column_limit, const std::string* start_column, const dht::partition_range_vector& range) {
+    static lw_shared_ptr<query::read_command> make_paged_read_cmd(service::storage_proxy& proxy, const schema& s, uint32_t column_limit, const std::string* start_column, const dht::partition_range_vector& range) {
         auto opts = query_opts(s);
         std::vector<query::clustering_range> clustering_ranges;
         query::column_id_vector regular_columns;
@@ -394,7 +397,8 @@ public:
         clustering_ranges.emplace_back(query::clustering_range::make_open_ended_both_sides());
         auto slice = query::partition_slice(std::move(clustering_ranges), { }, std::move(regular_columns), opts,
                 std::move(specific_ranges), cql_serialization_format::internal());
-        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), query::row_limit(row_limit), query::partition_limit(partition_limit));
+        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice),
+                query::row_limit(row_limit), query::partition_limit(partition_limit));
     }
 
     static future<> do_get_paged_slice(
@@ -406,7 +410,8 @@ public:
             const ::timeout_config& timeout_config,
             std::vector<KeySlice>& output,
             service::query_state& qs) {
-        auto cmd = make_paged_read_cmd(*schema, column_limit, start_column, range);
+        auto& proxy = service::get_local_storage_proxy();
+        auto cmd = make_paged_read_cmd(proxy, *schema, column_limit, start_column, range);
         std::optional<partition_key> start_key;
         auto end = range[0].end();
         if (start_column && !schema->thrift().is_dynamic()) {
@@ -417,7 +422,7 @@ public:
         }
         auto range1 = range; // query() below accepts an rvalue, so need a copy to reuse later
         auto timeout = db::timeout_clock::now() + timeout_config.range_read_timeout;
-        return service::get_local_storage_proxy().query(schema, cmd, std::move(range), consistency_level, {timeout, empty_service_permit(), qs.get_client_state()}).then(
+        return proxy.query(schema, cmd, std::move(range), consistency_level, {timeout, empty_service_permit(), qs.get_client_state()}).then(
                 [schema, cmd, column_limit](service::storage_proxy::coordinator_query_result qr) {
             return query::result_view::do_with(*qr.query_result, [schema, cmd, column_limit](query::result_view v) {
                 return to_key_slices(*schema, cmd->slice, v, column_limit);
@@ -664,11 +669,13 @@ public:
                 }
             }
             auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts, nullptr);
-            auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), query::row_limit(row_limit));
+            auto& proxy = service::get_local_storage_proxy();
+            auto cmd = make_lw_shared<query::read_command>(schema->id(), schema->version(), std::move(slice), proxy.get_max_result_size(slice),
+                    query::row_limit(row_limit));
             auto f = _query_state.get_client_state().has_schema_access(*schema, auth::permission::SELECT);
-            return f.then([this, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level] {
+            return f.then([this, &proxy, dk = std::move(dk), cmd, schema, column_limit = request.count, cl = request.consistency_level] {
                 auto timeout = db::timeout_clock::now() + _timeout_config.read_timeout;
-                return service::get_local_storage_proxy().query(schema, cmd, {dht::partition_range::make_singular(dk)}, cl_from_thrift(cl), {timeout, /* FIXME: pass real permit */empty_service_permit(), _query_state.get_client_state()}).then(
+                return proxy.query(schema, cmd, {dht::partition_range::make_singular(dk)}, cl_from_thrift(cl), {timeout, /* FIXME: pass real permit */empty_service_permit(), _query_state.get_client_state()}).then(
                         [schema, cmd, column_limit](service::storage_proxy::coordinator_query_result qr) {
                     return query::result_view::do_with(*qr.query_result, [schema, cmd, column_limit](query::result_view v) {
                         column_aggregator<query_order::no> aggregator(*schema, cmd->slice, column_limit, { });
@@ -1464,7 +1471,7 @@ private:
         opts.set(query::partition_slice::option::send_partition_key);
         return opts;
     }
-    static lw_shared_ptr<query::read_command> slice_pred_to_read_cmd(const schema& s, const SlicePredicate& predicate) {
+    static lw_shared_ptr<query::read_command> slice_pred_to_read_cmd(service::storage_proxy& proxy, const schema& s, const SlicePredicate& predicate) {
         auto opts = query_opts(s);
         std::vector<query::clustering_range> clustering_ranges;
         query::column_id_vector regular_columns;
@@ -1511,7 +1518,7 @@ private:
         }
         auto slice = query::partition_slice(std::move(clustering_ranges), {}, std::move(regular_columns), opts,
                 nullptr, cql_serialization_format::internal(), per_partition_row_limit);
-        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice));
+        return make_lw_shared<query::read_command>(s.id(), s.version(), std::move(slice), proxy.get_max_result_size(slice));
     }
     static ColumnParent column_path_to_column_parent(const ColumnPath& column_path) {
         ColumnParent ret;

From fbbbc3e05c5fc13c4aaa9ac11871648de207d954 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 28 Jul 2020 09:56:22 +0300
Subject: [PATCH 17/26] query: result_memory_limiter: use the new
 max_result_size type

---
 database.cc                       |  2 +-
 multishard_mutation_query.cc      |  2 +-
 query-result.hh                   | 24 ++++++++++++------------
 table.cc                          |  2 +-
 test/boost/mutation_query_test.cc |  6 ++++--
 5 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/database.cc b/database.cc
index 0340798244..c655096cca 100644
--- a/database.cc
+++ b/database.cc
@@ -1207,7 +1207,7 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
 future<std::tuple<reconcilable_result, cache_temperature>>
 database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range,
                           size_t result_max_size, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
-  return get_result_memory_limiter().new_mutation_read(result_max_size).then(
+  return get_result_memory_limiter().new_mutation_read(query::max_result_size(result_max_size)).then(
           [&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) {
     column_family& cf = find_column_family(cmd.cf_id);
     query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
diff --git a/multishard_mutation_query.cc b/multishard_mutation_query.cc
index f2177f1c57..d280a04a0d 100644
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -668,7 +668,7 @@ future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_tempera
                 db.local().find_column_family(s).get_global_cache_hit_rate()));
     }
 
-    return db.local().get_result_memory_limiter().new_mutation_read(max_size).then([&, s = std::move(s), trace_state = std::move(trace_state),
+    return db.local().get_result_memory_limiter().new_mutation_read(query::max_result_size(max_size)).then([&, s = std::move(s), trace_state = std::move(trace_state),
             timeout] (query::result_memory_accounter accounter) mutable {
         return do_query_mutations(db, s, cmd, ranges, std::move(trace_state), timeout, std::move(accounter)).then_wrapped(
                     [&db, s = std::move(s)] (future<reconcilable_result>&& f) {
diff --git a/query-result.hh b/query-result.hh
index fdff5d3740..2dc70377ba 100644
--- a/query-result.hh
+++ b/query-result.hh
@@ -68,18 +68,18 @@ public:
     // Reserves minimum_result_size and creates new memory accounter for
     // mutation query. Uses the specified maximum result size and may be
     // stopped before reaching it due to memory pressure on shard.
-    future<result_memory_accounter> new_mutation_read(size_t max_result_size);
+    future<result_memory_accounter> new_mutation_read(query::max_result_size max_result_size);
 
     // Reserves minimum_result_size and creates new memory accounter for
     // data query. Uses the specified maximum result size, result will *not*
     // be stopped due to on shard memory pressure in order to avoid digest
     // mismatches.
-    future<result_memory_accounter> new_data_read(size_t max_result_size);
+    future<result_memory_accounter> new_data_read(query::max_result_size max_result_size);
 
     // Creates a memory accounter for digest reads. Such accounter doesn't
     // contribute to the shard memory usage, but still stops producing the
     // result after individual limit has been reached.
-    future<result_memory_accounter> new_digest_read(size_t max_result_size);
+    future<result_memory_accounter> new_digest_read(query::max_result_size max_result_size);
 
     // Checks whether the result can grow any more, takes into account only
     // the per shard limit.
@@ -119,13 +119,13 @@ class result_memory_accounter {
     size_t _blocked_bytes = 0;
     size_t _used_memory = 0;
     size_t _total_used_memory = 0;
-    size_t _maximum_result_size = 0;
+    query::max_result_size _maximum_result_size;
     stop_iteration _stop_on_global_limit;
 private:
     // Mutation query accounter. Uses provided individual result size limit and
     // will stop when shard memory pressure grows too high.
     struct mutation_query_tag { };
-    explicit result_memory_accounter(mutation_query_tag, result_memory_limiter& limiter, size_t max_size) noexcept
+    explicit result_memory_accounter(mutation_query_tag, result_memory_limiter& limiter, query::max_result_size max_size) noexcept
         : _limiter(&limiter)
         , _blocked_bytes(result_memory_limiter::minimum_result_size)
         , _maximum_result_size(max_size)
@@ -135,7 +135,7 @@ private:
     // Data query accounter. Uses provided individual result size limit and
     // will *not* stop even though shard memory pressure grows too high.
     struct data_query_tag { };
-    explicit result_memory_accounter(data_query_tag, result_memory_limiter& limiter, size_t max_size) noexcept
+    explicit result_memory_accounter(data_query_tag, result_memory_limiter& limiter, query::max_result_size max_size) noexcept
         : _limiter(&limiter)
         , _blocked_bytes(result_memory_limiter::minimum_result_size)
         , _maximum_result_size(max_size)
@@ -145,7 +145,7 @@ private:
     // will *not* stop even though shard memory pressure grows too high. This
     // accounter does not contribute to the shard memory limits.
     struct digest_query_tag { };
-    explicit result_memory_accounter(digest_query_tag, result_memory_limiter&, size_t max_size) noexcept
+    explicit result_memory_accounter(digest_query_tag, result_memory_limiter&, query::max_result_size max_size) noexcept
         : _blocked_bytes(0)
         , _maximum_result_size(max_size)
     { }
@@ -185,7 +185,7 @@ public:
     stop_iteration update_and_check(size_t n) {
         _used_memory += n;
         _total_used_memory += n;
-        auto stop = stop_iteration(_total_used_memory > _maximum_result_size);
+        auto stop = stop_iteration(_total_used_memory > _maximum_result_size.hard_limit);
         if (_limiter && _used_memory > _blocked_bytes) {
             auto to_block = std::min(_used_memory - _blocked_bytes, n);
             _blocked_bytes += to_block;
@@ -196,7 +196,7 @@ public:
 
     // Checks whether the result can grow any more.
     stop_iteration check() const {
-        stop_iteration stop { _total_used_memory > _maximum_result_size };
+        stop_iteration stop { _total_used_memory > _maximum_result_size.hard_limit };
         if (!stop && _used_memory >= _blocked_bytes && _limiter) {
             return _limiter->check() && _stop_on_global_limit;
         }
@@ -217,19 +217,19 @@ public:
     }
 };
 
-inline future<result_memory_accounter> result_memory_limiter::new_mutation_read(size_t max_size) {
+inline future<result_memory_accounter> result_memory_limiter::new_mutation_read(query::max_result_size max_size) {
     return _memory_limiter.wait(minimum_result_size).then([this, max_size] {
         return result_memory_accounter(result_memory_accounter::mutation_query_tag(), *this, max_size);
     });
 }
 
-inline future<result_memory_accounter> result_memory_limiter::new_data_read(size_t max_size) {
+inline future<result_memory_accounter> result_memory_limiter::new_data_read(query::max_result_size max_size) {
     return _memory_limiter.wait(minimum_result_size).then([this, max_size] {
         return result_memory_accounter(result_memory_accounter::data_query_tag(), *this, max_size);
     });
 }
 
-inline future<result_memory_accounter> result_memory_limiter::new_digest_read(size_t max_size) {
+inline future<result_memory_accounter> result_memory_limiter::new_digest_read(query::max_result_size max_size) {
     return make_ready_future<result_memory_accounter>(result_memory_accounter(result_memory_accounter::digest_query_tag(), *this, max_size));
 }
 
diff --git a/table.cc b/table.cc
index b7d77e674c..23ccb61441 100644
--- a/table.cc
+++ b/table.cc
@@ -2043,7 +2043,7 @@ table::query(schema_ptr s,
     utils::latency_counter lc;
     _stats.reads.set_latency(lc);
     auto f = opts.request == query::result_request::only_digest
-             ? memory_limiter.new_digest_read(max_size) : memory_limiter.new_data_read(max_size);
+             ? memory_limiter.new_digest_read(query::max_result_size(max_size)) : memory_limiter.new_data_read(query::max_result_size(max_size));
     return f.then([this, lc, s = std::move(s), &cmd, class_config, opts, &partition_ranges,
             trace_state = std::move(trace_state), timeout, cache_ctx = std::move(cache_ctx)] (query::result_memory_accounter accounter) mutable {
         auto qs_ptr = std::make_unique<query_state>(std::move(s), cmd, opts, partition_ranges, std::move(accounter));
diff --git a/test/boost/mutation_query_test.cc b/test/boost/mutation_query_test.cc
index 38056eb1a5..1c268759a5 100644
--- a/test/boost/mutation_query_test.cc
+++ b/test/boost/mutation_query_test.cc
@@ -547,11 +547,13 @@ SEASTAR_THREAD_TEST_CASE(test_result_size_calculation) {
     query::partition_slice slice = make_full_slice(*s);
     slice.options.set<query::partition_slice::option::allow_short_read>();
 
-    query::result::builder digest_only_builder(slice, query::result_options{query::result_request::only_digest, query::digest_algorithm::xxHash}, l.new_digest_read(query::result_memory_limiter::maximum_result_size).get0());
+    query::result::builder digest_only_builder(slice, query::result_options{query::result_request::only_digest, query::digest_algorithm::xxHash},
+            l.new_digest_read(query::max_result_size(query::result_memory_limiter::maximum_result_size)).get0());
     data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(),
             gc_clock::now(), digest_only_builder, db::no_timeout, tests::make_query_class_config()).get0();
 
-    query::result::builder result_and_digest_builder(slice, query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash}, l.new_data_read(query::result_memory_limiter::maximum_result_size).get0());
+    query::result::builder result_and_digest_builder(slice, query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash},
+            l.new_data_read(query::max_result_size(query::result_memory_limiter::maximum_result_size)).get0());
     data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(),
             gc_clock::now(), result_and_digest_builder, db::no_timeout, tests::make_query_class_config()).get0();
 

From 159d37053da18fc4aec1c3007a567d324858b8ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 10:33:13 +0300
Subject: [PATCH 18/26] storage_proxy: use read_command::max_result_size to
 pass max result size around

Use the recently added `max_result_size` field of `query::read_command`
to pass the max result size around, including passing it to remote
nodes. This means that the max result size will be sent along each read,
instead of once per connection.
As we want to select the appropriate `max_result_size` based on the type
of the query as well as based on the query class (user or internal) the
previous method won't do anymore. If the remote doesn't fill this
field, the old per-connection value is used.
---
 database.cc                                  |  7 +--
 database.hh                                  |  7 +--
 db/schema_tables.cc                          |  2 +-
 multishard_mutation_query.cc                 |  3 +-
 multishard_mutation_query.hh                 |  1 -
 service/paxos/paxos_state.cc                 |  2 +-
 service/storage_proxy.cc                     | 64 +++++++++++---------
 service/storage_proxy.hh                     | 17 ++----
 table.cc                                     |  3 +-
 test/boost/database_test.cc                  |  8 +--
 test/boost/multishard_mutation_query_test.cc |  8 +--
 test/boost/querier_cache_test.cc             |  4 +-
 12 files changed, 60 insertions(+), 66 deletions(-)

diff --git a/database.cc b/database.cc
index c655096cca..5a7b79f498 100644
--- a/database.cc
+++ b/database.cc
@@ -1178,7 +1178,7 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
 
 future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
-                tracing::trace_state_ptr trace_state, uint64_t max_result_size, db::timeout_clock::time_point timeout) {
+                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
     column_family& cf = find_column_family(cmd.cf_id);
     query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
     return _data_query_stage(&cf,
@@ -1189,7 +1189,6 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
             seastar::cref(ranges),
             std::move(trace_state),
             seastar::ref(get_result_memory_limiter()),
-            max_result_size,
             timeout,
             std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
         if (f.failed()) {
@@ -1206,8 +1205,8 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
 
 future<std::tuple<reconcilable_result, cache_temperature>>
 database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range,
-                          size_t result_max_size, tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
-  return get_result_memory_limiter().new_mutation_read(query::max_result_size(result_max_size)).then(
+                          tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
+  return get_result_memory_limiter().new_mutation_read(*cmd.max_result_size).then(
           [&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) {
     column_family& cf = find_column_family(cmd.cf_id);
     query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
diff --git a/database.hh b/database.hh
index 090d3be332..26b5e92055 100644
--- a/database.hh
+++ b/database.hh
@@ -753,7 +753,6 @@ public:
         const dht::partition_range_vector& ranges,
         tracing::trace_state_ptr trace_state,
         query::result_memory_limiter& memory_limiter,
-        uint64_t max_result_size,
         db::timeout_clock::time_point timeout,
         query::querier_cache_context cache_ctx = { });
 
@@ -1299,7 +1298,6 @@ private:
         const dht::partition_range_vector&,
         tracing::trace_state_ptr,
         query::result_memory_limiter&,
-        uint64_t,
         db::timeout_clock::time_point,
         query::querier_cache_context> _data_query_stage;
 
@@ -1464,10 +1462,9 @@ public:
     unsigned shard_of(const frozen_mutation& m);
     future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>> query(schema_ptr, const query::read_command& cmd, query::result_options opts,
                                                                   const dht::partition_range_vector& ranges, tracing::trace_state_ptr trace_state,
-                                                                  uint64_t max_result_size, db::timeout_clock::time_point timeout);
+                                                                  db::timeout_clock::time_point timeout);
     future<std::tuple<reconcilable_result, cache_temperature>> query_mutations(schema_ptr, const query::read_command& cmd, const dht::partition_range& range,
-                                                size_t max_result_size, tracing::trace_state_ptr trace_state,
-                                                db::timeout_clock::time_point timeout);
+                                                tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout);
     // Apply the mutation atomically.
     // Throws timed_out_error when timeout is reached.
     future<> apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::commitlog::force_sync sync, db::timeout_clock::time_point timeout);
diff --git a/db/schema_tables.cc b/db/schema_tables.cc
index cb3d5dbf03..0014cf22e3 100644
--- a/db/schema_tables.cc
+++ b/db/schema_tables.cc
@@ -743,7 +743,7 @@ future<mutation> query_partition_mutation(service::storage_proxy& proxy,
 {
     auto dk = dht::decorate_key(*s, pkey);
     return do_with(dht::partition_range::make_singular(dk), [&proxy, dk, s = std::move(s), cmd = std::move(cmd)] (auto& range) {
-        return proxy.query_mutations_locally(s, std::move(cmd), range, db::no_timeout)
+        return proxy.query_mutations_locally(s, std::move(cmd), range, db::no_timeout, tracing::trace_state_ptr{})
                 .then([dk = std::move(dk), s](rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature> res_hit_rate) {
                     auto&& [res, hit_rate] = res_hit_rate;
                     auto&& partitions = res->partitions();
diff --git a/multishard_mutation_query.cc b/multishard_mutation_query.cc
index d280a04a0d..fc10c97bc6 100644
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -659,7 +659,6 @@ future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_tempera
         const query::read_command& cmd,
         const dht::partition_range_vector& ranges,
         tracing::trace_state_ptr trace_state,
-        uint64_t max_size,
         db::timeout_clock::time_point timeout) {
     if (cmd.row_limit == 0 || cmd.slice.partition_row_limit() == 0 || cmd.partition_limit == 0) {
         return make_ready_future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>(
@@ -668,7 +667,7 @@ future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_tempera
                 db.local().find_column_family(s).get_global_cache_hit_rate()));
     }
 
-    return db.local().get_result_memory_limiter().new_mutation_read(query::max_result_size(max_size)).then([&, s = std::move(s), trace_state = std::move(trace_state),
+    return db.local().get_result_memory_limiter().new_mutation_read(*cmd.max_result_size).then([&, s = std::move(s), trace_state = std::move(trace_state),
             timeout] (query::result_memory_accounter accounter) mutable {
         return do_query_mutations(db, s, cmd, ranges, std::move(trace_state), timeout, std::move(accounter)).then_wrapped(
                     [&db, s = std::move(s)] (future<reconcilable_result>&& f) {
diff --git a/multishard_mutation_query.hh b/multishard_mutation_query.hh
index 8d49a4dd5b..f0baed9c78 100644
--- a/multishard_mutation_query.hh
+++ b/multishard_mutation_query.hh
@@ -67,5 +67,4 @@ future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_tempera
         const query::read_command& cmd,
         const dht::partition_range_vector& ranges,
         tracing::trace_state_ptr trace_state,
-        uint64_t max_size,
         db::timeout_clock::time_point timeout);
diff --git a/service/paxos/paxos_state.cc b/service/paxos/paxos_state.cc
index 4fabfa5c6d..a106f39b68 100644
--- a/service/paxos/paxos_state.cc
+++ b/service/paxos/paxos_state.cc
@@ -81,7 +81,7 @@ future<prepare_response> paxos_state::prepare(tracing::trace_state_ptr tr_state,
                                 [tr_state, schema, &cmd, only_digest, da, timeout] (const dht::partition_range_vector& prv) {
                             return get_local_storage_proxy().get_db().local().query(schema, cmd,
                                     {only_digest ? query::result_request::only_digest : query::result_request::result_and_digest, da},
-                                    prv, tr_state, query::result_memory_limiter::maximum_result_size, timeout);
+                                    prv, tr_state, timeout);
                         });
                     });
                     return when_all(std::move(f1), std::move(f2)).then([state = std::move(state), only_digest] (auto t) {
diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc
index b6554d7959..bd0bc6e221 100644
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -3872,8 +3872,8 @@ db::read_repair_decision storage_proxy::new_read_repair_decision(const schema& s
 }
 
 future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>
-storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da, uint64_t max_size) {
-    return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout, max_size).then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> result_and_hit_rate) {
+storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, query::digest_algorithm da) {
+    return query_result_local(std::move(s), std::move(cmd), pr, query::result_options::only_digest(da), std::move(trace_state), timeout).then([] (rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature> result_and_hit_rate) {
         auto&& [result, hit_rate] = result_and_hit_rate;
         return make_ready_future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>>(rpc::tuple(*result->digest(), result->last_modified(), hit_rate));
     });
@@ -3881,15 +3881,15 @@ storage_proxy::query_result_local_digest(schema_ptr s, lw_shared_ptr<query::read
 
 future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>
 storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr, query::result_options opts,
-                                  tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout, uint64_t max_size) {
+                                  tracing::trace_state_ptr trace_state, storage_proxy::clock_type::time_point timeout) {
     cmd->slice.options.set_if<query::partition_slice::option::with_digest>(opts.request != query::result_request::only_result);
     if (pr.is_singular()) {
         unsigned shard = dht::shard_of(*s, pr.start()->value().token());
         get_stats().replica_cross_shard_ops += shard != this_shard_id();
-        return _db.invoke_on(shard, _read_smp_service_group, [max_size, gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
+        return _db.invoke_on(shard, _read_smp_service_group, [gs = global_schema_ptr(s), prv = dht::partition_range_vector({pr}) /* FIXME: pr is copied */, cmd, opts, timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
             auto trace_state = gt.get();
             tracing::trace(trace_state, "Start querying the token range that starts with {}", seastar::value_of([&prv] { return prv.begin()->start()->value().token(); }));
-            return db.query(gs, *cmd, opts, prv, trace_state, max_size, timeout).then([trace_state](std::tuple<lw_shared_ptr<query::result>, cache_temperature>&& f_ht) {
+            return db.query(gs, *cmd, opts, prv, trace_state, timeout).then([trace_state](std::tuple<lw_shared_ptr<query::result>, cache_temperature>&& f_ht) {
                 auto&& [f, ht] = f_ht;
                 tracing::trace(trace_state, "Querying is done");
                 return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>(rpc::tuple(make_foreign(std::move(f)), ht));
@@ -3897,7 +3897,7 @@ storage_proxy::query_result_local(schema_ptr s, lw_shared_ptr<query::read_comman
         });
     } else {
         // FIXME: adjust multishard_mutation_query to accept an smp_service_group and propagate it there
-        return query_nonsingular_mutations_locally(s, cmd, {pr}, std::move(trace_state), max_size, timeout).then([s, cmd, opts] (rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>&& r_ht) {
+        return query_nonsingular_mutations_locally(s, cmd, {pr}, std::move(trace_state), timeout).then([s, cmd, opts] (rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>&& r_ht) {
             auto&& [r, ht] = r_ht;
             return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>>(
                     rpc::tuple(::make_foreign(::make_lw_shared(to_data_query_result(*r, s, cmd->slice,  cmd->row_limit, cmd->partition_limit, opts))), ht));
@@ -4936,11 +4936,13 @@ void storage_proxy::init_messaging_service() {
             tracing::trace(trace_state_ptr, "read_data: message received from /{}", src_addr.addr);
         }
         auto da = oda.value_or(query::digest_algorithm::MD5);
-        auto max_size = cinfo.retrieve_auxiliary<uint64_t>("max_result_size");
-        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, max_size, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+        if (!cmd.max_result_size) {
+            cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
+        }
+        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
             p->get_stats().replica_data_reads++;
             auto src_ip = src_addr.addr;
-            return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, da, &pr, &p, &trace_state_ptr, max_size, t] (schema_ptr s) {
+            return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, da, &pr, &p, &trace_state_ptr, t] (schema_ptr s) {
                 auto pr2 = ::compat::unwrap(std::move(pr), *s);
                 if (pr2.second) {
                     // this function assumes singular queries but doesn't validate
@@ -4950,7 +4952,7 @@ void storage_proxy::init_messaging_service() {
                 opts.digest_algo = da;
                 opts.request = da == query::digest_algorithm::none ? query::result_request::only_result : query::result_request::result_and_digest;
                 auto timeout = t ? *t : db::no_timeout;
-                return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout, max_size);
+                return p->query_result_local(std::move(s), cmd, std::move(pr2.first), opts, trace_state_ptr, timeout);
             }).finally([&trace_state_ptr, src_ip] () mutable {
                 tracing::trace(trace_state_ptr, "read_data handling is done, sending a response to /{}", src_ip);
             });
@@ -4964,22 +4966,24 @@ void storage_proxy::init_messaging_service() {
             tracing::begin(trace_state_ptr);
             tracing::trace(trace_state_ptr, "read_mutation_data: message received from /{}", src_addr.addr);
         }
-        auto max_size = cinfo.retrieve_auxiliary<uint64_t>("max_result_size");
+        if (!cmd.max_result_size) {
+            cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
+        }
         return do_with(std::move(pr),
                        get_local_shared_storage_proxy(),
                        std::move(trace_state_ptr),
                        ::compat::one_or_two_partition_ranges({}),
-                       [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), max_size, t] (
+                       [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), t] (
                                ::compat::wrapping_partition_range& pr,
                                shared_ptr<storage_proxy>& p,
                                tracing::trace_state_ptr& trace_state_ptr,
                                ::compat::one_or_two_partition_ranges& unwrapped) mutable {
             p->get_stats().replica_mutation_data_reads++;
             auto src_ip = src_addr.addr;
-            return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, max_size, &unwrapped, t] (schema_ptr s) mutable {
+            return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, &unwrapped, t] (schema_ptr s) mutable {
                 unwrapped = ::compat::unwrap(std::move(pr), *s);
                 auto timeout = t ? *t : db::no_timeout;
-                return p->query_mutations_locally(std::move(s), std::move(cmd), unwrapped, timeout, trace_state_ptr, max_size);
+                return p->query_mutations_locally(std::move(s), std::move(cmd), unwrapped, timeout, trace_state_ptr);
             }).finally([&trace_state_ptr, src_ip] () mutable {
                 tracing::trace(trace_state_ptr, "read_mutation_data handling is done, sending a response to /{}", src_ip);
             });
@@ -4994,18 +4998,20 @@ void storage_proxy::init_messaging_service() {
             tracing::trace(trace_state_ptr, "read_digest: message received from /{}", src_addr.addr);
         }
         auto da = oda.value_or(query::digest_algorithm::MD5);
-        auto max_size = cinfo.retrieve_auxiliary<uint64_t>("max_result_size");
-        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, max_size, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
+        if (!cmd.max_result_size) {
+            cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
+        }
+        return do_with(std::move(pr), get_local_shared_storage_proxy(), std::move(trace_state_ptr), [&cinfo, cmd = make_lw_shared<query::read_command>(std::move(cmd)), src_addr = std::move(src_addr), da, t] (::compat::wrapping_partition_range& pr, shared_ptr<storage_proxy>& p, tracing::trace_state_ptr& trace_state_ptr) mutable {
             p->get_stats().replica_digest_reads++;
             auto src_ip = src_addr.addr;
-            return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, max_size, t, da] (schema_ptr s) {
+            return get_schema_for_read(cmd->schema_version, std::move(src_addr)).then([cmd, &pr, &p, &trace_state_ptr, t, da] (schema_ptr s) {
                 auto pr2 = ::compat::unwrap(std::move(pr), *s);
                 if (pr2.second) {
                     // this function assumes singular queries but doesn't validate
                     throw std::runtime_error("READ_DIGEST called with wrapping range");
                 }
                 auto timeout = t ? *t : db::no_timeout;
-                return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da, max_size);
+                return p->query_result_local_digest(std::move(s), cmd, std::move(pr2.first), trace_state_ptr, timeout, da);
             }).finally([&trace_state_ptr, src_ip] () mutable {
                 tracing::trace(trace_state_ptr, "read_digest handling is done, sending a response to /{}", src_ip);
             });
@@ -5041,6 +5047,9 @@ void storage_proxy::init_messaging_service() {
             tracing::begin(tr_state);
             tracing::trace(tr_state, "paxos_prepare: message received from /{} ballot {}", src_ip, ballot);
         }
+        if (!cmd.max_result_size) {
+            cmd.max_result_size.emplace(cinfo.retrieve_auxiliary<uint64_t>("max_result_size"));
+        }
 
         return get_schema_for_read(cmd.schema_version, src_addr).then([this, cmd = std::move(cmd), key = std::move(key), ballot,
                          only_digest, da, timeout, tr_state = std::move(tr_state), src_ip] (schema_ptr schema) mutable {
@@ -5153,29 +5162,29 @@ future<> storage_proxy::uninit_messaging_service() {
 future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
 storage_proxy::query_mutations_locally(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
                                        storage_proxy::clock_type::time_point timeout,
-                                       tracing::trace_state_ptr trace_state, uint64_t max_size) {
+                                       tracing::trace_state_ptr trace_state) {
     if (pr.is_singular()) {
         unsigned shard = dht::shard_of(*s, pr.start()->value().token());
         get_stats().replica_cross_shard_ops += shard != this_shard_id();
-        return _db.invoke_on(shard, _read_smp_service_group, [max_size, cmd, &pr, gs=global_schema_ptr(s), timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
-            return db.query_mutations(gs, *cmd, pr, max_size, gt, timeout).then([] (std::tuple<reconcilable_result, cache_temperature> result_ht) {
+        return _db.invoke_on(shard, _read_smp_service_group, [cmd, &pr, gs=global_schema_ptr(s), timeout, gt = tracing::global_trace_state_ptr(std::move(trace_state))] (database& db) mutable {
+            return db.query_mutations(gs, *cmd, pr, gt, timeout).then([] (std::tuple<reconcilable_result, cache_temperature> result_ht) {
                 auto&& [result, ht] = result_ht;
                 return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>(rpc::tuple(make_foreign(make_lw_shared(std::move(result))), ht));
             });
         });
     } else {
-        return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), {pr}, std::move(trace_state), max_size, timeout);
+        return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), {pr}, std::move(trace_state), timeout);
     }
 }
 
 future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>
 storage_proxy::query_mutations_locally(schema_ptr s, lw_shared_ptr<query::read_command> cmd, const ::compat::one_or_two_partition_ranges& pr,
                                        storage_proxy::clock_type::time_point timeout,
-                                       tracing::trace_state_ptr trace_state, uint64_t max_size) {
+                                       tracing::trace_state_ptr trace_state) {
     if (!pr.second) {
-        return query_mutations_locally(std::move(s), std::move(cmd), pr.first, timeout, std::move(trace_state), max_size);
+        return query_mutations_locally(std::move(s), std::move(cmd), pr.first, timeout, std::move(trace_state));
     } else {
-        return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), pr, std::move(trace_state), max_size, timeout);
+        return query_nonsingular_mutations_locally(std::move(s), std::move(cmd), pr, std::move(trace_state), timeout);
     }
 }
 
@@ -5184,11 +5193,10 @@ storage_proxy::query_nonsingular_mutations_locally(schema_ptr s,
                                                    lw_shared_ptr<query::read_command> cmd,
                                                    const dht::partition_range_vector&& prs,
                                                    tracing::trace_state_ptr trace_state,
-                                                   uint64_t max_size,
                                                    storage_proxy::clock_type::time_point timeout) {
-    return do_with(cmd, std::move(prs), [this, max_size, timeout, s = std::move(s), trace_state = std::move(trace_state)] (lw_shared_ptr<query::read_command>& cmd,
+    return do_with(cmd, std::move(prs), [this, timeout, s = std::move(s), trace_state = std::move(trace_state)] (lw_shared_ptr<query::read_command>& cmd,
                 const dht::partition_range_vector& prs) mutable {
-        return query_mutations_on_all_shards(_db, std::move(s), *cmd, prs, std::move(trace_state), max_size, timeout).then([] (std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature> t) {
+        return query_mutations_on_all_shards(_db, std::move(s), *cmd, prs, std::move(trace_state), timeout).then([] (std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature> t) {
             return make_ready_future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>>(std::move(t));
         });
     });
diff --git a/service/storage_proxy.hh b/service/storage_proxy.hh
index 13ed45a92e..6d728d5fba 100644
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -360,13 +360,11 @@ private:
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<query::result>>, cache_temperature>> query_result_local(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
                                                                            query::result_options opts,
                                                                            tracing::trace_state_ptr trace_state,
-                                                                           clock_type::time_point timeout,
-                                                                           uint64_t max_size = query::result_memory_limiter::maximum_result_size);
+                                                                           clock_type::time_point timeout);
     future<rpc::tuple<query::result_digest, api::timestamp_type, cache_temperature>> query_result_local_digest(schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range& pr,
                                                                                                    tracing::trace_state_ptr trace_state,
                                                                                                    clock_type::time_point timeout,
-                                                                                                   query::digest_algorithm da,
-                                                                                                   uint64_t max_size  = query::result_memory_limiter::maximum_result_size);
+                                                                                                   query::digest_algorithm da);
     future<coordinator_query_result> query_partition_key_range(lw_shared_ptr<query::read_command> cmd,
             dht::partition_range_vector partition_ranges,
             db::consistency_level cl,
@@ -408,7 +406,7 @@ private:
     future<> mutate_internal(Range mutations, db::consistency_level cl, bool counter_write, tracing::trace_state_ptr tr_state, service_permit permit, std::optional<clock_type::time_point> timeout_opt = { }, lw_shared_ptr<cdc::operation_result_tracker> cdc_tracker = { });
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_nonsingular_mutations_locally(
             schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector&& pr, tracing::trace_state_ptr trace_state,
-            uint64_t max_size, clock_type::time_point timeout);
+            clock_type::time_point timeout);
 
     future<> mutate_counters_on_leader(std::vector<frozen_mutation_and_schema> mutations, db::consistency_level cl, clock_type::time_point timeout,
                                        tracing::trace_state_ptr trace_state, service_permit permit);
@@ -560,21 +558,18 @@ public:
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_mutations_locally(
         schema_ptr, lw_shared_ptr<query::read_command> cmd, const dht::partition_range&,
         clock_type::time_point timeout,
-        tracing::trace_state_ptr trace_state = nullptr,
-        uint64_t max_size = query::result_memory_limiter::maximum_result_size);
+        tracing::trace_state_ptr trace_state = nullptr);
 
 
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_mutations_locally(
         schema_ptr, lw_shared_ptr<query::read_command> cmd, const ::compat::one_or_two_partition_ranges&,
         clock_type::time_point timeout,
-        tracing::trace_state_ptr trace_state = nullptr,
-        uint64_t max_size = query::result_memory_limiter::maximum_result_size);
+        tracing::trace_state_ptr trace_state = nullptr);
 
     future<rpc::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>> query_mutations_locally(
             schema_ptr s, lw_shared_ptr<query::read_command> cmd, const dht::partition_range_vector& pr,
             clock_type::time_point timeout,
-            tracing::trace_state_ptr trace_state = nullptr,
-            uint64_t max_size = query::result_memory_limiter::maximum_result_size);
+            tracing::trace_state_ptr trace_state = nullptr);
 
     future<bool> cas(schema_ptr schema, shared_ptr<cas_request> request, lw_shared_ptr<query::read_command> cmd,
             dht::partition_range_vector&& partition_ranges, coordinator_query_options query_options,
diff --git a/table.cc b/table.cc
index 23ccb61441..c5323278bb 100644
--- a/table.cc
+++ b/table.cc
@@ -2037,13 +2037,12 @@ table::query(schema_ptr s,
         const dht::partition_range_vector& partition_ranges,
         tracing::trace_state_ptr trace_state,
         query::result_memory_limiter& memory_limiter,
-        uint64_t max_size,
         db::timeout_clock::time_point timeout,
         query::querier_cache_context cache_ctx) {
     utils::latency_counter lc;
     _stats.reads.set_latency(lc);
     auto f = opts.request == query::result_request::only_digest
-             ? memory_limiter.new_digest_read(query::max_result_size(max_size)) : memory_limiter.new_data_read(query::max_result_size(max_size));
+             ? memory_limiter.new_digest_read(*cmd.max_result_size) : memory_limiter.new_data_read(*cmd.max_result_size);
     return f.then([this, lc, s = std::move(s), &cmd, class_config, opts, &partition_ranges,
             trace_state = std::move(trace_state), timeout, cache_ctx = std::move(cache_ctx)] (query::result_memory_accounter accounter) mutable {
         auto qs_ptr = std::make_unique<query_state>(std::move(s), cmd, opts, partition_ranges, std::move(accounter));
diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc
index a2ef47712e..e36949d40c 100644
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -63,7 +63,7 @@ SEASTAR_TEST_CASE(test_safety_after_truncate) {
         auto assert_query_result = [&] (size_t expected_size) {
             auto max_size = std::numeric_limits<size_t>::max();
             auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(1000));
-            auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0();
+            auto&& [result, cache_tempature] = db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0();
             assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(expected_size);
         };
         assert_query_result(1000);
@@ -106,21 +106,21 @@ SEASTAR_TEST_CASE(test_querying_with_limits) {
             auto max_size = std::numeric_limits<size_t>::max();
             {
                 auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size), query::row_limit(3));
-                auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
+                auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
             }
 
             {
                 auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size),
                         query::row_limit(query::max_rows), query::partition_limit(5));
-                auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
+                auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(5);
             }
 
             {
                 auto cmd = query::read_command(s->id(), s->version(), partition_slice_builder(*s).build(), query::max_result_size(max_size),
                         query::row_limit(query::max_rows), query::partition_limit(3));
-                auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, max_size, db::no_timeout).get0());
+                auto result = std::get<0>(db.query(s, cmd, query::result_options::only_result(), pranges, nullptr, db::no_timeout).get0());
                 assert_that(query::result_set::from_raw_result(s, cmd.slice, *result)).has_size(3);
             }
         });
diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc
index a4b3651797..9e43b2d166 100644
--- a/test/boost/multishard_mutation_query_test.cc
+++ b/test/boost/multishard_mutation_query_test.cc
@@ -84,7 +84,7 @@ SEASTAR_THREAD_TEST_CASE(test_abandoned_read) {
         auto cmd = query::read_command(s->id(), s->version(), s->full_slice(), 7, gc_clock::now(), std::nullopt, query::max_partitions,
                 utils::make_random_uuid(), query::is_first_page::yes, query::max_result_size(query::result_memory_limiter::unlimited_result_size));
 
-        query_mutations_on_all_shards(env.db(), s, cmd, {query::full_partition_range}, nullptr, std::numeric_limits<uint64_t>::max(), db::no_timeout).get();
+        query_mutations_on_all_shards(env.db(), s, cmd, {query::full_partition_range}, nullptr, db::no_timeout).get();
 
         check_cache_population(env.db(), 1);
 
@@ -108,7 +108,7 @@ static std::vector<mutation> read_all_partitions_one_by_one(distributed<database
                         query::max_result_size(query::result_memory_limiter::unlimited_result_size));
                 const auto range = dht::partition_range::make_singular(pkey);
                 return make_foreign(std::make_unique<reconcilable_result>(
-                    std::get<0>(db.query_mutations(std::move(s), cmd, range, std::numeric_limits<size_t>::max(), nullptr, db::no_timeout).get0())));
+                    std::get<0>(db.query_mutations(std::move(s), cmd, range, nullptr, db::no_timeout).get0())));
             });
         }).get0();
 
@@ -133,7 +133,7 @@ read_partitions_with_paged_scan(distributed<database>& db, schema_ptr s, uint32_
 
     // First page is special, needs to have `is_first_page` set.
     {
-        auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {range}, nullptr, max_size, db::no_timeout).get0());
+        auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {range}, nullptr, db::no_timeout).get0());
         for (auto& part : res->partitions()) {
             auto mut = part.mut().unfreeze(s);
             results.emplace_back(std::move(mut));
@@ -177,7 +177,7 @@ read_partitions_with_paged_scan(distributed<database>& db, schema_ptr s, uint32_
             cmd.slice.set_range(*s, last_pkey.key(), std::move(ckranges));
         }
 
-        auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {pkrange}, nullptr, max_size, db::no_timeout).get0());
+        auto res = std::get<0>(query_mutations_on_all_shards(db, s, cmd, {pkrange}, nullptr, db::no_timeout).get0());
 
         if (is_stateful) {
             BOOST_REQUIRE(aggregate_querier_cache_stat(db, &query::querier_cache::stats::lookups) >= npages);
diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc
index 93eeff994a..63940fadca 100644
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -674,7 +674,6 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
         db.query_mutations(s,
                 cmd1,
                 query::full_partition_range,
-                1024 * 1024,
                 nullptr,
                 db::no_timeout).get();
 
@@ -710,7 +709,6 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
         db.query_mutations(s,
                 cmd2,
                 query::full_partition_range,
-                1024 * 1024,
                 nullptr,
                 db::no_timeout).get();
 
@@ -724,10 +722,10 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
         // of the tracked buffers.
         cmd2.row_limit = query::max_rows;
         cmd2.partition_limit = query::max_partitions;
+        cmd2.max_result_size.emplace(query::result_memory_limiter::unlimited_result_size);
         db.query_mutations(s,
                 cmd2,
                 query::full_partition_range,
-                query::result_memory_limiter::unlimited_result_size,
                 nullptr,
                 db::no_timeout).get();
         return make_ready_future<>();

From 9eab5bca27be01e8d2db5ac49e7a8339a283be03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 12:21:37 +0300
Subject: [PATCH 19/26] query_*(): use the coordinator specified memory limit
 for unlimited queries

It is important that all replicas participating in a read use the same
memory limits to avoid artificial differences due to different amount of
results. The coordinator now passes down its own memory limit for reads,
in the form of max_result_size (or max_size). For unpaged or reverse
queries this has to be used now instead of the locally set
max_memory_unlimited_query configuration item.

To avoid the replicas accidentally using the local limit contained in
the `query_class_config` returned from
`database::make_query_class_config()`, we refactor the latter into
`database::get_reader_concurrency_semaphore()`. Most of its callers were
only interested in the semaphore only anyway and those that were
interested in the limit as well should get it from the coordinator
instead, so this refactoring is a win-win.
---
 database.cc                                  | 19 ++++++++++---------
 database.hh                                  |  4 +++-
 db/view/view.cc                              |  2 +-
 db/view/view_update_generator.cc             |  2 +-
 multishard_mutation_query.cc                 | 11 +++++------
 test/boost/multishard_mutation_query_test.cc |  2 +-
 test/boost/querier_cache_test.cc             |  2 +-
 test/boost/view_build_test.cc                |  2 +-
 test/manual/sstable_scan_footprint_test.cc   |  2 +-
 9 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/database.cc b/database.cc
index 5a7b79f498..1a6bdbf0b8 100644
--- a/database.cc
+++ b/database.cc
@@ -1180,11 +1180,12 @@ future<std::tuple<lw_shared_ptr<query::result>, cache_temperature>>
 database::query(schema_ptr s, const query::read_command& cmd, query::result_options opts, const dht::partition_range_vector& ranges,
                 tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
     column_family& cf = find_column_family(cmd.cf_id);
+    auto class_config = query::query_class_config{.semaphore = get_reader_concurrency_semaphore(), .max_memory_for_unlimited_query = *cmd.max_result_size};
     query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
     return _data_query_stage(&cf,
             std::move(s),
             seastar::cref(cmd),
-            make_query_class_config(),
+            class_config,
             opts,
             seastar::cref(ranges),
             std::move(trace_state),
@@ -1209,6 +1210,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
   return get_result_memory_limiter().new_mutation_read(*cmd.max_result_size).then(
           [&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) {
     column_family& cf = find_column_family(cmd.cf_id);
+    auto class_config = query::query_class_config{.semaphore = get_reader_concurrency_semaphore(), .max_memory_for_unlimited_query = *cmd.max_result_size};
     query::querier_cache_context cache_ctx(_querier_cache, cmd.query_uuid, cmd.is_first_page);
     return _mutation_query_stage(std::move(s),
             cf.as_mutation_source(),
@@ -1218,7 +1220,7 @@ database::query_mutations(schema_ptr s, const query::read_command& cmd, const dh
             cmd.partition_limit,
             cmd.timestamp,
             timeout,
-            make_query_class_config(),
+            class_config,
             std::move(accounter),
             std::move(trace_state),
             std::move(cache_ctx)).then_wrapped([this, s = _stats, hit_rate = cf.get_global_cache_hit_rate(), op = cf.read_in_progress()] (auto f) {
@@ -1281,17 +1283,16 @@ void database::register_connection_drop_notifier(netw::messaging_service& ms) {
     });
 }
 
-query::query_class_config database::make_query_class_config() {
+reader_concurrency_semaphore& database::get_reader_concurrency_semaphore() {
     // Everything running in the statement group is considered a user query
     if (current_scheduling_group() == _dbcfg.statement_scheduling_group) {
-        return query::query_class_config{_read_concurrency_sem,
-            query::max_result_size(_cfg.max_memory_for_unlimited_query_soft_limit(), _cfg.max_memory_for_unlimited_query_hard_limit())};
+        return _read_concurrency_sem;
     // Reads done on behalf of view update generation run in the streaming group
     } else if (current_scheduling_group() == _dbcfg.streaming_scheduling_group) {
-        return query::query_class_config{_streaming_concurrency_sem, query::max_result_size(std::numeric_limits<uint64_t>::max())};
+        return _streaming_concurrency_sem;
     // Everything else is considered a system query
     } else {
-        return query::query_class_config{_system_read_concurrency_sem, query::max_result_size(std::numeric_limits<uint64_t>::max())};
+        return _system_read_concurrency_sem;
     }
 }
 
@@ -1351,7 +1352,7 @@ future<mutation> database::do_apply_counter_update(column_family& cf, const froz
             // counter state for each modified cell...
 
             tracing::trace(trace_state, "Reading counter values from the CF");
-            return counter_write_query(m_schema, cf.as_mutation_source(), make_query_class_config().semaphore.make_permit(), m.decorated_key(), slice, trace_state, timeout)
+            return counter_write_query(m_schema, cf.as_mutation_source(), get_reader_concurrency_semaphore().make_permit(), m.decorated_key(), slice, trace_state, timeout)
                     .then([this, &cf, &m, m_schema, timeout, trace_state] (auto mopt) {
                 // ...now, that we got existing state of all affected counter
                 // cells we can look for our shard in each of them, increment
@@ -1562,7 +1563,7 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::tra
     if (cf.views().empty()) {
         return apply_with_commitlog(std::move(s), cf, std::move(uuid), m, timeout, sync).finally([op = std::move(op)] { });
     }
-    future<row_locker::lock_holder> f = cf.push_view_replica_updates(s, m, timeout, std::move(tr_state), make_query_class_config().semaphore);
+    future<row_locker::lock_holder> f = cf.push_view_replica_updates(s, m, timeout, std::move(tr_state), get_reader_concurrency_semaphore());
     return f.then([this, s = std::move(s), uuid = std::move(uuid), &m, timeout, &cf, op = std::move(op), sync] (row_locker::lock_holder lock) mutable {
         return apply_with_commitlog(std::move(s), cf, std::move(uuid), m, timeout, sync).finally(
                 // Hold the local lock on the base-table partition or row
diff --git a/database.hh b/database.hh
index 26b5e92055..c8540ff5cd 100644
--- a/database.hh
+++ b/database.hh
@@ -1592,7 +1592,9 @@ public:
         return _supports_infinite_bound_range_deletions;
     }
 
-    query::query_class_config make_query_class_config();
+    // Get the reader concurrency semaphore, appropriate for the query class,
+    // which is deduced from the current scheduling group.
+    reader_concurrency_semaphore& get_reader_concurrency_semaphore();
 };
 
 future<> start_large_data_handler(sharded<database>& db);
diff --git a/db/view/view.cc b/db/view/view.cc
index 4b18d59f6d..3d0ee79134 100644
--- a/db/view/view.cc
+++ b/db/view/view.cc
@@ -1285,7 +1285,7 @@ view_builder::build_step& view_builder::get_or_create_build_step(utils::UUID bas
 void view_builder::initialize_reader_at_current_token(build_step& step) {
     step.pslice = make_partition_slice(*step.base->schema());
     step.prange = dht::partition_range(dht::ring_position::starting_at(step.current_token()), dht::ring_position::max());
-    auto permit = _db.make_query_class_config().semaphore.make_permit();
+    auto permit = _db.get_reader_concurrency_semaphore().make_permit();
     step.reader = make_local_shard_sstable_reader(
             step.base->schema(),
             std::move(permit),
diff --git a/db/view/view_update_generator.cc b/db/view/view_update_generator.cc
index 624d192e5f..0552b95d7a 100644
--- a/db/view/view_update_generator.cc
+++ b/db/view/view_update_generator.cc
@@ -78,7 +78,7 @@ future<> view_update_generator::start() {
                     auto [staging_sstable_reader, staging_sstable_reader_handle] = make_manually_paused_evictable_reader(
                             std::move(ms),
                             s,
-                            _db.make_query_class_config().semaphore.make_permit(),
+                            _db.get_reader_concurrency_semaphore().make_permit(),
                             query::full_partition_range,
                             s->full_slice(),
                             service::get_local_streaming_priority(),
diff --git a/multishard_mutation_query.cc b/multishard_mutation_query.cc
index fc10c97bc6..7e3516c32b 100644
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -244,7 +244,7 @@ public:
     virtual reader_concurrency_semaphore& semaphore() override {
         const auto shard = this_shard_id();
         if (!_semaphores[shard]) {
-            _semaphores[shard] = &_db.local().make_query_class_config().semaphore;
+            _semaphores[shard] = &_db.local().get_reader_concurrency_semaphore();
         }
         return *_semaphores[shard];
     }
@@ -618,18 +618,17 @@ static future<reconcilable_result> do_query_mutations(
                     mutation_reader::forwarding fwd_mr) {
                 return make_multishard_combining_reader(ctx, std::move(s), pr, ps, pc, std::move(trace_state), fwd_mr);
             });
-            auto class_config = ctx->db().local().make_query_class_config();
-            auto reader = make_flat_multi_range_reader(s, class_config.semaphore.make_permit(), std::move(ms), ranges, cmd.slice,
-                    service::get_local_sstable_query_read_priority(), trace_state, mutation_reader::forwarding::no);
+            auto reader = make_flat_multi_range_reader(s, ctx->db().local().get_reader_concurrency_semaphore().make_permit(), std::move(ms), ranges,
+                    cmd.slice, service::get_local_sstable_query_read_priority(), trace_state, mutation_reader::forwarding::no);
 
             auto compaction_state = make_lw_shared<compact_for_mutation_query_state>(*s, cmd.timestamp, cmd.slice, cmd.row_limit,
                     cmd.partition_limit);
 
-            return do_with(std::move(reader), std::move(compaction_state), [&, class_config, accounter = std::move(accounter), timeout] (
+            return do_with(std::move(reader), std::move(compaction_state), [&, accounter = std::move(accounter), timeout] (
                         flat_mutation_reader& reader, lw_shared_ptr<compact_for_mutation_query_state>& compaction_state) mutable {
                 auto rrb = reconcilable_result_builder(*reader.schema(), cmd.slice, std::move(accounter));
                 return query::consume_page(reader, compaction_state, cmd.slice, std::move(rrb), cmd.row_limit, cmd.partition_limit, cmd.timestamp,
-                        timeout, class_config.max_memory_for_unlimited_query).then([&] (consume_result&& result) mutable {
+                        timeout, *cmd.max_result_size).then([&] (consume_result&& result) mutable {
                     return make_ready_future<page_consume_result>(page_consume_result(std::move(result), reader.detach_buffer(), std::move(compaction_state)));
                 });
             }).then_wrapped([&ctx] (future<page_consume_result>&& result_fut) {
diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc
index 9e43b2d166..295c14bc34 100644
--- a/test/boost/multishard_mutation_query_test.cc
+++ b/test/boost/multishard_mutation_query_test.cc
@@ -973,7 +973,7 @@ SEASTAR_THREAD_TEST_CASE(fuzzy_test) {
 
         const auto& partitions = pop_desc.partitions;
         smp::invoke_on_all([cfg, db = &env.db(), gs = global_schema_ptr(pop_desc.schema), &partitions] {
-            auto& sem = db->local().make_query_class_config().semaphore;
+            auto& sem = db->local().get_reader_concurrency_semaphore();
 
             auto resources = sem.available_resources();
             resources -= reader_concurrency_semaphore::resources{1, 0};
diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc
index 63940fadca..207ad883dd 100644
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -677,7 +677,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
                 nullptr,
                 db::no_timeout).get();
 
-        auto& semaphore = db.make_query_class_config().semaphore;
+        auto& semaphore = db.get_reader_concurrency_semaphore();
         auto permit = semaphore.make_permit();
 
         BOOST_CHECK_EQUAL(db.get_querier_cache_stats().resource_based_evictions, 0);
diff --git a/test/boost/view_build_test.cc b/test/boost/view_build_test.cc
index 8c91b5217d..f4c39c9444 100644
--- a/test/boost/view_build_test.cc
+++ b/test/boost/view_build_test.cc
@@ -554,7 +554,7 @@ SEASTAR_THREAD_TEST_CASE(test_view_update_generator_deadlock) {
         t->add_sstable_and_update_cache(sst).get();
 
         auto& sem = *with_scheduling_group(e.local_db().get_streaming_scheduling_group(), [&] () {
-            return &e.local_db().make_query_class_config().semaphore;
+            return &e.local_db().get_reader_concurrency_semaphore();
         }).get0();
 
         // consume all units except what is needed to admit a single reader.
diff --git a/test/manual/sstable_scan_footprint_test.cc b/test/manual/sstable_scan_footprint_test.cc
index 99287ef49f..34edf85145 100644
--- a/test/manual/sstable_scan_footprint_test.cc
+++ b/test/manual/sstable_scan_footprint_test.cc
@@ -308,7 +308,7 @@ int main(int argc, char** argv) {
             auto prev_occupancy = logalloc::shard_tracker().occupancy();
             testlog.info("Occupancy before: {}", prev_occupancy);
 
-            auto& sem = env.local_db().make_query_class_config().semaphore;
+            auto& sem = env.local_db().get_reader_concurrency_semaphore();
 
             testlog.info("Reading");
             stats_collector sc(sem, stats_collector_params);

From 6660a5df511a3727fa0b8cc15149f56a3af5006e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 15:55:04 +0300
Subject: [PATCH 20/26] result_memory_accounter: remove default constructor

If somebody wants to bypass proper memory accounting they should at
the very least be forced to consider if that is indeed wise and think a
second about the limit they want to apply.
---
 mutation.cc                        |  6 ++--
 mutation.hh                        |  2 ++
 mutation_query.cc                  |  3 +-
 mutation_query.hh                  |  2 +-
 query-result-set.cc                |  2 +-
 query-result.hh                    |  5 +++-
 table.cc                           |  2 +-
 test/boost/mutation_query_test.cc  | 46 ++++++++++++++++--------------
 test/boost/mutation_test.cc        | 24 ++++++++++------
 test/perf/memory_footprint_test.cc |  3 +-
 10 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/mutation.cc b/mutation.cc
index 0f68421695..3260ec97c8 100644
--- a/mutation.cc
+++ b/mutation.cc
@@ -122,20 +122,22 @@ mutation::query(query::result::builder& builder,
 
 query::result
 mutation::query(const query::partition_slice& slice,
+    query::result_memory_accounter&& accounter,
     query::result_options opts,
     gc_clock::time_point now, uint32_t row_limit) &&
 {
-    query::result::builder builder(slice, opts, { });
+    query::result::builder builder(slice, opts, std::move(accounter));
     std::move(*this).query(builder, slice, now, row_limit);
     return builder.build();
 }
 
 query::result
 mutation::query(const query::partition_slice& slice,
+    query::result_memory_accounter&& accounter,
     query::result_options opts,
     gc_clock::time_point now, uint32_t row_limit) const&
 {
-    return mutation(*this).query(slice, opts, now, row_limit);
+    return mutation(*this).query(slice, std::move(accounter), opts, now, row_limit);
 }
 
 size_t
diff --git a/mutation.hh b/mutation.hh
index 8baee800a8..e85ed56bd6 100644
--- a/mutation.hh
+++ b/mutation.hh
@@ -108,6 +108,7 @@ public:
 public:
     // The supplied partition_slice must be governed by this mutation's schema
     query::result query(const query::partition_slice&,
+        query::result_memory_accounter&& accounter,
         query::result_options opts = query::result_options::only_result(),
         gc_clock::time_point now = gc_clock::now(),
         uint32_t row_limit = query::max_rows) &&;
@@ -115,6 +116,7 @@ public:
     // The supplied partition_slice must be governed by this mutation's schema
     // FIXME: Slower than the r-value version
     query::result query(const query::partition_slice&,
+        query::result_memory_accounter&& accounter,
         query::result_options opts = query::result_options::only_result(),
         gc_clock::time_point now = gc_clock::now(),
         uint32_t row_limit = query::max_rows) const&;
diff --git a/mutation_query.cc b/mutation_query.cc
index 8c9dffa71c..530c987978 100644
--- a/mutation_query.cc
+++ b/mutation_query.cc
@@ -58,7 +58,8 @@ bool reconcilable_result::operator!=(const reconcilable_result& other) const {
 
 query::result
 to_data_query_result(const reconcilable_result& r, schema_ptr s, const query::partition_slice& slice, uint32_t max_rows, uint32_t max_partitions, query::result_options opts) {
-    query::result::builder builder(slice, opts, { });
+    // This result was already built with a limit, don't apply another one.
+    query::result::builder builder(slice, opts, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size });
     for (const partition& p : r.partitions()) {
         if (builder.row_count() >= max_rows || builder.partition_count() >= max_partitions) {
             break;
diff --git a/mutation_query.hh b/mutation_query.hh
index dbd0374b09..0bdb68bae5 100644
--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -164,7 +164,7 @@ future<reconcilable_result> mutation_query(
     gc_clock::time_point query_time,
     db::timeout_clock::time_point timeout,
     query::query_class_config class_config,
-    query::result_memory_accounter&& accounter = { },
+    query::result_memory_accounter&& accounter,
     tracing::trace_state_ptr trace_ptr = nullptr,
     query::querier_cache_context cache_ctx = { });
 
diff --git a/query-result-set.cc b/query-result-set.cc
index fdd976c5e5..075c423155 100644
--- a/query-result-set.cc
+++ b/query-result-set.cc
@@ -225,7 +225,7 @@ result_set::from_raw_result(schema_ptr s, const partition_slice& slice, const re
 
 result_set::result_set(const mutation& m) : result_set([&m] {
     auto slice = partition_slice_builder(*m.schema()).build();
-    auto qr = mutation(m).query(slice, result_options::only_result());
+    auto qr = mutation(m).query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, result_options::only_result());
     return result_set::from_raw_result(m.schema(), slice, qr);
 }())
 { }
diff --git a/query-result.hh b/query-result.hh
index 2dc70377ba..749c51cefe 100644
--- a/query-result.hh
+++ b/query-result.hh
@@ -152,7 +152,10 @@ private:
 
     friend class result_memory_limiter;
 public:
-    result_memory_accounter() = default;
+    explicit result_memory_accounter(size_t max_size) noexcept
+        : _blocked_bytes(0)
+        , _maximum_result_size(max_size) {
+    }
 
     result_memory_accounter(result_memory_accounter&& other) noexcept
         : _limiter(std::exchange(other._limiter, nullptr))
diff --git a/table.cc b/table.cc
index c5323278bb..9418a820f7 100644
--- a/table.cc
+++ b/table.cc
@@ -2001,7 +2001,7 @@ struct query_state {
                          const query::read_command& cmd,
                          query::result_options opts,
                          const dht::partition_range_vector& ranges,
-                         query::result_memory_accounter memory_accounter = { })
+                         query::result_memory_accounter memory_accounter)
             : schema(std::move(s))
             , cmd(cmd)
             , builder(cmd.slice, opts, std::move(memory_accounter))
diff --git a/test/boost/mutation_query_test.cc b/test/boost/mutation_query_test.cc
index 1c268759a5..133bbc885b 100644
--- a/test/boost/mutation_query_test.cc
+++ b/test/boost/mutation_query_test.cc
@@ -78,6 +78,10 @@ static query::partition_slice make_full_slice(const schema& s) {
 
 static auto inf32 = std::numeric_limits<unsigned>::max();
 
+static query::result_memory_accounter make_accounter() {
+    return query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size };
+}
+
 query::result_set to_result_set(const reconcilable_result& r, schema_ptr s, const query::partition_slice& slice) {
     return query::result_set::from_raw_result(s, slice, to_data_query_result(r, s, slice, inf32, inf32));
 }
@@ -101,7 +105,7 @@ SEASTAR_TEST_CASE(test_reading_from_single_partition) {
             auto slice = make_full_slice(*s);
 
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             // FIXME: use mutation assertions
             assert_that(to_result_set(result, s, slice))
@@ -124,7 +128,7 @@ SEASTAR_TEST_CASE(test_reading_from_single_partition) {
                 .build();
 
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_only(a_row()
@@ -160,7 +164,7 @@ SEASTAR_TEST_CASE(test_cells_are_expired_according_to_query_timestamp) {
             auto slice = make_full_slice(*s);
 
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_only(a_row()
@@ -174,7 +178,7 @@ SEASTAR_TEST_CASE(test_cells_are_expired_according_to_query_timestamp) {
             auto slice = make_full_slice(*s);
 
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, 1, query::max_partitions, now + 2s, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, 1, query::max_partitions, now + 2s, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_only(a_row()
@@ -207,7 +211,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
                 .build();
 
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_size(3)
@@ -237,7 +241,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
                 .build();
 
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_size(3)
@@ -265,7 +269,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
 
             {
                 reconcilable_result result = mutation_query(s, src,
-                    query::full_partition_range, slice, 10, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                    query::full_partition_range, slice, 10, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
                 assert_that(to_result_set(result, s, slice))
                     .has_size(3)
@@ -285,7 +289,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
 
             {
                 reconcilable_result result = mutation_query(s, src,
-                    query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                    query::full_partition_range, slice, 1, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
                 assert_that(to_result_set(result, s, slice))
                     .has_size(1)
@@ -297,7 +301,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
 
             {
                 reconcilable_result result = mutation_query(s, src,
-                    query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                    query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
                 assert_that(to_result_set(result, s, slice))
                     .has_size(2)
@@ -324,7 +328,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
                 .build();
 
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, 2, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_size(2)
@@ -348,7 +352,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
                 .build();
 
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_size(2)
@@ -370,7 +374,7 @@ SEASTAR_TEST_CASE(test_reverse_ordering_is_respected) {
                 .build();
 
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, 3, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_only(a_row()
@@ -396,7 +400,7 @@ SEASTAR_TEST_CASE(test_query_when_partition_tombstone_covers_live_cells) {
         auto slice = make_full_slice(*s);
 
         reconcilable_result result = mutation_query(s, src,
-            query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config()).get0();
+            query::full_partition_range, slice, query::max_rows, query::max_partitions, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
         assert_that(to_result_set(result, s, slice))
             .is_empty();
@@ -447,7 +451,7 @@ SEASTAR_TEST_CASE(test_partitions_with_only_expired_tombstones_are_dropped) {
         auto query_time = now + std::chrono::seconds(1);
 
         reconcilable_result result = mutation_query(s, src, query::full_partition_range, slice, query::max_rows, query::max_partitions, query_time,
-                db::no_timeout, tests::make_query_class_config()).get0();
+                db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
         BOOST_REQUIRE_EQUAL(result.partitions().size(), 2);
         BOOST_REQUIRE_EQUAL(result.row_count(), 2);
@@ -466,28 +470,28 @@ SEASTAR_TEST_CASE(test_result_row_count) {
             auto src = make_source({m1});
 
             auto r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now,
-                        db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
+                        db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
             BOOST_REQUIRE_EQUAL(r.row_count().value(), 0);
 
             m1.set_static_cell("s1", data_value(bytes("S_v1")), 1);
             r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now,
-                        db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
+                        db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
             BOOST_REQUIRE_EQUAL(r.row_count().value(), 1);
 
             m1.set_clustered_cell(clustering_key::from_single_value(*s, bytes("A")), "v1", data_value(bytes("A_v1")), 1);
             r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now,
-                        db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
+                        db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
             BOOST_REQUIRE_EQUAL(r.row_count().value(), 1);
 
             m1.set_clustered_cell(clustering_key::from_single_value(*s, bytes("B")), "v1", data_value(bytes("B_v1")), 1);
             r = to_data_query_result(mutation_query(s, make_source({m1}), query::full_partition_range, slice, 10000, query::max_partitions, now,
-                        db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
+                        db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
             BOOST_REQUIRE_EQUAL(r.row_count().value(), 2);
 
             mutation m2(s, partition_key::from_single_value(*s, "key2"));
             m2.set_static_cell("s1", data_value(bytes("S_v1")), 1);
             r = to_data_query_result(mutation_query(s, make_source({m1, m2}), query::full_partition_range, slice, 10000, query::max_partitions, now,
-                        db::no_timeout, tests::make_query_class_config()).get0(), s, slice, inf32, inf32);
+                        db::no_timeout, tests::make_query_class_config(), make_accounter()).get0(), s, slice, inf32, inf32);
             BOOST_REQUIRE_EQUAL(r.row_count().value(), 3);
     });
 }
@@ -510,7 +514,7 @@ SEASTAR_TEST_CASE(test_partition_limit) {
 
         {
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, query::max_rows, 10, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, query::max_rows, 10, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_size(2)
@@ -526,7 +530,7 @@ SEASTAR_TEST_CASE(test_partition_limit) {
 
         {
             reconcilable_result result = mutation_query(s, src,
-                query::full_partition_range, slice, query::max_rows, 1, now, db::no_timeout, tests::make_query_class_config()).get0();
+                query::full_partition_range, slice, query::max_rows, 1, now, db::no_timeout, tests::make_query_class_config(), make_accounter()).get0();
 
             assert_that(to_result_set(result, s, slice))
                 .has_size(1)
diff --git a/test/boost/mutation_test.cc b/test/boost/mutation_test.cc
index d29de47fc6..8d6bad425d 100644
--- a/test/boost/mutation_test.cc
+++ b/test/boost/mutation_test.cc
@@ -776,7 +776,8 @@ SEASTAR_TEST_CASE(test_querying_of_mutation) {
 
         auto resultify = [s] (const mutation& m) -> query::result_set {
             auto slice = make_full_slice(*s);
-            return query::result_set::from_raw_result(s, slice, m.query(slice));
+            return query::result_set::from_raw_result(s, slice,
+                    m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }));
         };
 
         mutation m(s, partition_key::from_single_value(*s, "key1"));
@@ -811,7 +812,8 @@ SEASTAR_TEST_CASE(test_partition_with_no_live_data_is_absent_in_data_query_resul
 
         auto slice = make_full_slice(*s);
 
-        assert_that(query::result_set::from_raw_result(s, slice, m.query(slice)))
+        assert_that(query::result_set::from_raw_result(s, slice,
+                    m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size })))
             .is_empty();
     });
 }
@@ -834,7 +836,8 @@ SEASTAR_TEST_CASE(test_partition_with_live_data_in_static_row_is_present_in_the_
             .with_regular_column("v")
             .build();
 
-        assert_that(query::result_set::from_raw_result(s, slice, m.query(slice)))
+        assert_that(query::result_set::from_raw_result(s, slice,
+                    m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size })))
             .has_only(a_row()
                 .with_column("pk", data_value(bytes("key1")))
                 .with_column("v", data_value::make_null(bytes_type)));
@@ -857,7 +860,8 @@ SEASTAR_TEST_CASE(test_query_result_with_one_regular_column_missing) {
 
         auto slice = partition_slice_builder(*s).build();
 
-        assert_that(query::result_set::from_raw_result(s, slice, m.query(slice)))
+        assert_that(query::result_set::from_raw_result(s, slice,
+                    m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size })))
             .has_only(a_row()
                 .with_column("pk", data_value(bytes("key1")))
                 .with_column("ck", data_value(bytes("ck:A")))
@@ -1243,8 +1247,10 @@ SEASTAR_TEST_CASE(test_query_digest) {
         auto check_digests_equal = [] (const mutation& m1, const mutation& m2) {
             auto ps1 = partition_slice_builder(*m1.schema()).build();
             auto ps2 = partition_slice_builder(*m2.schema()).build();
-            auto digest1 = *m1.query(ps1, query::result_options::only_digest(query::digest_algorithm::xxHash)).digest();
-            auto digest2 = *m2.query(ps2, query::result_options::only_digest(query::digest_algorithm::xxHash)).digest();
+            auto digest1 = *m1.query(ps1, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size },
+                    query::result_options::only_digest(query::digest_algorithm::xxHash)).digest();
+            auto digest2 = *m2.query(ps2, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size },
+                    query::result_options::only_digest(query::digest_algorithm::xxHash)).digest();
             if (digest1 != digest2) {
                 BOOST_FAIL(format("Digest should be the same for {} and {}", m1, m2));
             }
@@ -1493,7 +1499,8 @@ SEASTAR_THREAD_TEST_CASE(test_querying_expired_rows) {
                 .without_partition_key_columns()
                 .build();
         auto opts = query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash};
-        return query::result_set::from_raw_result(s, slice, m.query(slice, opts, t));
+        return query::result_set::from_raw_result(s, slice,
+                m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, opts, t));
     };
 
     mutation m(s, pk);
@@ -1557,7 +1564,8 @@ SEASTAR_TEST_CASE(test_querying_expired_cells) {
                     .without_partition_key_columns()
                     .build();
             auto opts = query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash};
-            return query::result_set::from_raw_result(s, slice, m.query(slice, opts, t));
+            return query::result_set::from_raw_result(s, slice,
+                    m.query(slice, query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, opts, t));
         };
 
         {
diff --git a/test/perf/memory_footprint_test.cc b/test/perf/memory_footprint_test.cc
index 104e7a5cb0..a34cbff7fc 100644
--- a/test/perf/memory_footprint_test.cc
+++ b/test/perf/memory_footprint_test.cc
@@ -196,7 +196,8 @@ static sizes calculate_sizes(cache_tracker& tracker, const mutation_settings& se
     result.cache = tracker.region().occupancy().used_space() - cache_initial_occupancy;
     result.frozen = freeze(m).representation().size();
     result.canonical = canonical_mutation(m).representation().size();
-    result.query_result = m.query(partition_slice_builder(*s).build(), query::result_options::only_result()).buf().size();
+    result.query_result = m.query(partition_slice_builder(*s).build(),
+            query::result_memory_accounter{ query::result_memory_limiter::unlimited_result_size }, query::result_options::only_result()).buf().size();
 
     tmpdir sstable_dir;
     sstables::test_env env;

From d27f8321d7e4147bd18efe0a9e08fb342d954b24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 15:57:40 +0300
Subject: [PATCH 21/26] partition_slice_builder: add with_option()

---
 partition_slice_builder.hh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/partition_slice_builder.hh b/partition_slice_builder.hh
index 2b3796bc1a..5337775377 100644
--- a/partition_slice_builder.hh
+++ b/partition_slice_builder.hh
@@ -54,6 +54,11 @@ public:
     partition_slice_builder& without_partition_key_columns();
     partition_slice_builder& without_clustering_key_columns();
     partition_slice_builder& reversed();
+    template <query::partition_slice::option OPTION>
+    partition_slice_builder& with_option() {
+        _options.set<OPTION>();
+        return *this;
+    }
 
     query::partition_slice build();
 };

From 648ce473ab622ac363b834cc88615c9afab4e639 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 16:18:50 +0300
Subject: [PATCH 22/26] test: set the allow_short_read slice option for paged
 queries

Some tests use the lower level methods directly and meant to use paging
but didn't and nobody noticed. This was revealed by the enforcement of
max result size (introduced in a later patch), which caused these tests
to fail due to exceeding the max result size.
This patch fixes this by setting the `allow_short_reads` slice option.
---
 test/boost/multishard_mutation_query_test.cc | 1 +
 test/boost/querier_cache_test.cc             | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/boost/multishard_mutation_query_test.cc b/test/boost/multishard_mutation_query_test.cc
index 295c14bc34..7be00933fc 100644
--- a/test/boost/multishard_mutation_query_test.cc
+++ b/test/boost/multishard_mutation_query_test.cc
@@ -881,6 +881,7 @@ run_fuzzy_test_scan(size_t i, fuzzy_test_config cfg, distributed<database>& db,
 
     const auto partition_slice = partition_slice_builder(*schema)
         .with_ranges(generate_clustering_ranges(rnd_engine, *schema, part_descs))
+        .with_option<query::partition_slice::option::allow_short_read>()
         .build();
 
     const auto is_stateful = stateful_query(std::uniform_int_distribution<int>(0, 3)(rnd_engine));
diff --git a/test/boost/querier_cache_test.cc b/test/boost/querier_cache_test.cc
index 207ad883dd..24e22a4d6f 100644
--- a/test/boost/querier_cache_test.cc
+++ b/test/boost/querier_cache_test.cc
@@ -658,10 +658,12 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
         auto s = cf.schema();
 
         cf.flush().get();
+        auto slice = s->full_slice();
+        slice.options.set<query::partition_slice::option::allow_short_read>();
 
         auto cmd1 = query::read_command(s->id(),
                 s->version(),
-                s->full_slice(),
+                slice,
                 1,
                 gc_clock::now(),
                 std::nullopt,
@@ -696,7 +698,7 @@ SEASTAR_THREAD_TEST_CASE(test_resources_based_cache_eviction) {
 
         auto cmd2 = query::read_command(s->id(),
                 s->version(),
-                s->full_slice(),
+                slice,
                 1,
                 gc_clock::now(),
                 std::nullopt,

From 43c0da4b63ee9c79945a87cfcdbdb02c5cf38559 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 16:22:30 +0300
Subject: [PATCH 23/26] test: cql_test_env: set the
 max_memory_unlimited_query_{soft,hard}_limit

To an unlimited value, in order to avoid aborting any unpaged queries
executed by tests, that would exceed the default result limit of
1MB/100MB.
---
 test/lib/cql_test_env.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/lib/cql_test_env.cc b/test/lib/cql_test_env.cc
index 63e18894e1..b79e4cf455 100644
--- a/test/lib/cql_test_env.cc
+++ b/test/lib/cql_test_env.cc
@@ -409,6 +409,13 @@ public:
                 create_directories((cfg->view_hints_directory() + "/" + std::to_string(i)).c_str());
             }
 
+            if (!cfg->max_memory_for_unlimited_query_soft_limit.is_set()) {
+                cfg->max_memory_for_unlimited_query_soft_limit.set(uint64_t(query::result_memory_limiter::unlimited_result_size));
+            }
+            if (!cfg->max_memory_for_unlimited_query_hard_limit.is_set()) {
+                cfg->max_memory_for_unlimited_query_hard_limit.set(uint64_t(query::result_memory_limiter::unlimited_result_size));
+            }
+
             sharded<locator::token_metadata> token_metadata;
             token_metadata.start().get();
             auto stop_token_metadata = defer([&token_metadata] { token_metadata.stop().get(); });

From 02a7492d628255b39d761a06c2d7687a88e2b1b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 28 Jul 2020 14:55:59 +0300
Subject: [PATCH 24/26] query-result.hh: move the definition of short_read to
 the top

It will be used by `result_memory_{limiter,accounter}` soon.
---
 query-result.hh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/query-result.hh b/query-result.hh
index 749c51cefe..53f24d702c 100644
--- a/query-result.hh
+++ b/query-result.hh
@@ -30,6 +30,9 @@
 
 namespace query {
 
+struct short_read_tag { };
+using short_read = bool_class<short_read_tag>;
+
 // result_memory_limiter, result_memory_accounter and result_memory_tracker
 // form an infrastructure for limiting size of query results.
 //
@@ -308,9 +311,6 @@ public:
 //  - query-result-reader.hh
 //  - query-result-writer.hh
 
-struct short_read_tag { };
-using short_read = bool_class<short_read_tag>;
-
 class result {
     bytes_ostream _w;
     std::optional<result_digest> _digest;

From f7a4d19fb1a6e962e3f0e30409569924c47e9a3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Tue, 28 Jul 2020 15:07:34 +0300
Subject: [PATCH 25/26] mutation_partition: abort read when hard limit is
 exceeded for non-paged reads

If the read is not paged (short read is not allowed) abort the query if
the hard memory limit is reached. On reaching the soft memory limit a
warning is logged. This should allow users to adjust their application
code while at the same time protecting the database from the really bad
queries.
The enforcement happens inside the memory accounter and doesn't require
cooperation from the result builders. This ensures memory limit set for
the query is respected for all kind of reads. Previously non-paged reads
simply ignored the memory accounter requesting the read to stop and
consumed all the memory they wanted.
---
 database.cc                       |  3 ++-
 multishard_mutation_query.cc      |  5 ++--
 mutation_partition.cc             | 40 ++++++++++++++++++++----------
 mutation_query.hh                 |  2 --
 query-result.hh                   | 41 +++++++++++++++++++------------
 table.cc                          |  3 ++-
 test/boost/mutation_query_test.cc |  4 +--
 7 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/database.cc b/database.cc
index 1a6bdbf0b8..d1399077bc 100644
--- a/database.cc
+++ b/database.cc
@@ -1207,7 +1207,8 @@ database::query(schema_ptr s, const query::read_command& cmd, query::result_opti
 future<std::tuple<reconcilable_result, cache_temperature>>
 database::query_mutations(schema_ptr s, const query::read_command& cmd, const dht::partition_range& range,
                           tracing::trace_state_ptr trace_state, db::timeout_clock::time_point timeout) {
-  return get_result_memory_limiter().new_mutation_read(*cmd.max_result_size).then(
+    const auto short_read_allwoed = query::short_read(cmd.slice.options.contains<query::partition_slice::option::allow_short_read>());
+  return get_result_memory_limiter().new_mutation_read(*cmd.max_result_size, short_read_allwoed).then(
           [&, s = std::move(s), trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) {
     column_family& cf = find_column_family(cmd.cf_id);
     auto class_config = query::query_class_config{.semaphore = get_reader_concurrency_semaphore(), .max_memory_for_unlimited_query = *cmd.max_result_size};
diff --git a/multishard_mutation_query.cc b/multishard_mutation_query.cc
index 7e3516c32b..f21f17e2d4 100644
--- a/multishard_mutation_query.cc
+++ b/multishard_mutation_query.cc
@@ -666,8 +666,9 @@ future<std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_tempera
                 db.local().find_column_family(s).get_global_cache_hit_rate()));
     }
 
-    return db.local().get_result_memory_limiter().new_mutation_read(*cmd.max_result_size).then([&, s = std::move(s), trace_state = std::move(trace_state),
-            timeout] (query::result_memory_accounter accounter) mutable {
+    const auto short_read_allwoed = query::short_read(cmd.slice.options.contains<query::partition_slice::option::allow_short_read>());
+    return db.local().get_result_memory_limiter().new_mutation_read(*cmd.max_result_size, short_read_allwoed).then([&, s = std::move(s),
+            trace_state = std::move(trace_state), timeout] (query::result_memory_accounter accounter) mutable {
         return do_query_mutations(db, s, cmd, ranges, std::move(trace_state), timeout, std::move(accounter)).then_wrapped(
                     [&db, s = std::move(s)] (future<reconcilable_result>&& f) {
             auto& local_db = db.local();
diff --git a/mutation_partition.cc b/mutation_partition.cc
index 113a8cbeba..a33b77f807 100644
--- a/mutation_partition.cc
+++ b/mutation_partition.cc
@@ -1976,7 +1976,6 @@ class mutation_querier {
     bool _live_data_in_static_row{};
     uint32_t _live_clustering_rows = 0;
     std::optional<ser::qr_partition__rows<bytes_ostream>> _rows_wr;
-    bool _short_reads_allowed;
 private:
     void query_static_row(const row& r, tombstone current_tombstone);
     void prepare_writers();
@@ -1998,7 +1997,6 @@ mutation_querier::mutation_querier(const schema& s, query::result::partition_wri
     , _memory_accounter(memory_accounter)
     , _pw(std::move(pw))
     , _static_cells_wr(pw.start().start_static_row().start_cells())
-    , _short_reads_allowed(pw.slice().options.contains<query::partition_slice::option::allow_short_read>())
 {
 }
 
@@ -2011,7 +2009,7 @@ void mutation_querier::query_static_row(const row& r, tombstone current_tombston
             get_compacted_row_slice(_schema, slice, column_kind::static_column,
                                     r, slice.static_columns, _static_cells_wr);
             _memory_accounter.update(_static_cells_wr._out.size() - start);
-        } else if (_short_reads_allowed) {
+        } else {
             seastar::measuring_output_stream stream;
             ser::qr_partition__static_row__cells<seastar::measuring_output_stream> out(stream, { });
             auto start = stream.size();
@@ -2075,7 +2073,7 @@ stop_iteration mutation_querier::consume(clustering_row&& cr, row_tombstone curr
         auto start = _rows_wr->_out.size();
         write_row(*_rows_wr);
         stop = _memory_accounter.update_and_check(_rows_wr->_out.size() - start);
-    } else if (_short_reads_allowed) {
+    } else {
         seastar::measuring_output_stream stream;
         ser::qr_partition__rows<seastar::measuring_output_stream> out(stream, { });
         auto start = stream.size();
@@ -2084,7 +2082,7 @@ stop_iteration mutation_querier::consume(clustering_row&& cr, row_tombstone curr
     }
 
     _live_clustering_rows++;
-    return stop && stop_iteration(_short_reads_allowed);
+    return stop;
 }
 
 uint32_t mutation_querier::consume_end_of_stream() {
@@ -2115,11 +2113,9 @@ class query_result_builder {
     query::result::builder& _rb;
     std::optional<mutation_querier> _mutation_consumer;
     stop_iteration _stop;
-    stop_iteration _short_read_allowed;
 public:
     query_result_builder(const schema& s, query::result::builder& rb)
         : _schema(s), _rb(rb)
-        , _short_read_allowed(_rb.slice().options.contains<query::partition_slice::option::allow_short_read>())
     { }
 
     void consume_new_partition(const dht::decorated_key& dk) {
@@ -2130,21 +2126,21 @@ public:
         _mutation_consumer->consume(t);
     }
     stop_iteration consume(static_row&& sr, tombstone t, bool) {
-        _stop = _mutation_consumer->consume(std::move(sr), t) && _short_read_allowed;
+        _stop = _mutation_consumer->consume(std::move(sr), t);
         return _stop;
     }
     stop_iteration consume(clustering_row&& cr, row_tombstone t,  bool) {
-        _stop = _mutation_consumer->consume(std::move(cr), t) && _short_read_allowed;
+        _stop = _mutation_consumer->consume(std::move(cr), t);
         return _stop;
     }
     stop_iteration consume(range_tombstone&& rt) {
-        _stop = _mutation_consumer->consume(std::move(rt)) && _short_read_allowed;
+        _stop = _mutation_consumer->consume(std::move(rt));
         return _stop;
     }
 
     stop_iteration consume_end_of_partition() {
         auto live_rows_in_partition = _mutation_consumer->consume_end_of_stream();
-        if (_short_read_allowed && live_rows_in_partition > 0 && !_stop) {
+        if (live_rows_in_partition > 0 && !_stop) {
             _stop = _rb.memory_accounter().check();
         }
         if (_stop) {
@@ -2191,6 +2187,24 @@ future<> data_query(
     });
 }
 
+stop_iteration query::result_memory_accounter::check_local_limit() const {
+    if (_total_used_memory > _maximum_result_size.hard_limit) {
+        if (_short_read_allowed) {
+            return stop_iteration::yes;
+        }
+        throw std::runtime_error(fmt::format(
+                "Memory usage of unpaged query exceeds hard limit of {} (configured via max_memory_for_unlimited_query_hard_limit)",
+                _maximum_result_size.hard_limit));
+    }
+    if (_below_soft_limit && !_short_read_allowed && _total_used_memory > _maximum_result_size.soft_limit) {
+        mplog.warn(
+                "Memory usage of unpaged query exceeds soft limit of {} (configured via max_memory_for_unlimited_query_soft_limit)",
+                _maximum_result_size.soft_limit);
+        _below_soft_limit = false;
+    }
+    return stop_iteration::no;
+}
+
 void reconcilable_result_builder::consume_new_partition(const dht::decorated_key& dk) {
     _return_static_content_on_partition_with_no_rows =
         _slice.options.contains(query::partition_slice::option::always_return_static_content) ||
@@ -2220,7 +2234,7 @@ stop_iteration reconcilable_result_builder::consume(clustering_row&& cr, row_tom
         // guarantee progress, not ending the result on a live row would
         // mean that the next page fetch will read all tombstones after the
         // last live row again.
-        _stop = stop && stop_iteration(_short_read_allowed);
+        _stop = stop;
     }
     return _mutation_consumer->consume(std::move(cr)) || _stop;
 }
@@ -2239,7 +2253,7 @@ stop_iteration reconcilable_result_builder::consume_end_of_partition() {
         // well. Next page fetch will ask for the next partition and if we
         // don't do that we could end up with an unbounded number of
         // partitions with only a static row.
-        _stop = _stop || (_memory_accounter.check() && stop_iteration(_short_read_allowed));
+        _stop = _stop || _memory_accounter.check();
     }
     _total_live_rows += _live_rows;
     _result.emplace_back(partition { _live_rows, _mutation_consumer->consume_end_of_stream() });
diff --git a/mutation_query.hh b/mutation_query.hh
index 0bdb68bae5..66c4da2b87 100644
--- a/mutation_query.hh
+++ b/mutation_query.hh
@@ -122,14 +122,12 @@ class reconcilable_result_builder {
     uint32_t _total_live_rows = 0;
     query::result_memory_accounter _memory_accounter;
     stop_iteration _stop;
-    bool _short_read_allowed;
     std::optional<streamed_mutation_freezer> _mutation_consumer;
 public:
     reconcilable_result_builder(const schema& s, const query::partition_slice& slice,
                                 query::result_memory_accounter&& accounter)
         : _schema(s), _slice(slice)
         , _memory_accounter(std::move(accounter))
-        , _short_read_allowed(slice.options.contains<query::partition_slice::option::allow_short_read>())
     { }
 
     void consume_new_partition(const dht::decorated_key& dk);
diff --git a/query-result.hh b/query-result.hh
index 53f24d702c..8fa674f718 100644
--- a/query-result.hh
+++ b/query-result.hh
@@ -71,18 +71,18 @@ public:
     // Reserves minimum_result_size and creates new memory accounter for
     // mutation query. Uses the specified maximum result size and may be
     // stopped before reaching it due to memory pressure on shard.
-    future<result_memory_accounter> new_mutation_read(query::max_result_size max_result_size);
+    future<result_memory_accounter> new_mutation_read(query::max_result_size max_result_size, short_read short_read_allowed);
 
     // Reserves minimum_result_size and creates new memory accounter for
     // data query. Uses the specified maximum result size, result will *not*
     // be stopped due to on shard memory pressure in order to avoid digest
     // mismatches.
-    future<result_memory_accounter> new_data_read(query::max_result_size max_result_size);
+    future<result_memory_accounter> new_data_read(query::max_result_size max_result_size, short_read short_read_allowed);
 
     // Creates a memory accounter for digest reads. Such accounter doesn't
     // contribute to the shard memory usage, but still stops producing the
     // result after individual limit has been reached.
-    future<result_memory_accounter> new_digest_read(query::max_result_size max_result_size);
+    future<result_memory_accounter> new_digest_read(query::max_result_size max_result_size, short_read short_read_allowed);
 
     // Checks whether the result can grow any more, takes into account only
     // the per shard limit.
@@ -124,35 +124,42 @@ class result_memory_accounter {
     size_t _total_used_memory = 0;
     query::max_result_size _maximum_result_size;
     stop_iteration _stop_on_global_limit;
+    short_read _short_read_allowed;
+    mutable bool _below_soft_limit = true;
 private:
     // Mutation query accounter. Uses provided individual result size limit and
     // will stop when shard memory pressure grows too high.
     struct mutation_query_tag { };
-    explicit result_memory_accounter(mutation_query_tag, result_memory_limiter& limiter, query::max_result_size max_size) noexcept
+    explicit result_memory_accounter(mutation_query_tag, result_memory_limiter& limiter, query::max_result_size max_size, short_read short_read_allowed) noexcept
         : _limiter(&limiter)
         , _blocked_bytes(result_memory_limiter::minimum_result_size)
         , _maximum_result_size(max_size)
         , _stop_on_global_limit(true)
+        , _short_read_allowed(short_read_allowed)
     { }
 
     // Data query accounter. Uses provided individual result size limit and
     // will *not* stop even though shard memory pressure grows too high.
     struct data_query_tag { };
-    explicit result_memory_accounter(data_query_tag, result_memory_limiter& limiter, query::max_result_size max_size) noexcept
+    explicit result_memory_accounter(data_query_tag, result_memory_limiter& limiter, query::max_result_size max_size, short_read short_read_allowed) noexcept
         : _limiter(&limiter)
         , _blocked_bytes(result_memory_limiter::minimum_result_size)
         , _maximum_result_size(max_size)
+        , _short_read_allowed(short_read_allowed)
     { }
 
     // Digest query accounter. Uses provided individual result size limit and
     // will *not* stop even though shard memory pressure grows too high. This
     // accounter does not contribute to the shard memory limits.
     struct digest_query_tag { };
-    explicit result_memory_accounter(digest_query_tag, result_memory_limiter&, query::max_result_size max_size) noexcept
+    explicit result_memory_accounter(digest_query_tag, result_memory_limiter&, query::max_result_size max_size, short_read short_read_allowed) noexcept
         : _blocked_bytes(0)
         , _maximum_result_size(max_size)
+        , _short_read_allowed(short_read_allowed)
     { }
 
+    stop_iteration check_local_limit() const;
+
     friend class result_memory_limiter;
 public:
     explicit result_memory_accounter(size_t max_size) noexcept
@@ -167,6 +174,8 @@ public:
         , _total_used_memory(other._total_used_memory)
         , _maximum_result_size(other._maximum_result_size)
         , _stop_on_global_limit(other._stop_on_global_limit)
+        , _short_read_allowed(other._short_read_allowed)
+        , _below_soft_limit(other._below_soft_limit)
     { }
 
     result_memory_accounter& operator=(result_memory_accounter&& other) noexcept {
@@ -191,7 +200,7 @@ public:
     stop_iteration update_and_check(size_t n) {
         _used_memory += n;
         _total_used_memory += n;
-        auto stop = stop_iteration(_total_used_memory > _maximum_result_size.hard_limit);
+        auto stop = check_local_limit();
         if (_limiter && _used_memory > _blocked_bytes) {
             auto to_block = std::min(_used_memory - _blocked_bytes, n);
             _blocked_bytes += to_block;
@@ -202,7 +211,7 @@ public:
 
     // Checks whether the result can grow any more.
     stop_iteration check() const {
-        stop_iteration stop { _total_used_memory > _maximum_result_size.hard_limit };
+        auto stop = check_local_limit();
         if (!stop && _used_memory >= _blocked_bytes && _limiter) {
             return _limiter->check() && _stop_on_global_limit;
         }
@@ -223,20 +232,20 @@ public:
     }
 };
 
-inline future<result_memory_accounter> result_memory_limiter::new_mutation_read(query::max_result_size max_size) {
-    return _memory_limiter.wait(minimum_result_size).then([this, max_size] {
-        return result_memory_accounter(result_memory_accounter::mutation_query_tag(), *this, max_size);
+inline future<result_memory_accounter> result_memory_limiter::new_mutation_read(query::max_result_size max_size, short_read short_read_allowed) {
+    return _memory_limiter.wait(minimum_result_size).then([this, max_size, short_read_allowed] {
+        return result_memory_accounter(result_memory_accounter::mutation_query_tag(), *this, max_size, short_read_allowed);
     });
 }
 
-inline future<result_memory_accounter> result_memory_limiter::new_data_read(query::max_result_size max_size) {
-    return _memory_limiter.wait(minimum_result_size).then([this, max_size] {
-        return result_memory_accounter(result_memory_accounter::data_query_tag(), *this, max_size);
+inline future<result_memory_accounter> result_memory_limiter::new_data_read(query::max_result_size max_size, short_read short_read_allowed) {
+    return _memory_limiter.wait(minimum_result_size).then([this, max_size, short_read_allowed] {
+        return result_memory_accounter(result_memory_accounter::data_query_tag(), *this, max_size, short_read_allowed);
     });
 }
 
-inline future<result_memory_accounter> result_memory_limiter::new_digest_read(query::max_result_size max_size) {
-    return make_ready_future<result_memory_accounter>(result_memory_accounter(result_memory_accounter::digest_query_tag(), *this, max_size));
+inline future<result_memory_accounter> result_memory_limiter::new_digest_read(query::max_result_size max_size, short_read short_read_allowed) {
+    return make_ready_future<result_memory_accounter>(result_memory_accounter(result_memory_accounter::digest_query_tag(), *this, max_size, short_read_allowed));
 }
 
 enum class result_request {
diff --git a/table.cc b/table.cc
index 9418a820f7..2713bbe129 100644
--- a/table.cc
+++ b/table.cc
@@ -2041,8 +2041,9 @@ table::query(schema_ptr s,
         query::querier_cache_context cache_ctx) {
     utils::latency_counter lc;
     _stats.reads.set_latency(lc);
+    const auto short_read_allwoed = query::short_read(cmd.slice.options.contains<query::partition_slice::option::allow_short_read>());
     auto f = opts.request == query::result_request::only_digest
-             ? memory_limiter.new_digest_read(*cmd.max_result_size) : memory_limiter.new_data_read(*cmd.max_result_size);
+             ? memory_limiter.new_digest_read(*cmd.max_result_size, short_read_allwoed) : memory_limiter.new_data_read(*cmd.max_result_size, short_read_allwoed);
     return f.then([this, lc, s = std::move(s), &cmd, class_config, opts, &partition_ranges,
             trace_state = std::move(trace_state), timeout, cache_ctx = std::move(cache_ctx)] (query::result_memory_accounter accounter) mutable {
         auto qs_ptr = std::make_unique<query_state>(std::move(s), cmd, opts, partition_ranges, std::move(accounter));
diff --git a/test/boost/mutation_query_test.cc b/test/boost/mutation_query_test.cc
index 133bbc885b..fa0ffaad13 100644
--- a/test/boost/mutation_query_test.cc
+++ b/test/boost/mutation_query_test.cc
@@ -552,12 +552,12 @@ SEASTAR_THREAD_TEST_CASE(test_result_size_calculation) {
     slice.options.set<query::partition_slice::option::allow_short_read>();
 
     query::result::builder digest_only_builder(slice, query::result_options{query::result_request::only_digest, query::digest_algorithm::xxHash},
-            l.new_digest_read(query::max_result_size(query::result_memory_limiter::maximum_result_size)).get0());
+            l.new_digest_read(query::max_result_size(query::result_memory_limiter::maximum_result_size), query::short_read::yes).get0());
     data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(),
             gc_clock::now(), digest_only_builder, db::no_timeout, tests::make_query_class_config()).get0();
 
     query::result::builder result_and_digest_builder(slice, query::result_options{query::result_request::result_and_digest, query::digest_algorithm::xxHash},
-            l.new_data_read(query::max_result_size(query::result_memory_limiter::maximum_result_size)).get0());
+            l.new_data_read(query::max_result_size(query::result_memory_limiter::maximum_result_size), query::short_read::yes).get0());
     data_query(s, source, query::full_partition_range, slice, std::numeric_limits<uint32_t>::max(), std::numeric_limits<uint32_t>::max(),
             gc_clock::now(), result_and_digest_builder, db::no_timeout, tests::make_query_class_config()).get0();
 

From 3804dfcc0c39ce6e383574c2f8e331c5f8afd233 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Botond=20D=C3=A9nes?= <bdenes@scylladb.com>
Date: Thu, 28 May 2020 10:21:43 +0300
Subject: [PATCH 26/26] test: database_test: add test for enforced max result
 limit

Two tests are added: one that works on the low-level database API, and
another one that works on the CQL API.
---
 test/boost/cql_query_test.cc | 78 +++++++++++++++++++++++++++++++++
 test/boost/database_test.cc  | 85 ++++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+)

diff --git a/test/boost/cql_query_test.cc b/test/boost/cql_query_test.cc
index 7a14262168..3f9eaf5f6b 100644
--- a/test/boost/cql_query_test.cc
+++ b/test/boost/cql_query_test.cc
@@ -4575,3 +4575,81 @@ SEASTAR_TEST_CASE(test_impossible_where) {
         require_rows(e, "SELECT * FROM t2 WHERE c>=10 AND c<=0 ALLOW FILTERING", {});
     });
 }
+
+SEASTAR_THREAD_TEST_CASE(test_query_limit) {
+    cql_test_config cfg;
+
+    cfg.db_config->max_memory_for_unlimited_query_soft_limit.set(256, utils::config_file::config_source::CommandLine);
+    cfg.db_config->max_memory_for_unlimited_query_hard_limit.set(1024, utils::config_file::config_source::CommandLine);
+
+    cfg.dbcfg.emplace();
+    cfg.dbcfg->available_memory = memory::stats().total_memory();
+    cfg.dbcfg->statement_scheduling_group = seastar::create_scheduling_group("statement", 1000).get0();
+    cfg.dbcfg->streaming_scheduling_group = seastar::create_scheduling_group("streaming", 200).get0();
+
+    do_with_cql_env_thread([] (cql_test_env& e) {
+        e.execute_cql("CREATE TABLE test (pk int, ck int, v text, PRIMARY KEY (pk, ck));").get();
+        auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0();
+
+        const int pk = 0;
+        const auto raw_pk = int32_type->decompose(data_value(pk));
+        const auto cql3_pk = cql3::raw_value::make_value(raw_pk);
+
+        const auto value = sstring(1024, 'a');
+        const auto raw_value = utf8_type->decompose(data_value(value));
+        const auto cql3_value = cql3::raw_value::make_value(raw_value);
+
+        const int num_rows = 10;
+
+        for (int i = 0; i != num_rows; ++i) {
+            const auto cql3_ck = cql3::raw_value::make_value(int32_type->decompose(data_value(i)));
+            e.execute_prepared(id, {cql3_pk, cql3_ck, cql3_value}).get();
+        }
+
+        auto& db = e.local_db();
+
+        const auto make_expected_row = [&] (int ck) -> std::vector<bytes_opt> {
+            return {raw_pk, int32_type->decompose(ck), raw_value};
+        };
+
+        const auto normal_rows = boost::copy_range<std::vector<std::vector<bytes_opt>>>(boost::irange(0, num_rows) | boost::adaptors::transformed(make_expected_row));
+        const auto reversed_rows = boost::copy_range<std::vector<std::vector<bytes_opt>>>(boost::irange(0, num_rows) | boost::adaptors::reversed | boost::adaptors::transformed(make_expected_row));
+
+        for (auto is_paged : {true, false}) {
+            for (auto is_reversed : {true, false}) {
+                for (auto scheduling_group : {db.get_statement_scheduling_group(), db.get_streaming_scheduling_group(), default_scheduling_group()}) {
+                    const auto should_fail = (!is_paged || is_reversed) && scheduling_group == db.get_statement_scheduling_group();
+                    testlog.info("checking: is_paged={}, is_reversed={}, scheduling_group={}, should_fail={}", is_paged, is_reversed, scheduling_group.name(), should_fail);
+                    const auto select_query = format("SELECT * FROM test WHERE pk = {} ORDER BY ck {};", pk, is_reversed ? "DESC" : "ASC");
+
+                    int32_t page_size = is_paged ? 10000 : -1;
+                    auto qo = std::make_unique<cql3::query_options>(db::consistency_level::LOCAL_ONE, infinite_timeout_config, std::vector<cql3::raw_value>{},
+                                cql3::query_options::specific_options{page_size, nullptr, {}, api::new_timestamp()});
+
+                    const auto* expected_rows = is_reversed ? &reversed_rows : &normal_rows;
+
+                    try {
+                        auto result = with_scheduling_group(scheduling_group, [&e] (const sstring& q, std::unique_ptr<cql3::query_options> qo) {
+                            return e.execute_cql(q, std::move(qo));
+                        }, select_query, std::move(qo)).get0();
+                        assert_that(std::move(result))
+                                .is_rows()
+                                .with_rows(*expected_rows);
+
+                        if (should_fail) {
+                            BOOST_FAIL("Expected exception, but none was thrown.");
+                        } else {
+                            testlog.trace("No exception thrown, as expected.");
+                        }
+                    } catch (exceptions::read_failure_exception& e) {
+                        if (should_fail) {
+                            testlog.trace("Exception thrown, as expected: {}", e);
+                        } else {
+                            BOOST_FAIL(fmt::format("Expected no exception, but caught: {}", e));
+                        }
+                    }
+                }
+            }
+        }
+    }, std::move(cfg)).get();
+}
diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc
index e36949d40c..3ef973080f 100644
--- a/test/boost/database_test.cc
+++ b/test/boost/database_test.cc
@@ -28,6 +28,7 @@
 #include "test/lib/cql_test_env.hh"
 #include "test/lib/result_set_assertions.hh"
 #include "test/lib/reader_permit.hh"
+#include "test/lib/log.hh"
 
 #include "database.hh"
 #include "partition_slice_builder.hh"
@@ -40,6 +41,7 @@
 #include "db/commitlog/commitlog_replayer.hh"
 #include "test/lib/tmpdir.hh"
 #include "db/data_listeners.hh"
+#include "multishard_mutation_query.hh"
 
 using namespace std::chrono_literals;
 
@@ -472,3 +474,86 @@ SEASTAR_TEST_CASE(toppartitions_cross_shard_schema_ptr) {
         tq.gather().get();
     });
 }
+
+SEASTAR_THREAD_TEST_CASE(read_max_size) {
+    do_with_cql_env([] (cql_test_env& e) {
+        e.execute_cql("CREATE TABLE test (pk text, ck int, v text, PRIMARY KEY (pk, ck));").get();
+        auto id = e.prepare("INSERT INTO test (pk, ck, v) VALUES (?, ?, ?);").get0();
+
+        auto& db = e.local_db();
+        auto& tab = db.find_column_family("ks", "test");
+        auto s = tab.schema();
+
+        auto pk = make_local_key(s);
+        const auto raw_pk = utf8_type->decompose(data_value(pk));
+        const auto cql3_pk = cql3::raw_value::make_value(raw_pk);
+
+        const auto value = sstring(1024, 'a');
+        const auto raw_value = utf8_type->decompose(data_value(value));
+        const auto cql3_value = cql3::raw_value::make_value(raw_value);
+
+        const int num_rows = 1024;
+
+        for (int i = 0; i != num_rows; ++i) {
+            const auto cql3_ck = cql3::raw_value::make_value(int32_type->decompose(data_value(i)));
+            e.execute_prepared(id, {cql3_pk, cql3_ck, cql3_value}).get();
+        }
+
+        const auto partition_ranges = std::vector<dht::partition_range>{query::full_partition_range};
+
+        const std::vector<std::pair<sstring, std::function<future<size_t>(schema_ptr, const query::read_command&)>>> query_methods{
+                {"query_mutations()", [&db, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
+                    return db.query_mutations(s, cmd, partition_ranges.front(), {}, db::no_timeout).then(
+                            [] (const std::tuple<reconcilable_result, cache_temperature>& res) {
+                        return std::get<0>(res).memory_usage();
+                    });
+                }},
+                {"query()", [&db, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
+                    return db.query(s, cmd, query::result_options::only_result(), partition_ranges, {}, db::no_timeout).then(
+                            [] (const std::tuple<lw_shared_ptr<query::result>, cache_temperature>& res) {
+                        return size_t(std::get<0>(res)->buf().size());
+                    });
+                }},
+                {"query_mutations_on_all_shards()", [&e, &partition_ranges] (schema_ptr s, const query::read_command& cmd) -> future<size_t> {
+                    return query_mutations_on_all_shards(e.db(), s, cmd, partition_ranges, {}, db::no_timeout).then(
+                            [] (const std::tuple<foreign_ptr<lw_shared_ptr<reconcilable_result>>, cache_temperature>& res) {
+                        return std::get<0>(res)->memory_usage();
+                    });
+                }}
+        };
+
+        for (auto [query_method_name, query_method] : query_methods) {
+            for (auto allow_short_read : {true, false}) {
+                for (auto max_size : {1024u, 1024u * 1024u, 1024u * 1024u * 1024u}) {
+                    const auto should_throw = max_size < (num_rows * value.size() * 2) && !allow_short_read;
+                    testlog.info("checking: query_method={}, allow_short_read={}, max_size={}, should_throw={}", query_method_name, allow_short_read, max_size, should_throw);
+                    auto slice = s->full_slice();
+                    if (allow_short_read) {
+                        slice.options.set<query::partition_slice::option::allow_short_read>();
+                    } else {
+                        slice.options.remove<query::partition_slice::option::allow_short_read>();
+                    }
+                    query::read_command cmd(s->id(), s->version(), slice, query::max_result_size(max_size));
+                    try {
+                        auto size = query_method(s, cmd).get0();
+                        // Just to ensure we are not interpreting empty results as success.
+                        BOOST_REQUIRE(size != 0);
+                        if (should_throw) {
+                            BOOST_FAIL("Expected exception, but none was thrown.");
+                        } else {
+                            testlog.trace("No exception thrown, as expected.");
+                        }
+                    } catch (std::runtime_error& e) {
+                        if (should_throw) {
+                            testlog.trace("Exception thrown, as expected: {}", e);
+                        } else {
+                            BOOST_FAIL(fmt::format("Expected no exception, but caught: {}", e));
+                        }
+                    }
+                }
+            }
+        }
+
+        return make_ready_future<>();
+    }).get();
+}