diff --git a/alternator/CMakeLists.txt b/alternator/CMakeLists.txt index 4cbe0691e4..80edc23bc1 100644 --- a/alternator/CMakeLists.txt +++ b/alternator/CMakeLists.txt @@ -9,6 +9,7 @@ target_sources(alternator controller.cc server.cc executor.cc + executor_read.cc stats.cc serialization.cc expressions.cc diff --git a/alternator/attribute_path.hh b/alternator/attribute_path.hh new file mode 100644 index 0000000000..8ae753c6ad --- /dev/null +++ b/alternator/attribute_path.hh @@ -0,0 +1,253 @@ +/* + * Copyright 2019-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "utils/rjson.hh" +#include "utils/overloaded_functor.hh" +#include "alternator/error.hh" +#include "alternator/expressions_types.hh" + +namespace alternator { + +// An attribute_path_map object is used to hold data for various attributes +// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path +// has a root attribute, and then modified by member and index operators - +// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then +// "[2]" index, and finally ".c" member. +// Data can be added to an attribute_path_map using the add() function, but +// requires that attributes with data not be *overlapping* or *conflicting*: +// +// 1. Two attribute paths which are identical or an ancestor of one another +// are considered *overlapping* and not allowed. If a.b.c has data, +// we can't add more data in a.b.c or any of its descendants like a.b.c.d. +// +// 2. Two attribute paths which need the same parent to have both a member and +// an index are considered *conflicting* and not allowed. E.g., if a.b has +// data, you can't add a[1]. The meaning of adding both would be that the +// attribute a is both a map and an array, which isn't sensible. +// +// These two requirements are common to the two places where Alternator uses +// this abstraction to describe how a hierarchical item is to be transformed: +// +// 1. In ProjectExpression: for filtering from a full top-level attribute +// only the parts for which user asked in ProjectionExpression. +// +// 2. In UpdateExpression: for taking the previous value of a top-level +// attribute, and modifying it based on the instructions in the user +// wrote in UpdateExpression. + +template +class attribute_path_map_node { +public: + using data_t = T; + // We need the extra unique_ptr<> here because libstdc++ unordered_map + // doesn't work with incomplete types :-( + using members_t = std::unordered_map>>; + // The indexes list is sorted because DynamoDB requires handling writes + // beyond the end of a list in index order. + using indexes_t = std::map>>; + // The prohibition on "overlap" and "conflict" explained above means + // That only one of data, members or indexes is non-empty. + std::optional> _content; + + bool is_empty() const { return !_content; } + bool has_value() const { return _content && std::holds_alternative(*_content); } + bool has_members() const { return _content && std::holds_alternative(*_content); } + bool has_indexes() const { return _content && std::holds_alternative(*_content); } + // get_members() assumes that has_members() is true + members_t& get_members() { return std::get(*_content); } + const members_t& get_members() const { return std::get(*_content); } + indexes_t& get_indexes() { return std::get(*_content); } + const indexes_t& get_indexes() const { return std::get(*_content); } + T& get_value() { return std::get(*_content); } + const T& get_value() const { return std::get(*_content); } +}; + +template +using attribute_path_map = std::unordered_map>; + +using attrs_to_get_node = attribute_path_map_node; +// attrs_to_get lists which top-level attribute are needed, and possibly also +// which part of the top-level attribute is really needed (when nested +// attribute paths appeared in the query). +// Most code actually uses optional. There, a disengaged +// optional means we should get all attributes, not specific ones. +using attrs_to_get = attribute_path_map; + +// takes a given JSON value and drops its parts which weren't asked to be +// kept. It modifies the given JSON value, or returns false to signify that +// the entire object should be dropped. +// Note that The JSON value is assumed to be encoded using the DynamoDB +// conventions - i.e., it is really a map whose key has a type string, +// and the value is the real object. +template +bool hierarchy_filter(rjson::value& val, const attribute_path_map_node& h) { + if (!val.IsObject() || val.MemberCount() != 1) { + // This shouldn't happen. We shouldn't have stored malformed objects. + // But today Alternator does not validate the structure of nested + // documents before storing them, so this can happen on read. + throw api_error::internal(format("Malformed value object read: {}", val)); + } + const char* type = val.MemberBegin()->name.GetString(); + rjson::value& v = val.MemberBegin()->value; + if (h.has_members()) { + const auto& members = h.get_members(); + if (type[0] != 'M' || !v.IsObject()) { + // If v is not an object (dictionary, map), none of the members + // can match. + return false; + } + rjson::value newv = rjson::empty_object(); + for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) { + std::string attr = rjson::to_string(it->name); + auto x = members.find(attr); + if (x != members.end()) { + if (x->second) { + // Only a part of this attribute is to be filtered, do it. + if (hierarchy_filter(it->value, *x->second)) { + // because newv started empty and attr are unique + // (keys of v), we can use add() here + rjson::add_with_string_name(newv, attr, std::move(it->value)); + } + } else { + // The entire attribute is to be kept + rjson::add_with_string_name(newv, attr, std::move(it->value)); + } + } + } + if (newv.MemberCount() == 0) { + return false; + } + v = newv; + } else if (h.has_indexes()) { + const auto& indexes = h.get_indexes(); + if (type[0] != 'L' || !v.IsArray()) { + return false; + } + rjson::value newv = rjson::empty_array(); + const auto& a = v.GetArray(); + for (unsigned i = 0; i < v.Size(); i++) { + auto x = indexes.find(i); + if (x != indexes.end()) { + if (x->second) { + if (hierarchy_filter(a[i], *x->second)) { + rjson::push_back(newv, std::move(a[i])); + } + } else { + // The entire attribute is to be kept + rjson::push_back(newv, std::move(a[i])); + } + } + } + if (newv.Size() == 0) { + return false; + } + v = newv; + } + return true; +} + +// Add a path to an attribute_path_map. Throws a validation error if the path +// "overlaps" with one already in the filter (one is a sub-path of the other) +// or "conflicts" with it (both a member and index is requested). +template +void attribute_path_map_add(const char* source, attribute_path_map& map, const parsed::path& p, T value = {}) { + using node = attribute_path_map_node; + // The first step is to look for the top-level attribute (p.root()): + auto it = map.find(p.root()); + if (it == map.end()) { + if (p.has_operators()) { + it = map.emplace(p.root(), node {std::nullopt}).first; + } else { + (void) map.emplace(p.root(), node {std::move(value)}).first; + // Value inserted for top-level node. We're done. + return; + } + } else if(!p.has_operators()) { + // If p is top-level and we already have it or a part of it + // in map, it's a forbidden overlapping path. + throw api_error::validation(fmt::format( + "Invalid {}: two document paths overlap at {}", source, p.root())); + } else if (it->second.has_value()) { + // If we're here, it != map.end() && p.has_operators && it->second.has_value(). + // This means the top-level attribute already has a value, and we're + // trying to add a non-top-level value. It's an overlap. + throw api_error::validation(fmt::format("Invalid {}: two document paths overlap at {}", source, p.root())); + } + node* h = &it->second; + // The second step is to walk h from the top-level node to the inner node + // where we're supposed to insert the value: + for (const auto& op : p.operators()) { + std::visit(overloaded_functor { + [&] (const std::string& member) { + if (h->is_empty()) { + *h = node {typename node::members_t()}; + } else if (h->has_indexes()) { + throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); + } else if (h->has_value()) { + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); + } + typename node::members_t& members = h->get_members(); + auto it = members.find(member); + if (it == members.end()) { + it = members.insert({member, std::make_unique()}).first; + } + h = it->second.get(); + }, + [&] (unsigned index) { + if (h->is_empty()) { + *h = node {typename node::indexes_t()}; + } else if (h->has_members()) { + throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); + } else if (h->has_value()) { + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); + } + typename node::indexes_t& indexes = h->get_indexes(); + auto it = indexes.find(index); + if (it == indexes.end()) { + it = indexes.insert({index, std::make_unique()}).first; + } + h = it->second.get(); + } + }, op); + } + // Finally, insert the value in the node h. + if (h->is_empty()) { + *h = node {std::move(value)}; + } else { + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); + } +} + +// A very simplified version of the above function for the special case of +// adding only top-level attribute. It's not only simpler, we also use a +// different error message, referring to a "duplicate attribute" instead of +// "overlapping paths". DynamoDB also has this distinction (errors in +// AttributesToGet refer to duplicates, not overlaps, but errors in +// ProjectionExpression refer to overlap - even if it's an exact duplicate). +template +void attribute_path_map_add(const char* source, attribute_path_map& map, const std::string& attr, T value = {}) { + using node = attribute_path_map_node; + auto it = map.find(attr); + if (it == map.end()) { + map.emplace(attr, node {std::move(value)}); + } else { + throw api_error::validation(fmt::format( + "Invalid {}: Duplicate attribute: {}", source, attr)); + } +} + +} // namespace alternator diff --git a/alternator/executor.cc b/alternator/executor.cc index 95de4b7232..b4c151b13f 100644 --- a/alternator/executor.cc +++ b/alternator/executor.cc @@ -71,8 +71,6 @@ using namespace std::chrono_literals; -logging::logger elogger("alternator-executor"); - namespace std { template <> struct hash> { size_t operator () (const std::pair& p) const { @@ -83,6 +81,8 @@ namespace std { namespace alternator { +logging::logger elogger("alternator-executor"); + // Alternator-specific table properties stored as hidden table tags: // // Alternator doesn't keep its own records of which Alternator tables exist @@ -180,7 +180,7 @@ void executor::maybe_audit( static lw_shared_ptr create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type, const std::map& tags_map, const gms::feature_service& feat, const db::tablets_mode_t::mode tablets_mode); -static map_type attrs_type() { +map_type attrs_type() { static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true); return t; } @@ -192,7 +192,7 @@ static const column_definition& attrs_column(const schema& schema) { } -static lw_shared_ptr get_stats_from_schema(service::storage_proxy& sp, const schema& schema) { +lw_shared_ptr get_stats_from_schema(service::storage_proxy& sp, const schema& schema) { try { replica::table& table = sp.local_db().find_column_family(schema.id()); if (!table.get_stats().alternator_stats) { @@ -223,35 +223,6 @@ executor::body_writer make_streamed(rjson::value&& value) { }; } -// make_streamed_with_extra_array() is variant of make_streamed() above, which -// builds a streaming response (a function writing to an output stream) from a -// JSON object (rjson::value) but adds to it at the end an additional array. -// The extra array is given a separate chunked_vector to avoid putting it -// inside the rjson::value - because RapidJSON does contiguous allocations for -// arrays which we want to avoid for potentially long arrays in Query/Scan -// responses (see #23535). -// If we ever fix RapidJSON to avoid contiguous allocations for arrays, or -// replace it entirely (#24458), we can remove this function and the function -// rjson::print_with_extra_array() which it calls. -executor::body_writer make_streamed_with_extra_array(rjson::value&& value, - std::string array_name, utils::chunked_vector&& array) { - return [value = std::move(value), array_name = std::move(array_name), array = std::move(array)](output_stream&& _out) mutable -> future<> { - auto out = std::move(_out); - std::exception_ptr ex; - try { - co_await rjson::print_with_extra_array(value, array_name, array, out); - } catch (...) { - ex = std::current_exception(); - } - co_await out.close(); - co_await rjson::destroy_gently(std::move(value)); - // TODO: can/should we also destroy the array gently? - if (ex) { - co_await coroutine::return_exception_ptr(std::move(ex)); - } - }; -} - // This function throws api_error::validation if input value is not an object. static void validate_is_object(const rjson::value& value, const char* caller) { if (!value.IsObject()) { @@ -473,7 +444,7 @@ static void validate_cdc_log_name_length(std::string_view table_name) { // instead of each component individually as DynamoDB does. // The view_name() function assumes the table_name has already been validated // but validates the legality of index_name and the combination of both. -static std::string view_name(std::string_view table_name, std::string_view index_name, const std::string& delim = ":", bool validate_len = true) { +std::string view_name(std::string_view table_name, std::string_view index_name, const std::string& delim, bool validate_len) { if (index_name.length() < 3) { throw api_error::validation("IndexName must be at least 3 characters long"); } @@ -490,11 +461,11 @@ static std::string view_name(std::string_view table_name, std::string_view index return ret; } -static std::string gsi_name(std::string_view table_name, std::string_view index_name, bool validate_len = true) { +std::string gsi_name(std::string_view table_name, std::string_view index_name, bool validate_len) { return view_name(table_name, index_name, ":", validate_len); } -static std::string lsi_name(std::string_view table_name, std::string_view index_name, bool validate_len = true) { +std::string lsi_name(std::string_view table_name, std::string_view index_name, bool validate_len) { return view_name(table_name, index_name, "!:", validate_len); } @@ -515,7 +486,7 @@ static std::optional find_table_name(const rjson::value& request) { return table_name; } -static std::string get_table_name(const rjson::value& request) { +std::string get_table_name(const rjson::value& request) { auto name = find_table_name(request); if (!name) { throw api_error::validation("Missing TableName field in request"); @@ -568,7 +539,7 @@ schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request) // or an exception if it doesn't exist. Otherwise, if table_name does not // start with INTERNAL_TABLE_PREFIX, this function returns an empty schema_ptr // and the caller should look for a normal Alternator table with that name. -static schema_ptr try_get_internal_table(data_dictionary::database db, std::string_view table_name) { +schema_ptr try_get_internal_table(data_dictionary::database db, std::string_view table_name) { size_t it = table_name.find(executor::INTERNAL_TABLE_PREFIX); if (it != 0) { return schema_ptr{}; @@ -596,64 +567,11 @@ static schema_ptr try_get_internal_table(data_dictionary::database db, std::stri } } -// get_table_or_view() is similar to to get_table(), except it returns either -// a table or a materialized view from which to read, based on the TableName -// and optional IndexName in the request. Only requests like Query and Scan -// which allow IndexName should use this function. -enum class table_or_view_type { base, lsi, gsi, vector_index }; -static std::pair -get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) { - table_or_view_type type = table_or_view_type::base; - std::string table_name = get_table_name(request); - - if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) { - return {s, type}; - } - - std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name; - const rjson::value* index_name = rjson::find(request, "IndexName"); - std::string orig_table_name; - if (index_name) { - if (index_name->IsString()) { - orig_table_name = std::move(table_name); - table_name = view_name(orig_table_name, rjson::to_string_view(*index_name)); - type = table_or_view_type::gsi; - } else { - throw api_error::validation( - fmt::format("Non-string IndexName '{}'", rjson::to_string_view(*index_name))); - } - // If no tables for global indexes were found, the index may be local - if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) { - type = table_or_view_type::lsi; - table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name)); - } - } - - try { - return { proxy.data_dictionary().find_schema(keyspace_name, table_name), type }; - } catch(data_dictionary::no_such_column_family&) { - if (index_name) { - // DynamoDB returns a different error depending on whether the - // base table doesn't exist (ResourceNotFoundException) or it - // does exist but the index does not (ValidationException). - if (proxy.data_dictionary().has_schema(keyspace_name, orig_table_name)) { - throw api_error::validation( - fmt::format("Requested resource not found: Index '{}' for table '{}'", rjson::to_string_view(*index_name), orig_table_name)); - } else { - throw api_error::resource_not_found( - fmt::format("Requested resource not found: Table: {} not found", orig_table_name)); - } - } else { - throw api_error::resource_not_found( - fmt::format("Requested resource not found: Table: {} not found", table_name)); - } - } -} - // get_table_for_write() is similar to get_table(), but additionally, if the // configuration allows this, may also allow writing to system table with -// prefix INTERNAL_TABLE_PREFIX. This is analogous to the function -// get_table_or_view() above which allows *reading* internal tables. +// prefix INTERNAL_TABLE_PREFIX. See also get_table_or_view() in +// executor_read.cc which allows *reading* internal tables by the Query +// operation. static schema_ptr get_table_for_write(service::storage_proxy& proxy, const rjson::value& request) { std::string table_name = get_table_name(request); if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) { @@ -685,7 +603,7 @@ static std::string get_string_attribute(const rjson::value& value, std::string_v // Convenience function for getting the value of a boolean attribute, or a // default value if it is missing. If the attribute exists, but is not a // bool, a descriptive api_error is thrown. -static bool get_bool_attribute(const rjson::value& value, std::string_view attribute_name, bool default_return) { +bool get_bool_attribute(const rjson::value& value, std::string_view attribute_name, bool default_return) { const rjson::value* attribute_value = rjson::find(value, attribute_name); if (!attribute_value) { return default_return; @@ -700,7 +618,7 @@ static bool get_bool_attribute(const rjson::value& value, std::string_view attri // Convenience function for getting the value of an integer attribute, or // an empty optional if it is missing. If the attribute exists, but is not // an integer, a descriptive api_error is thrown. -static std::optional get_int_attribute(const rjson::value& value, std::string_view attribute_name) { +std::optional get_int_attribute(const rjson::value& value, std::string_view attribute_name) { const rjson::value* attribute_value = rjson::find(value, attribute_name); if (!attribute_value) return {}; @@ -2730,7 +2648,7 @@ public: // After calling pk_from_json() and ck_from_json() to extract the pk and ck // components of a key, and if that succeeded, call check_key() to further // check that the key doesn't have any spurious components. -static void check_key(const rjson::value& key, const schema_ptr& schema) { +void check_key(const rjson::value& key, const schema_ptr& schema) { if (key.MemberCount() != (schema->clustering_key_size() == 0 ? 1 : 2)) { throw api_error::validation("Given key attribute not in schema"); } @@ -3397,7 +3315,7 @@ static bool check_needs_read_before_write(const parsed::condition_expression& co // Fail the expression if it has unused attribute names or values. This is // how DynamoDB behaves, so we do too. -static void verify_all_are_used(const rjson::value* field, +void verify_all_are_used(const rjson::value* field, const std::unordered_set& used, const char* field_name, const char* operation) { if (!field) { return; @@ -3639,7 +3557,7 @@ future executor::delete_item(client_state& client co_return res; } -static schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) { +schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) { sstring table_name = rjson::to_sstring(batch_request->name); // JSON keys are always strings try { return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name); @@ -4024,280 +3942,6 @@ static const std::string_view get_item_type_string(const rjson::value& v) { return rjson::to_string_view(mem.name); } -// attrs_to_get saves for each top-level attribute an attrs_to_get_node, -// a hierarchy of subparts that need to be kept. The following function -// takes a given JSON value and drops its parts which weren't asked to be -// kept. It modifies the given JSON value, or returns false to signify that -// the entire object should be dropped. -// Note that The JSON value is assumed to be encoded using the DynamoDB -// conventions - i.e., it is really a map whose key has a type string, -// and the value is the real object. -template -static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node& h) { - if (!val.IsObject() || val.MemberCount() != 1) { - // This shouldn't happen. We shouldn't have stored malformed objects. - // But today Alternator does not validate the structure of nested - // documents before storing them, so this can happen on read. - throw api_error::internal(format("Malformed value object read: {}", val)); - } - const char* type = val.MemberBegin()->name.GetString(); - rjson::value& v = val.MemberBegin()->value; - if (h.has_members()) { - const auto& members = h.get_members(); - if (type[0] != 'M' || !v.IsObject()) { - // If v is not an object (dictionary, map), none of the members - // can match. - return false; - } - rjson::value newv = rjson::empty_object(); - for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) { - std::string attr = rjson::to_string(it->name); - auto x = members.find(attr); - if (x != members.end()) { - if (x->second) { - // Only a part of this attribute is to be filtered, do it. - if (hierarchy_filter(it->value, *x->second)) { - // because newv started empty and attr are unique - // (keys of v), we can use add() here - rjson::add_with_string_name(newv, attr, std::move(it->value)); - } - } else { - // The entire attribute is to be kept - rjson::add_with_string_name(newv, attr, std::move(it->value)); - } - } - } - if (newv.MemberCount() == 0) { - return false; - } - v = newv; - } else if (h.has_indexes()) { - const auto& indexes = h.get_indexes(); - if (type[0] != 'L' || !v.IsArray()) { - return false; - } - rjson::value newv = rjson::empty_array(); - const auto& a = v.GetArray(); - for (unsigned i = 0; i < v.Size(); i++) { - auto x = indexes.find(i); - if (x != indexes.end()) { - if (x->second) { - if (hierarchy_filter(a[i], *x->second)) { - rjson::push_back(newv, std::move(a[i])); - } - } else { - // The entire attribute is to be kept - rjson::push_back(newv, std::move(a[i])); - } - } - } - if (newv.Size() == 0) { - return false; - } - v = newv; - } - return true; -} - -// Add a path to an attribute_path_map. Throws a validation error if the path -// "overlaps" with one already in the filter (one is a sub-path of the other) -// or "conflicts" with it (both a member and index is requested). -template -void attribute_path_map_add(const char* source, attribute_path_map& map, const parsed::path& p, T value = {}) { - using node = attribute_path_map_node; - // The first step is to look for the top-level attribute (p.root()): - auto it = map.find(p.root()); - if (it == map.end()) { - if (p.has_operators()) { - it = map.emplace(p.root(), node {std::nullopt}).first; - } else { - (void) map.emplace(p.root(), node {std::move(value)}).first; - // Value inserted for top-level node. We're done. - return; - } - } else if(!p.has_operators()) { - // If p is top-level and we already have it or a part of it - // in map, it's a forbidden overlapping path. - throw api_error::validation(fmt::format( - "Invalid {}: two document paths overlap at {}", source, p.root())); - } else if (it->second.has_value()) { - // If we're here, it != map.end() && p.has_operators && it->second.has_value(). - // This means the top-level attribute already has a value, and we're - // trying to add a non-top-level value. It's an overlap. - throw api_error::validation(fmt::format("Invalid {}: two document paths overlap at {}", source, p.root())); - } - node* h = &it->second; - // The second step is to walk h from the top-level node to the inner node - // where we're supposed to insert the value: - for (const auto& op : p.operators()) { - std::visit(overloaded_functor { - [&] (const std::string& member) { - if (h->is_empty()) { - *h = node {typename node::members_t()}; - } else if (h->has_indexes()) { - throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); - } else if (h->has_value()) { - throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); - } - typename node::members_t& members = h->get_members(); - auto it = members.find(member); - if (it == members.end()) { - it = members.insert({member, std::make_unique()}).first; - } - h = it->second.get(); - }, - [&] (unsigned index) { - if (h->is_empty()) { - *h = node {typename node::indexes_t()}; - } else if (h->has_members()) { - throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); - } else if (h->has_value()) { - throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); - } - typename node::indexes_t& indexes = h->get_indexes(); - auto it = indexes.find(index); - if (it == indexes.end()) { - it = indexes.insert({index, std::make_unique()}).first; - } - h = it->second.get(); - } - }, op); - } - // Finally, insert the value in the node h. - if (h->is_empty()) { - *h = node {std::move(value)}; - } else { - throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); - } -} - -// A very simplified version of the above function for the special case of -// adding only top-level attribute. It's not only simpler, we also use a -// different error message, referring to a "duplicate attribute"instead of -// "overlapping paths". DynamoDB also has this distinction (errors in -// AttributesToGet refer to duplicates, not overlaps, but errors in -// ProjectionExpression refer to overlap - even if it's an exact duplicate). -template -void attribute_path_map_add(const char* source, attribute_path_map& map, const std::string& attr, T value = {}) { - using node = attribute_path_map_node; - auto it = map.find(attr); - if (it == map.end()) { - map.emplace(attr, node {std::move(value)}); - } else { - throw api_error::validation(fmt::format( - "Invalid {}: Duplicate attribute: {}", source, attr)); - } -} - -// Parse the "Select" parameter of a Scan or Query operation, throwing a -// ValidationException in various forbidden combinations of options and -// finally returning one of three options: -// 1. regular - the default scan behavior of returning all or specific -// attributes ("ALL_ATTRIBUTES" or "SPECIFIC_ATTRIBUTES"). -// 2. count - just count the items ("COUNT") -// 3. projection - return projected attributes ("ALL_PROJECTED_ATTRIBUTES") -// An ValidationException is thrown when recognizing an invalid combination -// of options - such as ALL_PROJECTED_ATTRIBUTES for a base table, or -// SPECIFIC_ATTRIBUTES without ProjectionExpression or AttributesToGet. -enum class select_type { regular, count, projection }; -static select_type parse_select(const rjson::value& request, table_or_view_type table_type) { - const rjson::value* select_value = rjson::find(request, "Select"); - if (!select_value) { - // If "Select" is not specified, it defaults to ALL_ATTRIBUTES - // on a base table or vector index, or ALL_PROJECTED_ATTRIBUTES on GSI/LSI. - return (table_type == table_or_view_type::base || table_type == table_or_view_type::vector_index) ? - select_type::regular : select_type::projection; - } - if (!select_value->IsString()) { - throw api_error::validation("Select parameter must be a string"); - } - std::string_view select = rjson::to_string_view(*select_value); - const bool has_attributes_to_get = request.HasMember("AttributesToGet"); - const bool has_projection_expression = request.HasMember("ProjectionExpression"); - if (select == "SPECIFIC_ATTRIBUTES") { - if (has_projection_expression || has_attributes_to_get) { - return select_type::regular; - } - throw api_error::validation("Select=SPECIFIC_ATTRIBUTES requires AttributesToGet or ProjectionExpression"); - } - if (has_projection_expression || has_attributes_to_get) { - throw api_error::validation("AttributesToGet or ProjectionExpression require Select to be either SPECIFIC_ATTRIBUTES or missing"); - } - if (select == "COUNT") { - return select_type::count; - } - if (select == "ALL_ATTRIBUTES") { - // FIXME: when we support projections (#5036), if this is a GSI and - // not all attributes are projected to it, we should throw. - return select_type::regular; - } - if (select == "ALL_PROJECTED_ATTRIBUTES") { - if (table_type == table_or_view_type::base) { - throw api_error::validation("ALL_PROJECTED_ATTRIBUTES only allowed for indexes"); - } - return select_type::projection; - } - throw api_error::validation(fmt::format("Unknown Select value '{}'. Allowed choices: ALL_ATTRIBUTES, SPECIFIC_ATTRIBUTES, ALL_PROJECTED_ATTRIBUTES, COUNT", - select)); -} - -// calculate_attrs_to_get() takes either AttributesToGet or -// ProjectionExpression parameters (having both is *not* allowed), -// and returns the list of cells we need to read, or a disengaged optional -// when *all* attributes are to be returned. -// However, in our current implementation, only top-level attributes are -// stored as separate cells - a nested document is stored serialized together -// (as JSON) in the same cell. So this function return a map - each key is the -// top-level attribute we will need need to read, and the value for each -// top-level attribute is the partial hierarchy (struct hierarchy_filter) -// that we will need to extract from that serialized JSON. -// For example, if ProjectionExpression lists a.b and a.c[2], we -// return one top-level attribute name, "a", with the value "{b, c[2]}". - -static std::optional calculate_attrs_to_get(const rjson::value& req, parsed::expression_cache& parsed_expression_cache, std::unordered_set& used_attribute_names, select_type select = select_type::regular) { - if (select == select_type::count) { - // An empty map asks to retrieve no attributes. Note that this is - // different from a disengaged optional which means retrieve all. - return attrs_to_get(); - } - // FIXME: also need to handle select_type::projection - const bool has_attributes_to_get = req.HasMember("AttributesToGet"); - const bool has_projection_expression = req.HasMember("ProjectionExpression"); - if (has_attributes_to_get && has_projection_expression) { - throw api_error::validation( - format("GetItem does not allow both ProjectionExpression and AttributesToGet to be given together")); - } - if (has_attributes_to_get) { - const rjson::value& attributes_to_get = req["AttributesToGet"]; - attrs_to_get ret; - for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) { - attribute_path_map_add("AttributesToGet", ret, rjson::to_string(*it)); - validate_attr_name_length("AttributesToGet", it->GetStringLength(), false); - } - if (ret.empty()) { - throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead."); - } - return ret; - } else if (has_projection_expression) { - const rjson::value& projection_expression = req["ProjectionExpression"]; - const rjson::value* expression_attribute_names = rjson::find(req, "ExpressionAttributeNames"); - std::vector paths_to_get; - try { - paths_to_get = parsed_expression_cache.parse_projection_expression(rjson::to_string_view(projection_expression)); - } catch(expressions_syntax_error& e) { - throw api_error::validation(e.what()); - } - resolve_projection_expression(paths_to_get, expression_attribute_names, used_attribute_names); - attrs_to_get ret; - for (const parsed::path& p : paths_to_get) { - attribute_path_map_add("ProjectionExpression", ret, p); - } - return ret; - } - // An disengaged optional asks to read everything - return std::nullopt; -} - /** * Helper routine to extract data when we already have * row, etc etc. @@ -4409,29 +4053,6 @@ std::optional executor::describe_single_item(schema_ptr schema, return item; } -future> executor::describe_multi_item(schema_ptr schema, - const query::partition_slice&& slice, - shared_ptr selection, - foreign_ptr> query_result, - shared_ptr> attrs_to_get, - noncopyable_function item_callback) { - cql3::selection::result_set_builder builder(*selection, gc_clock::now()); - query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection)); - auto result_set = builder.build(); - std::vector ret; - for (auto& result_row : result_set->rows()) { - rjson::value item = rjson::empty_object(); - uint64_t item_length_in_bytes = 0; - describe_single_item(*selection, result_row, *attrs_to_get, item, &item_length_in_bytes); - if (item_callback) { - item_callback(item_length_in_bytes); - } - ret.push_back(std::move(item)); - co_await coroutine::maybe_yield(); - } - co_return ret; -} - static bool check_needs_read_before_write(const parsed::value& v) { return std::visit(overloaded_functor { [&] (const parsed::constant& c) -> bool { @@ -5219,106 +4840,6 @@ future executor::update_item(client_state& client co_return res; } -// Check according to the request's "ConsistentRead" field, which consistency -// level we need to use for the read. The field can be True for strongly -// consistent reads, or False for eventually consistent reads, or if this -// field is absence, we default to eventually consistent reads. -// In Scylla, eventually-consistent reads are implemented as consistency -// level LOCAL_ONE, and strongly-consistent reads as LOCAL_QUORUM. -static db::consistency_level get_read_consistency(const rjson::value& request) { - const rjson::value* consistent_read_value = rjson::find(request, "ConsistentRead"); - bool consistent_read = false; - if (consistent_read_value && !consistent_read_value->IsNull()) { - if (consistent_read_value->IsBool()) { - consistent_read = consistent_read_value->GetBool(); - } else { - throw api_error::validation("ConsistentRead flag must be a boolean"); - } - } - return consistent_read ? db::consistency_level::LOCAL_QUORUM : db::consistency_level::LOCAL_ONE; -} - -// describe_item() wraps the result of describe_single_item() by a map -// as needed by the GetItem request. It should not be used for other purposes, -// use describe_single_item() instead. -static rjson::value describe_item(schema_ptr schema, - const query::partition_slice& slice, - const cql3::selection::selection& selection, - const query::result& query_result, - const std::optional& attrs_to_get, - consumed_capacity_counter& consumed_capacity, - uint64_t& metric) { - std::optional opt_item = executor::describe_single_item(std::move(schema), slice, selection, std::move(query_result), attrs_to_get, &consumed_capacity._total_bytes); - rjson::value item_descr = rjson::empty_object(); - if (opt_item) { - rjson::add(item_descr, "Item", std::move(*opt_item)); - } - consumed_capacity.add_consumed_capacity_to_response_if_needed(item_descr); - metric += consumed_capacity.get_half_units(); - return item_descr; -} - -future executor::get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { - _stats.api_operations.get_item++; - auto start_time = std::chrono::steady_clock::now(); - elogger.trace("Getting item {}", request); - - schema_ptr schema = get_table(_proxy, request); - lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *schema); - per_table_stats->api_operations.get_item++; - tracing::add_alternator_table_name(trace_state, schema->cf_name()); - - rjson::value& query_key = request["Key"]; - db::consistency_level cl = get_read_consistency(request); - - maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "GetItem", request, cl); - - partition_key pk = pk_from_json(query_key, schema); - dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))}; - - std::vector bounds; - if (schema->clustering_key_size() == 0) { - bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } else { - clustering_key ck = ck_from_json(query_key, schema); - bounds.push_back(query::clustering_range::make_singular(std::move(ck))); - } - check_key(query_key, schema); - - //TODO(sarna): It would be better to fetch only some attributes of the map, not all - auto regular_columns = - schema->regular_columns() | std::views::transform(&column_definition::id) - | std::ranges::to(); - - auto selection = cql3::selection::selection::wildcard(schema); - - auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice), - query::tombstone_limit(_proxy.get_tombstone_limit())); - - std::unordered_set used_attribute_names; - auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names); - const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); - verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem"); - rcu_consumed_capacity_counter add_capacity(request, cl == db::consistency_level::LOCAL_QUORUM); - co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::SELECT, _stats); - service::storage_proxy::coordinator_query_result qr = - co_await _proxy.query( - schema, std::move(command), std::move(partition_ranges), cl, - service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)); - per_table_stats->api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time); - _stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time); - uint64_t rcu_half_units = 0; - rjson::value res = describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get), add_capacity, rcu_half_units); - per_table_stats->rcu_half_units_total += rcu_half_units; - _stats.rcu_half_units_total += rcu_half_units; - // Update item size metrics only if we found an item. - if (qr.query_result->row_count().value_or(0) > 0) { - per_table_stats->operation_sizes.get_item_op_size_kb.add(bytes_to_kb_ceil(add_capacity._total_bytes)); - } - co_return rjson::print(std::move(res)); -} - static void check_big_object(const rjson::value& val, int& size_left); static void check_big_array(const rjson::value& val, int& size_left); @@ -5372,1630 +4893,6 @@ static void check_big_object(const rjson::value& val, int& size_left) { } } -future executor::batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { - // FIXME: In this implementation, an unbounded batch size can cause - // unbounded response JSON object to be buffered in memory, unbounded - // parallelism of the requests, and unbounded amount of non-preemptable - // work in the following loops. So we should limit the batch size, and/or - // the response size, as DynamoDB does. - _stats.api_operations.batch_get_item++; - rjson::value& request_items = request["RequestItems"]; - auto start_time = std::chrono::steady_clock::now(); - // We need to validate all the parameters before starting any asynchronous - // query, and fail the entire request on any parse error. So we parse all - // the input into our own vector "requests", each element a table_requests - // listing all the request aimed at a single table. For efficiency, inside - // each table_requests we further group together all reads going to the - // same partition, so we can later send them together. - bool should_add_rcu = rcu_consumed_capacity_counter::should_add_capacity(request); - struct table_requests { - schema_ptr schema; - db::consistency_level cl; - ::shared_ptr> attrs_to_get; - // clustering_keys keeps a sorted set of clustering keys. It must - // be sorted for the read below (see #10827). Additionally each - // clustering key is mapped to the original rjson::value "Key". - using clustering_keys = std::map; - std::unordered_map requests; - table_requests(schema_ptr s) - : schema(std::move(s)) - , requests(8, partition_key::hashing(*schema), partition_key::equality(*schema)) - {} - void add(rjson::value& key) { - auto pk = pk_from_json(key, schema); - auto it = requests.find(pk); - if (it == requests.end()) { - it = requests.emplace(pk, clustering_key::less_compare(*schema)).first; - } - auto ck = ck_from_json(key, schema); - if (auto [_, inserted] = it->second.emplace(ck, &key); !inserted) { - throw api_error::validation("Provided list of item keys contains duplicates"); - } - } - }; - std::vector requests; - uint batch_size = 0; - for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) { - table_requests rs(get_table_from_batch_request(_proxy, it)); - tracing::add_alternator_table_name(trace_state, rs.schema->cf_name()); - rs.cl = get_read_consistency(it->value); - std::unordered_set used_attribute_names; - rs.attrs_to_get = ::make_shared>(calculate_attrs_to_get(it->value, *_parsed_expression_cache, used_attribute_names)); - const rjson::value* expression_attribute_names = rjson::find(it->value, "ExpressionAttributeNames"); - verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "GetItem"); - auto& keys = (it->value)["Keys"]; - for (rjson::value& key : keys.GetArray()) { - rs.add(key); - check_key(key, rs.schema); - } - batch_size += rs.requests.size(); - requests.emplace_back(std::move(rs)); - } - - for (const table_requests& tr : requests) { - co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, tr.schema, auth::permission::SELECT, _stats); - } - - _stats.api_operations.batch_get_item_batch_total += batch_size; - _stats.api_operations.batch_get_item_histogram.add(batch_size); - // If we got here, all "requests" are valid, so let's start the - // requests for the different partitions all in parallel. - std::vector>> response_futures; - std::vector consumed_rcu_half_units_per_table(requests.size()); - for (size_t i = 0; i < requests.size(); i++) { - const table_requests& rs = requests[i]; - bool is_quorum = rs.cl == db::consistency_level::LOCAL_QUORUM; - lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); - per_table_stats->api_operations.batch_get_item_histogram.add(rs.requests.size()); - for (const auto& [pk, cks] : rs.requests) { - dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*rs.schema, pk))}; - std::vector bounds; - if (rs.schema->clustering_key_size() == 0) { - bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } else { - for (auto& ck : cks) { - bounds.push_back(query::clustering_range::make_singular(ck.first)); - } - } - auto regular_columns = - rs.schema->regular_columns() | std::views::transform(&column_definition::id) - | std::ranges::to(); - auto selection = cql3::selection::selection::wildcard(rs.schema); - auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice), - query::tombstone_limit(_proxy.get_tombstone_limit())); - command->allow_limit = db::allow_per_partition_rate_limit::yes; - const auto item_callback = [is_quorum, per_table_stats, &rcus_per_table = consumed_rcu_half_units_per_table[i]](uint64_t size) { - rcus_per_table += rcu_consumed_capacity_counter::get_half_units(size, is_quorum); - // Update item size only if the item exists. - if (size > 0) { - per_table_stats->operation_sizes.batch_get_item_op_size_kb.add(bytes_to_kb_ceil(size)); - } - }; - future> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl, - service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then( - [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get, item_callback = std::move(item_callback)] (service::storage_proxy::coordinator_query_result qr) mutable { - utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); }); - return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get), std::move(item_callback)); - }); - response_futures.push_back(std::move(f)); - } - } - - // Wait for all requests to complete, and then return the response. - // In case of full failure (no reads succeeded), an arbitrary error - // from one of the operations will be returned. - bool some_succeeded = false; - std::exception_ptr eptr; - std::set table_names; // for auditing - // FIXME: will_log() here doesn't pass keyspace/table, so keyspace-level audit - // filtering is bypassed — a batch spanning multiple tables is audited as a whole. - bool should_audit = _audit.local_is_initialized() && _audit.local().will_log(audit::statement_category::QUERY); - rjson::value response = rjson::empty_object(); - rjson::add(response, "Responses", rjson::empty_object()); - rjson::add(response, "UnprocessedKeys", rjson::empty_object()); - auto fut_it = response_futures.begin(); - rjson::value consumed_capacity = rjson::empty_array(); - for (size_t i = 0; i < requests.size(); i++) { - const table_requests& rs = requests[i]; - std::string table = table_name(*rs.schema); - if (should_audit) { - table_names.insert(table); - } - for (const auto& [_, cks] : rs.requests) { - auto& fut = *fut_it; - ++fut_it; - try { - std::vector results = co_await std::move(fut); - some_succeeded = true; - if (!response["Responses"].HasMember(table)) { - rjson::add_with_string_name(response["Responses"], table, rjson::empty_array()); - } - for (rjson::value& json : results) { - rjson::push_back(response["Responses"][table], std::move(json)); - } - } catch(...) { - eptr = std::current_exception(); - // This read of potentially several rows in one partition, - // failed. We need to add the row key(s) to UnprocessedKeys. - if (!response["UnprocessedKeys"].HasMember(table)) { - // Add the table's entry in UnprocessedKeys. Need to copy - // all the table's parameters from the request except the - // Keys field, which we start empty and then build below. - rjson::add_with_string_name(response["UnprocessedKeys"], table, rjson::empty_object()); - rjson::value& unprocessed_item = response["UnprocessedKeys"][table]; - rjson::value& request_item = request_items[table]; - for (auto it = request_item.MemberBegin(); it != request_item.MemberEnd(); ++it) { - if (it->name != "Keys") { - rjson::add_with_string_name(unprocessed_item, - rjson::to_string_view(it->name), rjson::copy(it->value)); - } - } - rjson::add_with_string_name(unprocessed_item, "Keys", rjson::empty_array()); - } - for (auto& ck : cks) { - rjson::push_back(response["UnprocessedKeys"][table]["Keys"], std::move(*ck.second)); - } - } - } - uint64_t rcu_half_units = consumed_rcu_half_units_per_table[i]; - _stats.rcu_half_units_total += rcu_half_units; - lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); - per_table_stats->rcu_half_units_total += rcu_half_units; - if (should_add_rcu) { - rjson::value entry = rjson::empty_object(); - rjson::add(entry, "TableName", table); - rjson::add(entry, "CapacityUnits", rcu_half_units*0.5); - rjson::push_back(consumed_capacity, std::move(entry)); - } - } - - if (should_add_rcu) { - rjson::add(response, "ConsumedCapacity", std::move(consumed_capacity)); - } - elogger.trace("Unprocessed keys: {}", response["UnprocessedKeys"]); - // NOTE: Each table in the batch has its own CL (set by get_read_consistency()), - // but the audit entry records a single CL for the whole batch. We use ANY as a - // placeholder to indicate "mixed / not applicable". - // FIXME: Auditing is executed only for a complete success - maybe_audit(audit_info, audit::statement_category::QUERY, "", - print_names_for_audit(table_names), "BatchGetItem", request, db::consistency_level::ANY); - if (!some_succeeded && eptr) { - co_await coroutine::return_exception_ptr(std::move(eptr)); - } - auto duration = std::chrono::steady_clock::now() - start_time; - _stats.api_operations.batch_get_item_latency.mark(duration); - for (const table_requests& rs : requests) { - lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); - per_table_stats->api_operations.batch_get_item_latency.mark(duration); - } - if (is_big(response)) { - co_return make_streamed(std::move(response)); - } else { - co_return rjson::print(std::move(response)); - } -} - -// "filter" represents a condition that can be applied to individual items -// read by a Query or Scan operation, to decide whether to keep the item. -// A filter is constructed from a Query or Scan request. This uses the -// relevant fields in the query (FilterExpression or QueryFilter/ScanFilter + -// ConditionalOperator). These fields are pre-checked and pre-parsed as much -// as possible, to ensure that later checking of many items is efficient. -class filter { -private: - // Holding QueryFilter/ScanFilter + ConditionalOperator: - struct conditions_filter { - bool require_all; - rjson::value conditions; - }; - // Holding a parsed FilterExpression: - struct expression_filter { - parsed::condition_expression expression; - }; - std::optional> _imp; -public: - // Filtering for Scan and Query are very similar, but there are some - // small differences, especially the names of the request attributes. - enum class request_type { SCAN, QUERY }; - // Note that a filter does not store pointers to the query used to - // construct it. - filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt, - std::unordered_set& used_attribute_names, - std::unordered_set& used_attribute_values); - bool check(const rjson::value& item) const; - bool filters_on(std::string_view attribute) const; - // for_filters_on() runs the given function on the attributes that the - // filter works on. It may run for the same attribute more than once if - // used more than once in the filter. - void for_filters_on(const noncopyable_function& func) const; - operator bool() const { return bool(_imp); } -}; - -filter::filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt, - std::unordered_set& used_attribute_names, - std::unordered_set& used_attribute_values) { - const rjson::value* expression = rjson::find(request, "FilterExpression"); - const char* conditions_attribute = (rt == request_type::SCAN) ? "ScanFilter" : "QueryFilter"; - const rjson::value* conditions = rjson::find(request, conditions_attribute); - auto conditional_operator = get_conditional_operator(request); - if (conditional_operator != conditional_operator_type::MISSING && - (!conditions || (conditions->IsObject() && conditions->GetObject().ObjectEmpty()))) { - throw api_error::validation( - format("'ConditionalOperator' parameter cannot be specified for missing or empty {}", - conditions_attribute)); - } - if (expression && conditions) { - throw api_error::validation( - format("FilterExpression and {} are not allowed together", conditions_attribute)); - } - if (expression) { - if (!expression->IsString()) { - throw api_error::validation("FilterExpression must be a string"); - } - if (expression->GetStringLength() == 0) { - throw api_error::validation("FilterExpression must not be empty"); - } - if (rjson::find(request, "AttributesToGet")) { - throw api_error::validation("Cannot use both old-style and new-style parameters in same request: FilterExpression and AttributesToGet"); - } - try { - auto parsed = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(*expression), "FilterExpression"); - const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); - const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); - resolve_condition_expression(parsed, - expression_attribute_names, expression_attribute_values, - used_attribute_names, used_attribute_values); - _imp = expression_filter { std::move(parsed) }; - } catch(expressions_syntax_error& e) { - throw api_error::validation(e.what()); - } - } - if (conditions) { - if (rjson::find(request, "ProjectionExpression")) { - throw api_error::validation(format("Cannot use both old-style and new-style parameters in same request: {} and ProjectionExpression", conditions_attribute)); - } - bool require_all = conditional_operator != conditional_operator_type::OR; - _imp = conditions_filter { require_all, rjson::copy(*conditions) }; - } -} - -bool filter::check(const rjson::value& item) const { - if (!_imp) { - return true; - } - return std::visit(overloaded_functor { - [&] (const conditions_filter& f) -> bool { - return verify_condition(f.conditions, f.require_all, &item); - }, - [&] (const expression_filter& f) -> bool { - return verify_condition_expression(f.expression, &item); - } - }, *_imp); -} - -bool filter::filters_on(std::string_view attribute) const { - if (!_imp) { - return false; - } - return std::visit(overloaded_functor { - [&] (const conditions_filter& f) -> bool { - for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) { - if (rjson::to_string_view(it->name) == attribute) { - return true; - } - } - return false; - }, - [&] (const expression_filter& f) -> bool { - return condition_expression_on(f.expression, attribute); - } - }, *_imp); -} - -void filter::for_filters_on(const noncopyable_function& func) const { - if (_imp) { - std::visit(overloaded_functor { - [&] (const conditions_filter& f) -> void { - for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) { - func(rjson::to_string_view(it->name)); - } - }, - [&] (const expression_filter& f) -> void { - return for_condition_expression_on(f.expression, func); - } - }, *_imp); - } -} - -class describe_items_visitor { - typedef std::vector columns_t; - const columns_t& _columns; - const std::optional& _attrs_to_get; - std::unordered_set _extra_filter_attrs; - const filter& _filter; - typename columns_t::const_iterator _column_it; - rjson::value _item; - // _items is a chunked_vector instead of a RapidJson array - // (rjson::value) because unfortunately RapidJson arrays are stored - // contiguously in memory, and cause large allocations when a Query/Scan - // returns a long list of short items (issue #23535). - utils::chunked_vector _items; - size_t _scanned_count; - -public: - describe_items_visitor(const columns_t& columns, const std::optional& attrs_to_get, filter& filter) - : _columns(columns) - , _attrs_to_get(attrs_to_get) - , _filter(filter) - , _column_it(columns.begin()) - , _item(rjson::empty_object()) - , _scanned_count(0) - { - // _filter.check() may need additional attributes not listed in - // _attrs_to_get (i.e., not requested as part of the output). - // We list those in _extra_filter_attrs. We will include them in - // the JSON but take them out before finally returning the JSON. - if (_attrs_to_get) { - _filter.for_filters_on([&] (std::string_view attr) { - std::string a(attr); // no heterogeneous maps searches :-( - if (!_attrs_to_get->contains(a)) { - _extra_filter_attrs.emplace(std::move(a)); - } - }); - } - } - - void start_row() { - _column_it = _columns.begin(); - } - - void accept_value(managed_bytes_view_opt result_bytes_view) { - if (!result_bytes_view) { - ++_column_it; - return; - } - result_bytes_view->with_linearized([this] (bytes_view bv) { - std::string column_name = (*_column_it)->name_as_text(); - if (column_name != executor::ATTRS_COLUMN_NAME) { - if (!_attrs_to_get || _attrs_to_get->contains(column_name) || _extra_filter_attrs.contains(column_name)) { - if (!_item.HasMember(column_name.c_str())) { - rjson::add_with_string_name(_item, column_name, rjson::empty_object()); - } - rjson::value& field = _item[column_name.c_str()]; - rjson::add_with_string_name(field, type_to_string((*_column_it)->type), json_key_column_value(bv, **_column_it)); - } - } else { - auto deserialized = attrs_type()->deserialize(bv); - auto keys_and_values = value_cast(deserialized); - for (auto entry : keys_and_values) { - std::string attr_name = value_cast(entry.first); - if (!_attrs_to_get || _attrs_to_get->contains(attr_name) || _extra_filter_attrs.contains(attr_name)) { - bytes value = value_cast(entry.second); - // Even if _attrs_to_get asked to keep only a part of a - // top-level attribute, we keep the entire attribute - // at this stage, because the item filter might still - // need the other parts (it was easier for us to keep - // extra_filter_attrs at top-level granularity). We'll - // filter the unneeded parts after item filtering. - rjson::add_with_string_name(_item, attr_name, deserialize_item(value)); - } - } - } - }); - ++_column_it; - } - - void end_row() { - if (_filter.check(_item)) { - // As noted above, we kept entire top-level attributes listed in - // _attrs_to_get. We may need to only keep parts of them. - if (_attrs_to_get) { - for (const auto& attr: *_attrs_to_get) { - // If !attr.has_value() it means we were asked not to keep - // attr entirely, but just parts of it. - if (!attr.second.has_value()) { - rjson::value* toplevel= rjson::find(_item, attr.first); - if (toplevel && !hierarchy_filter(*toplevel, attr.second)) { - rjson::remove_member(_item, attr.first); - } - } - } - } - // Remove the extra attributes _extra_filter_attrs which we had - // to add just for the filter, and not requested to be returned: - for (const auto& attr : _extra_filter_attrs) { - rjson::remove_member(_item, attr); - } - - _items.push_back(std::move(_item)); - } - _item = rjson::empty_object(); - ++_scanned_count; - } - - utils::chunked_vector get_items() && { - return std::move(_items); - } - - size_t get_scanned_count() { - return _scanned_count; - } -}; - -// describe_items() returns a JSON object that includes members "Count" -// and "ScannedCount", but *not* "Items" - that is returned separately -// as a chunked_vector to avoid large contiguous allocations which -// RapidJSON does of its array. The caller should add "Items" to the -// returned JSON object if needed, or print it separately. -// The returned chunked_vector (the items) is std::optional<>, because -// the user may have requested only to count items, and not return any -// items - which is different from returning an empty list of items. -static future>, size_t>> describe_items( - const cql3::selection::selection& selection, - std::unique_ptr result_set, - std::optional&& attrs_to_get, - filter&& filter) { - describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter); - co_await result_set->visit_gently(visitor); - auto scanned_count = visitor.get_scanned_count(); - utils::chunked_vector items = std::move(visitor).get_items(); - rjson::value items_descr = rjson::empty_object(); - auto size = items.size(); - rjson::add(items_descr, "Count", rjson::value(size)); - rjson::add(items_descr, "ScannedCount", rjson::value(scanned_count)); - // If attrs_to_get && attrs_to_get->empty(), this means the user asked not - // to get any attributes (i.e., a Scan or Query with Select=COUNT) and we - // shouldn't return "Items" at all. - // TODO: consider optimizing the case of Select=COUNT without a filter. - // In that case, we currently build a list of empty items and here drop - // it. We could just count the items and not bother with the empty items. - // (However, remember that when we do have a filter, we need the items). - std::optional> opt_items; - if (!attrs_to_get || !attrs_to_get->empty()) { - opt_items = std::move(items); - } - co_return std::tuple(std::move(items_descr), std::move(opt_items), size); -} - -static rjson::value encode_paging_state(const schema& schema, const service::pager::paging_state& paging_state) { - rjson::value last_evaluated_key = rjson::empty_object(); - std::vector exploded_pk = paging_state.get_partition_key().explode(); - auto exploded_pk_it = exploded_pk.begin(); - for (const column_definition& cdef : schema.partition_key_columns()) { - rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object()); - rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()]; - rjson::add_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef)); - ++exploded_pk_it; - } - auto pos = paging_state.get_position_in_partition(); - if (pos.has_key()) { - // Alternator itself allows at most one column in clustering key, but - // user can use Alternator api to access system tables which might have - // multiple clustering key columns. So we need to handle that case here. - auto cdef_it = schema.clustering_key_columns().begin(); - for(const auto &exploded_ck : pos.key().explode()) { - rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef_it->name_as_text()), rjson::empty_object()); - rjson::value& key_entry = last_evaluated_key[cdef_it->name_as_text()]; - rjson::add_with_string_name(key_entry, type_to_string(cdef_it->type), json_key_column_value(exploded_ck, *cdef_it)); - ++cdef_it; - } - } - // To avoid possible conflicts (and thus having to reserve these names) we - // avoid adding the weight and region fields of the position to the paging - // state. Alternator will never need these as it doesn't have range - // tombstones (the only thing that can generate a position other than at(row)). - // We conditionally include these fields when reading CQL tables through alternator. - if (!is_alternator_keyspace(schema.ks_name()) && (!pos.has_key() || pos.get_bound_weight() != bound_weight::equal)) { - rjson::add_with_string_name(last_evaluated_key, scylla_paging_region, rjson::empty_object()); - rjson::add(last_evaluated_key[scylla_paging_region.data()], "S", rjson::from_string(fmt::to_string(pos.region()))); - rjson::add_with_string_name(last_evaluated_key, scylla_paging_weight, rjson::empty_object()); - rjson::add(last_evaluated_key[scylla_paging_weight.data()], "N", static_cast(pos.get_bound_weight())); - } - return last_evaluated_key; -} - -// RapidJSON allocates arrays contiguously in memory, so we want to avoid -// returning a large number of items as a single rapidjson array, and use -// a chunked_vector instead. The following constant is an arbitrary cutoff -// point for when to switch from a rapidjson array to a chunked_vector. -static constexpr int max_items_for_rapidjson_array = 256; - -static future do_query(service::storage_proxy& proxy, - schema_ptr table_schema, - const rjson::value* exclusive_start_key, - dht::partition_range_vector partition_ranges, - std::vector ck_bounds, - std::optional attrs_to_get, - uint32_t limit, - db::consistency_level cl, - filter filter, - query::partition_slice::option_set custom_opts, - service::client_state& client_state, - alternator::stats& stats, - tracing::trace_state_ptr trace_state, - service_permit permit, - bool enforce_authorization, - bool warn_authorization) { - lw_shared_ptr old_paging_state = nullptr; - - tracing::trace(trace_state, "Performing a database query"); - - // Reverse the schema and the clustering bounds as the underlying code expects - // reversed queries in the native reversed format. - auto query_schema = table_schema; - const bool reversed = custom_opts.contains(); - if (reversed) { - query_schema = table_schema->get_reversed(); - - std::reverse(ck_bounds.begin(), ck_bounds.end()); - for (auto& bound : ck_bounds) { - bound = query::reverse(bound); - } - } - - if (exclusive_start_key) { - partition_key pk = pk_from_json(*exclusive_start_key, table_schema); - auto pos = position_in_partition::for_partition_start(); - if (table_schema->clustering_key_size() > 0) { - pos = pos_from_json(*exclusive_start_key, table_schema); - } - old_paging_state = make_lw_shared(pk, pos, query::max_partitions, query_id::create_null_id(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0); - } - - co_await verify_permission(enforce_authorization, warn_authorization, client_state, table_schema, auth::permission::SELECT, stats); - - auto regular_columns = - table_schema->regular_columns() | std::views::transform(&column_definition::id) - | std::ranges::to(); - auto static_columns = - table_schema->static_columns() | std::views::transform(&column_definition::id) - | std::ranges::to(); - auto selection = cql3::selection::selection::wildcard(table_schema); - query::partition_slice::option_set opts = selection->get_query_options(); - opts.add(custom_opts); - auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts); - auto command = ::make_lw_shared(query_schema->id(), query_schema->version(), partition_slice, proxy.get_max_result_size(partition_slice), - query::tombstone_limit(proxy.get_tombstone_limit())); - - elogger.trace("Executing read query (reversed {}): table schema {}, query schema {}", partition_slice.is_reversed(), table_schema->version(), query_schema->version()); - - auto query_state_ptr = std::make_unique(client_state, trace_state, std::move(permit)); - - // FIXME: should be moved above, set on opts, so get_max_result_size knows it? - command->slice.options.set(); - auto query_options = std::make_unique(cl, std::vector{}); - query_options = std::make_unique(std::move(query_options), std::move(old_paging_state)); - auto p = service::pager::query_pagers::pager(proxy, query_schema, selection, *query_state_ptr, *query_options, command, std::move(partition_ranges), nullptr); - - std::unique_ptr rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout()); - if (!p->is_exhausted()) { - rs->get_metadata().set_paging_state(p->state()); - } - auto paging_state = rs->get_metadata().paging_state(); - bool has_filter = filter; - auto [items_descr, opt_items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter)); - if (paging_state) { - rjson::add(items_descr, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state)); - } - if (has_filter) { - stats.cql_stats.filtered_rows_read_total += p->stats().rows_read_total; - // update our "filtered_row_matched_total" for all the rows matched, despited the filter - stats.cql_stats.filtered_rows_matched_total += size; - } - if (opt_items) { - if (opt_items->size() >= max_items_for_rapidjson_array) { - // There are many items, better print the JSON and the array of - // items (opt_items) separately to avoid RapidJSON's contiguous - // allocation of arrays. - co_return make_streamed_with_extra_array(std::move(items_descr), "Items", std::move(*opt_items)); - } - // There aren't many items in the chunked vector opt_items, - // let's just insert them into the JSON object and print the - // full JSON normally. - rjson::value items_json = rjson::empty_array(); - for (auto& item : *opt_items) { - rjson::push_back(items_json, std::move(item)); - } - rjson::add(items_descr, "Items", std::move(items_json)); - } - if (is_big(items_descr)) { - co_return make_streamed(std::move(items_descr)); - } - co_return rjson::print(std::move(items_descr)); -} - -static dht::token token_for_segment(int segment, int total_segments) { - throwing_assert(total_segments > 1 && segment >= 0 && segment < total_segments); - uint64_t delta = std::numeric_limits::max() / total_segments; - return dht::token::from_int64(std::numeric_limits::min() + delta * segment); -} - -static dht::partition_range get_range_for_segment(int segment, int total_segments) { - if (total_segments == 1) { - return dht::partition_range::make_open_ended_both_sides(); - } - if (segment == 0) { - dht::token ending_token = token_for_segment(1, total_segments); - return dht::partition_range::make_ending_with( - dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false)); - } else if (segment == total_segments - 1) { - dht::token starting_token = token_for_segment(segment, total_segments); - return dht::partition_range::make_starting_with( - dht::partition_range::bound(dht::ring_position::starting_at(starting_token))); - } else { - dht::token starting_token = token_for_segment(segment, total_segments); - dht::token ending_token = token_for_segment(segment + 1, total_segments); - return dht::partition_range::make( - dht::partition_range::bound(dht::ring_position::starting_at(starting_token)), - dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false) - ); - } -} - -future executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { - _stats.api_operations.scan++; - elogger.trace("Scanning {}", request); - - auto [schema, table_type] = get_table_or_view(_proxy, request); - db::consistency_level cl = get_read_consistency(request); - - maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "Scan", request, cl); - - tracing::add_alternator_table_name(trace_state, schema->cf_name()); - get_stats_from_schema(_proxy, *schema)->api_operations.scan++; - auto segment = get_int_attribute(request, "Segment"); - auto total_segments = get_int_attribute(request, "TotalSegments"); - if (segment || total_segments) { - if (!segment || !total_segments) { - return make_ready_future(api_error::validation( - "Both Segment and TotalSegments attributes need to be present for a parallel scan")); - } - if (*segment < 0 || *segment >= *total_segments) { - return make_ready_future(api_error::validation( - "Segment must be non-negative and less than TotalSegments")); - } - if (*total_segments < 0 || *total_segments > 1000000) { - return make_ready_future(api_error::validation( - "TotalSegments must be non-negative and less or equal to 1000000")); - } - } - - rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey"); - - if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) { - return make_ready_future(api_error::validation( - "Consistent reads are not allowed on global indexes (GSI)")); - } - rjson::value* limit_json = rjson::find(request, "Limit"); - uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits::max(); - if (limit <= 0) { - return make_ready_future(api_error::validation("Limit must be greater than 0")); - } - - select_type select = parse_select(request, table_type); - - std::unordered_set used_attribute_names; - std::unordered_set used_attribute_values; - auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select); - - dht::partition_range_vector partition_ranges; - if (segment) { - auto range = get_range_for_segment(*segment, *total_segments); - if (exclusive_start_key) { - auto ring_pos = dht::ring_position{dht::decorate_key(*schema, pk_from_json(*exclusive_start_key, schema))}; - if (!range.contains(ring_pos, dht::ring_position_comparator(*schema))) { - return make_ready_future(api_error::validation( - format("The provided starting key is invalid: Invalid ExclusiveStartKey. Please use ExclusiveStartKey " - "with correct Segment. TotalSegments: {} Segment: {}", *total_segments, *segment))); - } - } - partition_ranges.push_back(range); - } else { - partition_ranges.push_back(dht::partition_range::make_open_ended_both_sides()); - } - std::vector ck_bounds{query::clustering_range::make_open_ended_both_sides()}; - - filter filter(*_parsed_expression_cache, request, filter::request_type::SCAN, used_attribute_names, used_attribute_values); - // Note: Unlike Query, Scan does allow a filter on the key attributes. - // For some *specific* cases of key filtering, such an equality test on - // partition key or comparison operator for the sort key, we could have - // optimized the filtering by modifying partition_ranges and/or - // ck_bounds. We haven't done this optimization yet. - - const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); - const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); - verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Scan"); - verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Scan"); - - return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, - std::move(filter), query::partition_slice::option_set(), client_state, _stats, trace_state, std::move(permit), _enforce_authorization, _warn_authorization); -} - -static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, const rjson::value& comp_definition, const rjson::value& attrs) { - auto op = get_comparison_operator(comp_definition); - if (op != comparison_operator_type::EQ) { - throw api_error::validation(format("Hash key can only be restricted with equality operator (EQ). {} not supported.", comp_definition)); - } - if (attrs.Size() != 1) { - throw api_error::validation(format("A single attribute is required for a hash key EQ restriction: {}", attrs)); - } - bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef); - partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value)); - auto decorated_key = dht::decorate_key(*schema, pk); - return dht::partition_range(decorated_key); -} - -static query::clustering_range get_clustering_range_for_begins_with(bytes&& target, const clustering_key& ck, schema_ptr schema, data_type t) { - auto it = boost::range::find_end(target, bytes("\xFF"), std::not_equal_to()); - if (it != target.end()) { - ++*it; - target.resize(std::distance(target.begin(), it) + 1); - clustering_key upper_limit = clustering_key::from_single_value(*schema, target); - return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit, false)); - } - return query::clustering_range::make_starting_with(query::clustering_range::bound(ck)); -} - -static query::clustering_range calculate_ck_bound(schema_ptr schema, const column_definition& ck_cdef, const rjson::value& comp_definition, const rjson::value& attrs) { - auto op = get_comparison_operator(comp_definition); - const size_t expected_attrs_size = (op == comparison_operator_type::BETWEEN) ? 2 : 1; - if (attrs.Size() != expected_attrs_size) { - throw api_error::validation(format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs)); - } - bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef); - clustering_key ck = clustering_key::from_single_value(*schema, raw_value); - switch (op) { - case comparison_operator_type::EQ: - return query::clustering_range(ck); - case comparison_operator_type::LE: - return query::clustering_range::make_ending_with(query::clustering_range::bound(ck)); - case comparison_operator_type::LT: - return query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false)); - case comparison_operator_type::GE: - return query::clustering_range::make_starting_with(query::clustering_range::bound(ck)); - case comparison_operator_type::GT: - return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false)); - case comparison_operator_type::BETWEEN: { - bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef); - clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit); - return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit)); - } - case comparison_operator_type::BEGINS_WITH: { - if (raw_value.empty()) { - return query::clustering_range::make_open_ended_both_sides(); - } - // NOTICE(sarna): A range starting with given prefix and ending (non-inclusively) with a string "incremented" by a single - // character at the end. Throws for NUMBER instances. - if (!ck_cdef.type->is_compatible_with(*utf8_type)) { - throw api_error::validation(fmt::format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type))); - } - return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type); - } - default: - throw api_error::validation(format("Operator {} not supported for sort key", comp_definition)); - } -} - -// Calculates primary key bounds from KeyConditions -static std::pair> -calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) { - dht::partition_range_vector partition_ranges; - std::vector ck_bounds; - - for (auto it = conditions.MemberBegin(); it != conditions.MemberEnd(); ++it) { - sstring key = rjson::to_sstring(it->name); - const rjson::value& condition = it->value; - - const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator"); - const rjson::value& attr_list = rjson::get(condition, "AttributeValueList"); - - const column_definition& pk_cdef = schema->partition_key_columns().front(); - const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? &schema->clustering_key_columns().front() : nullptr; - if (key == pk_cdef.name_as_text()) { - if (!partition_ranges.empty()) { - throw api_error::validation("Currently only a single restriction per key is allowed"); - } - partition_ranges.push_back(calculate_pk_bound(schema, pk_cdef, comp_definition, attr_list)); - } - if (ck_cdef && key == ck_cdef->name_as_text()) { - if (!ck_bounds.empty()) { - throw api_error::validation("Currently only a single restriction per key is allowed"); - } - ck_bounds.push_back(calculate_ck_bound(schema, *ck_cdef, comp_definition, attr_list)); - } - } - - // Validate that a query's conditions must be on the hash key, and - // optionally also on the sort key if it exists. - if (partition_ranges.empty()) { - throw api_error::validation(format("Query missing condition on hash key '{}'", schema->partition_key_columns().front().name_as_text())); - } - if (schema->clustering_key_size() == 0) { - if (conditions.MemberCount() != 1) { - throw api_error::validation("Only one condition allowed in table with only hash key"); - } - } else { - if (conditions.MemberCount() == 2 && ck_bounds.empty()) { - throw api_error::validation(format("Query missing condition on sort key '{}'", schema->clustering_key_columns().front().name_as_text())); - } else if (conditions.MemberCount() > 2) { - throw api_error::validation("Only one or two conditions allowed in table with hash key and sort key"); - } - } - - if (ck_bounds.empty()) { - ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } - - return {std::move(partition_ranges), std::move(ck_bounds)}; -} - -// Extract the top-level column name specified in a KeyConditionExpression. -// If a nested attribute path is given, a ValidationException is generated. -// If the column name is a #reference to ExpressionAttributeNames, the -// reference is resolved. -// Note this function returns a string_view, which may refer to data in the -// given parsed::value or expression_attribute_names. -static std::string_view get_toplevel(const parsed::value& v, - const rjson::value* expression_attribute_names, - std::unordered_set& used_attribute_names) -{ - const parsed::path& path = std::get(v._value); - if (path.has_operators()) { - throw api_error::validation("KeyConditionExpression does not support nested attributes"); - } - std::string_view column_name = path.root(); - if (column_name.size() > 0 && column_name[0] == '#') { - used_attribute_names.emplace(column_name); - if (!expression_attribute_names) { - throw api_error::validation( - fmt::format("ExpressionAttributeNames missing, entry '{}' required by KeyConditionExpression", - column_name)); - } - const rjson::value* value = rjson::find(*expression_attribute_names, column_name); - if (!value || !value->IsString()) { - throw api_error::validation( - fmt::format("ExpressionAttributeNames missing entry '{}' required by KeyConditionExpression", - column_name)); - } - column_name = rjson::to_string_view(*value); - } - return column_name; -} - -// Extract a constant value specified in a KeyConditionExpression. -// This constant was originally parsed as a reference (:name) to a member of -// ExpressionAttributeValues, but at this point, after resolve_value(), it -// was already converted into a JSON value. -// This function decodes the value (using its given expected type) into bytes -// which Scylla uses as the actual key value. If the value has the wrong type, -// or the input had other problems, a ValidationException is thrown. -static bytes get_constant_value(const parsed::value& v, - const column_definition& column) -{ - const parsed::constant& constant = std::get(v._value); - const parsed::constant::literal& lit = std::get(constant._value); - return get_key_from_typed_value(*lit, column); -} - -// condition_expression_and_list extracts a list of ANDed primitive conditions -// from a condition_expression. This is useful for KeyConditionExpression, -// which may not use OR or NOT. If the given condition_expression does use -// OR or NOT, this function throws a ValidationException. -static void condition_expression_and_list( - const parsed::condition_expression& condition_expression, - std::vector& conditions) -{ - if (condition_expression._negated) { - throw api_error::validation("KeyConditionExpression cannot use NOT"); - } - std::visit(overloaded_functor { - [&] (const parsed::primitive_condition& cond) { - conditions.push_back(&cond); - }, - [&] (const parsed::condition_expression::condition_list& list) { - if (list.op == '|' && list.conditions.size() > 1) { - throw api_error::validation("KeyConditionExpression cannot use OR"); - } - for (const parsed::condition_expression& cond : list.conditions) { - condition_expression_and_list(cond, conditions); - } - } - }, condition_expression._expression); -} - -// Calculates primary key bounds from KeyConditionExpression -static std::pair> -calculate_bounds_condition_expression(schema_ptr schema, - const rjson::value& expression, - const rjson::value* expression_attribute_values, - std::unordered_set& used_attribute_values, - const rjson::value* expression_attribute_names, - std::unordered_set& used_attribute_names, - parsed::expression_cache& parsed_expression_cache) -{ - if (!expression.IsString()) { - throw api_error::validation("KeyConditionExpression must be a string"); - } - if (expression.GetStringLength() == 0) { - throw api_error::validation("KeyConditionExpression must not be empty"); - } - // We parse the KeyConditionExpression with the same parser we use for - // ConditionExpression. But KeyConditionExpression only supports a subset - // of the ConditionExpression features, so we have many additional - // verifications below that the key condition is legal. Briefly, a valid - // key condition must contain a single partition key and a single - // sort-key range. - parsed::condition_expression p; - try { - p = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(expression), "KeyConditionExpression"); - } catch(expressions_syntax_error& e) { - throw api_error::validation(e.what()); - } - resolve_condition_expression(p, - expression_attribute_names, expression_attribute_values, - used_attribute_names, used_attribute_values); - std::vector conditions; - condition_expression_and_list(p, conditions); - - if (conditions.size() < 1 || conditions.size() > 2) { - throw api_error::validation( - "KeyConditionExpression syntax error: must have 1 or 2 conditions"); - } - // Scylla allows us to have an (equality) constraint on the partition key - // pk_cdef, and a range constraint on the *first* clustering key ck_cdef. - // Note that this is also good enough for our GSI implementation - the - // GSI's user-specified sort key will be the first clustering key. - // FIXME: In the case described in issue #5320 (base and GSI both have - // just hash key - but different ones), this may allow the user to Query - // using the base key which isn't officially part of the GSI. - const column_definition& pk_cdef = schema->partition_key_columns().front(); - const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? - &schema->clustering_key_columns().front() : nullptr; - - dht::partition_range_vector partition_ranges; - std::vector ck_bounds; - for (const parsed::primitive_condition* condp : conditions) { - const parsed::primitive_condition& cond = *condp; - // In all comparison operators, one operand must be a column name, - // the other is a constant (value reference). We remember which is - // which in toplevel_ind, and also the column name in key (not just - // for comparison operators). - std::string_view key; - int toplevel_ind; - switch (cond._values.size()) { - case 1: { - // The only legal single-value condition is a begin_with() function, - // and it must have two parameters - a top-level attribute and a - // value reference.. - const parsed::value::function_call *f = std::get_if(&cond._values[0]._value); - if (!f) { - throw api_error::validation("KeyConditionExpression cannot be just a value"); - } - if (f->_function_name != "begins_with") { - throw api_error::validation( - fmt::format("KeyConditionExpression function '{}' not supported",f->_function_name)); - } - if (f->_parameters.size() != 2 || !f->_parameters[0].is_path() || - !f->_parameters[1].is_constant()) { - throw api_error::validation( - "KeyConditionExpression begins_with() takes attribute and value"); - } - key = get_toplevel(f->_parameters[0], expression_attribute_names, used_attribute_names); - toplevel_ind = -1; - break; - } - case 2: - if (cond._values[0].is_path() && cond._values[1].is_constant()) { - toplevel_ind = 0; - } else if (cond._values[1].is_path() && cond._values[0].is_constant()) { - toplevel_ind = 1; - } else { - throw api_error::validation("KeyConditionExpression must compare attribute with constant"); - } - key = get_toplevel(cond._values[toplevel_ind], expression_attribute_names, used_attribute_names); - break; - case 3: - // Only BETWEEN has three operands. First must be a column name, - // two other must be value references (constants): - if (cond._op != parsed::primitive_condition::type::BETWEEN) { - // Shouldn't happen unless we have a bug in the parser - throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size())); - } - if (cond._values[0].is_path() && cond._values[1].is_constant() && cond._values[2].is_constant()) { - toplevel_ind = 0; - key = get_toplevel(cond._values[0], expression_attribute_names, used_attribute_names); - } else { - throw api_error::validation("KeyConditionExpression must compare attribute with constants"); - } - break; - default: - // Shouldn't happen unless we have a bug in the parser - throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size())); - } - if (cond._op == parsed::primitive_condition::type::IN) { - throw api_error::validation("KeyConditionExpression does not support IN operator"); - } else if (cond._op == parsed::primitive_condition::type::NE) { - throw api_error::validation("KeyConditionExpression does not support NE operator"); - } else if (cond._op == parsed::primitive_condition::type::EQ) { - // the EQ operator (=) is the only one which can be used for both - // the partition key and sort key: - if (sstring(key) == pk_cdef.name_as_text()) { - if (!partition_ranges.empty()) { - throw api_error::validation( - "KeyConditionExpression allows only one condition for each key"); - } - bytes raw_value = get_constant_value(cond._values[!toplevel_ind], pk_cdef); - partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value)); - auto decorated_key = dht::decorate_key(*schema, pk); - partition_ranges.push_back(dht::partition_range(decorated_key)); - } else if (ck_cdef && sstring(key) == ck_cdef->name_as_text()) { - if (!ck_bounds.empty()) { - throw api_error::validation( - "KeyConditionExpression allows only one condition for each key"); - } - bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef); - clustering_key ck = clustering_key::from_single_value(*schema, raw_value); - ck_bounds.push_back(query::clustering_range(ck)); - } else { - throw api_error::validation( - fmt::format("KeyConditionExpression condition on non-key attribute {}", key)); - } - continue; - } - // If we're still here, it's any other operator besides EQ, and these - // are allowed *only* on the clustering key: - if (sstring(key) == pk_cdef.name_as_text()) { - throw api_error::validation( - fmt::format("KeyConditionExpression only '=' condition is supported on partition key {}", key)); - } else if (!ck_cdef || sstring(key) != ck_cdef->name_as_text()) { - throw api_error::validation( - fmt::format("KeyConditionExpression condition on non-key attribute {}", key)); - } - if (!ck_bounds.empty()) { - throw api_error::validation( - "KeyConditionExpression allows only one condition for each key"); - } - if (cond._op == parsed::primitive_condition::type::BETWEEN) { - clustering_key ck1 = clustering_key::from_single_value(*schema, - get_constant_value(cond._values[1], *ck_cdef)); - clustering_key ck2 = clustering_key::from_single_value(*schema, - get_constant_value(cond._values[2], *ck_cdef)); - ck_bounds.push_back(query::clustering_range::make( - query::clustering_range::bound(ck1), query::clustering_range::bound(ck2))); - continue; - } else if (cond._values.size() == 1) { - // We already verified above, that this case this can only be a - // function call to begins_with(), with the first parameter the - // key, the second the value reference. - bytes raw_value = get_constant_value( - std::get(cond._values[0]._value)._parameters[1], *ck_cdef); - if (!ck_cdef->type->is_compatible_with(*utf8_type)) { - // begins_with() supported on bytes and strings (both stored - // in the database as strings) but not on numbers. - throw api_error::validation( - fmt::format("KeyConditionExpression begins_with() not supported on type {}", - type_to_string(ck_cdef->type))); - } else if (raw_value.empty()) { - ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } else { - clustering_key ck = clustering_key::from_single_value(*schema, raw_value); - ck_bounds.push_back(get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef->type)); - } - continue; - } - - // All remaining operator have one value reference parameter in index - // !toplevel_ind. Note how toplevel_ind==1 reverses the direction of - // an inequality. - bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef); - clustering_key ck = clustering_key::from_single_value(*schema, raw_value); - if ((cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 0) || - (cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 1)) { - ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false))); - } else if ((cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 0) || - (cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 1)) { - ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false))); - } else if ((cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 0) || - (cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 1)) { - ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck))); - } else if ((cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 0) || - (cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 1)) { - ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck))); - } - } - - if (partition_ranges.empty()) { - throw api_error::validation( - format("KeyConditionExpression requires a condition on partition key {}", pk_cdef.name_as_text())); - } - if (ck_bounds.empty()) { - ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } - return {std::move(partition_ranges), std::move(ck_bounds)}; -} - -static future query_vector( - service::storage_proxy& proxy, - vector_search::vector_store_client& vsc, - rjson::value request, - service::client_state& client_state, - tracing::trace_state_ptr trace_state, - service_permit permit, - bool enforce_authorization, - bool warn_authorization, - alternator::stats& stats, - parsed::expression_cache& parsed_expr_cache) { - // If vector search is requested, IndexName must be given and must - // refer to a vector index - not to a GSI or LSI. - const rjson::value* index_name_v = rjson::find(request, "IndexName"); - if (!index_name_v || !index_name_v->IsString()) { - co_return api_error::validation( - "VectorSearch requires IndexName referring to a vector index"); - } - std::string_view index_name = rjson::to_string_view(*index_name_v); - schema_ptr base_schema = get_table(proxy, request); - bool is_vector = std::ranges::any_of(base_schema->indices(), [&](const index_metadata& im) { - const auto& opts = im.options(); - auto it = opts.find(db::index::secondary_index::custom_class_option_name); - return im.name() == index_name && it != opts.end() && it->second == "vector_index"; - }); - if (!is_vector) { - co_return api_error::validation( - format("VectorSearch IndexName '{}' is not a vector index.", index_name)); - } - // QueryVector is required inside VectorSearch. - const rjson::value* vector_search = rjson::find(request, "VectorSearch"); - if (!vector_search || !vector_search->IsObject()) { - co_return api_error::validation( - "VectorSearch requires a VectorSearch parameter"); - } - const rjson::value* query_vector = rjson::find(*vector_search, "QueryVector"); - if (!query_vector || !query_vector->IsObject()) { - co_return api_error::validation( - "VectorSearch requires a QueryVector parameter"); - } - // QueryVector should be is a DynamoDB value, which must be of type "L" - // (a list), containing only elements of type "N" (numbers). The number - // of these elements must be exactly the "dimensions" defined for this - // vector index. We'll now validate all these assumptions and parse - // all the numbers in the vector into an std::vector query_vec - - // the type that ann() wants. - int dimensions = 0; - for (const index_metadata& im : base_schema->indices()) { - if (im.name() == index_name) { - auto dims_it = im.options().find("dimensions"); - if (dims_it != im.options().end()) { - try { - dimensions = std::stoi(dims_it->second); - } catch (...) {} - } - break; - } - } - throwing_assert(dimensions > 0); - const rjson::value* qv_list = rjson::find(*query_vector, "L"); - if (!qv_list || !qv_list->IsArray()) { - co_return api_error::validation( - "VectorSearch QueryVector must be a list of numbers"); - } - const auto& arr = qv_list->GetArray(); - if ((int)arr.Size() != dimensions) { - co_return api_error::validation( - format("VectorSearch QueryVector length {} does not match index Dimensions {}", - arr.Size(), dimensions)); - } - std::vector query_vec; - query_vec.reserve(arr.Size()); - for (const rjson::value& elem : arr) { - if (!elem.IsObject()) { - co_return api_error::validation( - "VectorSearch QueryVector must contain only numbers"); - } - const rjson::value* n_val = rjson::find(elem, "N"); - if (!n_val || !n_val->IsString()) { - co_return api_error::validation( - "VectorSearch QueryVector must contain only numbers"); - } - std::string_view num_str = rjson::to_string_view(*n_val); - float f; - auto [ptr, ec] = std::from_chars(num_str.data(), num_str.data() + num_str.size(), f); - if (ec != std::errc{} || ptr != num_str.data() + num_str.size()) { - co_return api_error::validation( - format("VectorSearch QueryVector element '{}' is not a valid number", num_str)); - } - query_vec.push_back(f); - } - - // Limit is mandatory for vector search: it defines k, the number of - // nearest neighbors to return. - const rjson::value* limit_json = rjson::find(request, "Limit"); - if (!limit_json || !limit_json->IsUint()) { - co_return api_error::validation("VectorSearch requires a positive integer Limit parameter"); - } - uint32_t limit = limit_json->GetUint(); - if (limit == 0) { - co_return api_error::validation("Limit must be greater than 0"); - } - - // Consistent reads are not supported for vector search, just like GSI. - if (get_read_consistency(request) != db::consistency_level::LOCAL_ONE) { - co_return api_error::validation( - "Consistent reads are not allowed on vector indexes"); - } - - // Pagination (ExclusiveStartKey) is not supported for vector search. - if (rjson::find(request, "ExclusiveStartKey")) { - co_return api_error::validation( - "VectorSearch does not support pagination (ExclusiveStartKey)"); - } - - // ScanIndexForward is not supported for vector search: the ordering of - // results is determined by vector distance, not by the sort key. - if (rjson::find(request, "ScanIndexForward")) { - co_return api_error::validation( - "VectorSearch does not support ScanIndexForward"); - } - - std::unordered_set used_attribute_names; - std::unordered_set used_attribute_values; - // Parse the Select parameter and determine which attributes to return. - // For a vector index, the default Select is ALL_ATTRIBUTES (full items). - // ALL_PROJECTED_ATTRIBUTES is significantly more efficent because it - // returns what the vector store returned without looking up additional - // base-table data. Currently only the primary key attributes are projected - // but in the future we'll implement projecting additional attributes into - // the vector index - these additional attributes will also be usable for - // filtering). COUNT returns only the count without items. - select_type select = parse_select(request, table_or_view_type::vector_index); - std::optional attrs_to_get_opt; - if (select == select_type::projection) { - // ALL_PROJECTED_ATTRIBUTES for a vector index: return only key attributes. - alternator::attrs_to_get key_attrs; - for (const column_definition& cdef : base_schema->partition_key_columns()) { - attribute_path_map_add("Select", key_attrs, cdef.name_as_text()); - } - for (const column_definition& cdef : base_schema->clustering_key_columns()) { - attribute_path_map_add("Select", key_attrs, cdef.name_as_text()); - } - attrs_to_get_opt = std::move(key_attrs); - } else { - attrs_to_get_opt = calculate_attrs_to_get(request, parsed_expr_cache, used_attribute_names, select); - } - // QueryFilter (the old-style API) is not supported for vector search Queries. - if (rjson::find(request, "QueryFilter")) { - co_return api_error::validation( - "VectorSearch does not support QueryFilter; use FilterExpression instead"); - } - // FilterExpression: post-filter the vector search results by any attribute. - filter flt(parsed_expr_cache, request, filter::request_type::QUERY, - used_attribute_names, used_attribute_values); - const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); - verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query"); - const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); - verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query"); - - // Verify the user has SELECT permission on the base table, as we - // do for every type of read operation after validating the input - // parameters. - co_await verify_permission(enforce_authorization, warn_authorization, - client_state, base_schema, auth::permission::SELECT, stats); - - // Query the vector store for the approximate nearest neighbors. - auto timeout = executor::default_timeout(); - abort_on_expiry aoe(timeout); - rjson::value pre_filter = rjson::empty_object(); // TODO, implement - auto pkeys_result = co_await vsc.ann( - base_schema->ks_name(), std::string(index_name), base_schema, - std::move(query_vec), limit, pre_filter, aoe.abort_source()); - if (!pkeys_result.has_value()) { - const sstring error_msg = std::visit(vector_search::error_visitor{}, pkeys_result.error()); - co_return api_error::validation(error_msg); - } - const std::vector& pkeys = pkeys_result.value(); - - // For SELECT=COUNT with no filter: skip fetching from the base table and - // just return the count of candidates returned by the vector store. - // If a filter is present, fall through to the base-table fetch to apply it. - if (select == select_type::count && !flt) { - rjson::value response = rjson::empty_object(); - rjson::add(response, "Count", rjson::value(static_cast(pkeys.size()))); - rjson::add(response, "ScannedCount", rjson::value(static_cast(pkeys.size()))); - co_return rjson::print(std::move(response)); - } - - // For SELECT=ALL_PROJECTED_ATTRIBUTES with no filter: skip fetching from - // the base table and build items directly from the key columns returned by - // the vector store. If a filter is present, fall through to the base-table - // fetch to apply it. - if (select == select_type::projection && !flt) { - rjson::value items_json = rjson::empty_array(); - for (const auto& pkey : pkeys) { - rjson::value item = rjson::empty_object(); - std::vector exploded_pk = pkey.partition.key().explode(); - auto exploded_pk_it = exploded_pk.begin(); - for (const column_definition& cdef : base_schema->partition_key_columns()) { - rjson::value key_val = rjson::empty_object(); - rjson::add_with_string_name(key_val, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef)); - rjson::add_with_string_name(item, std::string_view(cdef.name_as_text()), std::move(key_val)); - ++exploded_pk_it; - } - if (base_schema->clustering_key_size() > 0) { - std::vector exploded_ck = pkey.clustering.explode(); - auto exploded_ck_it = exploded_ck.begin(); - for (const column_definition& cdef : base_schema->clustering_key_columns()) { - rjson::value key_val = rjson::empty_object(); - rjson::add_with_string_name(key_val, type_to_string(cdef.type), json_key_column_value(*exploded_ck_it, cdef)); - rjson::add_with_string_name(item, std::string_view(cdef.name_as_text()), std::move(key_val)); - ++exploded_ck_it; - } - } - rjson::push_back(items_json, std::move(item)); - } - rjson::value response = rjson::empty_object(); - rjson::add(response, "Count", rjson::value(static_cast(items_json.Size()))); - rjson::add(response, "ScannedCount", rjson::value(static_cast(pkeys.size()))); - rjson::add(response, "Items", std::move(items_json)); - co_return rjson::print(std::move(response)); - } - - // TODO: For SELECT=SPECIFIC_ATTRIBUTES, if they are part of the projected - // attributes, we should use the above optimized code path - not fall through - // to the read from the base table as below as we need to do if the specific - // attributes contain non-projected columns. - - // Fetch the matching items from the base table and build the response. - // When a filter is present, we always fetch the full item so that all - // attributes are available for filter evaluation, regardless of the - // projection required for the final response. - auto selection = cql3::selection::selection::wildcard(base_schema); - auto regular_columns = base_schema->regular_columns() - | std::views::transform(&column_definition::id) - | std::ranges::to(); - auto attrs_to_get = ::make_shared>( - flt ? std::nullopt : std::move(attrs_to_get_opt)); - - rjson::value items_json = rjson::empty_array(); - int matched_count = 0; - - if (base_schema->clustering_key_size() == 0) { - // Hash-only table: query each partition individually, in the order - // returned by the vector store, to preserve vector-distance ordering - // in the response. A multi-partition batch read would return items in - // token order instead, which would be wrong. - // FIXME: do this more efficiently with a batched read that preserves - // ordering. - for (const auto& pkey : pkeys) { - std::vector bounds{ - query::clustering_range::make_open_ended_both_sides()}; - auto partition_slice = query::partition_slice(std::move(bounds), {}, - regular_columns, selection->get_query_options()); - auto command = ::make_lw_shared( - base_schema->id(), base_schema->version(), partition_slice, - proxy.get_max_result_size(partition_slice), - query::tombstone_limit(proxy.get_tombstone_limit())); - service::storage_proxy::coordinator_query_result qr = - co_await proxy.query(base_schema, command, - {dht::partition_range(pkey.partition)}, - db::consistency_level::LOCAL_ONE, - service::storage_proxy::coordinator_query_options( - timeout, permit, client_state, trace_state)); - auto opt_item = executor::describe_single_item(base_schema, partition_slice, - *selection, *qr.query_result, *attrs_to_get); - if (opt_item && (!flt || flt.check(*opt_item))) { - ++matched_count; - if (select != select_type::count) { - if (select == select_type::projection) { - // A filter caused us to fall through here instead of - // taking the projection early-exit above. Reconstruct - // the key-only item from the full item we fetched. - rjson::value key_item = rjson::empty_object(); - for (const column_definition& cdef : base_schema->partition_key_columns()) { - if (const rjson::value* v = rjson::find(*opt_item, cdef.name_as_text())) { - rjson::add_with_string_name(key_item, cdef.name_as_text(), rjson::copy(*v)); - } - } - rjson::push_back(items_json, std::move(key_item)); - } else { - // When a filter caused us to fetch the full item, apply the - // requested projection (attrs_to_get_opt) before returning it. - // This mirrors describe_items_visitor::end_row() which removes - // extra filter attributes from the returned item. - if (flt && attrs_to_get_opt) { - for (const auto& [attr_name, subpath] : *attrs_to_get_opt) { - if (!subpath.has_value()) { - if (rjson::value* toplevel = rjson::find(*opt_item, attr_name)) { - if (!hierarchy_filter(*toplevel, subpath)) { - rjson::remove_member(*opt_item, attr_name); - } - } - } - } - std::vector to_remove; - for (auto it = opt_item->MemberBegin(); it != opt_item->MemberEnd(); ++it) { - std::string key(it->name.GetString(), it->name.GetStringLength()); - if (!attrs_to_get_opt->contains(key)) { - to_remove.push_back(std::move(key)); - } - } - for (const auto& key : to_remove) { - rjson::remove_member(*opt_item, key); - } - } - rjson::push_back(items_json, std::move(*opt_item)); - } - } - } - } - } else { - // Hash+range table: query each (partition, clustering) pair individually. - // FIXME: do this more efficiently!!! - for (const auto& pkey : pkeys) { - std::vector bounds{ - query::clustering_range::make_singular(pkey.clustering)}; - auto partition_slice = query::partition_slice(std::move(bounds), {}, - regular_columns, selection->get_query_options()); - auto command = ::make_lw_shared( - base_schema->id(), base_schema->version(), partition_slice, - proxy.get_max_result_size(partition_slice), - query::tombstone_limit(proxy.get_tombstone_limit())); - service::storage_proxy::coordinator_query_result qr = - co_await proxy.query(base_schema, command, - {dht::partition_range(pkey.partition)}, - db::consistency_level::LOCAL_ONE, - service::storage_proxy::coordinator_query_options( - timeout, permit, client_state, trace_state)); - auto opt_item = executor::describe_single_item(base_schema, partition_slice, - *selection, *qr.query_result, *attrs_to_get); - if (opt_item && (!flt || flt.check(*opt_item))) { - ++matched_count; - if (select != select_type::count) { - if (select == select_type::projection) { - // A filter caused us to fall through here; project to keys. - rjson::value key_item = rjson::empty_object(); - for (const column_definition& cdef : base_schema->partition_key_columns()) { - if (const rjson::value* v = rjson::find(*opt_item, cdef.name_as_text())) { - rjson::add_with_string_name(key_item, cdef.name_as_text(), rjson::copy(*v)); - } - } - for (const column_definition& cdef : base_schema->clustering_key_columns()) { - if (const rjson::value* v = rjson::find(*opt_item, cdef.name_as_text())) { - rjson::add_with_string_name(key_item, cdef.name_as_text(), rjson::copy(*v)); - } - } - rjson::push_back(items_json, std::move(key_item)); - } else { - // When a filter caused us to fetch the full item, apply the - // requested projection (attrs_to_get_opt) before returning it. - // This mirrors describe_items_visitor::end_row() which removes - // extra filter attributes from the returned item. - if (flt && attrs_to_get_opt) { - for (const auto& [attr_name, subpath] : *attrs_to_get_opt) { - if (!subpath.has_value()) { - if (rjson::value* toplevel = rjson::find(*opt_item, attr_name)) { - if (!hierarchy_filter(*toplevel, subpath)) { - rjson::remove_member(*opt_item, attr_name); - } - } - } - } - std::vector to_remove; - for (auto it = opt_item->MemberBegin(); it != opt_item->MemberEnd(); ++it) { - std::string key(it->name.GetString(), it->name.GetStringLength()); - if (!attrs_to_get_opt->contains(key)) { - to_remove.push_back(std::move(key)); - } - } - for (const auto& key : to_remove) { - rjson::remove_member(*opt_item, key); - } - } - rjson::push_back(items_json, std::move(*opt_item)); - } - } - } - } - } - - rjson::value response = rjson::empty_object(); - if (select == select_type::count) { - rjson::add(response, "Count", rjson::value(matched_count)); - } else { - rjson::add(response, "Count", rjson::value(static_cast(items_json.Size()))); - rjson::add(response, "Items", std::move(items_json)); - } - rjson::add(response, "ScannedCount", rjson::value(static_cast(pkeys.size()))); - co_return rjson::print(std::move(response)); -} - -future executor::query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { - _stats.api_operations.query++; - elogger.trace("Querying {}", request); - - if (rjson::find(request, "VectorSearch")) { - // If vector search is requested, we have a separate code path. - // IndexName must be given and must refer to a vector index - not - // to a GSI or LSI as the code below assumes. - return query_vector(_proxy, _vsc, std::move(request), client_state, trace_state, std::move(permit), - _enforce_authorization, _warn_authorization, _stats, *_parsed_expression_cache); - } - - auto [schema, table_type] = get_table_or_view(_proxy, request); - db::consistency_level cl = get_read_consistency(request); - - maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "Query", request, cl); - - get_stats_from_schema(_proxy, *schema)->api_operations.query++; - tracing::add_alternator_table_name(trace_state, schema->cf_name()); - - rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey"); - if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) { - return make_ready_future(api_error::validation( - "Consistent reads are not allowed on global indexes (GSI)")); - } - rjson::value* limit_json = rjson::find(request, "Limit"); - uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits::max(); - if (limit <= 0) { - return make_ready_future(api_error::validation("Limit must be greater than 0")); - } - - const bool forward = get_bool_attribute(request, "ScanIndexForward", true); - - rjson::value* key_conditions = rjson::find(request, "KeyConditions"); - rjson::value* key_condition_expression = rjson::find(request, "KeyConditionExpression"); - std::unordered_set used_attribute_values; - std::unordered_set used_attribute_names; - if (key_conditions && key_condition_expression) { - throw api_error::validation("Query does not allow both " - "KeyConditions and KeyConditionExpression to be given together"); - } else if (!key_conditions && !key_condition_expression) { - throw api_error::validation("Query must have one of " - "KeyConditions or KeyConditionExpression"); - } - - const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); - const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); - - // exactly one of key_conditions or key_condition_expression - auto [partition_ranges, ck_bounds] = key_conditions - ? calculate_bounds_conditions(schema, *key_conditions) - : calculate_bounds_condition_expression(schema, *key_condition_expression, - expression_attribute_values, - used_attribute_values, - expression_attribute_names, - used_attribute_names, *_parsed_expression_cache); - - filter filter(*_parsed_expression_cache, request, filter::request_type::QUERY, - used_attribute_names, used_attribute_values); - - // A query is not allowed to filter on the partition key or the sort key. - for (const column_definition& cdef : schema->partition_key_columns()) { // just one - if (filter.filters_on(cdef.name_as_text())) { - return make_ready_future(api_error::validation( - format("QueryFilter can only contain non-primary key attributes: Partition key attribute: {}", cdef.name_as_text()))); - } - } - for (const column_definition& cdef : schema->clustering_key_columns()) { - if (filter.filters_on(cdef.name_as_text())) { - return make_ready_future(api_error::validation( - format("QueryFilter can only contain non-primary key attributes: Sort key attribute: {}", cdef.name_as_text()))); - } - // FIXME: this "break" can avoid listing some clustering key columns - // we added for GSIs just because they existed in the base table - - // but not in all cases. We still have issue #5320. - break; - } - - select_type select = parse_select(request, table_type); - - auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select); - verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query"); - verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query"); - query::partition_slice::option_set opts; - opts.set_if(!forward); - return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, - std::move(filter), opts, client_state, _stats, std::move(trace_state), std::move(permit), _enforce_authorization, _warn_authorization); -} future executor::list_tables(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { _stats.api_operations.list_tables++; diff --git a/alternator/executor.hh b/alternator/executor.hh index ad880418e7..524354d1f8 100644 --- a/alternator/executor.hh +++ b/alternator/executor.hh @@ -67,6 +67,8 @@ class gossiper; class schema_builder; +#include "alternator/attribute_path.hh" + namespace alternator { enum class table_status; @@ -78,71 +80,6 @@ bool is_alternator_keyspace(const sstring& ks_name); // Wraps the db::get_tags_of_table and throws if the table is missing the tags extension. const std::map& get_tags_of_table_or_throw(schema_ptr schema); -// An attribute_path_map object is used to hold data for various attributes -// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path -// has a root attribute, and then modified by member and index operators - -// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then -// "[2]" index, and finally ".c" member. -// Data can be added to an attribute_path_map using the add() function, but -// requires that attributes with data not be *overlapping* or *conflicting*: -// -// 1. Two attribute paths which are identical or an ancestor of one another -// are considered *overlapping* and not allowed. If a.b.c has data, -// we can't add more data in a.b.c or any of its descendants like a.b.c.d. -// -// 2. Two attribute paths which need the same parent to have both a member and -// an index are considered *conflicting* and not allowed. E.g., if a.b has -// data, you can't add a[1]. The meaning of adding both would be that the -// attribute a is both a map and an array, which isn't sensible. -// -// These two requirements are common to the two places where Alternator uses -// this abstraction to describe how a hierarchical item is to be transformed: -// -// 1. In ProjectExpression: for filtering from a full top-level attribute -// only the parts for which user asked in ProjectionExpression. -// -// 2. In UpdateExpression: for taking the previous value of a top-level -// attribute, and modifying it based on the instructions in the user -// wrote in UpdateExpression. - -template -class attribute_path_map_node { -public: - using data_t = T; - // We need the extra unique_ptr<> here because libstdc++ unordered_map - // doesn't work with incomplete types :-( - using members_t = std::unordered_map>>; - // The indexes list is sorted because DynamoDB requires handling writes - // beyond the end of a list in index order. - using indexes_t = std::map>>; - // The prohibition on "overlap" and "conflict" explained above means - // That only one of data, members or indexes is non-empty. - std::optional> _content; - - bool is_empty() const { return !_content; } - bool has_value() const { return _content && std::holds_alternative(*_content); } - bool has_members() const { return _content && std::holds_alternative(*_content); } - bool has_indexes() const { return _content && std::holds_alternative(*_content); } - // get_members() assumes that has_members() is true - members_t& get_members() { return std::get(*_content); } - const members_t& get_members() const { return std::get(*_content); } - indexes_t& get_indexes() { return std::get(*_content); } - const indexes_t& get_indexes() const { return std::get(*_content); } - T& get_value() { return std::get(*_content); } - const T& get_value() const { return std::get(*_content); } -}; - -template -using attribute_path_map = std::unordered_map>; - -using attrs_to_get_node = attribute_path_map_node; -// attrs_to_get lists which top-level attribute are needed, and possibly also -// which part of the top-level attribute is really needed (when nested -// attribute paths appeared in the query). -// Most code actually uses optional. There, a disengaged -// optional means we should get all attributes, not specific ones. -using attrs_to_get = attribute_path_map; - namespace parsed { class expression_cache; } @@ -351,4 +288,24 @@ arn_parts parse_arn(std::string_view arn, std::string_view arn_field_name, std:: // The format is ks1|ks2|ks3... and table1|table2|table3... sstring print_names_for_audit(const std::set& names); +map_type attrs_type(); +lw_shared_ptr get_stats_from_schema(service::storage_proxy& sp, const schema& schema); +std::string view_name(std::string_view table_name, std::string_view index_name, + const std::string& delim = ":", bool validate_len = true); +std::string gsi_name(std::string_view table_name, std::string_view index_name, + bool validate_len = true); +std::string lsi_name(std::string_view table_name, std::string_view index_name, + bool validate_len = true); +std::string get_table_name(const rjson::value& request); +schema_ptr try_get_internal_table(data_dictionary::database db, std::string_view table_name); +std::optional get_int_attribute(const rjson::value& value, std::string_view attribute_name); +bool get_bool_attribute(const rjson::value& value, std::string_view attribute_name, bool default_return); +void check_key(const rjson::value& key, const schema_ptr& schema); +schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request); +void verify_all_are_used( + const rjson::value* field, + const std::unordered_set& used, + const char* field_name, + const char* operation); + } diff --git a/alternator/executor_read.cc b/alternator/executor_read.cc new file mode 100644 index 0000000000..05bc63a2b7 --- /dev/null +++ b/alternator/executor_read.cc @@ -0,0 +1,1997 @@ +/* + * Copyright 2019-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 + */ + +// This file implements the Alternator read operations: GetItem, BatchGetItem, +// Query (including vector search) and Scan. +// Public entry points: +// * executor::get_item() +// * executor::batch_get_item() +// * executor::scan() +// * executor::query() +// Major internal functions: +// * do_query(): the common code for Query and Scan, except vector search. +// * query_vector(): the vector-search code path for Query with VectorSearch. +// and a number of helper functions for parsing common parameters of read +// requests such as TableName, IndexName, Select, FilterExpression, +// ConsistentRead, ProjectionExpression, and more. + +#include "alternator/executor.hh" +#include "alternator/conditions.hh" +#include "alternator/expressions.hh" +#include "alternator/consumed_capacity.hh" +#include "alternator/serialization.hh" +#include "alternator/attribute_path.hh" +#include "auth/permission.hh" +#include "cql3/selection/selection.hh" +#include "cql3/result_set.hh" +#include "query/query-request.hh" +#include "schema/schema.hh" +#include "service/client_state.hh" +#include "service/pager/query_pagers.hh" +#include "service/storage_proxy.hh" +#include "index/secondary_index.hh" +#include "utils/assert.hh" +#include "utils/overloaded_functor.hh" +#include "utils/error_injection.hh" +#include "vector_search/vector_store_client.hh" +#include +#include +#include +#include +#include + +using namespace std::chrono_literals; + +namespace alternator { + +extern logging::logger elogger; // from executor.cc + +// make_streamed_with_extra_array() is variant of make_streamed() above, which +// builds a streaming response (a function writing to an output stream) from a +// JSON object (rjson::value) but adds to it at the end an additional array. +// The extra array is given a separate chunked_vector to avoid putting it +// inside the rjson::value - because RapidJSON does contiguous allocations for +// arrays which we want to avoid for potentially long arrays in Query/Scan +// responses (see #23535). +// If we ever fix RapidJSON to avoid contiguous allocations for arrays, or +// replace it entirely (#24458), we can remove this function and the function +// rjson::print_with_extra_array() which it calls. +static executor::body_writer make_streamed_with_extra_array(rjson::value&& value, + std::string array_name, utils::chunked_vector&& array) { + return [value = std::move(value), array_name = std::move(array_name), array = std::move(array)](output_stream&& _out) mutable -> future<> { + auto out = std::move(_out); + std::exception_ptr ex; + try { + co_await rjson::print_with_extra_array(value, array_name, array, out); + } catch (...) { + ex = std::current_exception(); + } + co_await out.close(); + co_await rjson::destroy_gently(std::move(value)); + // TODO: can/should we also destroy the array gently? + if (ex) { + co_await coroutine::return_exception_ptr(std::move(ex)); + } + }; +} + +// select_type represents how the Select parameter of Query/Scan selects what +// to return. It is also used by calculate_attrs_to_get() to know whether to +// return no attributes (count), or specific attributes. +enum class select_type { regular, count, projection }; + +// Check according to the request's "ConsistentRead" field, which consistency +// level we need to use for the read. The field can be True for strongly +// consistent reads, or False for eventually consistent reads, or if this +// field is absence, we default to eventually consistent reads. +// In Scylla, eventually-consistent reads are implemented as consistency +// level LOCAL_ONE, and strongly-consistent reads as LOCAL_QUORUM. +static db::consistency_level get_read_consistency(const rjson::value& request) { + const rjson::value* consistent_read_value = rjson::find(request, "ConsistentRead"); + bool consistent_read = false; + if (consistent_read_value && !consistent_read_value->IsNull()) { + if (consistent_read_value->IsBool()) { + consistent_read = consistent_read_value->GetBool(); + } else { + throw api_error::validation("ConsistentRead flag must be a boolean"); + } + } + return consistent_read ? db::consistency_level::LOCAL_QUORUM : db::consistency_level::LOCAL_ONE; +} + +// attrs_to_get saves for each top-level attribute an attrs_to_get_node, +// a hierarchy of subparts that need to be kept. The following function +// calculate_attrs_to_get() takes either AttributesToGet or +// ProjectionExpression parameters (having both is *not* allowed), +// and returns the list of cells we need to read, or a disengaged optional +// when *all* attributes are to be returned. +// However, in our current implementation, only top-level attributes are +// stored as separate cells - a nested document is stored serialized together +// (as JSON) in the same cell. So this function return a map - each key is the +// top-level attribute we will need need to read, and the value for each +// top-level attribute is the partial hierarchy (struct hierarchy_filter) +// that we will need to extract from that serialized JSON. +// For example, if ProjectionExpression lists a.b and a.c[2], we +// return one top-level attribute name, "a", with the value "{b, c[2]}". +static std::optional calculate_attrs_to_get(const rjson::value& req, parsed::expression_cache& parsed_expression_cache, std::unordered_set& used_attribute_names, select_type select = select_type::regular) { + if (select == select_type::count) { + // An empty map asks to retrieve no attributes. Note that this is + // different from a disengaged optional which means retrieve all. + return attrs_to_get(); + } + // FIXME: also need to handle select_type::projection + const bool has_attributes_to_get = req.HasMember("AttributesToGet"); + const bool has_projection_expression = req.HasMember("ProjectionExpression"); + if (has_attributes_to_get && has_projection_expression) { + throw api_error::validation( + format("GetItem does not allow both ProjectionExpression and AttributesToGet to be given together")); + } + if (has_attributes_to_get) { + const rjson::value& attributes_to_get = req["AttributesToGet"]; + attrs_to_get ret; + for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) { + attribute_path_map_add("AttributesToGet", ret, rjson::to_string(*it)); + validate_attr_name_length("AttributesToGet", it->GetStringLength(), false); + } + if (ret.empty()) { + throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead."); + } + return ret; + } else if (has_projection_expression) { + const rjson::value& projection_expression = req["ProjectionExpression"]; + const rjson::value* expression_attribute_names = rjson::find(req, "ExpressionAttributeNames"); + std::vector paths_to_get; + try { + paths_to_get = parsed_expression_cache.parse_projection_expression(rjson::to_string_view(projection_expression)); + } catch(expressions_syntax_error& e) { + throw api_error::validation(e.what()); + } + resolve_projection_expression(paths_to_get, expression_attribute_names, used_attribute_names); + attrs_to_get ret; + for (const parsed::path& p : paths_to_get) { + attribute_path_map_add("ProjectionExpression", ret, p); + } + return ret; + } + // A disengaged optional asks to read everything + return std::nullopt; +} + +// get_table_or_view() is similar to to get_table(), except it returns either +// a table or a materialized view from which to read, based on the TableName +// and optional IndexName in the request. Only requests like Query and Scan +// which allow IndexName should use this function. +enum class table_or_view_type { base, lsi, gsi, vector_index }; +static std::pair +get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) { + table_or_view_type type = table_or_view_type::base; + std::string table_name = get_table_name(request); + + if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) { + return {s, type}; + } + + std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name; + const rjson::value* index_name = rjson::find(request, "IndexName"); + std::string orig_table_name; + if (index_name) { + if (index_name->IsString()) { + orig_table_name = std::move(table_name); + table_name = view_name(orig_table_name, rjson::to_string_view(*index_name)); + type = table_or_view_type::gsi; + } else { + throw api_error::validation( + fmt::format("Non-string IndexName '{}'", rjson::to_string_view(*index_name))); + } + // If no tables for global indexes were found, the index may be local + if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) { + type = table_or_view_type::lsi; + table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name)); + } + } + + try { + return { proxy.data_dictionary().find_schema(keyspace_name, table_name), type }; + } catch(data_dictionary::no_such_column_family&) { + if (index_name) { + // DynamoDB returns a different error depending on whether the + // base table doesn't exist (ResourceNotFoundException) or it + // does exist but the index does not (ValidationException). + if (proxy.data_dictionary().has_schema(keyspace_name, orig_table_name)) { + throw api_error::validation( + fmt::format("Requested resource not found: Index '{}' for table '{}'", rjson::to_string_view(*index_name), orig_table_name)); + } else { + throw api_error::resource_not_found( + fmt::format("Requested resource not found: Table: {} not found", orig_table_name)); + } + } else { + throw api_error::resource_not_found( + fmt::format("Requested resource not found: Table: {} not found", table_name)); + } + } +} + + +// Parse the "Select" parameter of a Scan or Query operation, throwing a +// ValidationException in various forbidden combinations of options and +// finally returning one of three options: +// 1. regular - the default scan behavior of returning all or specific +// attributes ("ALL_ATTRIBUTES" or "SPECIFIC_ATTRIBUTES"). +// 2. count - just count the items ("COUNT") +// 3. projection - return projected attributes ("ALL_PROJECTED_ATTRIBUTES") +// An ValidationException is thrown when recognizing an invalid combination +// of options - such as ALL_PROJECTED_ATTRIBUTES for a base table, or +// SPECIFIC_ATTRIBUTES without ProjectionExpression or AttributesToGet. +static select_type parse_select(const rjson::value& request, table_or_view_type table_type) { + const rjson::value* select_value = rjson::find(request, "Select"); + if (!select_value) { + // If "Select" is not specified, it defaults to ALL_ATTRIBUTES + // on a base table or vector index, or ALL_PROJECTED_ATTRIBUTES on GSI/LSI. + return (table_type == table_or_view_type::base || table_type == table_or_view_type::vector_index) ? + select_type::regular : select_type::projection; + } + if (!select_value->IsString()) { + throw api_error::validation("Select parameter must be a string"); + } + std::string_view select = rjson::to_string_view(*select_value); + const bool has_attributes_to_get = request.HasMember("AttributesToGet"); + const bool has_projection_expression = request.HasMember("ProjectionExpression"); + if (select == "SPECIFIC_ATTRIBUTES") { + if (has_projection_expression || has_attributes_to_get) { + return select_type::regular; + } + throw api_error::validation("Select=SPECIFIC_ATTRIBUTES requires AttributesToGet or ProjectionExpression"); + } + if (has_projection_expression || has_attributes_to_get) { + throw api_error::validation("AttributesToGet or ProjectionExpression require Select to be either SPECIFIC_ATTRIBUTES or missing"); + } + if (select == "COUNT") { + return select_type::count; + } + if (select == "ALL_ATTRIBUTES") { + // FIXME: when we support projections (#5036), if this is a GSI and + // not all attributes are projected to it, we should throw. + return select_type::regular; + } + if (select == "ALL_PROJECTED_ATTRIBUTES") { + if (table_type == table_or_view_type::base) { + throw api_error::validation("ALL_PROJECTED_ATTRIBUTES only allowed for indexes"); + } + return select_type::projection; + } + throw api_error::validation(fmt::format("Unknown Select value '{}'. Allowed choices: ALL_ATTRIBUTES, SPECIFIC_ATTRIBUTES, ALL_PROJECTED_ATTRIBUTES, COUNT", + select)); +} + +// "filter" represents a condition that can be applied to individual items +// read by a Query or Scan operation, to decide whether to keep the item. +// A filter is constructed from a Query or Scan request. This uses the +// relevant fields in the query (FilterExpression or QueryFilter/ScanFilter + +// ConditionalOperator). These fields are pre-checked and pre-parsed as much +// as possible, to ensure that later checking of many items is efficient. +class filter { +private: + // Holding QueryFilter/ScanFilter + ConditionalOperator: + struct conditions_filter { + bool require_all; + rjson::value conditions; + }; + // Holding a parsed FilterExpression: + struct expression_filter { + parsed::condition_expression expression; + }; + std::optional> _imp; +public: + // Filtering for Scan and Query are very similar, but there are some + // small differences, especially the names of the request attributes. + enum class request_type { SCAN, QUERY }; + // Note that a filter does not store pointers to the query used to + // construct it. + filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt, + std::unordered_set& used_attribute_names, + std::unordered_set& used_attribute_values); + bool check(const rjson::value& item) const; + bool filters_on(std::string_view attribute) const; + // for_filters_on() runs the given function on the attributes that the + // filter works on. It may run for the same attribute more than once if + // used more than once in the filter. + void for_filters_on(const noncopyable_function& func) const; + operator bool() const { return bool(_imp); } +}; + +filter::filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt, + std::unordered_set& used_attribute_names, + std::unordered_set& used_attribute_values) { + const rjson::value* expression = rjson::find(request, "FilterExpression"); + const char* conditions_attribute = (rt == request_type::SCAN) ? "ScanFilter" : "QueryFilter"; + const rjson::value* conditions = rjson::find(request, conditions_attribute); + auto conditional_operator = get_conditional_operator(request); + if (conditional_operator != conditional_operator_type::MISSING && + (!conditions || (conditions->IsObject() && conditions->GetObject().ObjectEmpty()))) { + throw api_error::validation( + format("'ConditionalOperator' parameter cannot be specified for missing or empty {}", + conditions_attribute)); + } + if (expression && conditions) { + throw api_error::validation( + format("FilterExpression and {} are not allowed together", conditions_attribute)); + } + if (expression) { + if (!expression->IsString()) { + throw api_error::validation("FilterExpression must be a string"); + } + if (expression->GetStringLength() == 0) { + throw api_error::validation("FilterExpression must not be empty"); + } + if (rjson::find(request, "AttributesToGet")) { + throw api_error::validation("Cannot use both old-style and new-style parameters in same request: FilterExpression and AttributesToGet"); + } + try { + auto parsed = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(*expression), "FilterExpression"); + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); + resolve_condition_expression(parsed, + expression_attribute_names, expression_attribute_values, + used_attribute_names, used_attribute_values); + _imp = expression_filter { std::move(parsed) }; + } catch(expressions_syntax_error& e) { + throw api_error::validation(e.what()); + } + } + if (conditions) { + if (rjson::find(request, "ProjectionExpression")) { + throw api_error::validation(format("Cannot use both old-style and new-style parameters in same request: {} and ProjectionExpression", conditions_attribute)); + } + bool require_all = conditional_operator != conditional_operator_type::OR; + _imp = conditions_filter { require_all, rjson::copy(*conditions) }; + } +} + +bool filter::check(const rjson::value& item) const { + if (!_imp) { + return true; + } + return std::visit(overloaded_functor { + [&] (const conditions_filter& f) -> bool { + return verify_condition(f.conditions, f.require_all, &item); + }, + [&] (const expression_filter& f) -> bool { + return verify_condition_expression(f.expression, &item); + } + }, *_imp); +} + +bool filter::filters_on(std::string_view attribute) const { + if (!_imp) { + return false; + } + return std::visit(overloaded_functor { + [&] (const conditions_filter& f) -> bool { + for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) { + if (rjson::to_string_view(it->name) == attribute) { + return true; + } + } + return false; + }, + [&] (const expression_filter& f) -> bool { + return condition_expression_on(f.expression, attribute); + } + }, *_imp); +} + +void filter::for_filters_on(const noncopyable_function& func) const { + if (_imp) { + std::visit(overloaded_functor { + [&] (const conditions_filter& f) -> void { + for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) { + func(rjson::to_string_view(it->name)); + } + }, + [&] (const expression_filter& f) -> void { + return for_condition_expression_on(f.expression, func); + } + }, *_imp); + } +} + +class describe_items_visitor { + typedef std::vector columns_t; + const columns_t& _columns; + const std::optional& _attrs_to_get; + std::unordered_set _extra_filter_attrs; + const filter& _filter; + typename columns_t::const_iterator _column_it; + rjson::value _item; + // _items is a chunked_vector instead of a RapidJson array + // (rjson::value) because unfortunately RapidJson arrays are stored + // contiguously in memory, and cause large allocations when a Query/Scan + // returns a long list of short items (issue #23535). + utils::chunked_vector _items; + size_t _scanned_count; + +public: + describe_items_visitor(const columns_t& columns, const std::optional& attrs_to_get, filter& filter) + : _columns(columns) + , _attrs_to_get(attrs_to_get) + , _filter(filter) + , _column_it(columns.begin()) + , _item(rjson::empty_object()) + , _scanned_count(0) + { + // _filter.check() may need additional attributes not listed in + // _attrs_to_get (i.e., not requested as part of the output). + // We list those in _extra_filter_attrs. We will include them in + // the JSON but take them out before finally returning the JSON. + if (_attrs_to_get) { + _filter.for_filters_on([&] (std::string_view attr) { + std::string a(attr); // no heterogeneous maps searches :-( + if (!_attrs_to_get->contains(a)) { + _extra_filter_attrs.emplace(std::move(a)); + } + }); + } + } + + void start_row() { + _column_it = _columns.begin(); + } + + void accept_value(managed_bytes_view_opt result_bytes_view) { + if (!result_bytes_view) { + ++_column_it; + return; + } + result_bytes_view->with_linearized([this] (bytes_view bv) { + std::string column_name = (*_column_it)->name_as_text(); + if (column_name != executor::ATTRS_COLUMN_NAME) { + if (!_attrs_to_get || _attrs_to_get->contains(column_name) || _extra_filter_attrs.contains(column_name)) { + if (!_item.HasMember(column_name.c_str())) { + rjson::add_with_string_name(_item, column_name, rjson::empty_object()); + } + rjson::value& field = _item[column_name.c_str()]; + rjson::add_with_string_name(field, type_to_string((*_column_it)->type), json_key_column_value(bv, **_column_it)); + } + } else { + auto deserialized = attrs_type()->deserialize(bv); + auto keys_and_values = value_cast(deserialized); + for (auto entry : keys_and_values) { + std::string attr_name = value_cast(entry.first); + if (!_attrs_to_get || _attrs_to_get->contains(attr_name) || _extra_filter_attrs.contains(attr_name)) { + bytes value = value_cast(entry.second); + // Even if _attrs_to_get asked to keep only a part of a + // top-level attribute, we keep the entire attribute + // at this stage, because the item filter might still + // need the other parts (it was easier for us to keep + // extra_filter_attrs at top-level granularity). We'll + // filter the unneeded parts after item filtering. + rjson::add_with_string_name(_item, attr_name, deserialize_item(value)); + } + } + } + }); + ++_column_it; + } + + void end_row() { + if (_filter.check(_item)) { + // As noted above, we kept entire top-level attributes listed in + // _attrs_to_get. We may need to only keep parts of them. + if (_attrs_to_get) { + for (const auto& attr: *_attrs_to_get) { + // If !attr.has_value() it means we were asked not to keep + // attr entirely, but just parts of it. + if (!attr.second.has_value()) { + rjson::value* toplevel= rjson::find(_item, attr.first); + if (toplevel && !hierarchy_filter(*toplevel, attr.second)) { + rjson::remove_member(_item, attr.first); + } + } + } + } + // Remove the extra attributes _extra_filter_attrs which we had + // to add just for the filter, and not requested to be returned: + for (const auto& attr : _extra_filter_attrs) { + rjson::remove_member(_item, attr); + } + + _items.push_back(std::move(_item)); + } + _item = rjson::empty_object(); + ++_scanned_count; + } + + utils::chunked_vector get_items() && { + return std::move(_items); + } + + size_t get_scanned_count() { + return _scanned_count; + } +}; + +// describe_items() returns a JSON object that includes members "Count" +// and "ScannedCount", but *not* "Items" - that is returned separately +// as a chunked_vector to avoid large contiguous allocations which +// RapidJSON does of its array. The caller should add "Items" to the +// returned JSON object if needed, or print it separately. +// The returned chunked_vector (the items) is std::optional<>, because +// the user may have requested only to count items, and not return any +// items - which is different from returning an empty list of items. +static future>, size_t>> describe_items( + const cql3::selection::selection& selection, + std::unique_ptr result_set, + std::optional&& attrs_to_get, + filter&& filter) { + describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter); + co_await result_set->visit_gently(visitor); + auto scanned_count = visitor.get_scanned_count(); + utils::chunked_vector items = std::move(visitor).get_items(); + rjson::value items_descr = rjson::empty_object(); + auto size = items.size(); + rjson::add(items_descr, "Count", rjson::value(size)); + rjson::add(items_descr, "ScannedCount", rjson::value(scanned_count)); + // If attrs_to_get && attrs_to_get->empty(), this means the user asked not + // to get any attributes (i.e., a Scan or Query with Select=COUNT) and we + // shouldn't return "Items" at all. + // TODO: consider optimizing the case of Select=COUNT without a filter. + // In that case, we currently build a list of empty items and here drop + // it. We could just count the items and not bother with the empty items. + // (However, remember that when we do have a filter, we need the items). + std::optional> opt_items; + if (!attrs_to_get || !attrs_to_get->empty()) { + opt_items = std::move(items); + } + co_return std::tuple(std::move(items_descr), std::move(opt_items), size); +} + +static rjson::value encode_paging_state(const schema& schema, const service::pager::paging_state& paging_state) { + rjson::value last_evaluated_key = rjson::empty_object(); + std::vector exploded_pk = paging_state.get_partition_key().explode(); + auto exploded_pk_it = exploded_pk.begin(); + for (const column_definition& cdef : schema.partition_key_columns()) { + rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object()); + rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()]; + rjson::add_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef)); + ++exploded_pk_it; + } + auto pos = paging_state.get_position_in_partition(); + if (pos.has_key()) { + // Alternator itself allows at most one column in clustering key, but + // user can use Alternator api to access system tables which might have + // multiple clustering key columns. So we need to handle that case here. + auto cdef_it = schema.clustering_key_columns().begin(); + for(const auto &exploded_ck : pos.key().explode()) { + rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef_it->name_as_text()), rjson::empty_object()); + rjson::value& key_entry = last_evaluated_key[cdef_it->name_as_text()]; + rjson::add_with_string_name(key_entry, type_to_string(cdef_it->type), json_key_column_value(exploded_ck, *cdef_it)); + ++cdef_it; + } + } + // To avoid possible conflicts (and thus having to reserve these names) we + // avoid adding the weight and region fields of the position to the paging + // state. Alternator will never need these as it doesn't have range + // tombstones (the only thing that can generate a position other than at(row)). + // We conditionally include these fields when reading CQL tables through alternator. + if (!is_alternator_keyspace(schema.ks_name()) && (!pos.has_key() || pos.get_bound_weight() != bound_weight::equal)) { + rjson::add_with_string_name(last_evaluated_key, scylla_paging_region, rjson::empty_object()); + rjson::add(last_evaluated_key[scylla_paging_region.data()], "S", rjson::from_string(fmt::to_string(pos.region()))); + rjson::add_with_string_name(last_evaluated_key, scylla_paging_weight, rjson::empty_object()); + rjson::add(last_evaluated_key[scylla_paging_weight.data()], "N", static_cast(pos.get_bound_weight())); + } + return last_evaluated_key; +} + +// RapidJSON allocates arrays contiguously in memory, so we want to avoid +// returning a large number of items as a single rapidjson array, and use +// a chunked_vector instead. The following constant is an arbitrary cutoff +// point for when to switch from a rapidjson array to a chunked_vector. +static constexpr int max_items_for_rapidjson_array = 256; + +static future do_query(service::storage_proxy& proxy, + schema_ptr table_schema, + const rjson::value* exclusive_start_key, + dht::partition_range_vector partition_ranges, + std::vector ck_bounds, + std::optional attrs_to_get, + uint32_t limit, + db::consistency_level cl, + filter filter, + query::partition_slice::option_set custom_opts, + service::client_state& client_state, + alternator::stats& stats, + tracing::trace_state_ptr trace_state, + service_permit permit, + bool enforce_authorization, + bool warn_authorization) { + lw_shared_ptr old_paging_state = nullptr; + + tracing::trace(trace_state, "Performing a database query"); + + // Reverse the schema and the clustering bounds as the underlying code expects + // reversed queries in the native reversed format. + auto query_schema = table_schema; + const bool reversed = custom_opts.contains(); + if (reversed) { + query_schema = table_schema->get_reversed(); + + std::reverse(ck_bounds.begin(), ck_bounds.end()); + for (auto& bound : ck_bounds) { + bound = query::reverse(bound); + } + } + + if (exclusive_start_key) { + partition_key pk = pk_from_json(*exclusive_start_key, table_schema); + auto pos = position_in_partition::for_partition_start(); + if (table_schema->clustering_key_size() > 0) { + pos = pos_from_json(*exclusive_start_key, table_schema); + } + old_paging_state = make_lw_shared(pk, pos, query::max_partitions, query_id::create_null_id(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0); + } + + co_await verify_permission(enforce_authorization, warn_authorization, client_state, table_schema, auth::permission::SELECT, stats); + + auto regular_columns = + table_schema->regular_columns() | std::views::transform(&column_definition::id) + | std::ranges::to(); + auto static_columns = + table_schema->static_columns() | std::views::transform(&column_definition::id) + | std::ranges::to(); + auto selection = cql3::selection::selection::wildcard(table_schema); + query::partition_slice::option_set opts = selection->get_query_options(); + opts.add(custom_opts); + auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts); + auto command = ::make_lw_shared(query_schema->id(), query_schema->version(), partition_slice, proxy.get_max_result_size(partition_slice), + query::tombstone_limit(proxy.get_tombstone_limit())); + + elogger.trace("Executing read query (reversed {}): table schema {}, query schema {}", partition_slice.is_reversed(), table_schema->version(), query_schema->version()); + + auto query_state_ptr = std::make_unique(client_state, trace_state, std::move(permit)); + + // FIXME: should be moved above, set on opts, so get_max_result_size knows it? + command->slice.options.set(); + auto query_options = std::make_unique(cl, std::vector{}); + query_options = std::make_unique(std::move(query_options), std::move(old_paging_state)); + auto p = service::pager::query_pagers::pager(proxy, query_schema, selection, *query_state_ptr, *query_options, command, std::move(partition_ranges), nullptr); + + std::unique_ptr rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout()); + if (!p->is_exhausted()) { + rs->get_metadata().set_paging_state(p->state()); + } + auto paging_state = rs->get_metadata().paging_state(); + bool has_filter = filter; + auto [items_descr, opt_items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter)); + if (paging_state) { + rjson::add(items_descr, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state)); + } + if (has_filter) { + stats.cql_stats.filtered_rows_read_total += p->stats().rows_read_total; + // update our "filtered_row_matched_total" for all the rows matched, despited the filter + stats.cql_stats.filtered_rows_matched_total += size; + } + if (opt_items) { + if (opt_items->size() >= max_items_for_rapidjson_array) { + // There are many items, better print the JSON and the array of + // items (opt_items) separately to avoid RapidJSON's contiguous + // allocation of arrays. + co_return make_streamed_with_extra_array(std::move(items_descr), "Items", std::move(*opt_items)); + } + // There aren't many items in the chunked vector opt_items, + // let's just insert them into the JSON object and print the + // full JSON normally. + rjson::value items_json = rjson::empty_array(); + for (auto& item : *opt_items) { + rjson::push_back(items_json, std::move(item)); + } + rjson::add(items_descr, "Items", std::move(items_json)); + } + if (is_big(items_descr)) { + co_return make_streamed(std::move(items_descr)); + } + co_return rjson::print(std::move(items_descr)); +} + +static dht::token token_for_segment(int segment, int total_segments) { + throwing_assert(total_segments > 1 && segment >= 0 && segment < total_segments); + uint64_t delta = std::numeric_limits::max() / total_segments; + return dht::token::from_int64(std::numeric_limits::min() + delta * segment); +} + +static dht::partition_range get_range_for_segment(int segment, int total_segments) { + if (total_segments == 1) { + return dht::partition_range::make_open_ended_both_sides(); + } + if (segment == 0) { + dht::token ending_token = token_for_segment(1, total_segments); + return dht::partition_range::make_ending_with( + dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false)); + } else if (segment == total_segments - 1) { + dht::token starting_token = token_for_segment(segment, total_segments); + return dht::partition_range::make_starting_with( + dht::partition_range::bound(dht::ring_position::starting_at(starting_token))); + } else { + dht::token starting_token = token_for_segment(segment, total_segments); + dht::token ending_token = token_for_segment(segment + 1, total_segments); + return dht::partition_range::make( + dht::partition_range::bound(dht::ring_position::starting_at(starting_token)), + dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false) + ); + } +} + +future executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { + _stats.api_operations.scan++; + elogger.trace("Scanning {}", request); + + auto [schema, table_type] = get_table_or_view(_proxy, request); + db::consistency_level cl = get_read_consistency(request); + maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "Scan", request, cl); + tracing::add_alternator_table_name(trace_state, schema->cf_name()); + get_stats_from_schema(_proxy, *schema)->api_operations.scan++; + auto segment = get_int_attribute(request, "Segment"); + auto total_segments = get_int_attribute(request, "TotalSegments"); + if (segment || total_segments) { + if (!segment || !total_segments) { + return make_ready_future(api_error::validation( + "Both Segment and TotalSegments attributes need to be present for a parallel scan")); + } + if (*segment < 0 || *segment >= *total_segments) { + return make_ready_future(api_error::validation( + "Segment must be non-negative and less than TotalSegments")); + } + if (*total_segments < 0 || *total_segments > 1000000) { + return make_ready_future(api_error::validation( + "TotalSegments must be non-negative and less or equal to 1000000")); + } + } + + rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey"); + + if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) { + return make_ready_future(api_error::validation( + "Consistent reads are not allowed on global indexes (GSI)")); + } + rjson::value* limit_json = rjson::find(request, "Limit"); + uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits::max(); + if (limit <= 0) { + return make_ready_future(api_error::validation("Limit must be greater than 0")); + } + + select_type select = parse_select(request, table_type); + + std::unordered_set used_attribute_names; + std::unordered_set used_attribute_values; + auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select); + + dht::partition_range_vector partition_ranges; + if (segment) { + auto range = get_range_for_segment(*segment, *total_segments); + if (exclusive_start_key) { + auto ring_pos = dht::ring_position{dht::decorate_key(*schema, pk_from_json(*exclusive_start_key, schema))}; + if (!range.contains(ring_pos, dht::ring_position_comparator(*schema))) { + return make_ready_future(api_error::validation( + format("The provided starting key is invalid: Invalid ExclusiveStartKey. Please use ExclusiveStartKey " + "with correct Segment. TotalSegments: {} Segment: {}", *total_segments, *segment))); + } + } + partition_ranges.push_back(range); + } else { + partition_ranges.push_back(dht::partition_range::make_open_ended_both_sides()); + } + std::vector ck_bounds{query::clustering_range::make_open_ended_both_sides()}; + + filter filter(*_parsed_expression_cache, request, filter::request_type::SCAN, used_attribute_names, used_attribute_values); + // Note: Unlike Query, Scan does allow a filter on the key attributes. + // For some *specific* cases of key filtering, such an equality test on + // partition key or comparison operator for the sort key, we could have + // optimized the filtering by modifying partition_ranges and/or + // ck_bounds. We haven't done this optimization yet. + + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); + verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Scan"); + verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Scan"); + + return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, + std::move(filter), query::partition_slice::option_set(), client_state, _stats, trace_state, std::move(permit), _enforce_authorization, _warn_authorization); +} + +static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, const rjson::value& comp_definition, const rjson::value& attrs) { + auto op = get_comparison_operator(comp_definition); + if (op != comparison_operator_type::EQ) { + throw api_error::validation(format("Hash key can only be restricted with equality operator (EQ). {} not supported.", comp_definition)); + } + if (attrs.Size() != 1) { + throw api_error::validation(format("A single attribute is required for a hash key EQ restriction: {}", attrs)); + } + bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef); + partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value)); + auto decorated_key = dht::decorate_key(*schema, pk); + return dht::partition_range(decorated_key); +} + +static query::clustering_range get_clustering_range_for_begins_with(bytes&& target, const clustering_key& ck, schema_ptr schema, data_type t) { + auto it = boost::range::find_end(target, bytes("\xFF"), std::not_equal_to()); + if (it != target.end()) { + ++*it; + target.resize(std::distance(target.begin(), it) + 1); + clustering_key upper_limit = clustering_key::from_single_value(*schema, target); + return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit, false)); + } + return query::clustering_range::make_starting_with(query::clustering_range::bound(ck)); +} + +static query::clustering_range calculate_ck_bound(schema_ptr schema, const column_definition& ck_cdef, const rjson::value& comp_definition, const rjson::value& attrs) { + auto op = get_comparison_operator(comp_definition); + const size_t expected_attrs_size = (op == comparison_operator_type::BETWEEN) ? 2 : 1; + if (attrs.Size() != expected_attrs_size) { + throw api_error::validation(format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs)); + } + bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef); + clustering_key ck = clustering_key::from_single_value(*schema, raw_value); + switch (op) { + case comparison_operator_type::EQ: + return query::clustering_range(ck); + case comparison_operator_type::LE: + return query::clustering_range::make_ending_with(query::clustering_range::bound(ck)); + case comparison_operator_type::LT: + return query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false)); + case comparison_operator_type::GE: + return query::clustering_range::make_starting_with(query::clustering_range::bound(ck)); + case comparison_operator_type::GT: + return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false)); + case comparison_operator_type::BETWEEN: { + bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef); + clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit); + return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit)); + } + case comparison_operator_type::BEGINS_WITH: { + if (raw_value.empty()) { + return query::clustering_range::make_open_ended_both_sides(); + } + // NOTICE(sarna): A range starting with given prefix and ending (non-inclusively) with a string "incremented" by a single + // character at the end. Throws for NUMBER instances. + if (!ck_cdef.type->is_compatible_with(*utf8_type)) { + throw api_error::validation(fmt::format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type))); + } + return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type); + } + default: + throw api_error::validation(format("Operator {} not supported for sort key", comp_definition)); + } +} + +// Calculates primary key bounds from KeyConditions +static std::pair> +calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) { + dht::partition_range_vector partition_ranges; + std::vector ck_bounds; + + for (auto it = conditions.MemberBegin(); it != conditions.MemberEnd(); ++it) { + sstring key = rjson::to_sstring(it->name); + const rjson::value& condition = it->value; + + const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator"); + const rjson::value& attr_list = rjson::get(condition, "AttributeValueList"); + + const column_definition& pk_cdef = schema->partition_key_columns().front(); + const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? &schema->clustering_key_columns().front() : nullptr; + if (key == pk_cdef.name_as_text()) { + if (!partition_ranges.empty()) { + throw api_error::validation("Currently only a single restriction per key is allowed"); + } + partition_ranges.push_back(calculate_pk_bound(schema, pk_cdef, comp_definition, attr_list)); + } + if (ck_cdef && key == ck_cdef->name_as_text()) { + if (!ck_bounds.empty()) { + throw api_error::validation("Currently only a single restriction per key is allowed"); + } + ck_bounds.push_back(calculate_ck_bound(schema, *ck_cdef, comp_definition, attr_list)); + } + } + + // Validate that a query's conditions must be on the hash key, and + // optionally also on the sort key if it exists. + if (partition_ranges.empty()) { + throw api_error::validation(format("Query missing condition on hash key '{}'", schema->partition_key_columns().front().name_as_text())); + } + if (schema->clustering_key_size() == 0) { + if (conditions.MemberCount() != 1) { + throw api_error::validation("Only one condition allowed in table with only hash key"); + } + } else { + if (conditions.MemberCount() == 2 && ck_bounds.empty()) { + throw api_error::validation(format("Query missing condition on sort key '{}'", schema->clustering_key_columns().front().name_as_text())); + } else if (conditions.MemberCount() > 2) { + throw api_error::validation("Only one or two conditions allowed in table with hash key and sort key"); + } + } + + if (ck_bounds.empty()) { + ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } + + return {std::move(partition_ranges), std::move(ck_bounds)}; +} + +// Extract the top-level column name specified in a KeyConditionExpression. +// If a nested attribute path is given, a ValidationException is generated. +// If the column name is a #reference to ExpressionAttributeNames, the +// reference is resolved. +// Note this function returns a string_view, which may refer to data in the +// given parsed::value or expression_attribute_names. +static std::string_view get_toplevel(const parsed::value& v, + const rjson::value* expression_attribute_names, + std::unordered_set& used_attribute_names) +{ + const parsed::path& path = std::get(v._value); + if (path.has_operators()) { + throw api_error::validation("KeyConditionExpression does not support nested attributes"); + } + std::string_view column_name = path.root(); + if (column_name.size() > 0 && column_name[0] == '#') { + used_attribute_names.emplace(column_name); + if (!expression_attribute_names) { + throw api_error::validation( + fmt::format("ExpressionAttributeNames missing, entry '{}' required by KeyConditionExpression", + column_name)); + } + const rjson::value* value = rjson::find(*expression_attribute_names, column_name); + if (!value || !value->IsString()) { + throw api_error::validation( + fmt::format("ExpressionAttributeNames missing entry '{}' required by KeyConditionExpression", + column_name)); + } + column_name = rjson::to_string_view(*value); + } + return column_name; +} + +// Extract a constant value specified in a KeyConditionExpression. +// This constant was originally parsed as a reference (:name) to a member of +// ExpressionAttributeValues, but at this point, after resolve_value(), it +// was already converted into a JSON value. +// This function decodes the value (using its given expected type) into bytes +// which Scylla uses as the actual key value. If the value has the wrong type, +// or the input had other problems, a ValidationException is thrown. +static bytes get_constant_value(const parsed::value& v, + const column_definition& column) +{ + const parsed::constant& constant = std::get(v._value); + const parsed::constant::literal& lit = std::get(constant._value); + return get_key_from_typed_value(*lit, column); +} + +// condition_expression_and_list extracts a list of ANDed primitive conditions +// from a condition_expression. This is useful for KeyConditionExpression, +// which may not use OR or NOT. If the given condition_expression does use +// OR or NOT, this function throws a ValidationException. +static void condition_expression_and_list( + const parsed::condition_expression& condition_expression, + std::vector& conditions) +{ + if (condition_expression._negated) { + throw api_error::validation("KeyConditionExpression cannot use NOT"); + } + std::visit(overloaded_functor { + [&] (const parsed::primitive_condition& cond) { + conditions.push_back(&cond); + }, + [&] (const parsed::condition_expression::condition_list& list) { + if (list.op == '|' && list.conditions.size() > 1) { + throw api_error::validation("KeyConditionExpression cannot use OR"); + } + for (const parsed::condition_expression& cond : list.conditions) { + condition_expression_and_list(cond, conditions); + } + } + }, condition_expression._expression); +} + +// Calculates primary key bounds from KeyConditionExpression +static std::pair> +calculate_bounds_condition_expression(schema_ptr schema, + const rjson::value& expression, + const rjson::value* expression_attribute_values, + std::unordered_set& used_attribute_values, + const rjson::value* expression_attribute_names, + std::unordered_set& used_attribute_names, + parsed::expression_cache& parsed_expression_cache) +{ + if (!expression.IsString()) { + throw api_error::validation("KeyConditionExpression must be a string"); + } + if (expression.GetStringLength() == 0) { + throw api_error::validation("KeyConditionExpression must not be empty"); + } + // We parse the KeyConditionExpression with the same parser we use for + // ConditionExpression. But KeyConditionExpression only supports a subset + // of the ConditionExpression features, so we have many additional + // verifications below that the key condition is legal. Briefly, a valid + // key condition must contain a single partition key and a single + // sort-key range. + parsed::condition_expression p; + try { + p = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(expression), "KeyConditionExpression"); + } catch(expressions_syntax_error& e) { + throw api_error::validation(e.what()); + } + resolve_condition_expression(p, + expression_attribute_names, expression_attribute_values, + used_attribute_names, used_attribute_values); + std::vector conditions; + condition_expression_and_list(p, conditions); + + if (conditions.size() < 1 || conditions.size() > 2) { + throw api_error::validation( + "KeyConditionExpression syntax error: must have 1 or 2 conditions"); + } + // Scylla allows us to have an (equality) constraint on the partition key + // pk_cdef, and a range constraint on the *first* clustering key ck_cdef. + // Note that this is also good enough for our GSI implementation - the + // GSI's user-specified sort key will be the first clustering key. + // FIXME: In the case described in issue #5320 (base and GSI both have + // just hash key - but different ones), this may allow the user to Query + // using the base key which isn't officially part of the GSI. + const column_definition& pk_cdef = schema->partition_key_columns().front(); + const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? + &schema->clustering_key_columns().front() : nullptr; + + dht::partition_range_vector partition_ranges; + std::vector ck_bounds; + for (const parsed::primitive_condition* condp : conditions) { + const parsed::primitive_condition& cond = *condp; + // In all comparison operators, one operand must be a column name, + // the other is a constant (value reference). We remember which is + // which in toplevel_ind, and also the column name in key (not just + // for comparison operators). + std::string_view key; + int toplevel_ind; + switch (cond._values.size()) { + case 1: { + // The only legal single-value condition is a begin_with() function, + // and it must have two parameters - a top-level attribute and a + // value reference.. + const parsed::value::function_call *f = std::get_if(&cond._values[0]._value); + if (!f) { + throw api_error::validation("KeyConditionExpression cannot be just a value"); + } + if (f->_function_name != "begins_with") { + throw api_error::validation( + fmt::format("KeyConditionExpression function '{}' not supported",f->_function_name)); + } + if (f->_parameters.size() != 2 || !f->_parameters[0].is_path() || + !f->_parameters[1].is_constant()) { + throw api_error::validation( + "KeyConditionExpression begins_with() takes attribute and value"); + } + key = get_toplevel(f->_parameters[0], expression_attribute_names, used_attribute_names); + toplevel_ind = -1; + break; + } + case 2: + if (cond._values[0].is_path() && cond._values[1].is_constant()) { + toplevel_ind = 0; + } else if (cond._values[1].is_path() && cond._values[0].is_constant()) { + toplevel_ind = 1; + } else { + throw api_error::validation("KeyConditionExpression must compare attribute with constant"); + } + key = get_toplevel(cond._values[toplevel_ind], expression_attribute_names, used_attribute_names); + break; + case 3: + // Only BETWEEN has three operands. First must be a column name, + // two other must be value references (constants): + if (cond._op != parsed::primitive_condition::type::BETWEEN) { + // Shouldn't happen unless we have a bug in the parser + throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size())); + } + if (cond._values[0].is_path() && cond._values[1].is_constant() && cond._values[2].is_constant()) { + toplevel_ind = 0; + key = get_toplevel(cond._values[0], expression_attribute_names, used_attribute_names); + } else { + throw api_error::validation("KeyConditionExpression must compare attribute with constants"); + } + break; + default: + // Shouldn't happen unless we have a bug in the parser + throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size())); + } + if (cond._op == parsed::primitive_condition::type::IN) { + throw api_error::validation("KeyConditionExpression does not support IN operator"); + } else if (cond._op == parsed::primitive_condition::type::NE) { + throw api_error::validation("KeyConditionExpression does not support NE operator"); + } else if (cond._op == parsed::primitive_condition::type::EQ) { + // the EQ operator (=) is the only one which can be used for both + // the partition key and sort key: + if (sstring(key) == pk_cdef.name_as_text()) { + if (!partition_ranges.empty()) { + throw api_error::validation( + "KeyConditionExpression allows only one condition for each key"); + } + bytes raw_value = get_constant_value(cond._values[!toplevel_ind], pk_cdef); + partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value)); + auto decorated_key = dht::decorate_key(*schema, pk); + partition_ranges.push_back(dht::partition_range(decorated_key)); + } else if (ck_cdef && sstring(key) == ck_cdef->name_as_text()) { + if (!ck_bounds.empty()) { + throw api_error::validation( + "KeyConditionExpression allows only one condition for each key"); + } + bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef); + clustering_key ck = clustering_key::from_single_value(*schema, raw_value); + ck_bounds.push_back(query::clustering_range(ck)); + } else { + throw api_error::validation( + fmt::format("KeyConditionExpression condition on non-key attribute {}", key)); + } + continue; + } + // If we're still here, it's any other operator besides EQ, and these + // are allowed *only* on the clustering key: + if (sstring(key) == pk_cdef.name_as_text()) { + throw api_error::validation( + fmt::format("KeyConditionExpression only '=' condition is supported on partition key {}", key)); + } else if (!ck_cdef || sstring(key) != ck_cdef->name_as_text()) { + throw api_error::validation( + fmt::format("KeyConditionExpression condition on non-key attribute {}", key)); + } + if (!ck_bounds.empty()) { + throw api_error::validation( + "KeyConditionExpression allows only one condition for each key"); + } + if (cond._op == parsed::primitive_condition::type::BETWEEN) { + clustering_key ck1 = clustering_key::from_single_value(*schema, + get_constant_value(cond._values[1], *ck_cdef)); + clustering_key ck2 = clustering_key::from_single_value(*schema, + get_constant_value(cond._values[2], *ck_cdef)); + ck_bounds.push_back(query::clustering_range::make( + query::clustering_range::bound(ck1), query::clustering_range::bound(ck2))); + continue; + } else if (cond._values.size() == 1) { + // We already verified above, that this case this can only be a + // function call to begins_with(), with the first parameter the + // key, the second the value reference. + bytes raw_value = get_constant_value( + std::get(cond._values[0]._value)._parameters[1], *ck_cdef); + if (!ck_cdef->type->is_compatible_with(*utf8_type)) { + // begins_with() supported on bytes and strings (both stored + // in the database as strings) but not on numbers. + throw api_error::validation( + fmt::format("KeyConditionExpression begins_with() not supported on type {}", + type_to_string(ck_cdef->type))); + } else if (raw_value.empty()) { + ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } else { + clustering_key ck = clustering_key::from_single_value(*schema, raw_value); + ck_bounds.push_back(get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef->type)); + } + continue; + } + + // All remaining operator have one value reference parameter in index + // !toplevel_ind. Note how toplevel_ind==1 reverses the direction of + // an inequality. + bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef); + clustering_key ck = clustering_key::from_single_value(*schema, raw_value); + if ((cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 0) || + (cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 1)) { + ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false))); + } else if ((cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 0) || + (cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 1)) { + ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false))); + } else if ((cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 0) || + (cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 1)) { + ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck))); + } else if ((cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 0) || + (cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 1)) { + ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck))); + } + } + + if (partition_ranges.empty()) { + throw api_error::validation( + format("KeyConditionExpression requires a condition on partition key {}", pk_cdef.name_as_text())); + } + if (ck_bounds.empty()) { + ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } + return {std::move(partition_ranges), std::move(ck_bounds)}; +} + +static future query_vector( + service::storage_proxy& proxy, + vector_search::vector_store_client& vsc, + rjson::value request, + service::client_state& client_state, + tracing::trace_state_ptr trace_state, + service_permit permit, + bool enforce_authorization, + bool warn_authorization, + alternator::stats& stats, + parsed::expression_cache& parsed_expr_cache) { + // If vector search is requested, IndexName must be given and must + // refer to a vector index - not to a GSI or LSI. + const rjson::value* index_name_v = rjson::find(request, "IndexName"); + if (!index_name_v || !index_name_v->IsString()) { + co_return api_error::validation( + "VectorSearch requires IndexName referring to a vector index"); + } + std::string_view index_name = rjson::to_string_view(*index_name_v); + schema_ptr base_schema = get_table(proxy, request); + bool is_vector = std::ranges::any_of(base_schema->indices(), [&](const index_metadata& im) { + const auto& opts = im.options(); + auto it = opts.find(db::index::secondary_index::custom_class_option_name); + return im.name() == index_name && it != opts.end() && it->second == "vector_index"; + }); + if (!is_vector) { + co_return api_error::validation( + format("VectorSearch IndexName '{}' is not a vector index.", index_name)); + } + // QueryVector is required inside VectorSearch. + const rjson::value* vector_search = rjson::find(request, "VectorSearch"); + if (!vector_search || !vector_search->IsObject()) { + co_return api_error::validation( + "VectorSearch requires a VectorSearch parameter"); + } + const rjson::value* query_vector = rjson::find(*vector_search, "QueryVector"); + if (!query_vector || !query_vector->IsObject()) { + co_return api_error::validation( + "VectorSearch requires a QueryVector parameter"); + } + // QueryVector should be is a DynamoDB value, which must be of type "L" + // (a list), containing only elements of type "N" (numbers). The number + // of these elements must be exactly the "dimensions" defined for this + // vector index. We'll now validate all these assumptions and parse + // all the numbers in the vector into an std::vector query_vec - + // the type that ann() wants. + int dimensions = 0; + for (const index_metadata& im : base_schema->indices()) { + if (im.name() == index_name) { + auto dims_it = im.options().find("dimensions"); + if (dims_it != im.options().end()) { + try { + dimensions = std::stoi(dims_it->second); + } catch (...) {} + } + break; + } + } + throwing_assert(dimensions > 0); + const rjson::value* qv_list = rjson::find(*query_vector, "L"); + if (!qv_list || !qv_list->IsArray()) { + co_return api_error::validation( + "VectorSearch QueryVector must be a list of numbers"); + } + const auto& arr = qv_list->GetArray(); + if ((int)arr.Size() != dimensions) { + co_return api_error::validation( + format("VectorSearch QueryVector length {} does not match index Dimensions {}", + arr.Size(), dimensions)); + } + std::vector query_vec; + query_vec.reserve(arr.Size()); + for (const rjson::value& elem : arr) { + if (!elem.IsObject()) { + co_return api_error::validation( + "VectorSearch QueryVector must contain only numbers"); + } + const rjson::value* n_val = rjson::find(elem, "N"); + if (!n_val || !n_val->IsString()) { + co_return api_error::validation( + "VectorSearch QueryVector must contain only numbers"); + } + std::string_view num_str = rjson::to_string_view(*n_val); + float f; + auto [ptr, ec] = std::from_chars(num_str.data(), num_str.data() + num_str.size(), f); + if (ec != std::errc{} || ptr != num_str.data() + num_str.size()) { + co_return api_error::validation( + format("VectorSearch QueryVector element '{}' is not a valid number", num_str)); + } + query_vec.push_back(f); + } + + // Limit is mandatory for vector search: it defines k, the number of + // nearest neighbors to return. + const rjson::value* limit_json = rjson::find(request, "Limit"); + if (!limit_json || !limit_json->IsUint()) { + co_return api_error::validation("VectorSearch requires a positive integer Limit parameter"); + } + uint32_t limit = limit_json->GetUint(); + if (limit == 0) { + co_return api_error::validation("Limit must be greater than 0"); + } + + // Consistent reads are not supported for vector search, just like GSI. + if (get_read_consistency(request) != db::consistency_level::LOCAL_ONE) { + co_return api_error::validation( + "Consistent reads are not allowed on vector indexes"); + } + + // Pagination (ExclusiveStartKey) is not supported for vector search. + if (rjson::find(request, "ExclusiveStartKey")) { + co_return api_error::validation( + "VectorSearch does not support pagination (ExclusiveStartKey)"); + } + + // ScanIndexForward is not supported for vector search: the ordering of + // results is determined by vector distance, not by the sort key. + if (rjson::find(request, "ScanIndexForward")) { + co_return api_error::validation( + "VectorSearch does not support ScanIndexForward"); + } + + std::unordered_set used_attribute_names; + std::unordered_set used_attribute_values; + // Parse the Select parameter and determine which attributes to return. + // For a vector index, the default Select is ALL_ATTRIBUTES (full items). + // ALL_PROJECTED_ATTRIBUTES is significantly more efficent because it + // returns what the vector store returned without looking up additional + // base-table data. Currently only the primary key attributes are projected + // but in the future we'll implement projecting additional attributes into + // the vector index - these additional attributes will also be usable for + // filtering). COUNT returns only the count without items. + select_type select = parse_select(request, table_or_view_type::vector_index); + std::optional attrs_to_get_opt; + if (select == select_type::projection) { + // ALL_PROJECTED_ATTRIBUTES for a vector index: return only key attributes. + alternator::attrs_to_get key_attrs; + for (const column_definition& cdef : base_schema->partition_key_columns()) { + attribute_path_map_add("Select", key_attrs, cdef.name_as_text()); + } + for (const column_definition& cdef : base_schema->clustering_key_columns()) { + attribute_path_map_add("Select", key_attrs, cdef.name_as_text()); + } + attrs_to_get_opt = std::move(key_attrs); + } else { + attrs_to_get_opt = calculate_attrs_to_get(request, parsed_expr_cache, used_attribute_names, select); + } + // QueryFilter (the old-style API) is not supported for vector search Queries. + if (rjson::find(request, "QueryFilter")) { + co_return api_error::validation( + "VectorSearch does not support QueryFilter; use FilterExpression instead"); + } + // FilterExpression: post-filter the vector search results by any attribute. + filter flt(parsed_expr_cache, request, filter::request_type::QUERY, + used_attribute_names, used_attribute_values); + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query"); + const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); + verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query"); + + // Verify the user has SELECT permission on the base table, as we + // do for every type of read operation after validating the input + // parameters. + co_await verify_permission(enforce_authorization, warn_authorization, + client_state, base_schema, auth::permission::SELECT, stats); + + // Query the vector store for the approximate nearest neighbors. + auto timeout = executor::default_timeout(); + abort_on_expiry aoe(timeout); + rjson::value pre_filter = rjson::empty_object(); // TODO, implement + auto pkeys_result = co_await vsc.ann( + base_schema->ks_name(), std::string(index_name), base_schema, + std::move(query_vec), limit, pre_filter, aoe.abort_source()); + if (!pkeys_result.has_value()) { + const sstring error_msg = std::visit(vector_search::error_visitor{}, pkeys_result.error()); + co_return api_error::validation(error_msg); + } + const std::vector& pkeys = pkeys_result.value(); + + // For SELECT=COUNT with no filter: skip fetching from the base table and + // just return the count of candidates returned by the vector store. + // If a filter is present, fall through to the base-table fetch to apply it. + if (select == select_type::count && !flt) { + rjson::value response = rjson::empty_object(); + rjson::add(response, "Count", rjson::value(static_cast(pkeys.size()))); + rjson::add(response, "ScannedCount", rjson::value(static_cast(pkeys.size()))); + co_return rjson::print(std::move(response)); + } + + // For SELECT=ALL_PROJECTED_ATTRIBUTES with no filter: skip fetching from + // the base table and build items directly from the key columns returned by + // the vector store. If a filter is present, fall through to the base-table + // fetch to apply it. + if (select == select_type::projection && !flt) { + rjson::value items_json = rjson::empty_array(); + for (const auto& pkey : pkeys) { + rjson::value item = rjson::empty_object(); + std::vector exploded_pk = pkey.partition.key().explode(); + auto exploded_pk_it = exploded_pk.begin(); + for (const column_definition& cdef : base_schema->partition_key_columns()) { + rjson::value key_val = rjson::empty_object(); + rjson::add_with_string_name(key_val, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef)); + rjson::add_with_string_name(item, std::string_view(cdef.name_as_text()), std::move(key_val)); + ++exploded_pk_it; + } + if (base_schema->clustering_key_size() > 0) { + std::vector exploded_ck = pkey.clustering.explode(); + auto exploded_ck_it = exploded_ck.begin(); + for (const column_definition& cdef : base_schema->clustering_key_columns()) { + rjson::value key_val = rjson::empty_object(); + rjson::add_with_string_name(key_val, type_to_string(cdef.type), json_key_column_value(*exploded_ck_it, cdef)); + rjson::add_with_string_name(item, std::string_view(cdef.name_as_text()), std::move(key_val)); + ++exploded_ck_it; + } + } + rjson::push_back(items_json, std::move(item)); + } + rjson::value response = rjson::empty_object(); + rjson::add(response, "Count", rjson::value(static_cast(items_json.Size()))); + rjson::add(response, "ScannedCount", rjson::value(static_cast(pkeys.size()))); + rjson::add(response, "Items", std::move(items_json)); + co_return rjson::print(std::move(response)); + } + + // TODO: For SELECT=SPECIFIC_ATTRIBUTES, if they are part of the projected + // attributes, we should use the above optimized code path - not fall through + // to the read from the base table as below as we need to do if the specific + // attributes contain non-projected columns. + + // Fetch the matching items from the base table and build the response. + // When a filter is present, we always fetch the full item so that all + // attributes are available for filter evaluation, regardless of the + // projection required for the final response. + auto selection = cql3::selection::selection::wildcard(base_schema); + auto regular_columns = base_schema->regular_columns() + | std::views::transform(&column_definition::id) + | std::ranges::to(); + auto attrs_to_get = ::make_shared>( + flt ? std::nullopt : std::move(attrs_to_get_opt)); + + rjson::value items_json = rjson::empty_array(); + int matched_count = 0; + + if (base_schema->clustering_key_size() == 0) { + // Hash-only table: query each partition individually, in the order + // returned by the vector store, to preserve vector-distance ordering + // in the response. A multi-partition batch read would return items in + // token order instead, which would be wrong. + // FIXME: do this more efficiently with a batched read that preserves + // ordering. + for (const auto& pkey : pkeys) { + std::vector bounds{ + query::clustering_range::make_open_ended_both_sides()}; + auto partition_slice = query::partition_slice(std::move(bounds), {}, + regular_columns, selection->get_query_options()); + auto command = ::make_lw_shared( + base_schema->id(), base_schema->version(), partition_slice, + proxy.get_max_result_size(partition_slice), + query::tombstone_limit(proxy.get_tombstone_limit())); + service::storage_proxy::coordinator_query_result qr = + co_await proxy.query(base_schema, command, + {dht::partition_range(pkey.partition)}, + db::consistency_level::LOCAL_ONE, + service::storage_proxy::coordinator_query_options( + timeout, permit, client_state, trace_state)); + auto opt_item = executor::describe_single_item(base_schema, partition_slice, + *selection, *qr.query_result, *attrs_to_get); + if (opt_item && (!flt || flt.check(*opt_item))) { + ++matched_count; + if (select != select_type::count) { + if (select == select_type::projection) { + // A filter caused us to fall through here instead of + // taking the projection early-exit above. Reconstruct + // the key-only item from the full item we fetched. + rjson::value key_item = rjson::empty_object(); + for (const column_definition& cdef : base_schema->partition_key_columns()) { + if (const rjson::value* v = rjson::find(*opt_item, cdef.name_as_text())) { + rjson::add_with_string_name(key_item, cdef.name_as_text(), rjson::copy(*v)); + } + } + rjson::push_back(items_json, std::move(key_item)); + } else { + // When a filter caused us to fetch the full item, apply the + // requested projection (attrs_to_get_opt) before returning it. + // This mirrors describe_items_visitor::end_row() which removes + // extra filter attributes from the returned item. + if (flt && attrs_to_get_opt) { + for (const auto& [attr_name, subpath] : *attrs_to_get_opt) { + if (!subpath.has_value()) { + if (rjson::value* toplevel = rjson::find(*opt_item, attr_name)) { + if (!hierarchy_filter(*toplevel, subpath)) { + rjson::remove_member(*opt_item, attr_name); + } + } + } + } + std::vector to_remove; + for (auto it = opt_item->MemberBegin(); it != opt_item->MemberEnd(); ++it) { + std::string key(it->name.GetString(), it->name.GetStringLength()); + if (!attrs_to_get_opt->contains(key)) { + to_remove.push_back(std::move(key)); + } + } + for (const auto& key : to_remove) { + rjson::remove_member(*opt_item, key); + } + } + rjson::push_back(items_json, std::move(*opt_item)); + } + } + } + } + } else { + // Hash+range table: query each (partition, clustering) pair individually. + // FIXME: do this more efficiently!!! + for (const auto& pkey : pkeys) { + std::vector bounds{ + query::clustering_range::make_singular(pkey.clustering)}; + auto partition_slice = query::partition_slice(std::move(bounds), {}, + regular_columns, selection->get_query_options()); + auto command = ::make_lw_shared( + base_schema->id(), base_schema->version(), partition_slice, + proxy.get_max_result_size(partition_slice), + query::tombstone_limit(proxy.get_tombstone_limit())); + service::storage_proxy::coordinator_query_result qr = + co_await proxy.query(base_schema, command, + {dht::partition_range(pkey.partition)}, + db::consistency_level::LOCAL_ONE, + service::storage_proxy::coordinator_query_options( + timeout, permit, client_state, trace_state)); + auto opt_item = executor::describe_single_item(base_schema, partition_slice, + *selection, *qr.query_result, *attrs_to_get); + if (opt_item && (!flt || flt.check(*opt_item))) { + ++matched_count; + if (select != select_type::count) { + if (select == select_type::projection) { + // A filter caused us to fall through here; project to keys. + rjson::value key_item = rjson::empty_object(); + for (const column_definition& cdef : base_schema->partition_key_columns()) { + if (const rjson::value* v = rjson::find(*opt_item, cdef.name_as_text())) { + rjson::add_with_string_name(key_item, cdef.name_as_text(), rjson::copy(*v)); + } + } + for (const column_definition& cdef : base_schema->clustering_key_columns()) { + if (const rjson::value* v = rjson::find(*opt_item, cdef.name_as_text())) { + rjson::add_with_string_name(key_item, cdef.name_as_text(), rjson::copy(*v)); + } + } + rjson::push_back(items_json, std::move(key_item)); + } else { + // When a filter caused us to fetch the full item, apply the + // requested projection (attrs_to_get_opt) before returning it. + // This mirrors describe_items_visitor::end_row() which removes + // extra filter attributes from the returned item. + if (flt && attrs_to_get_opt) { + for (const auto& [attr_name, subpath] : *attrs_to_get_opt) { + if (!subpath.has_value()) { + if (rjson::value* toplevel = rjson::find(*opt_item, attr_name)) { + if (!hierarchy_filter(*toplevel, subpath)) { + rjson::remove_member(*opt_item, attr_name); + } + } + } + } + std::vector to_remove; + for (auto it = opt_item->MemberBegin(); it != opt_item->MemberEnd(); ++it) { + std::string key(it->name.GetString(), it->name.GetStringLength()); + if (!attrs_to_get_opt->contains(key)) { + to_remove.push_back(std::move(key)); + } + } + for (const auto& key : to_remove) { + rjson::remove_member(*opt_item, key); + } + } + rjson::push_back(items_json, std::move(*opt_item)); + } + } + } + } + } + + rjson::value response = rjson::empty_object(); + if (select == select_type::count) { + rjson::add(response, "Count", rjson::value(matched_count)); + } else { + rjson::add(response, "Count", rjson::value(static_cast(items_json.Size()))); + rjson::add(response, "Items", std::move(items_json)); + } + rjson::add(response, "ScannedCount", rjson::value(static_cast(pkeys.size()))); + co_return rjson::print(std::move(response)); +} + +future executor::query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { + _stats.api_operations.query++; + elogger.trace("Querying {}", request); + + if (rjson::find(request, "VectorSearch")) { + // If vector search is requested, we have a separate code path. + // IndexName must be given and must refer to a vector index - not + // to a GSI or LSI as the code below assumes. + return query_vector(_proxy, _vsc, std::move(request), client_state, trace_state, std::move(permit), + _enforce_authorization, _warn_authorization, _stats, *_parsed_expression_cache); + } + + auto [schema, table_type] = get_table_or_view(_proxy, request); + db::consistency_level cl = get_read_consistency(request); + maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "Query", request, cl); + + get_stats_from_schema(_proxy, *schema)->api_operations.query++; + tracing::add_alternator_table_name(trace_state, schema->cf_name()); + + rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey"); + if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) { + return make_ready_future(api_error::validation( + "Consistent reads are not allowed on global indexes (GSI)")); + } + rjson::value* limit_json = rjson::find(request, "Limit"); + uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits::max(); + if (limit <= 0) { + return make_ready_future(api_error::validation("Limit must be greater than 0")); + } + + const bool forward = get_bool_attribute(request, "ScanIndexForward", true); + + rjson::value* key_conditions = rjson::find(request, "KeyConditions"); + rjson::value* key_condition_expression = rjson::find(request, "KeyConditionExpression"); + std::unordered_set used_attribute_values; + std::unordered_set used_attribute_names; + if (key_conditions && key_condition_expression) { + throw api_error::validation("Query does not allow both " + "KeyConditions and KeyConditionExpression to be given together"); + } else if (!key_conditions && !key_condition_expression) { + throw api_error::validation("Query must have one of " + "KeyConditions or KeyConditionExpression"); + } + + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); + + // exactly one of key_conditions or key_condition_expression + auto [partition_ranges, ck_bounds] = key_conditions + ? calculate_bounds_conditions(schema, *key_conditions) + : calculate_bounds_condition_expression(schema, *key_condition_expression, + expression_attribute_values, + used_attribute_values, + expression_attribute_names, + used_attribute_names, *_parsed_expression_cache); + + filter filter(*_parsed_expression_cache, request, filter::request_type::QUERY, + used_attribute_names, used_attribute_values); + + // A query is not allowed to filter on the partition key or the sort key. + for (const column_definition& cdef : schema->partition_key_columns()) { // just one + if (filter.filters_on(cdef.name_as_text())) { + return make_ready_future(api_error::validation( + format("QueryFilter can only contain non-primary key attributes: Partition key attribute: {}", cdef.name_as_text()))); + } + } + for (const column_definition& cdef : schema->clustering_key_columns()) { + if (filter.filters_on(cdef.name_as_text())) { + return make_ready_future(api_error::validation( + format("QueryFilter can only contain non-primary key attributes: Sort key attribute: {}", cdef.name_as_text()))); + } + // FIXME: this "break" can avoid listing some clustering key columns + // we added for GSIs just because they existed in the base table - + // but not in all cases. We still have issue #5320. + break; + } + + select_type select = parse_select(request, table_type); + + auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select); + verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query"); + verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query"); + query::partition_slice::option_set opts; + opts.set_if(!forward); + return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, + std::move(filter), opts, client_state, _stats, std::move(trace_state), std::move(permit), _enforce_authorization, _warn_authorization); +} + +future> executor::describe_multi_item(schema_ptr schema, + const query::partition_slice&& slice, + shared_ptr selection, + foreign_ptr> query_result, + shared_ptr> attrs_to_get, + noncopyable_function item_callback) { + cql3::selection::result_set_builder builder(*selection, gc_clock::now()); + query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection)); + auto result_set = builder.build(); + std::vector ret; + for (auto& result_row : result_set->rows()) { + rjson::value item = rjson::empty_object(); + uint64_t item_length_in_bytes = 0; + describe_single_item(*selection, result_row, *attrs_to_get, item, &item_length_in_bytes); + if (item_callback) { + item_callback(item_length_in_bytes); + } + ret.push_back(std::move(item)); + co_await coroutine::maybe_yield(); + } + co_return ret; +} + +// describe_item() wraps the result of describe_single_item() by a map +// as needed by the GetItem request. It should not be used for other purposes, +// use describe_single_item() instead. +static rjson::value describe_item(schema_ptr schema, + const query::partition_slice& slice, + const cql3::selection::selection& selection, + const query::result& query_result, + const std::optional& attrs_to_get, + consumed_capacity_counter& consumed_capacity, + uint64_t& metric) { + std::optional opt_item = executor::describe_single_item(std::move(schema), slice, selection, std::move(query_result), attrs_to_get, &consumed_capacity._total_bytes); + rjson::value item_descr = rjson::empty_object(); + if (opt_item) { + rjson::add(item_descr, "Item", std::move(*opt_item)); + } + consumed_capacity.add_consumed_capacity_to_response_if_needed(item_descr); + metric += consumed_capacity.get_half_units(); + return item_descr; +} + +future executor::get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { + _stats.api_operations.get_item++; + auto start_time = std::chrono::steady_clock::now(); + elogger.trace("Getting item {}", request); + + schema_ptr schema = get_table(_proxy, request); + lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *schema); + per_table_stats->api_operations.get_item++; + tracing::add_alternator_table_name(trace_state, schema->cf_name()); + + rjson::value& query_key = request["Key"]; + db::consistency_level cl = get_read_consistency(request); + + maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "GetItem", request, cl); + + partition_key pk = pk_from_json(query_key, schema); + dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))}; + + std::vector bounds; + if (schema->clustering_key_size() == 0) { + bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } else { + clustering_key ck = ck_from_json(query_key, schema); + bounds.push_back(query::clustering_range::make_singular(std::move(ck))); + } + check_key(query_key, schema); + + //TODO(sarna): It would be better to fetch only some attributes of the map, not all + auto regular_columns = + schema->regular_columns() | std::views::transform(&column_definition::id) + | std::ranges::to(); + + auto selection = cql3::selection::selection::wildcard(schema); + + auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); + auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice), + query::tombstone_limit(_proxy.get_tombstone_limit())); + + std::unordered_set used_attribute_names; + auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names); + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem"); + rcu_consumed_capacity_counter add_capacity(request, cl == db::consistency_level::LOCAL_QUORUM); + co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::SELECT, _stats); + service::storage_proxy::coordinator_query_result qr = + co_await _proxy.query( + schema, std::move(command), std::move(partition_ranges), cl, + service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)); + per_table_stats->api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time); + _stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time); + uint64_t rcu_half_units = 0; + rjson::value res = describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get), add_capacity, rcu_half_units); + per_table_stats->rcu_half_units_total += rcu_half_units; + _stats.rcu_half_units_total += rcu_half_units; + // Update item size metrics only if we found an item. + if (qr.query_result->row_count().value_or(0) > 0) { + per_table_stats->operation_sizes.get_item_op_size_kb.add(bytes_to_kb_ceil(add_capacity._total_bytes)); + } + co_return rjson::print(std::move(res)); +} + +future executor::batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { + // FIXME: In this implementation, an unbounded batch size can cause + // unbounded response JSON object to be buffered in memory, unbounded + // parallelism of the requests, and unbounded amount of non-preemptable + // work in the following loops. So we should limit the batch size, and/or + // the response size, as DynamoDB does. + _stats.api_operations.batch_get_item++; + rjson::value& request_items = request["RequestItems"]; + auto start_time = std::chrono::steady_clock::now(); + // We need to validate all the parameters before starting any asynchronous + // query, and fail the entire request on any parse error. So we parse all + // the input into our own vector "requests", each element a table_requests + // listing all the request aimed at a single table. For efficiency, inside + // each table_requests we further group together all reads going to the + // same partition, so we can later send them together. + bool should_add_rcu = rcu_consumed_capacity_counter::should_add_capacity(request); + struct table_requests { + schema_ptr schema; + db::consistency_level cl; + ::shared_ptr> attrs_to_get; + // clustering_keys keeps a sorted set of clustering keys. It must + // be sorted for the read below (see #10827). Additionally each + // clustering key is mapped to the original rjson::value "Key". + using clustering_keys = std::map; + std::unordered_map requests; + table_requests(schema_ptr s) + : schema(std::move(s)) + , requests(8, partition_key::hashing(*schema), partition_key::equality(*schema)) + {} + void add(rjson::value& key) { + auto pk = pk_from_json(key, schema); + auto it = requests.find(pk); + if (it == requests.end()) { + it = requests.emplace(pk, clustering_key::less_compare(*schema)).first; + } + auto ck = ck_from_json(key, schema); + if (auto [_, inserted] = it->second.emplace(ck, &key); !inserted) { + throw api_error::validation("Provided list of item keys contains duplicates"); + } + } + }; + std::vector requests; + uint batch_size = 0; + for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) { + table_requests rs(get_table_from_batch_request(_proxy, it)); + tracing::add_alternator_table_name(trace_state, rs.schema->cf_name()); + rs.cl = get_read_consistency(it->value); + std::unordered_set used_attribute_names; + rs.attrs_to_get = ::make_shared>(calculate_attrs_to_get(it->value, *_parsed_expression_cache, used_attribute_names)); + const rjson::value* expression_attribute_names = rjson::find(it->value, "ExpressionAttributeNames"); + verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "GetItem"); + auto& keys = (it->value)["Keys"]; + for (rjson::value& key : keys.GetArray()) { + rs.add(key); + check_key(key, rs.schema); + } + batch_size += rs.requests.size(); + requests.emplace_back(std::move(rs)); + } + + for (const table_requests& tr : requests) { + co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, tr.schema, auth::permission::SELECT, _stats); + } + + _stats.api_operations.batch_get_item_batch_total += batch_size; + _stats.api_operations.batch_get_item_histogram.add(batch_size); + // If we got here, all "requests" are valid, so let's start the + // requests for the different partitions all in parallel. + std::vector>> response_futures; + std::vector consumed_rcu_half_units_per_table(requests.size()); + for (size_t i = 0; i < requests.size(); i++) { + const table_requests& rs = requests[i]; + bool is_quorum = rs.cl == db::consistency_level::LOCAL_QUORUM; + lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); + per_table_stats->api_operations.batch_get_item_histogram.add(rs.requests.size()); + for (const auto& [pk, cks] : rs.requests) { + dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*rs.schema, pk))}; + std::vector bounds; + if (rs.schema->clustering_key_size() == 0) { + bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } else { + for (auto& ck : cks) { + bounds.push_back(query::clustering_range::make_singular(ck.first)); + } + } + auto regular_columns = + rs.schema->regular_columns() | std::views::transform(&column_definition::id) + | std::ranges::to(); + auto selection = cql3::selection::selection::wildcard(rs.schema); + auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); + auto command = ::make_lw_shared(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice), + query::tombstone_limit(_proxy.get_tombstone_limit())); + command->allow_limit = db::allow_per_partition_rate_limit::yes; + const auto item_callback = [is_quorum, per_table_stats, &rcus_per_table = consumed_rcu_half_units_per_table[i]](uint64_t size) { + rcus_per_table += rcu_consumed_capacity_counter::get_half_units(size, is_quorum); + // Update item size only if the item exists. + if (size > 0) { + per_table_stats->operation_sizes.batch_get_item_op_size_kb.add(bytes_to_kb_ceil(size)); + } + }; + future> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl, + service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then( + [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get, item_callback = std::move(item_callback)] (service::storage_proxy::coordinator_query_result qr) mutable { + utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); }); + return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get), std::move(item_callback)); + }); + response_futures.push_back(std::move(f)); + } + } + + // Wait for all requests to complete, and then return the response. + // In case of full failure (no reads succeeded), an arbitrary error + // from one of the operations will be returned. + bool some_succeeded = false; + std::exception_ptr eptr; + std::set table_names; // for auditing + // FIXME: will_log() here doesn't pass keyspace/table, so keyspace-level audit + // filtering is bypassed — a batch spanning multiple tables is audited as a whole. + bool should_audit = _audit.local_is_initialized() && _audit.local().will_log(audit::statement_category::QUERY); + rjson::value response = rjson::empty_object(); + rjson::add(response, "Responses", rjson::empty_object()); + rjson::add(response, "UnprocessedKeys", rjson::empty_object()); + auto fut_it = response_futures.begin(); + rjson::value consumed_capacity = rjson::empty_array(); + for (size_t i = 0; i < requests.size(); i++) { + const table_requests& rs = requests[i]; + std::string table = table_name(*rs.schema); + if (should_audit) { + table_names.insert(table); + } + for (const auto& [_, cks] : rs.requests) { + auto& fut = *fut_it; + ++fut_it; + try { + std::vector results = co_await std::move(fut); + some_succeeded = true; + if (!response["Responses"].HasMember(table)) { + rjson::add_with_string_name(response["Responses"], table, rjson::empty_array()); + } + for (rjson::value& json : results) { + rjson::push_back(response["Responses"][table], std::move(json)); + } + } catch(...) { + eptr = std::current_exception(); + // This read of potentially several rows in one partition, + // failed. We need to add the row key(s) to UnprocessedKeys. + if (!response["UnprocessedKeys"].HasMember(table)) { + // Add the table's entry in UnprocessedKeys. Need to copy + // all the table's parameters from the request except the + // Keys field, which we start empty and then build below. + rjson::add_with_string_name(response["UnprocessedKeys"], table, rjson::empty_object()); + rjson::value& unprocessed_item = response["UnprocessedKeys"][table]; + rjson::value& request_item = request_items[table]; + for (auto it = request_item.MemberBegin(); it != request_item.MemberEnd(); ++it) { + if (it->name != "Keys") { + rjson::add_with_string_name(unprocessed_item, + rjson::to_string_view(it->name), rjson::copy(it->value)); + } + } + rjson::add_with_string_name(unprocessed_item, "Keys", rjson::empty_array()); + } + for (auto& ck : cks) { + rjson::push_back(response["UnprocessedKeys"][table]["Keys"], std::move(*ck.second)); + } + } + } + uint64_t rcu_half_units = consumed_rcu_half_units_per_table[i]; + _stats.rcu_half_units_total += rcu_half_units; + lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); + per_table_stats->rcu_half_units_total += rcu_half_units; + if (should_add_rcu) { + rjson::value entry = rjson::empty_object(); + rjson::add(entry, "TableName", table); + rjson::add(entry, "CapacityUnits", rcu_half_units*0.5); + rjson::push_back(consumed_capacity, std::move(entry)); + } + } + + if (should_add_rcu) { + rjson::add(response, "ConsumedCapacity", std::move(consumed_capacity)); + } + elogger.trace("Unprocessed keys: {}", response["UnprocessedKeys"]); + // NOTE: Each table in the batch has its own CL (set by get_read_consistency()), + // but the audit entry records a single CL for the whole batch. We use ANY as a + // placeholder to indicate "mixed / not applicable". + // FIXME: Auditing is executed only for a complete success + maybe_audit(audit_info, audit::statement_category::QUERY, "", + print_names_for_audit(table_names), "BatchGetItem", request, db::consistency_level::ANY); + if (!some_succeeded && eptr) { + co_await coroutine::return_exception_ptr(std::move(eptr)); + } + auto duration = std::chrono::steady_clock::now() - start_time; + _stats.api_operations.batch_get_item_latency.mark(duration); + for (const table_requests& rs : requests) { + lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); + per_table_stats->api_operations.batch_get_item_latency.mark(duration); + } + if (is_big(response)) { + co_return make_streamed(std::move(response)); + } else { + co_return rjson::print(std::move(response)); + } +} + +} // namespace alternator diff --git a/configure.py b/configure.py index 23b1822388..d1ce4ef2f8 100755 --- a/configure.py +++ b/configure.py @@ -1438,6 +1438,7 @@ alternator = [ 'alternator/controller.cc', 'alternator/server.cc', 'alternator/executor.cc', + 'alternator/executor_read.cc', 'alternator/stats.cc', 'alternator/serialization.cc', 'alternator/expressions.cc',