diff --git a/alternator/CMakeLists.txt b/alternator/CMakeLists.txt index 4cbe0691e4..515e598b09 100644 --- a/alternator/CMakeLists.txt +++ b/alternator/CMakeLists.txt @@ -9,6 +9,8 @@ target_sources(alternator controller.cc server.cc executor.cc + executor_read.cc + executor_util.cc stats.cc serialization.cc expressions.cc diff --git a/alternator/attribute_path.hh b/alternator/attribute_path.hh new file mode 100644 index 0000000000..8ae753c6ad --- /dev/null +++ b/alternator/attribute_path.hh @@ -0,0 +1,253 @@ +/* + * Copyright 2019-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "utils/rjson.hh" +#include "utils/overloaded_functor.hh" +#include "alternator/error.hh" +#include "alternator/expressions_types.hh" + +namespace alternator { + +// An attribute_path_map object is used to hold data for various attributes +// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path +// has a root attribute, and then modified by member and index operators - +// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then +// "[2]" index, and finally ".c" member. +// Data can be added to an attribute_path_map using the add() function, but +// requires that attributes with data not be *overlapping* or *conflicting*: +// +// 1. Two attribute paths which are identical or an ancestor of one another +// are considered *overlapping* and not allowed. If a.b.c has data, +// we can't add more data in a.b.c or any of its descendants like a.b.c.d. +// +// 2. Two attribute paths which need the same parent to have both a member and +// an index are considered *conflicting* and not allowed. E.g., if a.b has +// data, you can't add a[1]. The meaning of adding both would be that the +// attribute a is both a map and an array, which isn't sensible. +// +// These two requirements are common to the two places where Alternator uses +// this abstraction to describe how a hierarchical item is to be transformed: +// +// 1. In ProjectExpression: for filtering from a full top-level attribute +// only the parts for which user asked in ProjectionExpression. +// +// 2. In UpdateExpression: for taking the previous value of a top-level +// attribute, and modifying it based on the instructions in the user +// wrote in UpdateExpression. + +template +class attribute_path_map_node { +public: + using data_t = T; + // We need the extra unique_ptr<> here because libstdc++ unordered_map + // doesn't work with incomplete types :-( + using members_t = std::unordered_map>>; + // The indexes list is sorted because DynamoDB requires handling writes + // beyond the end of a list in index order. + using indexes_t = std::map>>; + // The prohibition on "overlap" and "conflict" explained above means + // That only one of data, members or indexes is non-empty. + std::optional> _content; + + bool is_empty() const { return !_content; } + bool has_value() const { return _content && std::holds_alternative(*_content); } + bool has_members() const { return _content && std::holds_alternative(*_content); } + bool has_indexes() const { return _content && std::holds_alternative(*_content); } + // get_members() assumes that has_members() is true + members_t& get_members() { return std::get(*_content); } + const members_t& get_members() const { return std::get(*_content); } + indexes_t& get_indexes() { return std::get(*_content); } + const indexes_t& get_indexes() const { return std::get(*_content); } + T& get_value() { return std::get(*_content); } + const T& get_value() const { return std::get(*_content); } +}; + +template +using attribute_path_map = std::unordered_map>; + +using attrs_to_get_node = attribute_path_map_node; +// attrs_to_get lists which top-level attribute are needed, and possibly also +// which part of the top-level attribute is really needed (when nested +// attribute paths appeared in the query). +// Most code actually uses optional. There, a disengaged +// optional means we should get all attributes, not specific ones. +using attrs_to_get = attribute_path_map; + +// takes a given JSON value and drops its parts which weren't asked to be +// kept. It modifies the given JSON value, or returns false to signify that +// the entire object should be dropped. +// Note that The JSON value is assumed to be encoded using the DynamoDB +// conventions - i.e., it is really a map whose key has a type string, +// and the value is the real object. +template +bool hierarchy_filter(rjson::value& val, const attribute_path_map_node& h) { + if (!val.IsObject() || val.MemberCount() != 1) { + // This shouldn't happen. We shouldn't have stored malformed objects. + // But today Alternator does not validate the structure of nested + // documents before storing them, so this can happen on read. + throw api_error::internal(format("Malformed value object read: {}", val)); + } + const char* type = val.MemberBegin()->name.GetString(); + rjson::value& v = val.MemberBegin()->value; + if (h.has_members()) { + const auto& members = h.get_members(); + if (type[0] != 'M' || !v.IsObject()) { + // If v is not an object (dictionary, map), none of the members + // can match. + return false; + } + rjson::value newv = rjson::empty_object(); + for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) { + std::string attr = rjson::to_string(it->name); + auto x = members.find(attr); + if (x != members.end()) { + if (x->second) { + // Only a part of this attribute is to be filtered, do it. + if (hierarchy_filter(it->value, *x->second)) { + // because newv started empty and attr are unique + // (keys of v), we can use add() here + rjson::add_with_string_name(newv, attr, std::move(it->value)); + } + } else { + // The entire attribute is to be kept + rjson::add_with_string_name(newv, attr, std::move(it->value)); + } + } + } + if (newv.MemberCount() == 0) { + return false; + } + v = newv; + } else if (h.has_indexes()) { + const auto& indexes = h.get_indexes(); + if (type[0] != 'L' || !v.IsArray()) { + return false; + } + rjson::value newv = rjson::empty_array(); + const auto& a = v.GetArray(); + for (unsigned i = 0; i < v.Size(); i++) { + auto x = indexes.find(i); + if (x != indexes.end()) { + if (x->second) { + if (hierarchy_filter(a[i], *x->second)) { + rjson::push_back(newv, std::move(a[i])); + } + } else { + // The entire attribute is to be kept + rjson::push_back(newv, std::move(a[i])); + } + } + } + if (newv.Size() == 0) { + return false; + } + v = newv; + } + return true; +} + +// Add a path to an attribute_path_map. Throws a validation error if the path +// "overlaps" with one already in the filter (one is a sub-path of the other) +// or "conflicts" with it (both a member and index is requested). +template +void attribute_path_map_add(const char* source, attribute_path_map& map, const parsed::path& p, T value = {}) { + using node = attribute_path_map_node; + // The first step is to look for the top-level attribute (p.root()): + auto it = map.find(p.root()); + if (it == map.end()) { + if (p.has_operators()) { + it = map.emplace(p.root(), node {std::nullopt}).first; + } else { + (void) map.emplace(p.root(), node {std::move(value)}).first; + // Value inserted for top-level node. We're done. + return; + } + } else if(!p.has_operators()) { + // If p is top-level and we already have it or a part of it + // in map, it's a forbidden overlapping path. + throw api_error::validation(fmt::format( + "Invalid {}: two document paths overlap at {}", source, p.root())); + } else if (it->second.has_value()) { + // If we're here, it != map.end() && p.has_operators && it->second.has_value(). + // This means the top-level attribute already has a value, and we're + // trying to add a non-top-level value. It's an overlap. + throw api_error::validation(fmt::format("Invalid {}: two document paths overlap at {}", source, p.root())); + } + node* h = &it->second; + // The second step is to walk h from the top-level node to the inner node + // where we're supposed to insert the value: + for (const auto& op : p.operators()) { + std::visit(overloaded_functor { + [&] (const std::string& member) { + if (h->is_empty()) { + *h = node {typename node::members_t()}; + } else if (h->has_indexes()) { + throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); + } else if (h->has_value()) { + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); + } + typename node::members_t& members = h->get_members(); + auto it = members.find(member); + if (it == members.end()) { + it = members.insert({member, std::make_unique()}).first; + } + h = it->second.get(); + }, + [&] (unsigned index) { + if (h->is_empty()) { + *h = node {typename node::indexes_t()}; + } else if (h->has_members()) { + throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); + } else if (h->has_value()) { + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); + } + typename node::indexes_t& indexes = h->get_indexes(); + auto it = indexes.find(index); + if (it == indexes.end()) { + it = indexes.insert({index, std::make_unique()}).first; + } + h = it->second.get(); + } + }, op); + } + // Finally, insert the value in the node h. + if (h->is_empty()) { + *h = node {std::move(value)}; + } else { + throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); + } +} + +// A very simplified version of the above function for the special case of +// adding only top-level attribute. It's not only simpler, we also use a +// different error message, referring to a "duplicate attribute" instead of +// "overlapping paths". DynamoDB also has this distinction (errors in +// AttributesToGet refer to duplicates, not overlaps, but errors in +// ProjectionExpression refer to overlap - even if it's an exact duplicate). +template +void attribute_path_map_add(const char* source, attribute_path_map& map, const std::string& attr, T value = {}) { + using node = attribute_path_map_node; + auto it = map.find(attr); + if (it == map.end()) { + map.emplace(attr, node {std::move(value)}); + } else { + throw api_error::validation(fmt::format( + "Invalid {}: Duplicate attribute: {}", source, attr)); + } +} + +} // namespace alternator diff --git a/alternator/controller.cc b/alternator/controller.cc index f142292267..e55c3d1d6d 100644 --- a/alternator/controller.cc +++ b/alternator/controller.cc @@ -18,6 +18,7 @@ #include "service/memory_limiter.hh" #include "auth/service.hh" #include "service/qos/service_level_controller.hh" +#include "vector_search/vector_store_client.hh" using namespace seastar; @@ -35,6 +36,7 @@ controller::controller( sharded& memory_limiter, sharded& auth_service, sharded& sl_controller, + sharded& vsc, const db::config& config, seastar::scheduling_group sg) : protocol_server(sg) @@ -47,6 +49,7 @@ controller::controller( , _memory_limiter(memory_limiter) , _auth_service(auth_service) , _sl_controller(sl_controller) + , _vsc(vsc) , _config(config) { } @@ -92,7 +95,7 @@ future<> controller::start_server() { return cfg.alternator_timeout_in_ms; }; _executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_ss), std::ref(_mm), std::ref(_sys_dist_ks), - sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value(), + sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), std::ref(_vsc), _ssg.value(), sharded_parameter(get_timeout_in_ms, std::ref(_config))).get(); _server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get(); // Note: from this point on, if start_server() throws for any reason, diff --git a/alternator/controller.hh b/alternator/controller.hh index 947ba55090..d7a73b8f69 100644 --- a/alternator/controller.hh +++ b/alternator/controller.hh @@ -43,6 +43,10 @@ namespace qos { class service_level_controller; } +namespace vector_search { +class vector_store_client; +} + namespace alternator { // This is the official DynamoDB API version. @@ -65,6 +69,7 @@ class controller : public protocol_server { sharded& _memory_limiter; sharded& _auth_service; sharded& _sl_controller; + sharded& _vsc; const db::config& _config; std::vector _listen_addresses; @@ -83,6 +88,7 @@ public: sharded& memory_limiter, sharded& auth_service, sharded& sl_controller, + sharded& vsc, const db::config& config, seastar::scheduling_group sg); diff --git a/alternator/executor.cc b/alternator/executor.cc index e2900faa2a..fa7e58c5ea 100644 --- a/alternator/executor.cc +++ b/alternator/executor.cc @@ -9,12 +9,14 @@ #include #include #include "alternator/executor.hh" +#include "alternator/executor_util.hh" #include "alternator/consumed_capacity.hh" #include "auth/permission.hh" #include "auth/resource.hh" #include "cdc/log.hh" #include "cdc/cdc_options.hh" #include "auth/service.hh" +#include "cql3/cql3_type.hh" #include "db/config.hh" #include "db/view/view_build_status.hh" #include "locator/tablets.hh" @@ -63,12 +65,14 @@ #include "types/types.hh" #include "db/system_keyspace.hh" #include "cql3/statements/ks_prop_defs.hh" +#include "cql3/statements/index_target.hh" +#include "index/secondary_index.hh" #include "alternator/ttl_tag.hh" +#include "vector_search/vector_store_client.hh" +#include "utils/simple_value_with_expiry.hh" using namespace std::chrono_literals; -static logging::logger elogger("alternator-executor"); - namespace std { template <> struct hash> { size_t operator () (const std::pair& p) const { @@ -79,6 +83,8 @@ namespace std { namespace alternator { +logging::logger elogger("alternator-executor"); + // Alternator-specific table properties stored as hidden table tags: // // Alternator doesn't keep its own records of which Alternator tables exist @@ -117,7 +123,7 @@ extern const sstring TTL_TAG_KEY("system:ttl_attribute"); // the SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY is set the user didn't specify any key and // base table's keys were added as range keys. In all other cases either the first key is the user specified key, // following ones are base table's keys added as needed or range key list will be empty. -static const sstring SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY("system:spurious_range_key_added_to_gsi_and_user_didnt_specify_range_key"); +extern const sstring SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY("system:spurious_range_key_added_to_gsi_and_user_didnt_specify_range_key"); // The following tags also have the "system:" prefix but are NOT used // by Alternator to store table properties - only the user ever writes to @@ -176,78 +182,12 @@ void executor::maybe_audit( static lw_shared_ptr create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type, const std::map& tags_map, const gms::feature_service& feat, const db::tablets_mode_t::mode tablets_mode); -static map_type attrs_type() { - static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true); - return t; -} - static const column_definition& attrs_column(const schema& schema) { const column_definition* cdef = schema.get_column_definition(bytes(executor::ATTRS_COLUMN_NAME)); throwing_assert(cdef); return *cdef; } - -static lw_shared_ptr get_stats_from_schema(service::storage_proxy& sp, const schema& schema) { - try { - replica::table& table = sp.local_db().find_column_family(schema.id()); - if (!table.get_stats().alternator_stats) { - table.get_stats().alternator_stats = seastar::make_shared(schema.ks_name(), schema.cf_name()); - } - return table.get_stats().alternator_stats->_stats; - } catch (std::runtime_error&) { - // If we're here it means that a table we are currently working on was deleted before the - // operation completed, returning a temporary object is fine, if the table get deleted so will its metrics - return make_lw_shared(); - } -} - -executor::body_writer make_streamed(rjson::value&& value) { - return [value = std::move(value)](output_stream&& _out) mutable -> future<> { - auto out = std::move(_out); - std::exception_ptr ex; - try { - co_await rjson::print(value, out); - } catch (...) { - ex = std::current_exception(); - } - co_await out.close(); - co_await rjson::destroy_gently(std::move(value)); - if (ex) { - co_await coroutine::return_exception_ptr(std::move(ex)); - } - }; -} - -// make_streamed_with_extra_array() is variant of make_streamed() above, which -// builds a streaming response (a function writing to an output stream) from a -// JSON object (rjson::value) but adds to it at the end an additional array. -// The extra array is given a separate chunked_vector to avoid putting it -// inside the rjson::value - because RapidJSON does contiguous allocations for -// arrays which we want to avoid for potentially long arrays in Query/Scan -// responses (see #23535). -// If we ever fix RapidJSON to avoid contiguous allocations for arrays, or -// replace it entirely (#24458), we can remove this function and the function -// rjson::print_with_extra_array() which it calls. -executor::body_writer make_streamed_with_extra_array(rjson::value&& value, - std::string array_name, utils::chunked_vector&& array) { - return [value = std::move(value), array_name = std::move(array_name), array = std::move(array)](output_stream&& _out) mutable -> future<> { - auto out = std::move(_out); - std::exception_ptr ex; - try { - co_await rjson::print_with_extra_array(value, array_name, array, out); - } catch (...) { - ex = std::current_exception(); - } - co_await out.close(); - co_await rjson::destroy_gently(std::move(value)); - // TODO: can/should we also destroy the array gently? - if (ex) { - co_await coroutine::return_exception_ptr(std::move(ex)); - } - }; -} - // This function throws api_error::validation if input value is not an object. static void validate_is_object(const rjson::value& value, const char* caller) { if (!value.IsObject()) { @@ -332,6 +272,7 @@ executor::executor(gms::gossiper& gossiper, service::migration_manager& mm, db::system_distributed_keyspace& sdks, cdc::metadata& cdc_metadata, + vector_search::vector_store_client& vsc, smp_service_group ssg, utils::updateable_value default_timeout_in_ms) : _gossiper(gossiper), @@ -340,6 +281,7 @@ executor::executor(gms::gossiper& gossiper, _mm(mm), _sdks(sdks), _cdc_metadata(cdc_metadata), + _vsc(vsc), _enforce_authorization(_proxy.data_dictionary().get_config().alternator_enforce_authorization), _warn_authorization(_proxy.data_dictionary().get_config().alternator_warn_authorization), _audit(audit::audit::audit_instance()), @@ -384,266 +326,11 @@ void executor::supplement_table_info(rjson::value& descr, const schema& schema, executor::supplement_table_stream_info(descr, schema, sp); } -// We would have liked to support table names up to 255 bytes, like DynamoDB. -// But Scylla creates a directory whose name is the table's name plus 33 -// bytes (dash and UUID), and since directory names are limited to 255 bytes, -// we need to limit table names to 222 bytes, instead of 255. -// See https://github.com/scylladb/scylla/issues/4480 -// We actually have two limits here, -// * max_table_name_length is the limit that Alternator will impose on names -// of new Alternator tables. -// * max_auxiliary_table_name_length is the potentially higher absolute limit -// that Scylla imposes on the names of auxiliary tables that Alternator -// wants to create internally - i.e. materialized views or CDC log tables. -// The second limit might mean that it is not possible to add a GSI to an -// existing table, because the name of the new auxiliary table may go over -// the limit. The second limit is also one of the reasons why the first limit -// is set lower than 222 - to have room to enable streams which add the extra -// suffix "_scylla_cdc_log" to the table name. -static constexpr int max_table_name_length = 192; -static constexpr int max_auxiliary_table_name_length = 222; - -static bool valid_table_name_chars(std::string_view name) { - for (auto c : name) { - if ((c < 'a' || c > 'z') && - (c < 'A' || c > 'Z') && - (c < '0' || c > '9') && - c != '_' && - c != '-' && - c != '.') { - return false; - } - } - return true; -} - -// validate_table_name() validates the TableName parameter in a request - it -// should only be called in CreateTable or when a request looking for an -// existing table failed to find it. validate_table_name() throws the -// appropriate api_error if this validation fails. -// The DynamoDB developer guide, https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.NamingRulesDataTypes.html#HowItWorks.NamingRules -// specifies that table "names must be between 3 and 255 characters long and -// can contain only the following characters: a-z, A-Z, 0-9, _ (underscore), -// - (dash), . (dot)". However, Alternator only allows max_table_name_length -// characters (see above) - not 255. -static void validate_table_name(std::string_view name) { - if (name.length() < 3 || name.length() > max_table_name_length) { - throw api_error::validation( - format("TableName must be at least 3 characters long and at most {} characters long", max_table_name_length)); - } - if (!valid_table_name_chars(name)) { - throw api_error::validation( - "TableName must satisfy regular expression pattern: [a-zA-Z0-9_.-]+"); - } -} - -// Validate that a CDC log table could be created for the base table with a -// given table_name, and if not, throw a user-visible api_error::validation. -// It is not possible to create a CDC log table if the table name is so long -// that adding the 15-character suffix "_scylla_cdc_log" (cdc_log_suffix) -// makes it go over max_auxiliary_table_name_length. -// Note that if max_table_name_length is set to less than 207 (which is -// max_auxiliary_table_name_length-15), then this function will never -// fail. However, it's still important to call it in UpdateTable, in case -// we have pre-existing tables with names longer than this to avoid #24598. -static void validate_cdc_log_name_length(std::string_view table_name) { - if (cdc::log_name(table_name).length() > max_auxiliary_table_name_length) { - // CDC will add cdc_log_suffix ("_scylla_cdc_log") to the table name - // to create its log table, and this will exceed the maximum allowed - // length. To provide a more helpful error message, we assume that - // cdc::log_name() always adds a suffix of the same length. - int suffix_len = cdc::log_name(table_name).length() - table_name.length(); - throw api_error::validation(fmt::format("Streams cannot be enabled to a table whose name is longer than {} characters: {}", - max_auxiliary_table_name_length - suffix_len, table_name)); - } -} - -// In DynamoDB index names are local to a table, while in Scylla, materialized -// view names are global (in a keyspace). So we need to compose a unique name -// for the view taking into account both the table's name and the index name. -// We concatenate the table and index name separated by a delim character -// (a character not allowed by DynamoDB in ordinary table names, default: ":"). -// The downside of this approach is that it limits the sum of the lengths, -// instead of each component individually as DynamoDB does. -// The view_name() function assumes the table_name has already been validated -// but validates the legality of index_name and the combination of both. -static std::string view_name(std::string_view table_name, std::string_view index_name, const std::string& delim = ":", bool validate_len = true) { - if (index_name.length() < 3) { - throw api_error::validation("IndexName must be at least 3 characters long"); - } - if (!valid_table_name_chars(index_name)) { - throw api_error::validation( - fmt::format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name)); - } - std::string ret = std::string(table_name) + delim + std::string(index_name); - if (ret.length() > max_auxiliary_table_name_length && validate_len) { - throw api_error::validation( - fmt::format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters", - table_name, index_name, max_auxiliary_table_name_length - delim.size())); - } - return ret; -} - -static std::string lsi_name(std::string_view table_name, std::string_view index_name, bool validate_len = true) { - return view_name(table_name, index_name, "!:", validate_len); -} - -/** Extract table name from a request. - * Most requests expect the table's name to be listed in a "TableName" field. - * This convenience function returns the name or api_error in case the - * table name is missing or not a string. - */ -static std::optional find_table_name(const rjson::value& request) { - const rjson::value* table_name_value = rjson::find(request, "TableName"); - if (!table_name_value) { - return std::nullopt; - } - if (!table_name_value->IsString()) { - throw api_error::validation("Non-string TableName field in request"); - } - std::string table_name = rjson::to_string(*table_name_value); - return table_name; -} - -static std::string get_table_name(const rjson::value& request) { - auto name = find_table_name(request); - if (!name) { - throw api_error::validation("Missing TableName field in request"); - } - return *name; -} - -/** Extract table schema from a request. - * Many requests expect the table's name to be listed in a "TableName" field - * and need to look it up as an existing table. This convenience function - * does this, with the appropriate validation and api_error in case the table - * name is missing, invalid or the table doesn't exist. If everything is - * successful, it returns the table's schema. - */ -schema_ptr executor::find_table(service::storage_proxy& proxy, const rjson::value& request) { - auto table_name = find_table_name(request); - if (!table_name) { - return nullptr; - } - return find_table(proxy, *table_name); -} - -schema_ptr executor::find_table(service::storage_proxy& proxy, std::string_view table_name) { - try { - return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(table_name), table_name); - } catch(data_dictionary::no_such_column_family&) { - // DynamoDB returns validation error even when table does not exist - // and the table name is invalid. - validate_table_name(table_name); - - throw api_error::resource_not_found( - fmt::format("Requested resource not found: Table: {} not found", table_name)); - } -} - -schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request) { - auto schema = executor::find_table(proxy, request); - if (!schema) { - // if we get here then the name was missing, since syntax or missing actual CF - // checks throw. Slow path, but just call get_table_name to generate exception. - get_table_name(request); - } - return schema; -} - -// try_get_internal_table() handles the special case that the given table_name -// begins with INTERNAL_TABLE_PREFIX (".scylla.alternator."). In that case, -// this function assumes that the rest of the name refers to an internal -// Scylla table (e.g., system table) and returns the schema of that table - -// or an exception if it doesn't exist. Otherwise, if table_name does not -// start with INTERNAL_TABLE_PREFIX, this function returns an empty schema_ptr -// and the caller should look for a normal Alternator table with that name. -static schema_ptr try_get_internal_table(data_dictionary::database db, std::string_view table_name) { - size_t it = table_name.find(executor::INTERNAL_TABLE_PREFIX); - if (it != 0) { - return schema_ptr{}; - } - table_name.remove_prefix(executor::INTERNAL_TABLE_PREFIX.size()); - size_t delim = table_name.find_first_of('.'); - if (delim == std::string_view::npos) { - return schema_ptr{}; - } - std::string_view ks_name = table_name.substr(0, delim); - table_name.remove_prefix(ks_name.size() + 1); - // Only internal keyspaces can be accessed to avoid leakage - auto ks = db.try_find_keyspace(ks_name); - if (!ks || !ks->is_internal()) { - return schema_ptr{}; - } - try { - return db.find_schema(ks_name, table_name); - } catch (data_dictionary::no_such_column_family&) { - // DynamoDB returns validation error even when table does not exist - // and the table name is invalid. - validate_table_name(table_name); - throw api_error::resource_not_found( - fmt::format("Requested resource not found: Internal table: {}.{} not found", ks_name, table_name)); - } -} - -// get_table_or_view() is similar to to get_table(), except it returns either -// a table or a materialized view from which to read, based on the TableName -// and optional IndexName in the request. Only requests like Query and Scan -// which allow IndexName should use this function. -enum class table_or_view_type { base, lsi, gsi }; -static std::pair -get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) { - table_or_view_type type = table_or_view_type::base; - std::string table_name = get_table_name(request); - - if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) { - return {s, type}; - } - - std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name; - const rjson::value* index_name = rjson::find(request, "IndexName"); - std::string orig_table_name; - if (index_name) { - if (index_name->IsString()) { - orig_table_name = std::move(table_name); - table_name = view_name(orig_table_name, rjson::to_string_view(*index_name)); - type = table_or_view_type::gsi; - } else { - throw api_error::validation( - fmt::format("Non-string IndexName '{}'", rjson::to_string_view(*index_name))); - } - // If no tables for global indexes were found, the index may be local - if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) { - type = table_or_view_type::lsi; - table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name)); - } - } - - try { - return { proxy.data_dictionary().find_schema(keyspace_name, table_name), type }; - } catch(data_dictionary::no_such_column_family&) { - if (index_name) { - // DynamoDB returns a different error depending on whether the - // base table doesn't exist (ResourceNotFoundException) or it - // does exist but the index does not (ValidationException). - if (proxy.data_dictionary().has_schema(keyspace_name, orig_table_name)) { - throw api_error::validation( - fmt::format("Requested resource not found: Index '{}' for table '{}'", rjson::to_string_view(*index_name), orig_table_name)); - } else { - throw api_error::resource_not_found( - fmt::format("Requested resource not found: Table: {} not found", orig_table_name)); - } - } else { - throw api_error::resource_not_found( - fmt::format("Requested resource not found: Table: {} not found", table_name)); - } - } -} - // get_table_for_write() is similar to get_table(), but additionally, if the // configuration allows this, may also allow writing to system table with -// prefix INTERNAL_TABLE_PREFIX. This is analogous to the function -// get_table_or_view() above which allows *reading* internal tables. +// prefix INTERNAL_TABLE_PREFIX. See also get_table_or_view() in +// executor_read.cc which allows *reading* internal tables by the Query +// operation. static schema_ptr get_table_for_write(service::storage_proxy& proxy, const rjson::value& request) { std::string table_name = get_table_name(request); if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) { @@ -655,90 +342,7 @@ static schema_ptr get_table_for_write(service::storage_proxy& proxy, const rjson } return s; } - return executor::find_table(proxy, table_name); -} - -// Convenience function for getting the value of a string attribute, or a -// default value if it is missing. If the attribute exists, but is not a -// string, a descriptive api_error is thrown. -static std::string get_string_attribute(const rjson::value& value, std::string_view attribute_name, const char* default_return) { - const rjson::value* attribute_value = rjson::find(value, attribute_name); - if (!attribute_value) - return default_return; - if (!attribute_value->IsString()) { - throw api_error::validation(fmt::format("Expected string value for attribute {}, got: {}", - attribute_name, value)); - } - return rjson::to_string(*attribute_value); -} - -// Convenience function for getting the value of a boolean attribute, or a -// default value if it is missing. If the attribute exists, but is not a -// bool, a descriptive api_error is thrown. -static bool get_bool_attribute(const rjson::value& value, std::string_view attribute_name, bool default_return) { - const rjson::value* attribute_value = rjson::find(value, attribute_name); - if (!attribute_value) { - return default_return; - } - if (!attribute_value->IsBool()) { - throw api_error::validation(fmt::format("Expected boolean value for attribute {}, got: {}", - attribute_name, value)); - } - return attribute_value->GetBool(); -} - -// Convenience function for getting the value of an integer attribute, or -// an empty optional if it is missing. If the attribute exists, but is not -// an integer, a descriptive api_error is thrown. -static std::optional get_int_attribute(const rjson::value& value, std::string_view attribute_name) { - const rjson::value* attribute_value = rjson::find(value, attribute_name); - if (!attribute_value) - return {}; - if (!attribute_value->IsInt()) { - throw api_error::validation(fmt::format("Expected integer value for attribute {}, got: {}", - attribute_name, value)); - } - return attribute_value->GetInt(); -} - -// Sets a KeySchema object inside the given JSON parent describing the key -// attributes of the given schema as being either HASH or RANGE keys. -// Additionally, adds to a given map mappings between the key attribute -// names and their type (as a DynamoDB type string). -void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map* attribute_types, const std::map *tags) { - rjson::value key_schema = rjson::empty_array(); - const bool ignore_range_keys_as_spurious = tags != nullptr && tags->contains(SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY); - - for (const column_definition& cdef : schema.partition_key_columns()) { - rjson::value key = rjson::empty_object(); - rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text())); - rjson::add(key, "KeyType", "HASH"); - rjson::push_back(key_schema, std::move(key)); - if (attribute_types) { - (*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type); - } - } - if (!ignore_range_keys_as_spurious) { - // NOTE: user requested key (there can be at most one) will always come first. - // There might be more keys following it, which were added, but those were - // not requested by the user, so we ignore them. - for (const column_definition& cdef : schema.clustering_key_columns()) { - rjson::value key = rjson::empty_object(); - rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text())); - rjson::add(key, "KeyType", "RANGE"); - rjson::push_back(key_schema, std::move(key)); - if (attribute_types) { - (*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type); - } - break; - } - } - rjson::add(parent, "KeySchema", std::move(key_schema)); - -} - -void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map& attribute_types, const std::map *tags) { - describe_key_schema(parent, schema, &attribute_types, tags); + return find_table(proxy, table_name); } static rjson::value generate_arn_for_table(const schema& schema) { @@ -921,7 +525,7 @@ future executor::fill_table_description(schema_ptr schema, table_s rjson::add(table_description, "CreationDateTime", rjson::value(creation_timestamp)); std::unordered_map key_attribute_types; // Add base table's KeySchema and collect types for AttributeDefinitions: - executor::describe_key_schema(table_description, *schema, key_attribute_types, tags_ptr); + describe_key_schema(table_description, *schema, &key_attribute_types, tags_ptr); if (!t.views().empty()) { rjson::value gsi_array = rjson::empty_array(); rjson::value lsi_array = rjson::empty_array(); @@ -937,7 +541,7 @@ future executor::fill_table_description(schema_ptr schema, table_s rjson::add(view_entry, "IndexName", rjson::from_string(index_name)); rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name)); // Add index's KeySchema and collect types for AttributeDefinitions: - executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr)); + describe_key_schema(view_entry, *vptr, &key_attribute_types, db::get_tags_of_table(vptr)); // Add projection type rjson::value projection = rjson::empty_object(); rjson::add(projection, "ProjectionType", "ALL"); @@ -969,6 +573,58 @@ future executor::fill_table_description(schema_ptr schema, table_s rjson::add(table_description, "GlobalSecondaryIndexes", std::move(gsi_array)); } } + // List vector indexes, if this table has any: + rjson::value vector_index_array = rjson::empty_array(); + abort_on_expiry vector_index_status_aoe(executor::default_timeout()); + for (const index_metadata& im : schema->indices()) { + const auto& opts = im.options(); + auto class_it = opts.find(db::index::secondary_index::custom_class_option_name); + if (class_it == opts.end() || class_it->second != "vector_index") { + continue; + } + rjson::value entry = rjson::empty_object(); + rjson::add(entry, "IndexName", rjson::from_string(im.name())); + rjson::value vector_attribute = rjson::empty_object(); + auto target_it = opts.find(cql3::statements::index_target::target_option_name); + if (target_it != opts.end()) { + rjson::add(vector_attribute, "AttributeName", rjson::from_string(target_it->second)); + } + auto dims_it = opts.find("dimensions"); + if (dims_it != opts.end()) { + try { + rjson::add(vector_attribute, "Dimensions", std::stoi(dims_it->second)); + } catch (const std::logic_error&) { + // This should never happen, because the dimensions option + // is validated on index creation + on_internal_error(elogger, fmt::format("Unexpected non-integer dimensions value '{}' for vector index '{}'", dims_it->second, im.name())); + } + } + rjson::add(entry, "VectorAttribute", std::move(vector_attribute)); + // Always return a Projection. Currently only KEYS_ONLY is + // supported, so we always return that. + rjson::value projection = rjson::empty_object(); + rjson::add(projection, "ProjectionType", "KEYS_ONLY"); + rjson::add(entry, "Projection", std::move(projection)); + // Report IndexStatus and Backfilling based on the vector store's + // reported state: SERVING -> ACTIVE, BOOTSTRAPPING -> CREATING+Backfilling, + // anything else (INITIALIZING, unreachable, etc.) -> CREATING. + auto vstatus = co_await _vsc.get_index_status( + schema->ks_name(), im.name(), vector_index_status_aoe.abort_source()); + using index_status = vector_search::vector_store_client::index_status; + if (vstatus == index_status::serving) { + rjson::add(entry, "IndexStatus", "ACTIVE"); + } else { + rjson::add(entry, "IndexStatus", "CREATING"); + if (vstatus == index_status::backfilling) { + rjson::add(entry, "Backfilling", rjson::value(true)); + } + } + rjson::push_back(vector_index_array, std::move(entry)); + } + if (!vector_index_array.Empty()) { + rjson::add(table_description, "VectorIndexes", std::move(vector_index_array)); + } + // Use map built by describe_key_schema() for base and indexes to produce // AttributeDefinitions for all key columns: rjson::value attribute_definitions = rjson::empty_array(); @@ -985,14 +641,6 @@ future executor::fill_table_description(schema_ptr schema, table_s co_return table_description; } -bool is_alternator_keyspace(const sstring& ks_name) { - return ks_name.find(executor::KEYSPACE_NAME_PREFIX) == 0; -} - -sstring executor::table_name(const schema& s) { - return s.cf_name(); -} - future executor::describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { _stats.api_operations.describe_table++; elogger.trace("Describing table {}", request); @@ -1011,91 +659,6 @@ future executor::describe_table(client_state& cli co_return rjson::print(std::move(response)); } -// This function increments the authorization_failures counter, and may also -// log a warn-level message and/or throw an access_denied exception, depending -// on what enforce_authorization and warn_authorization are set to. -// Note that if enforce_authorization is false, this function will return -// without throwing. So a caller that doesn't want to continue after an -// authorization_error must explicitly return after calling this function. -static void authorization_error(alternator::stats& stats, bool enforce_authorization, bool warn_authorization, std::string msg) { - stats.authorization_failures++; - if (enforce_authorization) { - if (warn_authorization) { - elogger.warn("alternator_warn_authorization=true: {}", msg); - } - throw api_error::access_denied(std::move(msg)); - } else { - if (warn_authorization) { - elogger.warn("If you set alternator_enforce_authorization=true the following will be enforced: {}", msg); - } - } -} - -// Check CQL's Role-Based Access Control (RBAC) permission_to_check (MODIFY, -// SELECT, DROP, etc.) on the given table. When permission is denied an -// appropriate user-readable api_error::access_denied is thrown. -future<> verify_permission( - bool enforce_authorization, - bool warn_authorization, - const service::client_state& client_state, - const schema_ptr& schema, - auth::permission permission_to_check, - alternator::stats& stats) { - if (!enforce_authorization && !warn_authorization) { - co_return; - } - // Unfortunately, the fix for issue #23218 did not modify the function - // that we use here - check_has_permissions(). So if we want to allow - // writes to internal tables (from try_get_internal_table()) only to a - // superuser, we need to explicitly check it here. - if (permission_to_check == auth::permission::MODIFY && is_internal_keyspace(schema->ks_name())) { - if (!client_state.user() || - !client_state.user()->name || - !co_await client_state.get_auth_service()->underlying_role_manager().is_superuser(*client_state.user()->name)) { - sstring username = ""; - if (client_state.user() && client_state.user()->name) { - username = client_state.user()->name.value(); - } - authorization_error(stats, enforce_authorization, warn_authorization, fmt::format( - "Write access denied on internal table {}.{} to role {} because it is not a superuser", - schema->ks_name(), schema->cf_name(), username)); - co_return; - } - } - auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name()); - if (!client_state.user() || !client_state.user()->name || - !co_await client_state.check_has_permission(auth::command_desc(permission_to_check, resource))) { - sstring username = ""; - if (client_state.user() && client_state.user()->name) { - username = client_state.user()->name.value(); - } - // Using exceptions for errors makes this function faster in the - // success path (when the operation is allowed). - authorization_error(stats, enforce_authorization, warn_authorization, fmt::format( - "{} access on table {}.{} is denied to role {}, client address {}", - auth::permissions::to_string(permission_to_check), - schema->ks_name(), schema->cf_name(), username, client_state.get_client_address())); - } -} - -// Similar to verify_permission() above, but just for CREATE operations. -// Those do not operate on any specific table, so require permissions on -// ALL KEYSPACES instead of any specific table. -static future<> verify_create_permission(bool enforce_authorization, bool warn_authorization, const service::client_state& client_state, alternator::stats& stats) { - if (!enforce_authorization && !warn_authorization) { - co_return; - } - auto resource = auth::resource(auth::resource_kind::data); - if (!co_await client_state.check_has_permission(auth::command_desc(auth::permission::CREATE, resource))) { - sstring username = ""; - if (client_state.user() && client_state.user()->name) { - username = client_state.user()->name.value(); - } - authorization_error(stats, enforce_authorization, warn_authorization, fmt::format( - "CREATE access on ALL KEYSPACES is denied to role {}", username)); - } -} - future executor::delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { _stats.api_operations.delete_table++; elogger.trace("Deleting table {}", request); @@ -1496,15 +1059,6 @@ static void update_tags_map(const rjson::value& tags, std::map validate_tags(tags_map); } -const std::map& get_tags_of_table_or_throw(schema_ptr schema) { - auto tags_ptr = db::get_tags_of_table(schema); - if (tags_ptr) { - return *tags_ptr; - } else { - throw api_error::validation(format("Table {} does not have valid tagging information", schema->ks_name())); - } -} - future executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { _stats.api_operations.tag_resource++; @@ -1754,6 +1308,34 @@ static future<> mark_view_schemas_as_built(utils::chunked_vector& out, } } +// Returns true if the given attribute name is already the target of any vector +// index on the schema. Analogous to schema::has_index(), but looks up by the +// indexed attribute name rather than the index name. +static bool has_vector_index_on_attribute(const schema& s, std::string_view attribute_name) { + for (const index_metadata& im : s.indices()) { + // No need to check if the secondary index is a vector index, because + // Alternator doesn't use secondary indexes for anything else (GSI and + // LSI are implemented as materialized views, not secondary indexes). + const auto& opts = im.options(); + auto target_it = opts.find(cql3::statements::index_target::target_option_name); + if (target_it != opts.end() && target_it->second == attribute_name) { + return true; + } + } + return false; +} + +// Returns the validated "Dimensions" value from a VectorAttribute JSON object +// or throws api_error::validation if invalid. The "source" parameter is used +// in error messages (e.g., "VectorIndexes" or "VectorIndexUpdates"). +static int get_dimensions(const rjson::value& vector_attribute, std::string_view source) { + const rjson::value* dimensions_v = rjson::find(vector_attribute, "Dimensions"); + if (!dimensions_v || !dimensions_v->IsInt() || dimensions_v->GetInt() <= 0 || (vector_dimension_t)dimensions_v->GetInt() > cql3::cql3_type::MAX_VECTOR_DIMENSION) { + throw api_error::validation(fmt::format("{} Dimensions must be an integer between 1 and {}.", source, cql3::cql3_type::MAX_VECTOR_DIMENSION)); + } + return dimensions_v->GetInt(); +} + future executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode, std::unique_ptr& audit_info) { throwing_assert(this_shard_id() == 0); @@ -1928,6 +1510,80 @@ future executor::create_table_on_shard0(service:: unused_attribute_definitions)); } + // Parse VectorIndexes parameters and apply them to "builder". This all + // happens before we actually create the table, so if we have a parse + // errors we can still fail without creating any table. + const rjson::value* vector_indexes = rjson::find(request, "VectorIndexes"); + if (vector_indexes) { + if (!vector_indexes->IsArray()) { + co_return api_error::validation("VectorIndexes must be an array."); + } + std::unordered_set seen_attribute_names; + for (const rjson::value& v : vector_indexes->GetArray()) { + const rjson::value* index_name_v = rjson::find(v, "IndexName"); + if (!index_name_v || !index_name_v->IsString()) { + co_return api_error::validation("VectorIndexes IndexName must be a string."); + } + std::string_view index_name = rjson::to_string_view(*index_name_v); + // Limit the length and character choice of a vector index's + // name to the same rules as table names. This is slightly + // different from GSI/LSI names, where we limit not the length + // of the index name but its sum with the base table name. + validate_table_name(index_name, "VectorIndexes IndexName"); + if (!index_names.emplace(index_name).second) { + co_return api_error::validation(fmt::format("Duplicate IndexName '{}', ", index_name)); + } + const rjson::value* vector_attribute_v = rjson::find(v, "VectorAttribute"); + if (!vector_attribute_v || !vector_attribute_v->IsObject()) { + co_return api_error::validation("VectorIndexes VectorAttribute must be an object."); + } + const rjson::value* attribute_name_v = rjson::find(*vector_attribute_v, "AttributeName"); + if (!attribute_name_v || !attribute_name_v->IsString()) { + co_return api_error::validation("VectorIndexes AttributeName must be a string."); + } + std::string_view attribute_name = rjson::to_string_view(*attribute_name_v); + validate_attr_name_length("VectorIndexes", attribute_name.size(), /*is_key=*/false, "AttributeName "); + if (!seen_attribute_names.emplace(attribute_name).second) { + co_return api_error::validation(fmt::format("Duplicate vector index on the same AttributeName '{}'", attribute_name)); + } + // attribute_name must not be one of the key columns of the base + // or GSIs or LSIs, because those have mandatory types (defined in + // AttributeDefinitions) which will never be a vector. + for (auto it = attribute_definitions->Begin(); it != attribute_definitions->End(); ++it) { + if (rjson::to_string_view((*it)["AttributeName"]) == attribute_name) { + co_return api_error::validation(fmt::format( + "VectorIndexes AttributeName '{}' is a key column of type {} so cannot be used as a vector index target.", attribute_name, rjson::to_string_view((*it)["AttributeType"]))); + } + } + int dimensions = get_dimensions(*vector_attribute_v, "VectorIndexes"); + // The optional Projection parameter is only supported with + // ProjectionType=KEYS_ONLY. Other values are not yet supported. + const rjson::value* projection_v = rjson::find(v, "Projection"); + if (projection_v) { + if (!projection_v->IsObject()) { + co_return api_error::validation("VectorIndexes Projection must be an object."); + } + const rjson::value* projection_type_v = rjson::find(*projection_v, "ProjectionType"); + if (!projection_type_v || !projection_type_v->IsString() || + rjson::to_string_view(*projection_type_v) != "KEYS_ONLY") { + co_return api_error::validation("VectorIndexes Projection: only ProjectionType=KEYS_ONLY is currently supported."); + } + } + // Add a vector index metadata entry to the base table schema. + index_options_map index_options; + index_options[db::index::secondary_index::custom_class_option_name] = "vector_index"; + index_options[cql3::statements::index_target::target_option_name] = sstring(attribute_name); + index_options["dimensions"] = std::to_string(dimensions); + builder.with_index(index_metadata{sstring(index_name), index_options, + index_metadata_kind::custom, index_metadata::is_local_index(false)}); + } + // If we have any vector indexes, we will use CDC and the CDC log + // name will need to fit our length limits, so validate it now. + if (vector_indexes->Size() > 0) { + validate_cdc_log_name_length(builder.cf_name()); + } + } + // We don't yet support configuring server-side encryption (SSE) via the // SSESpecifiction attribute, but an SSESpecification with Enabled=false // is simply the default, and should be accepted: @@ -1999,6 +1655,13 @@ future executor::create_table_on_shard0(service:: } } } + // Vector indexes is a new feature that we decided to only support + // on tablets. + if (vector_indexes && vector_indexes->Size() > 0) { + if (!rs->uses_tablets()) { + co_return api_error::validation("Vector indexes are not supported on tables using vnodes."); + } + } // Creating an index in tablets mode requires the keyspace to be RF-rack-valid. // GSI and LSI indexes are based on materialized views which require RF-rack-validity to avoid consistency issues. if (!view_builders.empty() || _proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) { @@ -2098,17 +1761,20 @@ future executor::create_table(client_state& clien // columns of the base table or any of its prior GSIs or LSIs, the type // given in AttributeDefinitions must match the type of the existing key - // otherwise Alternator will not know which type to enforce in new writes. +// Also, if the table already has vector indexes, their key attributes cannot +// be redefined in AttributeDefinitions with a non-vector type. // This function checks for such conflicts. It assumes that the structure of // the given attribute_definitions was already validated (with // validate_attribute_definitions()). // This function should be called multiple times - once for the base schema // and once for each of its views (existing GSIs and LSIs on this table). static void check_attribute_definitions_conflicts(const rjson::value& attribute_definitions, const schema& schema) { - for (auto& def : schema.primary_key_columns()) { - std::string def_type = type_to_string(def.type); - for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) { - const rjson::value& attribute_info = *it; - if (rjson::to_string_view(attribute_info["AttributeName"]) == def.name_as_text()) { + for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) { + const rjson::value& attribute_info = *it; + std::string_view attribute_name = rjson::to_string_view(attribute_info["AttributeName"]); + for (auto& def : schema.primary_key_columns()) { + if (attribute_name == def.name_as_text()) { + auto def_type = type_to_string(def.type); std::string_view type = rjson::to_string_view(attribute_info["AttributeType"]); if (type != def_type) { throw api_error::validation(fmt::format("AttributeDefinitions redefined {} to {} already a key attribute of type {} in this table", def.name_as_text(), type, def_type)); @@ -2116,6 +1782,12 @@ future executor::create_table(client_state& clien break; } } + // Additionally, if we have a vector index, its key attribute is + // required to have a vector type, and cannot be listed in + // AttributeDefinitions with a non-vector key type. + if (has_vector_index_on_attribute(schema, attribute_name)) { + throw api_error::validation(fmt::format("AttributeDefinitions redefines {} but already a key of a vector index in this table", attribute_name)); + } } } @@ -2192,6 +1864,129 @@ future executor::update_table(client_state& clien } } + // Support VectorIndexUpdates to add or delete a vector index, + // similar to GlobalSecondaryIndexUpdates above. We handle this + // before builder.build() so we can use builder directly. + rjson::value* vector_index_updates = rjson::find(request, "VectorIndexUpdates"); + if (vector_index_updates) { + if (!vector_index_updates->IsArray()) { + co_return api_error::validation("VectorIndexUpdates must be an array"); + } + if (vector_index_updates->Size() > 1) { + // VectorIndexUpdates mirrors GlobalSecondaryIndexUpdates. + // Since DynamoDB artifically limits the latter to just a + // single operation (one Create or one Delete), we also + // place the same artificial limit on VectorIndexUpdates, + // and throw the same LimitExceeded error if the client + // tries to pass more than one operation. + co_return api_error::limit_exceeded("VectorIndexUpdates only allows one index creation or deletion"); + } + } + if (vector_index_updates && vector_index_updates->Size() == 1) { + empty_request = false; + if (!(*vector_index_updates)[0].IsObject() || (*vector_index_updates)[0].MemberCount() != 1) { + co_return api_error::validation("VectorIndexUpdates array must contain one object with a Create or Delete operation"); + } + auto it = (*vector_index_updates)[0].MemberBegin(); + const std::string_view op = rjson::to_string_view(it->name); + if (!it->value.IsObject()) { + co_return api_error::validation("VectorIndexUpdates entries must be objects"); + } + const rjson::value* index_name_v = rjson::find(it->value, "IndexName"); + if (!index_name_v || !index_name_v->IsString()) { + co_return api_error::validation("VectorIndexUpdates operation must have IndexName"); + } + sstring index_name = rjson::to_sstring(*index_name_v); + if (op == "Create") { + if (!p.local().local_db().find_keyspace(tab->ks_name()).get_replication_strategy().uses_tablets()) { + co_return api_error::validation("Vector indexes are not supported on tables using vnodes."); + } + validate_table_name(index_name, "VectorIndexUpdates IndexName"); + // Check for duplicate index name against existing vector indexes, GSIs and LSIs. + if (tab->has_index(index_name)) { + // Alternator only uses a secondary index for vector + // search (GSI and LSI are implemented as materialized + // views, not secondary indexes), so the error message + // can refer to a "Vector index". + co_return api_error::validation(fmt::format( + "Vector index {} already exists in table {}", index_name, tab->cf_name())); + } + if (p.local().data_dictionary().has_schema(tab->ks_name(), gsi_name(tab->cf_name(), index_name, false)) || + p.local().data_dictionary().has_schema(tab->ks_name(), lsi_name(tab->cf_name(), index_name, false))) { + co_return api_error::validation(fmt::format( + "GSI or LSI {} already exists in table {}, cannot reuse the name for a vector index", index_name, tab->cf_name())); + } + const rjson::value* vector_attribute_v = rjson::find(it->value, "VectorAttribute"); + if (!vector_attribute_v || !vector_attribute_v->IsObject()) { + co_return api_error::validation("VectorIndexUpdates Create VectorAttribute must be an object."); + } + const rjson::value* attribute_name_v = rjson::find(*vector_attribute_v, "AttributeName"); + if (!attribute_name_v || !attribute_name_v->IsString()) { + co_return api_error::validation("VectorIndexUpdates Create AttributeName must be a string."); + } + std::string_view attribute_name = rjson::to_string_view(*attribute_name_v); + validate_attr_name_length("VectorIndexUpdates", attribute_name.size(), /*is_key=*/false, "AttributeName "); + // attribute_name must not be a key column of the base + // table or any of its GSIs or LSIs, because those have + // mandatory types (defined in AttributeDefinitions) which + // will never be a vector. + for (const column_definition& cdef : tab->primary_key_columns()) { + if (cdef.name_as_text() == attribute_name) { + co_return api_error::validation(fmt::format( + "VectorIndexUpdates AttributeName '{}' is a key column and cannot be used as a vector index target.", attribute_name)); + } + } + for (const auto& view : p.local().data_dictionary().find_column_family(tab).views()) { + for (const column_definition& cdef : view->primary_key_columns()) { + if (cdef.name_as_text() == attribute_name) { + co_return api_error::validation(fmt::format( + "VectorIndexUpdates AttributeName '{}' is a key column of a GSI or LSI and cannot be used as a vector index target.", attribute_name)); + } + } + } + // attribute_name must not already be the target of an + // existing vector index. + if (has_vector_index_on_attribute(*tab, attribute_name)) { + co_return api_error::validation(fmt::format( + "VectorIndexUpdates AttributeName '{}' is already the target of an existing vector index.", attribute_name)); + } + int dimensions = get_dimensions(*vector_attribute_v, "VectorIndexUpdates"); + // The optional Projection parameter is only supported with + // ProjectionType=KEYS_ONLY. Other values are not yet supported. + const rjson::value* projection_v = rjson::find(it->value, "Projection"); + if (projection_v) { + if (!projection_v->IsObject()) { + co_return api_error::validation("VectorIndexUpdates Projection must be an object."); + } + const rjson::value* projection_type_v = rjson::find(*projection_v, "ProjectionType"); + if (!projection_type_v || !projection_type_v->IsString() || + rjson::to_string_view(*projection_type_v) != "KEYS_ONLY") { + co_return api_error::validation("VectorIndexUpdates Projection: only ProjectionType=KEYS_ONLY is currently supported."); + } + } + // A vector index will use CDC on this table, so the CDC + // log table name will need to fit our length limits + validate_cdc_log_name_length(builder.cf_name()); + index_options_map index_options; + index_options[db::index::secondary_index::custom_class_option_name] = "vector_index"; + index_options[cql3::statements::index_target::target_option_name] = sstring(attribute_name); + index_options["dimensions"] = std::to_string(dimensions); + builder.with_index(index_metadata{index_name, index_options, + index_metadata_kind::custom, index_metadata::is_local_index(false)}); + } else if (op == "Delete") { + if (!tab->has_index(index_name)) { + co_return api_error::resource_not_found(fmt::format( + "No vector index {} in table {}", index_name, tab->cf_name())); + } + builder.without_index(index_name); + } else { + // Update operation not yet supported, as we don't yet + // have any updatable properties of vector indexes. + co_return api_error::validation(fmt::format( + "VectorIndexUpdates supports a Create or Delete operation, saw '{}'", op)); + } + } + schema = builder.build(); std::vector new_views; std::vector dropped_views; @@ -2208,6 +2003,10 @@ future executor::update_table(client_state& clien // a LimitExceededException if this is attempted. co_return api_error::limit_exceeded("GlobalSecondaryIndexUpdates only allows one index creation or deletion"); } + if (vector_index_updates && vector_index_updates->IsArray() && + vector_index_updates->Size() && gsi_updates->Size()) { + co_return api_error::limit_exceeded("UpdateTable cannot have both VectorIndexUpdates and GlobalSecondaryIndexUpdates in the same request"); + } if (gsi_updates->Size() == 1) { empty_request = false; if (!(*gsi_updates)[0].IsObject() || (*gsi_updates)[0].MemberCount() != 1) { @@ -2247,6 +2046,10 @@ future executor::update_table(client_state& clien co_return api_error::validation(fmt::format( "LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name)); } + if (tab->has_index(sstring(index_name))) { + co_return api_error::validation(fmt::format( + "Vector index {} already exists in table {}, cannot reuse the name for a GSI", index_name, table_name)); + } try { locator::assert_rf_rack_valid_keyspace(keyspace_name, p.local().local_db().get_token_metadata_ptr(), p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy()); @@ -2324,7 +2127,7 @@ future executor::update_table(client_state& clien } if (empty_request) { - co_return api_error::validation("UpdateTable requires one of GlobalSecondaryIndexUpdates, StreamSpecification or BillingMode to be specified"); + co_return api_error::validation("UpdateTable requires one of GlobalSecondaryIndexUpdates, VectorIndexUpdates, StreamSpecification or BillingMode to be specified"); } co_await verify_permission(enforce_authorization, warn_authorization, client_state_other_shard.get(), schema, auth::permission::ALTER, e.local()._stats); @@ -2416,15 +2219,6 @@ public: } }; -// After calling pk_from_json() and ck_from_json() to extract the pk and ck -// components of a key, and if that succeeded, call check_key() to further -// check that the key doesn't have any spurious components. -static void check_key(const rjson::value& key, const schema_ptr& schema) { - if (key.MemberCount() != (schema->clustering_key_size() == 0 ? 1 : 2)) { - throw api_error::validation("Given key attribute not in schema"); - } -} - // Verify that a value parsed from the user input is legal. In particular, // we check that the value is not an empty set, string or bytes - which is // (somewhat artificially) forbidden by DynamoDB. @@ -2542,6 +2336,29 @@ std::unordered_map si_key_attributes(data_dictionary::table return ret; } +// Get a map from attribute name (bytes) to dimensions (int) for all vector +// indexes defined on this table's schema. Used to validate written values +// against the vector index constraints. +static std::unordered_map vector_index_attributes(const schema& s) { + std::unordered_map ret; + for (const index_metadata& im : s.indices()) { + const auto& opts = im.options(); + auto class_it = opts.find(db::index::secondary_index::custom_class_option_name); + if (class_it == opts.end() || class_it->second != "vector_index") { + continue; + } + auto target_it = opts.find(cql3::statements::index_target::target_option_name); + auto dims_it = opts.find("dimensions"); + if (target_it == opts.end() || dims_it == opts.end()) { + continue; + } + try { + ret[to_bytes(target_it->second)] = std::stoi(dims_it->second); + } catch (...) {} + } + return ret; +} + // When an attribute is a key (hash or sort) of one of the GSIs or LSIs on a // table, DynamoDB refuses an update to that attribute with an unsuitable // value. Unsuitable values are: @@ -2562,13 +2379,10 @@ std::unordered_map si_key_attributes(data_dictionary::table // // validate_value_if_index_key() should only be called after validate_value() // already validated that the value itself has a valid form. -static inline void validate_value_if_index_key( +static void validate_value_if_index_key( std::unordered_map key_attributes, const bytes& attribute, const rjson::value& value) { - if (key_attributes.empty()) { - return; - } auto it = key_attributes.find(attribute); if (it == key_attributes.end()) { // Given attribute is not a key column with a fixed type, so no @@ -2592,10 +2406,59 @@ static inline void validate_value_if_index_key( } } +// When an attribute is the target of a vector index on the table, a write +// to that attribute is rejected unless the value is a DynamoDB list (type +// "L") of exactly the declared number of numeric (type "N") elements, where +// each number can be represented as a 32-bit float. +// +// validate_value_if_vector_index_attribute() should only be called after +// validate_value() already confirmed the value has a valid DynamoDB form. +static void validate_value_if_vector_index_attribute( + const std::unordered_map& vector_attrs, + const bytes& attribute, + const rjson::value& value) { + auto it = vector_attrs.find(attribute); + if (it == vector_attrs.end()) { + return; + } + int dimensions = it->second; + std::string_view attr_name = to_string_view(attribute); + // value is a DynamoDB typed value: an object with one member whose key + // is the type tag. validate_value() already checked the overall shape. + std::string_view value_type = rjson::to_string_view(value.MemberBegin()->name); + if (value_type != "L") { + throw api_error::validation(fmt::format( + "Vector index attribute '{}' must be a list of {} numbers, got type {}", + attr_name, dimensions, value_type)); + } + const rjson::value& list = value.MemberBegin()->value; + if (!list.IsArray() || (int)list.Size() != dimensions) { + throw api_error::validation(fmt::format( + "Vector index attribute '{}' must be a list of exactly {} numbers, got {} elements", + attr_name, dimensions, list.IsArray() ? (int)list.Size() : -1)); + } + for (const rjson::value& elem : list.GetArray()) { + if (!elem.IsObject() || elem.MemberCount() != 1 || + rjson::to_string_view(elem.MemberBegin()->name) != "N") { + throw api_error::validation(fmt::format( + "Vector index attribute '{}' must contain only numbers", attr_name)); + } + std::string_view num_str = rjson::to_string_view(elem.MemberBegin()->value); + float f; + auto [ptr, ec] = std::from_chars(num_str.data(), num_str.data() + num_str.size(), f); + if (ec != std::errc{} || ptr != num_str.data() + num_str.size() || !std::isfinite(f)) { + throw api_error::validation(fmt::format( + "Vector index attribute '{}' element '{}' cannot be represented as a 32-bit float", + attr_name, num_str)); + } + } +} + put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map key_attributes) : _pk(pk_from_json(item, schema)), _ck(ck_from_json(item, schema)) { _cells = std::vector(); _cells->reserve(item.MemberCount()); + auto vec_attrs = vector_index_attributes(*schema); for (auto it = item.MemberBegin(); it != item.MemberEnd(); ++it) { bytes column_name = to_bytes(rjson::to_string_view(it->name)); validate_value(it->value, "PutItem"); @@ -2605,7 +2468,14 @@ put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr sche if (!cdef) { // This attribute may be a key column of one of the GSI or LSI, // in which case there are some limitations on the value. - validate_value_if_index_key(key_attributes, column_name, it->value); + if (!key_attributes.empty()) { + validate_value_if_index_key(key_attributes, column_name, it->value); + } + // This attribute may also be a vector index target column, + // in which case it must be a list of the right number of floats. + if (!vec_attrs.empty()) { + validate_value_if_vector_index_attribute(vec_attrs, column_name, it->value); + } bytes value = serialize_item(it->value); if (value.size()) { // ScyllaDB uses one extra byte compared to DynamoDB for the bytes length @@ -2798,7 +2668,7 @@ std::optional rmw_operation::apply(foreign_ptrrow_count()) { auto selection = cql3::selection::selection::wildcard(_schema); uint64_t item_length = 0; - auto previous_item = executor::describe_single_item(_schema, slice, *selection, *qr, {}, &item_length); + auto previous_item = describe_single_item(_schema, slice, *selection, *qr, {}, &item_length); if (_consumed_capacity._total_bytes < item_length) { _consumed_capacity._total_bytes = item_length; } @@ -2884,7 +2754,7 @@ static future> get_previous_item( command->allow_limit = db::allow_per_partition_rate_limit::yes; return proxy.query(schema, command, to_partition_ranges(*schema, pk), cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state)).then( [schema, command, selection = std::move(selection), &item_length] (service::storage_proxy::coordinator_query_result qr) { - auto previous_item = executor::describe_single_item(schema, command->slice, *selection, *qr.query_result, {}, &item_length); + auto previous_item = describe_single_item(schema, command->slice, *selection, *qr.query_result, {}, &item_length); if (previous_item) { return make_ready_future>(std::make_unique(std::move(*previous_item))); } else { @@ -3008,22 +2878,6 @@ static bool check_needs_read_before_write(const parsed::condition_expression& co return !condition_expression.empty(); } -// Fail the expression if it has unused attribute names or values. This is -// how DynamoDB behaves, so we do too. -static void verify_all_are_used(const rjson::value* field, - const std::unordered_set& used, const char* field_name, const char* operation) { - if (!field) { - return; - } - for (auto it = field->MemberBegin(); it != field->MemberEnd(); ++it) { - if (!used.contains(rjson::to_string(it->name))) { - throw api_error::validation( - format("{} has spurious '{}', not used in {}", - field_name, rjson::to_string_view(it->name), operation)); - } - } -} - class put_item_operation : public rmw_operation { private: put_or_delete_item _mutation_builder; @@ -3252,18 +3106,6 @@ future executor::delete_item(client_state& client co_return res; } -static schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) { - sstring table_name = rjson::to_sstring(batch_request->name); // JSON keys are always strings - try { - return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name); - } catch(data_dictionary::no_such_column_family&) { - // DynamoDB returns validation error even when table does not exist - // and the table name is invalid. - validate_table_name(table_name); - throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name)); - } -} - using primary_key = std::pair; struct primary_key_hash { schema_ptr _s; @@ -3637,412 +3479,6 @@ static const std::string_view get_item_type_string(const rjson::value& v) { return rjson::to_string_view(mem.name); } -// attrs_to_get saves for each top-level attribute an attrs_to_get_node, -// a hierarchy of subparts that need to be kept. The following function -// takes a given JSON value and drops its parts which weren't asked to be -// kept. It modifies the given JSON value, or returns false to signify that -// the entire object should be dropped. -// Note that The JSON value is assumed to be encoded using the DynamoDB -// conventions - i.e., it is really a map whose key has a type string, -// and the value is the real object. -template -static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node& h) { - if (!val.IsObject() || val.MemberCount() != 1) { - // This shouldn't happen. We shouldn't have stored malformed objects. - // But today Alternator does not validate the structure of nested - // documents before storing them, so this can happen on read. - throw api_error::internal(format("Malformed value object read: {}", val)); - } - const char* type = val.MemberBegin()->name.GetString(); - rjson::value& v = val.MemberBegin()->value; - if (h.has_members()) { - const auto& members = h.get_members(); - if (type[0] != 'M' || !v.IsObject()) { - // If v is not an object (dictionary, map), none of the members - // can match. - return false; - } - rjson::value newv = rjson::empty_object(); - for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) { - std::string attr = rjson::to_string(it->name); - auto x = members.find(attr); - if (x != members.end()) { - if (x->second) { - // Only a part of this attribute is to be filtered, do it. - if (hierarchy_filter(it->value, *x->second)) { - // because newv started empty and attr are unique - // (keys of v), we can use add() here - rjson::add_with_string_name(newv, attr, std::move(it->value)); - } - } else { - // The entire attribute is to be kept - rjson::add_with_string_name(newv, attr, std::move(it->value)); - } - } - } - if (newv.MemberCount() == 0) { - return false; - } - v = newv; - } else if (h.has_indexes()) { - const auto& indexes = h.get_indexes(); - if (type[0] != 'L' || !v.IsArray()) { - return false; - } - rjson::value newv = rjson::empty_array(); - const auto& a = v.GetArray(); - for (unsigned i = 0; i < v.Size(); i++) { - auto x = indexes.find(i); - if (x != indexes.end()) { - if (x->second) { - if (hierarchy_filter(a[i], *x->second)) { - rjson::push_back(newv, std::move(a[i])); - } - } else { - // The entire attribute is to be kept - rjson::push_back(newv, std::move(a[i])); - } - } - } - if (newv.Size() == 0) { - return false; - } - v = newv; - } - return true; -} - -// Add a path to an attribute_path_map. Throws a validation error if the path -// "overlaps" with one already in the filter (one is a sub-path of the other) -// or "conflicts" with it (both a member and index is requested). -template -void attribute_path_map_add(const char* source, attribute_path_map& map, const parsed::path& p, T value = {}) { - using node = attribute_path_map_node; - // The first step is to look for the top-level attribute (p.root()): - auto it = map.find(p.root()); - if (it == map.end()) { - if (p.has_operators()) { - it = map.emplace(p.root(), node {std::nullopt}).first; - } else { - (void) map.emplace(p.root(), node {std::move(value)}).first; - // Value inserted for top-level node. We're done. - return; - } - } else if(!p.has_operators()) { - // If p is top-level and we already have it or a part of it - // in map, it's a forbidden overlapping path. - throw api_error::validation(fmt::format( - "Invalid {}: two document paths overlap at {}", source, p.root())); - } else if (it->second.has_value()) { - // If we're here, it != map.end() && p.has_operators && it->second.has_value(). - // This means the top-level attribute already has a value, and we're - // trying to add a non-top-level value. It's an overlap. - throw api_error::validation(fmt::format("Invalid {}: two document paths overlap at {}", source, p.root())); - } - node* h = &it->second; - // The second step is to walk h from the top-level node to the inner node - // where we're supposed to insert the value: - for (const auto& op : p.operators()) { - std::visit(overloaded_functor { - [&] (const std::string& member) { - if (h->is_empty()) { - *h = node {typename node::members_t()}; - } else if (h->has_indexes()) { - throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); - } else if (h->has_value()) { - throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); - } - typename node::members_t& members = h->get_members(); - auto it = members.find(member); - if (it == members.end()) { - it = members.insert({member, std::make_unique()}).first; - } - h = it->second.get(); - }, - [&] (unsigned index) { - if (h->is_empty()) { - *h = node {typename node::indexes_t()}; - } else if (h->has_members()) { - throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p)); - } else if (h->has_value()) { - throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); - } - typename node::indexes_t& indexes = h->get_indexes(); - auto it = indexes.find(index); - if (it == indexes.end()) { - it = indexes.insert({index, std::make_unique()}).first; - } - h = it->second.get(); - } - }, op); - } - // Finally, insert the value in the node h. - if (h->is_empty()) { - *h = node {std::move(value)}; - } else { - throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p)); - } -} - -// A very simplified version of the above function for the special case of -// adding only top-level attribute. It's not only simpler, we also use a -// different error message, referring to a "duplicate attribute"instead of -// "overlapping paths". DynamoDB also has this distinction (errors in -// AttributesToGet refer to duplicates, not overlaps, but errors in -// ProjectionExpression refer to overlap - even if it's an exact duplicate). -template -void attribute_path_map_add(const char* source, attribute_path_map& map, const std::string& attr, T value = {}) { - using node = attribute_path_map_node; - auto it = map.find(attr); - if (it == map.end()) { - map.emplace(attr, node {std::move(value)}); - } else { - throw api_error::validation(fmt::format( - "Invalid {}: Duplicate attribute: {}", source, attr)); - } -} - -// Parse the "Select" parameter of a Scan or Query operation, throwing a -// ValidationException in various forbidden combinations of options and -// finally returning one of three options: -// 1. regular - the default scan behavior of returning all or specific -// attributes ("ALL_ATTRIBUTES" or "SPECIFIC_ATTRIBUTES"). -// 2. count - just count the items ("COUNT") -// 3. projection - return projected attributes ("ALL_PROJECTED_ATTRIBUTES") -// An ValidationException is thrown when recognizing an invalid combination -// of options - such as ALL_PROJECTED_ATTRIBUTES for a base table, or -// SPECIFIC_ATTRIBUTES without ProjectionExpression or AttributesToGet. -enum class select_type { regular, count, projection }; -static select_type parse_select(const rjson::value& request, table_or_view_type table_type) { - const rjson::value* select_value = rjson::find(request, "Select"); - if (!select_value) { - // If "Select" is not specified, it defaults to ALL_ATTRIBUTES - // on a base table, or ALL_PROJECTED_ATTRIBUTES on an index - return table_type == table_or_view_type::base ? - select_type::regular : select_type::projection; - } - if (!select_value->IsString()) { - throw api_error::validation("Select parameter must be a string"); - } - std::string_view select = rjson::to_string_view(*select_value); - const bool has_attributes_to_get = request.HasMember("AttributesToGet"); - const bool has_projection_expression = request.HasMember("ProjectionExpression"); - if (select == "SPECIFIC_ATTRIBUTES") { - if (has_projection_expression || has_attributes_to_get) { - return select_type::regular; - } - throw api_error::validation("Select=SPECIFIC_ATTRIBUTES requires AttributesToGet or ProjectionExpression"); - } - if (has_projection_expression || has_attributes_to_get) { - throw api_error::validation("AttributesToGet or ProjectionExpression require Select to be either SPECIFIC_ATTRIBUTES or missing"); - } - if (select == "COUNT") { - return select_type::count; - } - if (select == "ALL_ATTRIBUTES") { - // FIXME: when we support projections (#5036), if this is a GSI and - // not all attributes are projected to it, we should throw. - return select_type::regular; - } - if (select == "ALL_PROJECTED_ATTRIBUTES") { - if (table_type == table_or_view_type::base) { - throw api_error::validation("ALL_PROJECTED_ATTRIBUTES only allowed for indexes"); - } - return select_type::projection; - } - throw api_error::validation(fmt::format("Unknown Select value '{}'. Allowed choices: ALL_ATTRIBUTES, SPECIFIC_ATTRIBUTES, ALL_PROJECTED_ATTRIBUTES, COUNT", - select)); -} - -// calculate_attrs_to_get() takes either AttributesToGet or -// ProjectionExpression parameters (having both is *not* allowed), -// and returns the list of cells we need to read, or a disengaged optional -// when *all* attributes are to be returned. -// However, in our current implementation, only top-level attributes are -// stored as separate cells - a nested document is stored serialized together -// (as JSON) in the same cell. So this function return a map - each key is the -// top-level attribute we will need need to read, and the value for each -// top-level attribute is the partial hierarchy (struct hierarchy_filter) -// that we will need to extract from that serialized JSON. -// For example, if ProjectionExpression lists a.b and a.c[2], we -// return one top-level attribute name, "a", with the value "{b, c[2]}". - -static std::optional calculate_attrs_to_get(const rjson::value& req, parsed::expression_cache& parsed_expression_cache, std::unordered_set& used_attribute_names, select_type select = select_type::regular) { - if (select == select_type::count) { - // An empty map asks to retrieve no attributes. Note that this is - // different from a disengaged optional which means retrieve all. - return attrs_to_get(); - } - // FIXME: also need to handle select_type::projection - const bool has_attributes_to_get = req.HasMember("AttributesToGet"); - const bool has_projection_expression = req.HasMember("ProjectionExpression"); - if (has_attributes_to_get && has_projection_expression) { - throw api_error::validation( - format("GetItem does not allow both ProjectionExpression and AttributesToGet to be given together")); - } - if (has_attributes_to_get) { - const rjson::value& attributes_to_get = req["AttributesToGet"]; - attrs_to_get ret; - for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) { - attribute_path_map_add("AttributesToGet", ret, rjson::to_string(*it)); - validate_attr_name_length("AttributesToGet", it->GetStringLength(), false); - } - if (ret.empty()) { - throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead."); - } - return ret; - } else if (has_projection_expression) { - const rjson::value& projection_expression = req["ProjectionExpression"]; - const rjson::value* expression_attribute_names = rjson::find(req, "ExpressionAttributeNames"); - std::vector paths_to_get; - try { - paths_to_get = parsed_expression_cache.parse_projection_expression(rjson::to_string_view(projection_expression)); - } catch(expressions_syntax_error& e) { - throw api_error::validation(e.what()); - } - resolve_projection_expression(paths_to_get, expression_attribute_names, used_attribute_names); - attrs_to_get ret; - for (const parsed::path& p : paths_to_get) { - attribute_path_map_add("ProjectionExpression", ret, p); - } - return ret; - } - // An disengaged optional asks to read everything - return std::nullopt; -} - -/** - * Helper routine to extract data when we already have - * row, etc etc. - * - * Note: include_all_embedded_attributes means we should - * include all values in the `ATTRS_COLUMN_NAME` map column. - * - * We could change the behaviour to simply include all values - * from this column if the `ATTRS_COLUMN_NAME` is explicit in - * `attrs_to_get`, but I am scared to do that now in case - * there is some corner case in existing code. - * - * Explicit bool means we can be sure all previous calls are - * as before. - */ -void executor::describe_single_item(const cql3::selection::selection& selection, - const std::vector& result_row, - const std::optional& attrs_to_get, - rjson::value& item, - uint64_t* item_length_in_bytes, - bool include_all_embedded_attributes) -{ - const auto& columns = selection.get_columns(); - auto column_it = columns.begin(); - for (const managed_bytes_opt& cell : result_row) { - if (!cell) { - ++column_it; - continue; - } - std::string column_name = (*column_it)->name_as_text(); - if (column_name != executor::ATTRS_COLUMN_NAME) { - if (item_length_in_bytes) { - (*item_length_in_bytes) += column_name.length() + cell->size(); - } - if (!attrs_to_get || attrs_to_get->contains(column_name)) { - // item is expected to start empty, and column_name are unique - // so add() makes sense - rjson::add_with_string_name(item, column_name, rjson::empty_object()); - rjson::value& field = item[column_name.c_str()]; - cell->with_linearized([&] (bytes_view linearized_cell) { - rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(linearized_cell, **column_it)); - }); - } - } else { - auto deserialized = attrs_type()->deserialize(*cell); - auto keys_and_values = value_cast(deserialized); - for (auto entry : keys_and_values) { - std::string attr_name = value_cast(entry.first); - if (item_length_in_bytes) { - (*item_length_in_bytes) += attr_name.length(); - } - if (include_all_embedded_attributes || !attrs_to_get || attrs_to_get->contains(attr_name)) { - bytes value = value_cast(entry.second); - if (item_length_in_bytes && value.length()) { - // ScyllaDB uses one extra byte compared to DynamoDB for the bytes length - (*item_length_in_bytes) += value.length() - 1; - } - rjson::value v = deserialize_item(value); - if (attrs_to_get) { - auto it = attrs_to_get->find(attr_name); - if (it != attrs_to_get->end()) { - // attrs_to_get may have asked for only part of - // this attribute. hierarchy_filter() modifies v, - // and returns false when nothing is to be kept. - if (!hierarchy_filter(v, it->second)) { - continue; - } - } - } - // item is expected to start empty, and attribute - // names are unique so add() makes sense - rjson::add_with_string_name(item, attr_name, std::move(v)); - } else if (item_length_in_bytes) { - (*item_length_in_bytes) += value_cast(entry.second).length() - 1; - } - } - } - ++column_it; - } -} - -std::optional executor::describe_single_item(schema_ptr schema, - const query::partition_slice& slice, - const cql3::selection::selection& selection, - const query::result& query_result, - const std::optional& attrs_to_get, - uint64_t* item_length_in_bytes) { - rjson::value item = rjson::empty_object(); - - cql3::selection::result_set_builder builder(selection, gc_clock::now()); - query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection)); - - auto result_set = builder.build(); - if (result_set->empty()) { - if (item_length_in_bytes) { - // empty results is counted as having a minimal length (e.g. 1 byte). - (*item_length_in_bytes) += 1; - } - // If there is no matching item, we're supposed to return an empty - // object without an Item member - not one with an empty Item member - return {}; - } - if (result_set->size() > 1) { - // If the result set contains multiple rows, the code should have - // called describe_multi_item(), not this function. - throw std::logic_error("describe_single_item() asked to describe multiple items"); - } - describe_single_item(selection, *result_set->rows().begin(), attrs_to_get, item, item_length_in_bytes); - return item; -} - -future> executor::describe_multi_item(schema_ptr schema, - const query::partition_slice&& slice, - shared_ptr selection, - foreign_ptr> query_result, - shared_ptr> attrs_to_get, - noncopyable_function item_callback) { - cql3::selection::result_set_builder builder(*selection, gc_clock::now()); - query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection)); - auto result_set = builder.build(); - std::vector ret; - for (auto& result_row : result_set->rows()) { - rjson::value item = rjson::empty_object(); - uint64_t item_length_in_bytes = 0; - describe_single_item(*selection, result_row, *attrs_to_get, item, &item_length_in_bytes); - item_callback(item_length_in_bytes); - ret.push_back(std::move(item)); - co_await coroutine::maybe_yield(); - } - co_return ret; -} - static bool check_needs_read_before_write(const parsed::value& v) { return std::visit(overloaded_functor { [&] (const parsed::constant& c) -> bool { @@ -4140,6 +3576,9 @@ public: // Saved list of GSI keys in the table being updated, used for // validate_value_if_index_key() std::unordered_map _key_attributes; + // Saved map of vector index target attributes to their dimensions, used + // for validate_value_if_vector_index_attribute() + std::unordered_map _vector_index_attributes; parsed::condition_expression _condition_expression; @@ -4245,6 +3684,7 @@ update_item_operation::update_item_operation(parsed::expression_cache& parsed_ex _key_attributes = si_key_attributes(proxy.data_dictionary().find_table( _schema->ks_name(), _schema->cf_name())); + _vector_index_attributes = vector_index_attributes(*_schema); } // These are the cases where update_item_operation::apply() needs to use @@ -4541,7 +3981,14 @@ void update_item_operation::update_attribute(bytes&& column_name, const rjson::v } else { // This attribute may be a key column of one of the GSIs or LSIs, // in which case there are some limitations on the value. - validate_value_if_index_key(_key_attributes, column_name, json_value); + if (!_key_attributes.empty()) { + validate_value_if_index_key(_key_attributes, column_name, json_value); + } + // This attribute may also be a vector index target column, + // in which case it must be a list of the right number of floats. + if (!_vector_index_attributes.empty()) { + validate_value_if_vector_index_attribute(_vector_index_attributes, column_name, json_value); + } modified_attrs.put(std::move(column_name), serialize_item(json_value), ts); } } @@ -4819,1383 +4266,6 @@ future executor::update_item(client_state& client co_return res; } -// Check according to the request's "ConsistentRead" field, which consistency -// level we need to use for the read. The field can be True for strongly -// consistent reads, or False for eventually consistent reads, or if this -// field is absence, we default to eventually consistent reads. -// In Scylla, eventually-consistent reads are implemented as consistency -// level LOCAL_ONE, and strongly-consistent reads as LOCAL_QUORUM. -static db::consistency_level get_read_consistency(const rjson::value& request) { - const rjson::value* consistent_read_value = rjson::find(request, "ConsistentRead"); - bool consistent_read = false; - if (consistent_read_value && !consistent_read_value->IsNull()) { - if (consistent_read_value->IsBool()) { - consistent_read = consistent_read_value->GetBool(); - } else { - throw api_error::validation("ConsistentRead flag must be a boolean"); - } - } - return consistent_read ? db::consistency_level::LOCAL_QUORUM : db::consistency_level::LOCAL_ONE; -} - -// describe_item() wraps the result of describe_single_item() by a map -// as needed by the GetItem request. It should not be used for other purposes, -// use describe_single_item() instead. -static rjson::value describe_item(schema_ptr schema, - const query::partition_slice& slice, - const cql3::selection::selection& selection, - const query::result& query_result, - const std::optional& attrs_to_get, - consumed_capacity_counter& consumed_capacity, - uint64_t& metric) { - std::optional opt_item = executor::describe_single_item(std::move(schema), slice, selection, std::move(query_result), attrs_to_get, &consumed_capacity._total_bytes); - rjson::value item_descr = rjson::empty_object(); - if (opt_item) { - rjson::add(item_descr, "Item", std::move(*opt_item)); - } - consumed_capacity.add_consumed_capacity_to_response_if_needed(item_descr); - metric += consumed_capacity.get_half_units(); - return item_descr; -} - -future executor::get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { - _stats.api_operations.get_item++; - auto start_time = std::chrono::steady_clock::now(); - elogger.trace("Getting item {}", request); - - schema_ptr schema = get_table(_proxy, request); - lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *schema); - per_table_stats->api_operations.get_item++; - tracing::add_alternator_table_name(trace_state, schema->cf_name()); - - rjson::value& query_key = request["Key"]; - db::consistency_level cl = get_read_consistency(request); - - maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "GetItem", request, cl); - - partition_key pk = pk_from_json(query_key, schema); - dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))}; - - std::vector bounds; - if (schema->clustering_key_size() == 0) { - bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } else { - clustering_key ck = ck_from_json(query_key, schema); - bounds.push_back(query::clustering_range::make_singular(std::move(ck))); - } - check_key(query_key, schema); - - //TODO(sarna): It would be better to fetch only some attributes of the map, not all - auto regular_columns = - schema->regular_columns() | std::views::transform(&column_definition::id) - | std::ranges::to(); - - auto selection = cql3::selection::selection::wildcard(schema); - - auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice), - query::tombstone_limit(_proxy.get_tombstone_limit())); - - std::unordered_set used_attribute_names; - auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names); - const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); - verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem"); - rcu_consumed_capacity_counter add_capacity(request, cl == db::consistency_level::LOCAL_QUORUM); - co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::SELECT, _stats); - service::storage_proxy::coordinator_query_result qr = - co_await _proxy.query( - schema, std::move(command), std::move(partition_ranges), cl, - service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)); - per_table_stats->api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time); - _stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time); - uint64_t rcu_half_units = 0; - rjson::value res = describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get), add_capacity, rcu_half_units); - per_table_stats->rcu_half_units_total += rcu_half_units; - _stats.rcu_half_units_total += rcu_half_units; - // Update item size metrics only if we found an item. - if (qr.query_result->row_count().value_or(0) > 0) { - per_table_stats->operation_sizes.get_item_op_size_kb.add(bytes_to_kb_ceil(add_capacity._total_bytes)); - } - co_return rjson::print(std::move(res)); -} - -static void check_big_object(const rjson::value& val, int& size_left); -static void check_big_array(const rjson::value& val, int& size_left); - -bool is_big(const rjson::value& val, int big_size) { - if (val.IsString()) { - return ssize_t(val.GetStringLength()) > big_size; - } else if (val.IsObject()) { - check_big_object(val, big_size); - return big_size < 0; - } else if (val.IsArray()) { - check_big_array(val, big_size); - return big_size < 0; - } - return false; -} - -static void check_big_array(const rjson::value& val, int& size_left) { - // Assume a fixed size of 10 bytes for each number, boolean, etc., or - // beginning of a sub-object. This doesn't have to be accurate. - size_left -= 10 * val.Size(); - for (const auto& v : val.GetArray()) { - if (size_left < 0) { - return; - } - // Note that we avoid recursive calls for the leaves (anything except - // array or object) because usually those greatly outnumber the trunk. - if (v.IsString()) { - size_left -= v.GetStringLength(); - } else if (v.IsObject()) { - check_big_object(v, size_left); - } else if (v.IsArray()) { - check_big_array(v, size_left); - } - } -} - -static void check_big_object(const rjson::value& val, int& size_left) { - size_left -= 10 * val.MemberCount(); - for (const auto& m : val.GetObject()) { - if (size_left < 0) { - return; - } - size_left -= m.name.GetStringLength(); - if (m.value.IsString()) { - size_left -= m.value.GetStringLength(); - } else if (m.value.IsObject()) { - check_big_object(m.value, size_left); - } else if (m.value.IsArray()) { - check_big_array(m.value, size_left); - } - } -} - -future executor::batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { - // FIXME: In this implementation, an unbounded batch size can cause - // unbounded response JSON object to be buffered in memory, unbounded - // parallelism of the requests, and unbounded amount of non-preemptable - // work in the following loops. So we should limit the batch size, and/or - // the response size, as DynamoDB does. - _stats.api_operations.batch_get_item++; - rjson::value& request_items = request["RequestItems"]; - auto start_time = std::chrono::steady_clock::now(); - // We need to validate all the parameters before starting any asynchronous - // query, and fail the entire request on any parse error. So we parse all - // the input into our own vector "requests", each element a table_requests - // listing all the request aimed at a single table. For efficiency, inside - // each table_requests we further group together all reads going to the - // same partition, so we can later send them together. - bool should_add_rcu = rcu_consumed_capacity_counter::should_add_capacity(request); - struct table_requests { - schema_ptr schema; - db::consistency_level cl; - ::shared_ptr> attrs_to_get; - // clustering_keys keeps a sorted set of clustering keys. It must - // be sorted for the read below (see #10827). Additionally each - // clustering key is mapped to the original rjson::value "Key". - using clustering_keys = std::map; - std::unordered_map requests; - table_requests(schema_ptr s) - : schema(std::move(s)) - , requests(8, partition_key::hashing(*schema), partition_key::equality(*schema)) - {} - void add(rjson::value& key) { - auto pk = pk_from_json(key, schema); - auto it = requests.find(pk); - if (it == requests.end()) { - it = requests.emplace(pk, clustering_key::less_compare(*schema)).first; - } - auto ck = ck_from_json(key, schema); - if (auto [_, inserted] = it->second.emplace(ck, &key); !inserted) { - throw api_error::validation("Provided list of item keys contains duplicates"); - } - } - }; - std::vector requests; - uint batch_size = 0; - for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) { - table_requests rs(get_table_from_batch_request(_proxy, it)); - tracing::add_alternator_table_name(trace_state, rs.schema->cf_name()); - rs.cl = get_read_consistency(it->value); - std::unordered_set used_attribute_names; - rs.attrs_to_get = ::make_shared>(calculate_attrs_to_get(it->value, *_parsed_expression_cache, used_attribute_names)); - const rjson::value* expression_attribute_names = rjson::find(it->value, "ExpressionAttributeNames"); - verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "GetItem"); - auto& keys = (it->value)["Keys"]; - for (rjson::value& key : keys.GetArray()) { - rs.add(key); - check_key(key, rs.schema); - } - batch_size += rs.requests.size(); - requests.emplace_back(std::move(rs)); - } - - for (const table_requests& tr : requests) { - co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, tr.schema, auth::permission::SELECT, _stats); - } - - _stats.api_operations.batch_get_item_batch_total += batch_size; - _stats.api_operations.batch_get_item_histogram.add(batch_size); - // If we got here, all "requests" are valid, so let's start the - // requests for the different partitions all in parallel. - std::vector>> response_futures; - std::vector consumed_rcu_half_units_per_table(requests.size()); - for (size_t i = 0; i < requests.size(); i++) { - const table_requests& rs = requests[i]; - bool is_quorum = rs.cl == db::consistency_level::LOCAL_QUORUM; - lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); - per_table_stats->api_operations.batch_get_item_histogram.add(rs.requests.size()); - for (const auto& [pk, cks] : rs.requests) { - dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*rs.schema, pk))}; - std::vector bounds; - if (rs.schema->clustering_key_size() == 0) { - bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } else { - for (auto& ck : cks) { - bounds.push_back(query::clustering_range::make_singular(ck.first)); - } - } - auto regular_columns = - rs.schema->regular_columns() | std::views::transform(&column_definition::id) - | std::ranges::to(); - auto selection = cql3::selection::selection::wildcard(rs.schema); - auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); - auto command = ::make_lw_shared(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice), - query::tombstone_limit(_proxy.get_tombstone_limit())); - command->allow_limit = db::allow_per_partition_rate_limit::yes; - const auto item_callback = [is_quorum, per_table_stats, &rcus_per_table = consumed_rcu_half_units_per_table[i]](uint64_t size) { - rcus_per_table += rcu_consumed_capacity_counter::get_half_units(size, is_quorum); - // Update item size only if the item exists. - if (size > 0) { - per_table_stats->operation_sizes.batch_get_item_op_size_kb.add(bytes_to_kb_ceil(size)); - } - }; - future> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl, - service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then( - [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get, item_callback = std::move(item_callback)] (service::storage_proxy::coordinator_query_result qr) mutable { - utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); }); - return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get), std::move(item_callback)); - }); - response_futures.push_back(std::move(f)); - } - } - - // Wait for all requests to complete, and then return the response. - // In case of full failure (no reads succeeded), an arbitrary error - // from one of the operations will be returned. - bool some_succeeded = false; - std::exception_ptr eptr; - std::set table_names; // for auditing - // FIXME: will_log() here doesn't pass keyspace/table, so keyspace-level audit - // filtering is bypassed — a batch spanning multiple tables is audited as a whole. - bool should_audit = _audit.local_is_initialized() && _audit.local().will_log(audit::statement_category::QUERY); - rjson::value response = rjson::empty_object(); - rjson::add(response, "Responses", rjson::empty_object()); - rjson::add(response, "UnprocessedKeys", rjson::empty_object()); - auto fut_it = response_futures.begin(); - rjson::value consumed_capacity = rjson::empty_array(); - for (size_t i = 0; i < requests.size(); i++) { - const table_requests& rs = requests[i]; - std::string table = table_name(*rs.schema); - if (should_audit) { - table_names.insert(table); - } - for (const auto& [_, cks] : rs.requests) { - auto& fut = *fut_it; - ++fut_it; - try { - std::vector results = co_await std::move(fut); - some_succeeded = true; - if (!response["Responses"].HasMember(table)) { - rjson::add_with_string_name(response["Responses"], table, rjson::empty_array()); - } - for (rjson::value& json : results) { - rjson::push_back(response["Responses"][table], std::move(json)); - } - } catch(...) { - eptr = std::current_exception(); - // This read of potentially several rows in one partition, - // failed. We need to add the row key(s) to UnprocessedKeys. - if (!response["UnprocessedKeys"].HasMember(table)) { - // Add the table's entry in UnprocessedKeys. Need to copy - // all the table's parameters from the request except the - // Keys field, which we start empty and then build below. - rjson::add_with_string_name(response["UnprocessedKeys"], table, rjson::empty_object()); - rjson::value& unprocessed_item = response["UnprocessedKeys"][table]; - rjson::value& request_item = request_items[table]; - for (auto it = request_item.MemberBegin(); it != request_item.MemberEnd(); ++it) { - if (it->name != "Keys") { - rjson::add_with_string_name(unprocessed_item, - rjson::to_string_view(it->name), rjson::copy(it->value)); - } - } - rjson::add_with_string_name(unprocessed_item, "Keys", rjson::empty_array()); - } - for (auto& ck : cks) { - rjson::push_back(response["UnprocessedKeys"][table]["Keys"], std::move(*ck.second)); - } - } - } - uint64_t rcu_half_units = consumed_rcu_half_units_per_table[i]; - _stats.rcu_half_units_total += rcu_half_units; - lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); - per_table_stats->rcu_half_units_total += rcu_half_units; - if (should_add_rcu) { - rjson::value entry = rjson::empty_object(); - rjson::add(entry, "TableName", table); - rjson::add(entry, "CapacityUnits", rcu_half_units*0.5); - rjson::push_back(consumed_capacity, std::move(entry)); - } - } - - if (should_add_rcu) { - rjson::add(response, "ConsumedCapacity", std::move(consumed_capacity)); - } - elogger.trace("Unprocessed keys: {}", response["UnprocessedKeys"]); - // NOTE: Each table in the batch has its own CL (set by get_read_consistency()), - // but the audit entry records a single CL for the whole batch. We use ANY as a - // placeholder to indicate "mixed / not applicable". - // FIXME: Auditing is executed only for a complete success - maybe_audit(audit_info, audit::statement_category::QUERY, "", - print_names_for_audit(table_names), "BatchGetItem", request, db::consistency_level::ANY); - if (!some_succeeded && eptr) { - co_await coroutine::return_exception_ptr(std::move(eptr)); - } - auto duration = std::chrono::steady_clock::now() - start_time; - _stats.api_operations.batch_get_item_latency.mark(duration); - for (const table_requests& rs : requests) { - lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); - per_table_stats->api_operations.batch_get_item_latency.mark(duration); - } - if (is_big(response)) { - co_return make_streamed(std::move(response)); - } else { - co_return rjson::print(std::move(response)); - } -} - -// "filter" represents a condition that can be applied to individual items -// read by a Query or Scan operation, to decide whether to keep the item. -// A filter is constructed from a Query or Scan request. This uses the -// relevant fields in the query (FilterExpression or QueryFilter/ScanFilter + -// ConditionalOperator). These fields are pre-checked and pre-parsed as much -// as possible, to ensure that later checking of many items is efficient. -class filter { -private: - // Holding QueryFilter/ScanFilter + ConditionalOperator: - struct conditions_filter { - bool require_all; - rjson::value conditions; - }; - // Holding a parsed FilterExpression: - struct expression_filter { - parsed::condition_expression expression; - }; - std::optional> _imp; -public: - // Filtering for Scan and Query are very similar, but there are some - // small differences, especially the names of the request attributes. - enum class request_type { SCAN, QUERY }; - // Note that a filter does not store pointers to the query used to - // construct it. - filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt, - std::unordered_set& used_attribute_names, - std::unordered_set& used_attribute_values); - bool check(const rjson::value& item) const; - bool filters_on(std::string_view attribute) const; - // for_filters_on() runs the given function on the attributes that the - // filter works on. It may run for the same attribute more than once if - // used more than once in the filter. - void for_filters_on(const noncopyable_function& func) const; - operator bool() const { return bool(_imp); } -}; - -filter::filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt, - std::unordered_set& used_attribute_names, - std::unordered_set& used_attribute_values) { - const rjson::value* expression = rjson::find(request, "FilterExpression"); - const char* conditions_attribute = (rt == request_type::SCAN) ? "ScanFilter" : "QueryFilter"; - const rjson::value* conditions = rjson::find(request, conditions_attribute); - auto conditional_operator = get_conditional_operator(request); - if (conditional_operator != conditional_operator_type::MISSING && - (!conditions || (conditions->IsObject() && conditions->GetObject().ObjectEmpty()))) { - throw api_error::validation( - format("'ConditionalOperator' parameter cannot be specified for missing or empty {}", - conditions_attribute)); - } - if (expression && conditions) { - throw api_error::validation( - format("FilterExpression and {} are not allowed together", conditions_attribute)); - } - if (expression) { - if (!expression->IsString()) { - throw api_error::validation("FilterExpression must be a string"); - } - if (expression->GetStringLength() == 0) { - throw api_error::validation("FilterExpression must not be empty"); - } - if (rjson::find(request, "AttributesToGet")) { - throw api_error::validation("Cannot use both old-style and new-style parameters in same request: FilterExpression and AttributesToGet"); - } - try { - auto parsed = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(*expression), "FilterExpression"); - const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); - const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); - resolve_condition_expression(parsed, - expression_attribute_names, expression_attribute_values, - used_attribute_names, used_attribute_values); - _imp = expression_filter { std::move(parsed) }; - } catch(expressions_syntax_error& e) { - throw api_error::validation(e.what()); - } - } - if (conditions) { - if (rjson::find(request, "ProjectionExpression")) { - throw api_error::validation(format("Cannot use both old-style and new-style parameters in same request: {} and ProjectionExpression", conditions_attribute)); - } - bool require_all = conditional_operator != conditional_operator_type::OR; - _imp = conditions_filter { require_all, rjson::copy(*conditions) }; - } -} - -bool filter::check(const rjson::value& item) const { - if (!_imp) { - return true; - } - return std::visit(overloaded_functor { - [&] (const conditions_filter& f) -> bool { - return verify_condition(f.conditions, f.require_all, &item); - }, - [&] (const expression_filter& f) -> bool { - return verify_condition_expression(f.expression, &item); - } - }, *_imp); -} - -bool filter::filters_on(std::string_view attribute) const { - if (!_imp) { - return false; - } - return std::visit(overloaded_functor { - [&] (const conditions_filter& f) -> bool { - for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) { - if (rjson::to_string_view(it->name) == attribute) { - return true; - } - } - return false; - }, - [&] (const expression_filter& f) -> bool { - return condition_expression_on(f.expression, attribute); - } - }, *_imp); -} - -void filter::for_filters_on(const noncopyable_function& func) const { - if (_imp) { - std::visit(overloaded_functor { - [&] (const conditions_filter& f) -> void { - for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) { - func(rjson::to_string_view(it->name)); - } - }, - [&] (const expression_filter& f) -> void { - return for_condition_expression_on(f.expression, func); - } - }, *_imp); - } -} - -class describe_items_visitor { - typedef std::vector columns_t; - const columns_t& _columns; - const std::optional& _attrs_to_get; - std::unordered_set _extra_filter_attrs; - const filter& _filter; - typename columns_t::const_iterator _column_it; - rjson::value _item; - // _items is a chunked_vector instead of a RapidJson array - // (rjson::value) because unfortunately RapidJson arrays are stored - // contiguously in memory, and cause large allocations when a Query/Scan - // returns a long list of short items (issue #23535). - utils::chunked_vector _items; - size_t _scanned_count; - -public: - describe_items_visitor(const columns_t& columns, const std::optional& attrs_to_get, filter& filter) - : _columns(columns) - , _attrs_to_get(attrs_to_get) - , _filter(filter) - , _column_it(columns.begin()) - , _item(rjson::empty_object()) - , _scanned_count(0) - { - // _filter.check() may need additional attributes not listed in - // _attrs_to_get (i.e., not requested as part of the output). - // We list those in _extra_filter_attrs. We will include them in - // the JSON but take them out before finally returning the JSON. - if (_attrs_to_get) { - _filter.for_filters_on([&] (std::string_view attr) { - std::string a(attr); // no heterogeneous maps searches :-( - if (!_attrs_to_get->contains(a)) { - _extra_filter_attrs.emplace(std::move(a)); - } - }); - } - } - - void start_row() { - _column_it = _columns.begin(); - } - - void accept_value(managed_bytes_view_opt result_bytes_view) { - if (!result_bytes_view) { - ++_column_it; - return; - } - result_bytes_view->with_linearized([this] (bytes_view bv) { - std::string column_name = (*_column_it)->name_as_text(); - if (column_name != executor::ATTRS_COLUMN_NAME) { - if (!_attrs_to_get || _attrs_to_get->contains(column_name) || _extra_filter_attrs.contains(column_name)) { - if (!_item.HasMember(column_name.c_str())) { - rjson::add_with_string_name(_item, column_name, rjson::empty_object()); - } - rjson::value& field = _item[column_name.c_str()]; - rjson::add_with_string_name(field, type_to_string((*_column_it)->type), json_key_column_value(bv, **_column_it)); - } - } else { - auto deserialized = attrs_type()->deserialize(bv); - auto keys_and_values = value_cast(deserialized); - for (auto entry : keys_and_values) { - std::string attr_name = value_cast(entry.first); - if (!_attrs_to_get || _attrs_to_get->contains(attr_name) || _extra_filter_attrs.contains(attr_name)) { - bytes value = value_cast(entry.second); - // Even if _attrs_to_get asked to keep only a part of a - // top-level attribute, we keep the entire attribute - // at this stage, because the item filter might still - // need the other parts (it was easier for us to keep - // extra_filter_attrs at top-level granularity). We'll - // filter the unneeded parts after item filtering. - rjson::add_with_string_name(_item, attr_name, deserialize_item(value)); - } - } - } - }); - ++_column_it; - } - - void end_row() { - if (_filter.check(_item)) { - // As noted above, we kept entire top-level attributes listed in - // _attrs_to_get. We may need to only keep parts of them. - if (_attrs_to_get) { - for (const auto& attr: *_attrs_to_get) { - // If !attr.has_value() it means we were asked not to keep - // attr entirely, but just parts of it. - if (!attr.second.has_value()) { - rjson::value* toplevel= rjson::find(_item, attr.first); - if (toplevel && !hierarchy_filter(*toplevel, attr.second)) { - rjson::remove_member(_item, attr.first); - } - } - } - } - // Remove the extra attributes _extra_filter_attrs which we had - // to add just for the filter, and not requested to be returned: - for (const auto& attr : _extra_filter_attrs) { - rjson::remove_member(_item, attr); - } - - _items.push_back(std::move(_item)); - } - _item = rjson::empty_object(); - ++_scanned_count; - } - - utils::chunked_vector get_items() && { - return std::move(_items); - } - - size_t get_scanned_count() { - return _scanned_count; - } -}; - -// describe_items() returns a JSON object that includes members "Count" -// and "ScannedCount", but *not* "Items" - that is returned separately -// as a chunked_vector to avoid large contiguous allocations which -// RapidJSON does of its array. The caller should add "Items" to the -// returned JSON object if needed, or print it separately. -// The returned chunked_vector (the items) is std::optional<>, because -// the user may have requested only to count items, and not return any -// items - which is different from returning an empty list of items. -static future>, size_t>> describe_items( - const cql3::selection::selection& selection, - std::unique_ptr result_set, - std::optional&& attrs_to_get, - filter&& filter) { - describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter); - co_await result_set->visit_gently(visitor); - auto scanned_count = visitor.get_scanned_count(); - utils::chunked_vector items = std::move(visitor).get_items(); - rjson::value items_descr = rjson::empty_object(); - auto size = items.size(); - rjson::add(items_descr, "Count", rjson::value(size)); - rjson::add(items_descr, "ScannedCount", rjson::value(scanned_count)); - // If attrs_to_get && attrs_to_get->empty(), this means the user asked not - // to get any attributes (i.e., a Scan or Query with Select=COUNT) and we - // shouldn't return "Items" at all. - // TODO: consider optimizing the case of Select=COUNT without a filter. - // In that case, we currently build a list of empty items and here drop - // it. We could just count the items and not bother with the empty items. - // (However, remember that when we do have a filter, we need the items). - std::optional> opt_items; - if (!attrs_to_get || !attrs_to_get->empty()) { - opt_items = std::move(items); - } - co_return std::tuple(std::move(items_descr), std::move(opt_items), size); -} - -static rjson::value encode_paging_state(const schema& schema, const service::pager::paging_state& paging_state) { - rjson::value last_evaluated_key = rjson::empty_object(); - std::vector exploded_pk = paging_state.get_partition_key().explode(); - auto exploded_pk_it = exploded_pk.begin(); - for (const column_definition& cdef : schema.partition_key_columns()) { - rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object()); - rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()]; - rjson::add_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef)); - ++exploded_pk_it; - } - auto pos = paging_state.get_position_in_partition(); - if (pos.has_key()) { - // Alternator itself allows at most one column in clustering key, but - // user can use Alternator api to access system tables which might have - // multiple clustering key columns. So we need to handle that case here. - auto cdef_it = schema.clustering_key_columns().begin(); - for(const auto &exploded_ck : pos.key().explode()) { - rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef_it->name_as_text()), rjson::empty_object()); - rjson::value& key_entry = last_evaluated_key[cdef_it->name_as_text()]; - rjson::add_with_string_name(key_entry, type_to_string(cdef_it->type), json_key_column_value(exploded_ck, *cdef_it)); - ++cdef_it; - } - } - // To avoid possible conflicts (and thus having to reserve these names) we - // avoid adding the weight and region fields of the position to the paging - // state. Alternator will never need these as it doesn't have range - // tombstones (the only thing that can generate a position other than at(row)). - // We conditionally include these fields when reading CQL tables through alternator. - if (!is_alternator_keyspace(schema.ks_name()) && (!pos.has_key() || pos.get_bound_weight() != bound_weight::equal)) { - rjson::add_with_string_name(last_evaluated_key, scylla_paging_region, rjson::empty_object()); - rjson::add(last_evaluated_key[scylla_paging_region.data()], "S", rjson::from_string(fmt::to_string(pos.region()))); - rjson::add_with_string_name(last_evaluated_key, scylla_paging_weight, rjson::empty_object()); - rjson::add(last_evaluated_key[scylla_paging_weight.data()], "N", static_cast(pos.get_bound_weight())); - } - return last_evaluated_key; -} - -// RapidJSON allocates arrays contiguously in memory, so we want to avoid -// returning a large number of items as a single rapidjson array, and use -// a chunked_vector instead. The following constant is an arbitrary cutoff -// point for when to switch from a rapidjson array to a chunked_vector. -static constexpr int max_items_for_rapidjson_array = 256; - -static future do_query(service::storage_proxy& proxy, - schema_ptr table_schema, - const rjson::value* exclusive_start_key, - dht::partition_range_vector partition_ranges, - std::vector ck_bounds, - std::optional attrs_to_get, - uint32_t limit, - db::consistency_level cl, - filter filter, - query::partition_slice::option_set custom_opts, - service::client_state& client_state, - alternator::stats& stats, - tracing::trace_state_ptr trace_state, - service_permit permit, - bool enforce_authorization, - bool warn_authorization) { - lw_shared_ptr old_paging_state = nullptr; - - tracing::trace(trace_state, "Performing a database query"); - - // Reverse the schema and the clustering bounds as the underlying code expects - // reversed queries in the native reversed format. - auto query_schema = table_schema; - const bool reversed = custom_opts.contains(); - if (reversed) { - query_schema = table_schema->get_reversed(); - - std::reverse(ck_bounds.begin(), ck_bounds.end()); - for (auto& bound : ck_bounds) { - bound = query::reverse(bound); - } - } - - if (exclusive_start_key) { - partition_key pk = pk_from_json(*exclusive_start_key, table_schema); - auto pos = position_in_partition::for_partition_start(); - if (table_schema->clustering_key_size() > 0) { - pos = pos_from_json(*exclusive_start_key, table_schema); - } - old_paging_state = make_lw_shared(pk, pos, query::max_partitions, query_id::create_null_id(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0); - } - - co_await verify_permission(enforce_authorization, warn_authorization, client_state, table_schema, auth::permission::SELECT, stats); - - auto regular_columns = - table_schema->regular_columns() | std::views::transform(&column_definition::id) - | std::ranges::to(); - auto static_columns = - table_schema->static_columns() | std::views::transform(&column_definition::id) - | std::ranges::to(); - auto selection = cql3::selection::selection::wildcard(table_schema); - query::partition_slice::option_set opts = selection->get_query_options(); - opts.add(custom_opts); - auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts); - auto command = ::make_lw_shared(query_schema->id(), query_schema->version(), partition_slice, proxy.get_max_result_size(partition_slice), - query::tombstone_limit(proxy.get_tombstone_limit())); - - elogger.trace("Executing read query (reversed {}): table schema {}, query schema {}", partition_slice.is_reversed(), table_schema->version(), query_schema->version()); - - auto query_state_ptr = std::make_unique(client_state, trace_state, std::move(permit)); - - // FIXME: should be moved above, set on opts, so get_max_result_size knows it? - command->slice.options.set(); - auto query_options = std::make_unique(cl, std::vector{}); - query_options = std::make_unique(std::move(query_options), std::move(old_paging_state)); - auto p = service::pager::query_pagers::pager(proxy, query_schema, selection, *query_state_ptr, *query_options, command, std::move(partition_ranges), nullptr); - - std::unique_ptr rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout()); - if (!p->is_exhausted()) { - rs->get_metadata().set_paging_state(p->state()); - } - auto paging_state = rs->get_metadata().paging_state(); - bool has_filter = filter; - auto [items_descr, opt_items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter)); - if (paging_state) { - rjson::add(items_descr, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state)); - } - if (has_filter) { - stats.cql_stats.filtered_rows_read_total += p->stats().rows_read_total; - // update our "filtered_row_matched_total" for all the rows matched, despited the filter - stats.cql_stats.filtered_rows_matched_total += size; - } - if (opt_items) { - if (opt_items->size() >= max_items_for_rapidjson_array) { - // There are many items, better print the JSON and the array of - // items (opt_items) separately to avoid RapidJSON's contiguous - // allocation of arrays. - co_return make_streamed_with_extra_array(std::move(items_descr), "Items", std::move(*opt_items)); - } - // There aren't many items in the chunked vector opt_items, - // let's just insert them into the JSON object and print the - // full JSON normally. - rjson::value items_json = rjson::empty_array(); - for (auto& item : *opt_items) { - rjson::push_back(items_json, std::move(item)); - } - rjson::add(items_descr, "Items", std::move(items_json)); - } - if (is_big(items_descr)) { - co_return make_streamed(std::move(items_descr)); - } - co_return rjson::print(std::move(items_descr)); -} - -static dht::token token_for_segment(int segment, int total_segments) { - throwing_assert(total_segments > 1 && segment >= 0 && segment < total_segments); - uint64_t delta = std::numeric_limits::max() / total_segments; - return dht::token::from_int64(std::numeric_limits::min() + delta * segment); -} - -static dht::partition_range get_range_for_segment(int segment, int total_segments) { - if (total_segments == 1) { - return dht::partition_range::make_open_ended_both_sides(); - } - if (segment == 0) { - dht::token ending_token = token_for_segment(1, total_segments); - return dht::partition_range::make_ending_with( - dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false)); - } else if (segment == total_segments - 1) { - dht::token starting_token = token_for_segment(segment, total_segments); - return dht::partition_range::make_starting_with( - dht::partition_range::bound(dht::ring_position::starting_at(starting_token))); - } else { - dht::token starting_token = token_for_segment(segment, total_segments); - dht::token ending_token = token_for_segment(segment + 1, total_segments); - return dht::partition_range::make( - dht::partition_range::bound(dht::ring_position::starting_at(starting_token)), - dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false) - ); - } -} - -future executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { - _stats.api_operations.scan++; - elogger.trace("Scanning {}", request); - - auto [schema, table_type] = get_table_or_view(_proxy, request); - db::consistency_level cl = get_read_consistency(request); - - maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "Scan", request, cl); - - tracing::add_alternator_table_name(trace_state, schema->cf_name()); - get_stats_from_schema(_proxy, *schema)->api_operations.scan++; - auto segment = get_int_attribute(request, "Segment"); - auto total_segments = get_int_attribute(request, "TotalSegments"); - if (segment || total_segments) { - if (!segment || !total_segments) { - return make_ready_future(api_error::validation( - "Both Segment and TotalSegments attributes need to be present for a parallel scan")); - } - if (*segment < 0 || *segment >= *total_segments) { - return make_ready_future(api_error::validation( - "Segment must be non-negative and less than TotalSegments")); - } - if (*total_segments < 0 || *total_segments > 1000000) { - return make_ready_future(api_error::validation( - "TotalSegments must be non-negative and less or equal to 1000000")); - } - } - - rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey"); - - if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) { - return make_ready_future(api_error::validation( - "Consistent reads are not allowed on global indexes (GSI)")); - } - rjson::value* limit_json = rjson::find(request, "Limit"); - uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits::max(); - if (limit <= 0) { - return make_ready_future(api_error::validation("Limit must be greater than 0")); - } - - select_type select = parse_select(request, table_type); - - std::unordered_set used_attribute_names; - std::unordered_set used_attribute_values; - auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select); - - dht::partition_range_vector partition_ranges; - if (segment) { - auto range = get_range_for_segment(*segment, *total_segments); - if (exclusive_start_key) { - auto ring_pos = dht::ring_position{dht::decorate_key(*schema, pk_from_json(*exclusive_start_key, schema))}; - if (!range.contains(ring_pos, dht::ring_position_comparator(*schema))) { - return make_ready_future(api_error::validation( - format("The provided starting key is invalid: Invalid ExclusiveStartKey. Please use ExclusiveStartKey " - "with correct Segment. TotalSegments: {} Segment: {}", *total_segments, *segment))); - } - } - partition_ranges.push_back(range); - } else { - partition_ranges.push_back(dht::partition_range::make_open_ended_both_sides()); - } - std::vector ck_bounds{query::clustering_range::make_open_ended_both_sides()}; - - filter filter(*_parsed_expression_cache, request, filter::request_type::SCAN, used_attribute_names, used_attribute_values); - // Note: Unlike Query, Scan does allow a filter on the key attributes. - // For some *specific* cases of key filtering, such an equality test on - // partition key or comparison operator for the sort key, we could have - // optimized the filtering by modifying partition_ranges and/or - // ck_bounds. We haven't done this optimization yet. - - const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); - const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); - verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Scan"); - verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Scan"); - - return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, - std::move(filter), query::partition_slice::option_set(), client_state, _stats, trace_state, std::move(permit), _enforce_authorization, _warn_authorization); -} - -static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, const rjson::value& comp_definition, const rjson::value& attrs) { - auto op = get_comparison_operator(comp_definition); - if (op != comparison_operator_type::EQ) { - throw api_error::validation(format("Hash key can only be restricted with equality operator (EQ). {} not supported.", comp_definition)); - } - if (attrs.Size() != 1) { - throw api_error::validation(format("A single attribute is required for a hash key EQ restriction: {}", attrs)); - } - bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef); - partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value)); - auto decorated_key = dht::decorate_key(*schema, pk); - return dht::partition_range(decorated_key); -} - -static query::clustering_range get_clustering_range_for_begins_with(bytes&& target, const clustering_key& ck, schema_ptr schema, data_type t) { - auto it = boost::range::find_end(target, bytes("\xFF"), std::not_equal_to()); - if (it != target.end()) { - ++*it; - target.resize(std::distance(target.begin(), it) + 1); - clustering_key upper_limit = clustering_key::from_single_value(*schema, target); - return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit, false)); - } - return query::clustering_range::make_starting_with(query::clustering_range::bound(ck)); -} - -static query::clustering_range calculate_ck_bound(schema_ptr schema, const column_definition& ck_cdef, const rjson::value& comp_definition, const rjson::value& attrs) { - auto op = get_comparison_operator(comp_definition); - const size_t expected_attrs_size = (op == comparison_operator_type::BETWEEN) ? 2 : 1; - if (attrs.Size() != expected_attrs_size) { - throw api_error::validation(format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs)); - } - bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef); - clustering_key ck = clustering_key::from_single_value(*schema, raw_value); - switch (op) { - case comparison_operator_type::EQ: - return query::clustering_range(ck); - case comparison_operator_type::LE: - return query::clustering_range::make_ending_with(query::clustering_range::bound(ck)); - case comparison_operator_type::LT: - return query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false)); - case comparison_operator_type::GE: - return query::clustering_range::make_starting_with(query::clustering_range::bound(ck)); - case comparison_operator_type::GT: - return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false)); - case comparison_operator_type::BETWEEN: { - bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef); - clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit); - return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit)); - } - case comparison_operator_type::BEGINS_WITH: { - if (raw_value.empty()) { - return query::clustering_range::make_open_ended_both_sides(); - } - // NOTICE(sarna): A range starting with given prefix and ending (non-inclusively) with a string "incremented" by a single - // character at the end. Throws for NUMBER instances. - if (!ck_cdef.type->is_compatible_with(*utf8_type)) { - throw api_error::validation(fmt::format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type))); - } - return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type); - } - default: - throw api_error::validation(format("Operator {} not supported for sort key", comp_definition)); - } -} - -// Calculates primary key bounds from KeyConditions -static std::pair> -calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) { - dht::partition_range_vector partition_ranges; - std::vector ck_bounds; - - for (auto it = conditions.MemberBegin(); it != conditions.MemberEnd(); ++it) { - sstring key = rjson::to_sstring(it->name); - const rjson::value& condition = it->value; - - const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator"); - const rjson::value& attr_list = rjson::get(condition, "AttributeValueList"); - - const column_definition& pk_cdef = schema->partition_key_columns().front(); - const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? &schema->clustering_key_columns().front() : nullptr; - if (key == pk_cdef.name_as_text()) { - if (!partition_ranges.empty()) { - throw api_error::validation("Currently only a single restriction per key is allowed"); - } - partition_ranges.push_back(calculate_pk_bound(schema, pk_cdef, comp_definition, attr_list)); - } - if (ck_cdef && key == ck_cdef->name_as_text()) { - if (!ck_bounds.empty()) { - throw api_error::validation("Currently only a single restriction per key is allowed"); - } - ck_bounds.push_back(calculate_ck_bound(schema, *ck_cdef, comp_definition, attr_list)); - } - } - - // Validate that a query's conditions must be on the hash key, and - // optionally also on the sort key if it exists. - if (partition_ranges.empty()) { - throw api_error::validation(format("Query missing condition on hash key '{}'", schema->partition_key_columns().front().name_as_text())); - } - if (schema->clustering_key_size() == 0) { - if (conditions.MemberCount() != 1) { - throw api_error::validation("Only one condition allowed in table with only hash key"); - } - } else { - if (conditions.MemberCount() == 2 && ck_bounds.empty()) { - throw api_error::validation(format("Query missing condition on sort key '{}'", schema->clustering_key_columns().front().name_as_text())); - } else if (conditions.MemberCount() > 2) { - throw api_error::validation("Only one or two conditions allowed in table with hash key and sort key"); - } - } - - if (ck_bounds.empty()) { - ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } - - return {std::move(partition_ranges), std::move(ck_bounds)}; -} - -// Extract the top-level column name specified in a KeyConditionExpression. -// If a nested attribute path is given, a ValidationException is generated. -// If the column name is a #reference to ExpressionAttributeNames, the -// reference is resolved. -// Note this function returns a string_view, which may refer to data in the -// given parsed::value or expression_attribute_names. -static std::string_view get_toplevel(const parsed::value& v, - const rjson::value* expression_attribute_names, - std::unordered_set& used_attribute_names) -{ - const parsed::path& path = std::get(v._value); - if (path.has_operators()) { - throw api_error::validation("KeyConditionExpression does not support nested attributes"); - } - std::string_view column_name = path.root(); - if (column_name.size() > 0 && column_name[0] == '#') { - used_attribute_names.emplace(column_name); - if (!expression_attribute_names) { - throw api_error::validation( - fmt::format("ExpressionAttributeNames missing, entry '{}' required by KeyConditionExpression", - column_name)); - } - const rjson::value* value = rjson::find(*expression_attribute_names, column_name); - if (!value || !value->IsString()) { - throw api_error::validation( - fmt::format("ExpressionAttributeNames missing entry '{}' required by KeyConditionExpression", - column_name)); - } - column_name = rjson::to_string_view(*value); - } - return column_name; -} - -// Extract a constant value specified in a KeyConditionExpression. -// This constant was originally parsed as a reference (:name) to a member of -// ExpressionAttributeValues, but at this point, after resolve_value(), it -// was already converted into a JSON value. -// This function decodes the value (using its given expected type) into bytes -// which Scylla uses as the actual key value. If the value has the wrong type, -// or the input had other problems, a ValidationException is thrown. -static bytes get_constant_value(const parsed::value& v, - const column_definition& column) -{ - const parsed::constant& constant = std::get(v._value); - const parsed::constant::literal& lit = std::get(constant._value); - return get_key_from_typed_value(*lit, column); -} - -// condition_expression_and_list extracts a list of ANDed primitive conditions -// from a condition_expression. This is useful for KeyConditionExpression, -// which may not use OR or NOT. If the given condition_expression does use -// OR or NOT, this function throws a ValidationException. -static void condition_expression_and_list( - const parsed::condition_expression& condition_expression, - std::vector& conditions) -{ - if (condition_expression._negated) { - throw api_error::validation("KeyConditionExpression cannot use NOT"); - } - std::visit(overloaded_functor { - [&] (const parsed::primitive_condition& cond) { - conditions.push_back(&cond); - }, - [&] (const parsed::condition_expression::condition_list& list) { - if (list.op == '|' && list.conditions.size() > 1) { - throw api_error::validation("KeyConditionExpression cannot use OR"); - } - for (const parsed::condition_expression& cond : list.conditions) { - condition_expression_and_list(cond, conditions); - } - } - }, condition_expression._expression); -} - -// Calculates primary key bounds from KeyConditionExpression -static std::pair> -calculate_bounds_condition_expression(schema_ptr schema, - const rjson::value& expression, - const rjson::value* expression_attribute_values, - std::unordered_set& used_attribute_values, - const rjson::value* expression_attribute_names, - std::unordered_set& used_attribute_names, - parsed::expression_cache& parsed_expression_cache) -{ - if (!expression.IsString()) { - throw api_error::validation("KeyConditionExpression must be a string"); - } - if (expression.GetStringLength() == 0) { - throw api_error::validation("KeyConditionExpression must not be empty"); - } - // We parse the KeyConditionExpression with the same parser we use for - // ConditionExpression. But KeyConditionExpression only supports a subset - // of the ConditionExpression features, so we have many additional - // verifications below that the key condition is legal. Briefly, a valid - // key condition must contain a single partition key and a single - // sort-key range. - parsed::condition_expression p; - try { - p = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(expression), "KeyConditionExpression"); - } catch(expressions_syntax_error& e) { - throw api_error::validation(e.what()); - } - resolve_condition_expression(p, - expression_attribute_names, expression_attribute_values, - used_attribute_names, used_attribute_values); - std::vector conditions; - condition_expression_and_list(p, conditions); - - if (conditions.size() < 1 || conditions.size() > 2) { - throw api_error::validation( - "KeyConditionExpression syntax error: must have 1 or 2 conditions"); - } - // Scylla allows us to have an (equality) constraint on the partition key - // pk_cdef, and a range constraint on the *first* clustering key ck_cdef. - // Note that this is also good enough for our GSI implementation - the - // GSI's user-specified sort key will be the first clustering key. - // FIXME: In the case described in issue #5320 (base and GSI both have - // just hash key - but different ones), this may allow the user to Query - // using the base key which isn't officially part of the GSI. - const column_definition& pk_cdef = schema->partition_key_columns().front(); - const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? - &schema->clustering_key_columns().front() : nullptr; - - dht::partition_range_vector partition_ranges; - std::vector ck_bounds; - for (const parsed::primitive_condition* condp : conditions) { - const parsed::primitive_condition& cond = *condp; - // In all comparison operators, one operand must be a column name, - // the other is a constant (value reference). We remember which is - // which in toplevel_ind, and also the column name in key (not just - // for comparison operators). - std::string_view key; - int toplevel_ind; - switch (cond._values.size()) { - case 1: { - // The only legal single-value condition is a begin_with() function, - // and it must have two parameters - a top-level attribute and a - // value reference.. - const parsed::value::function_call *f = std::get_if(&cond._values[0]._value); - if (!f) { - throw api_error::validation("KeyConditionExpression cannot be just a value"); - } - if (f->_function_name != "begins_with") { - throw api_error::validation( - fmt::format("KeyConditionExpression function '{}' not supported",f->_function_name)); - } - if (f->_parameters.size() != 2 || !f->_parameters[0].is_path() || - !f->_parameters[1].is_constant()) { - throw api_error::validation( - "KeyConditionExpression begins_with() takes attribute and value"); - } - key = get_toplevel(f->_parameters[0], expression_attribute_names, used_attribute_names); - toplevel_ind = -1; - break; - } - case 2: - if (cond._values[0].is_path() && cond._values[1].is_constant()) { - toplevel_ind = 0; - } else if (cond._values[1].is_path() && cond._values[0].is_constant()) { - toplevel_ind = 1; - } else { - throw api_error::validation("KeyConditionExpression must compare attribute with constant"); - } - key = get_toplevel(cond._values[toplevel_ind], expression_attribute_names, used_attribute_names); - break; - case 3: - // Only BETWEEN has three operands. First must be a column name, - // two other must be value references (constants): - if (cond._op != parsed::primitive_condition::type::BETWEEN) { - // Shouldn't happen unless we have a bug in the parser - throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size())); - } - if (cond._values[0].is_path() && cond._values[1].is_constant() && cond._values[2].is_constant()) { - toplevel_ind = 0; - key = get_toplevel(cond._values[0], expression_attribute_names, used_attribute_names); - } else { - throw api_error::validation("KeyConditionExpression must compare attribute with constants"); - } - break; - default: - // Shouldn't happen unless we have a bug in the parser - throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size())); - } - if (cond._op == parsed::primitive_condition::type::IN) { - throw api_error::validation("KeyConditionExpression does not support IN operator"); - } else if (cond._op == parsed::primitive_condition::type::NE) { - throw api_error::validation("KeyConditionExpression does not support NE operator"); - } else if (cond._op == parsed::primitive_condition::type::EQ) { - // the EQ operator (=) is the only one which can be used for both - // the partition key and sort key: - if (sstring(key) == pk_cdef.name_as_text()) { - if (!partition_ranges.empty()) { - throw api_error::validation( - "KeyConditionExpression allows only one condition for each key"); - } - bytes raw_value = get_constant_value(cond._values[!toplevel_ind], pk_cdef); - partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value)); - auto decorated_key = dht::decorate_key(*schema, pk); - partition_ranges.push_back(dht::partition_range(decorated_key)); - } else if (ck_cdef && sstring(key) == ck_cdef->name_as_text()) { - if (!ck_bounds.empty()) { - throw api_error::validation( - "KeyConditionExpression allows only one condition for each key"); - } - bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef); - clustering_key ck = clustering_key::from_single_value(*schema, raw_value); - ck_bounds.push_back(query::clustering_range(ck)); - } else { - throw api_error::validation( - fmt::format("KeyConditionExpression condition on non-key attribute {}", key)); - } - continue; - } - // If we're still here, it's any other operator besides EQ, and these - // are allowed *only* on the clustering key: - if (sstring(key) == pk_cdef.name_as_text()) { - throw api_error::validation( - fmt::format("KeyConditionExpression only '=' condition is supported on partition key {}", key)); - } else if (!ck_cdef || sstring(key) != ck_cdef->name_as_text()) { - throw api_error::validation( - fmt::format("KeyConditionExpression condition on non-key attribute {}", key)); - } - if (!ck_bounds.empty()) { - throw api_error::validation( - "KeyConditionExpression allows only one condition for each key"); - } - if (cond._op == parsed::primitive_condition::type::BETWEEN) { - clustering_key ck1 = clustering_key::from_single_value(*schema, - get_constant_value(cond._values[1], *ck_cdef)); - clustering_key ck2 = clustering_key::from_single_value(*schema, - get_constant_value(cond._values[2], *ck_cdef)); - ck_bounds.push_back(query::clustering_range::make( - query::clustering_range::bound(ck1), query::clustering_range::bound(ck2))); - continue; - } else if (cond._values.size() == 1) { - // We already verified above, that this case this can only be a - // function call to begins_with(), with the first parameter the - // key, the second the value reference. - bytes raw_value = get_constant_value( - std::get(cond._values[0]._value)._parameters[1], *ck_cdef); - if (!ck_cdef->type->is_compatible_with(*utf8_type)) { - // begins_with() supported on bytes and strings (both stored - // in the database as strings) but not on numbers. - throw api_error::validation( - fmt::format("KeyConditionExpression begins_with() not supported on type {}", - type_to_string(ck_cdef->type))); - } else if (raw_value.empty()) { - ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } else { - clustering_key ck = clustering_key::from_single_value(*schema, raw_value); - ck_bounds.push_back(get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef->type)); - } - continue; - } - - // All remaining operator have one value reference parameter in index - // !toplevel_ind. Note how toplevel_ind==1 reverses the direction of - // an inequality. - bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef); - clustering_key ck = clustering_key::from_single_value(*schema, raw_value); - if ((cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 0) || - (cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 1)) { - ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false))); - } else if ((cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 0) || - (cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 1)) { - ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false))); - } else if ((cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 0) || - (cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 1)) { - ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck))); - } else if ((cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 0) || - (cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 1)) { - ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck))); - } - } - - if (partition_ranges.empty()) { - throw api_error::validation( - format("KeyConditionExpression requires a condition on partition key {}", pk_cdef.name_as_text())); - } - if (ck_bounds.empty()) { - ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); - } - return {std::move(partition_ranges), std::move(ck_bounds)}; -} - -future executor::query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { - _stats.api_operations.query++; - elogger.trace("Querying {}", request); - - auto [schema, table_type] = get_table_or_view(_proxy, request); - db::consistency_level cl = get_read_consistency(request); - - maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "Query", request, cl); - - get_stats_from_schema(_proxy, *schema)->api_operations.query++; - tracing::add_alternator_table_name(trace_state, schema->cf_name()); - - rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey"); - if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) { - return make_ready_future(api_error::validation( - "Consistent reads are not allowed on global indexes (GSI)")); - } - rjson::value* limit_json = rjson::find(request, "Limit"); - uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits::max(); - if (limit <= 0) { - return make_ready_future(api_error::validation("Limit must be greater than 0")); - } - - const bool forward = get_bool_attribute(request, "ScanIndexForward", true); - - rjson::value* key_conditions = rjson::find(request, "KeyConditions"); - rjson::value* key_condition_expression = rjson::find(request, "KeyConditionExpression"); - std::unordered_set used_attribute_values; - std::unordered_set used_attribute_names; - if (key_conditions && key_condition_expression) { - throw api_error::validation("Query does not allow both " - "KeyConditions and KeyConditionExpression to be given together"); - } else if (!key_conditions && !key_condition_expression) { - throw api_error::validation("Query must have one of " - "KeyConditions or KeyConditionExpression"); - } - - const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); - const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); - - // exactly one of key_conditions or key_condition_expression - auto [partition_ranges, ck_bounds] = key_conditions - ? calculate_bounds_conditions(schema, *key_conditions) - : calculate_bounds_condition_expression(schema, *key_condition_expression, - expression_attribute_values, - used_attribute_values, - expression_attribute_names, - used_attribute_names, *_parsed_expression_cache); - - filter filter(*_parsed_expression_cache, request, filter::request_type::QUERY, - used_attribute_names, used_attribute_values); - - // A query is not allowed to filter on the partition key or the sort key. - for (const column_definition& cdef : schema->partition_key_columns()) { // just one - if (filter.filters_on(cdef.name_as_text())) { - return make_ready_future(api_error::validation( - format("QueryFilter can only contain non-primary key attributes: Partition key attribute: {}", cdef.name_as_text()))); - } - } - for (const column_definition& cdef : schema->clustering_key_columns()) { - if (filter.filters_on(cdef.name_as_text())) { - return make_ready_future(api_error::validation( - format("QueryFilter can only contain non-primary key attributes: Sort key attribute: {}", cdef.name_as_text()))); - } - // FIXME: this "break" can avoid listing some clustering key columns - // we added for GSIs just because they existed in the base table - - // but not in all cases. We still have issue #5320. - break; - } - - select_type select = parse_select(request, table_type); - - auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select); - verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query"); - verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query"); - query::partition_slice::option_set opts; - opts.set_if(!forward); - return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, - std::move(filter), opts, client_state, _stats, std::move(trace_state), std::move(permit), _enforce_authorization, _warn_authorization); -} - future executor::list_tables(client_state& client_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { _stats.api_operations.list_tables++; elogger.trace("Listing tables {}", request); diff --git a/alternator/executor.hh b/alternator/executor.hh index 9f372d9c4b..3de07fd7b0 100644 --- a/alternator/executor.hh +++ b/alternator/executor.hh @@ -11,6 +11,7 @@ #include #include "audit/audit.hh" #include "seastarx.hh" +#include #include #include @@ -21,13 +22,16 @@ #include "db/config.hh" #include "alternator/error.hh" -#include "stats.hh" +#include "alternator/attribute_path.hh" +#include "alternator/stats.hh" +#include "alternator/executor_util.hh" + #include "utils/rjson.hh" #include "utils/updateable_value.hh" -#include "utils/simple_value_with_expiry.hh" #include "tracing/trace_state.hh" + namespace db { class system_distributed_keyspace; } @@ -51,6 +55,10 @@ namespace service { class storage_service; } +namespace vector_search { + class vector_store_client; +} + namespace cdc { class metadata; } @@ -63,82 +71,13 @@ class gossiper; class schema_builder; + namespace alternator { enum class table_status; class rmw_operation; class put_or_delete_item; -schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request); -bool is_alternator_keyspace(const sstring& ks_name); -// Wraps the db::get_tags_of_table and throws if the table is missing the tags extension. -const std::map& get_tags_of_table_or_throw(schema_ptr schema); - -// An attribute_path_map object is used to hold data for various attributes -// paths (parsed::path) in a hierarchy of attribute paths. Each attribute path -// has a root attribute, and then modified by member and index operators - -// for example in "a.b[2].c" we have "a" as the root, then ".b" member, then -// "[2]" index, and finally ".c" member. -// Data can be added to an attribute_path_map using the add() function, but -// requires that attributes with data not be *overlapping* or *conflicting*: -// -// 1. Two attribute paths which are identical or an ancestor of one another -// are considered *overlapping* and not allowed. If a.b.c has data, -// we can't add more data in a.b.c or any of its descendants like a.b.c.d. -// -// 2. Two attribute paths which need the same parent to have both a member and -// an index are considered *conflicting* and not allowed. E.g., if a.b has -// data, you can't add a[1]. The meaning of adding both would be that the -// attribute a is both a map and an array, which isn't sensible. -// -// These two requirements are common to the two places where Alternator uses -// this abstraction to describe how a hierarchical item is to be transformed: -// -// 1. In ProjectExpression: for filtering from a full top-level attribute -// only the parts for which user asked in ProjectionExpression. -// -// 2. In UpdateExpression: for taking the previous value of a top-level -// attribute, and modifying it based on the instructions in the user -// wrote in UpdateExpression. - -template -class attribute_path_map_node { -public: - using data_t = T; - // We need the extra unique_ptr<> here because libstdc++ unordered_map - // doesn't work with incomplete types :-( - using members_t = std::unordered_map>>; - // The indexes list is sorted because DynamoDB requires handling writes - // beyond the end of a list in index order. - using indexes_t = std::map>>; - // The prohibition on "overlap" and "conflict" explained above means - // That only one of data, members or indexes is non-empty. - std::optional> _content; - - bool is_empty() const { return !_content; } - bool has_value() const { return _content && std::holds_alternative(*_content); } - bool has_members() const { return _content && std::holds_alternative(*_content); } - bool has_indexes() const { return _content && std::holds_alternative(*_content); } - // get_members() assumes that has_members() is true - members_t& get_members() { return std::get(*_content); } - const members_t& get_members() const { return std::get(*_content); } - indexes_t& get_indexes() { return std::get(*_content); } - const indexes_t& get_indexes() const { return std::get(*_content); } - T& get_value() { return std::get(*_content); } - const T& get_value() const { return std::get(*_content); } -}; - -template -using attribute_path_map = std::unordered_map>; - -using attrs_to_get_node = attribute_path_map_node; -// attrs_to_get lists which top-level attribute are needed, and possibly also -// which part of the top-level attribute is really needed (when nested -// attribute paths appeared in the query). -// Most code actually uses optional. There, a disengaged -// optional means we should get all attributes, not specific ones. -using attrs_to_get = attribute_path_map; - namespace parsed { class expression_cache; } @@ -150,6 +89,7 @@ class executor : public peering_sharded_service { service::migration_manager& _mm; db::system_distributed_keyspace& _sdks; cdc::metadata& _cdc_metadata; + vector_search::vector_store_client& _vsc; utils::updateable_value _enforce_authorization; utils::updateable_value _warn_authorization; seastar::sharded& _audit; @@ -177,7 +117,6 @@ public: // is written in chunks to the output_stream. This allows for efficient // handling of large responses without needing to allocate a large buffer // in memory. - using body_writer = noncopyable_function(output_stream&&)>; using request_return_type = std::variant; stats _stats; // The metric_groups object holds this stat object's metrics registered @@ -193,6 +132,7 @@ public: service::migration_manager& mm, db::system_distributed_keyspace& sdks, cdc::metadata& cdc_metadata, + vector_search::vector_store_client& vsc, smp_service_group ssg, utils::updateable_value default_timeout_in_ms); ~executor(); @@ -225,15 +165,9 @@ public: future<> start(); future<> stop(); - static sstring table_name(const schema&); static db::timeout_clock::time_point default_timeout(); private: static thread_local utils::updateable_value s_default_timeout_in_ms; -public: - static schema_ptr find_table(service::storage_proxy&, std::string_view table_name); - static schema_ptr find_table(service::storage_proxy&, const rjson::value& request); - -private: friend class rmw_operation; // Helper to set up auditing for an Alternator operation. Checks whether @@ -247,7 +181,6 @@ private: const rjson::value& request, std::optional cl = std::nullopt); - static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map * = nullptr, const std::map *tags = nullptr); future fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit); future create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode, std::unique_ptr& audit_info); @@ -263,62 +196,11 @@ private: tracing::trace_state_ptr trace_state, service_permit permit); public: - static void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map&, const std::map *tags = nullptr); - - static std::optional describe_single_item(schema_ptr, - const query::partition_slice&, - const cql3::selection::selection&, - const query::result&, - const std::optional&, - uint64_t* = nullptr); - - // Converts a multi-row selection result to JSON compatible with DynamoDB. - // For each row, this method calls item_callback, which takes the size of - // the item as the parameter. - static future> describe_multi_item(schema_ptr schema, - const query::partition_slice&& slice, - shared_ptr selection, - foreign_ptr> query_result, - shared_ptr> attrs_to_get, - noncopyable_function item_callback = {}); - - static void describe_single_item(const cql3::selection::selection&, - const std::vector&, - const std::optional&, - rjson::value&, - uint64_t* item_length_in_bytes = nullptr, - bool = false); - static bool add_stream_options(const rjson::value& stream_spec, schema_builder&, service::storage_proxy& sp); static void supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp); static void supplement_table_stream_info(rjson::value& descr, const schema& schema, const service::storage_proxy& sp); }; -// is_big() checks approximately if the given JSON value is "bigger" than -// the given big_size number of bytes. The goal is to *quickly* detect -// oversized JSON that, for example, is too large to be serialized to a -// contiguous string - we don't need an accurate size for that. Moreover, -// as soon as we detect that the JSON is indeed "big", we can return true -// and don't need to continue calculating its exact size. -// For simplicity, we use a recursive implementation. This is fine because -// Alternator limits the depth of JSONs it reads from inputs, and doesn't -// add more than a couple of levels in its own output construction. -bool is_big(const rjson::value& val, int big_size = 100'000); - -// Check CQL's Role-Based Access Control (RBAC) permission (MODIFY, -// SELECT, DROP, etc.) on the given table. When permission is denied an -// appropriate user-readable api_error::access_denied is thrown. -future<> verify_permission(bool enforce_authorization, bool warn_authorization, const service::client_state&, const schema_ptr&, auth::permission, alternator::stats& stats); - -/** - * Make return type for serializing the object "streamed", - * i.e. direct to HTTP output stream. Note: only useful for - * (very) large objects as there are overhead issues with this - * as well, but for massive lists of return objects this can - * help avoid large allocations/many re-allocs - */ -executor::body_writer make_streamed(rjson::value&&); - // returns table creation time in seconds since epoch for `db_clock` double get_table_creation_time(const schema &schema); @@ -344,5 +226,4 @@ arn_parts parse_arn(std::string_view arn, std::string_view arn_field_name, std:: // The format is ks1|ks2|ks3... and table1|table2|table3... sstring print_names_for_audit(const std::set& names); - } diff --git a/alternator/executor_read.cc b/alternator/executor_read.cc new file mode 100644 index 0000000000..a35960c49c --- /dev/null +++ b/alternator/executor_read.cc @@ -0,0 +1,1957 @@ +/* + * Copyright 2019-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 + */ + +// This file implements the Alternator read operations: GetItem, BatchGetItem, +// Query (including vector search) and Scan. +// Public entry points: +// * executor::get_item() +// * executor::batch_get_item() +// * executor::scan() +// * executor::query() +// Major internal functions: +// * do_query(): the common code for Query and Scan, except vector search. +// * query_vector(): the vector-search code path for Query with VectorSearch. +// and a number of helper functions for parsing common parameters of read +// requests such as TableName, IndexName, Select, FilterExpression, +// ConsistentRead, ProjectionExpression, and more. + +#include "alternator/executor.hh" +#include "alternator/executor_util.hh" +#include "alternator/conditions.hh" +#include "alternator/expressions.hh" +#include "alternator/consumed_capacity.hh" +#include "alternator/serialization.hh" +#include "alternator/attribute_path.hh" +#include "auth/permission.hh" +#include "cql3/selection/selection.hh" +#include "cql3/result_set.hh" +#include "query/query-request.hh" +#include "schema/schema.hh" +#include "service/client_state.hh" +#include "service/pager/query_pagers.hh" +#include "service/storage_proxy.hh" +#include "index/secondary_index.hh" +#include "utils/assert.hh" +#include "utils/overloaded_functor.hh" +#include "utils/error_injection.hh" +#include "vector_search/vector_store_client.hh" +#include +#include +#include +#include +#include +#include + +using namespace std::chrono_literals; + +namespace alternator { + +extern logging::logger elogger; // from executor.cc + +// make_streamed_with_extra_array() is variant of make_streamed() above, which +// builds a streaming response (a function writing to an output stream) from a +// JSON object (rjson::value) but adds to it at the end an additional array. +// The extra array is given a separate chunked_vector to avoid putting it +// inside the rjson::value - because RapidJSON does contiguous allocations for +// arrays which we want to avoid for potentially long arrays in Query/Scan +// responses (see #23535). +// If we ever fix RapidJSON to avoid contiguous allocations for arrays, or +// replace it entirely (#24458), we can remove this function and the function +// rjson::print_with_extra_array() which it calls. +static body_writer make_streamed_with_extra_array(rjson::value&& value, + std::string array_name, utils::chunked_vector&& array) { + return [value = std::move(value), array_name = std::move(array_name), array = std::move(array)](output_stream&& _out) mutable -> future<> { + auto out = std::move(_out); + std::exception_ptr ex; + try { + co_await rjson::print_with_extra_array(value, array_name, array, out); + } catch (...) { + ex = std::current_exception(); + } + co_await out.close(); + co_await rjson::destroy_gently(std::move(value)); + // TODO: can/should we also destroy the array gently? + if (ex) { + co_await coroutine::return_exception_ptr(std::move(ex)); + } + }; +} + +// select_type represents how the Select parameter of Query/Scan selects what +// to return. It is also used by calculate_attrs_to_get() to know whether to +// return no attributes (count), or specific attributes. +enum class select_type { regular, count, projection }; + +// Check according to the request's "ConsistentRead" field, which consistency +// level we need to use for the read. The field can be True for strongly +// consistent reads, or False for eventually consistent reads, or if this +// field is absence, we default to eventually consistent reads. +// In Scylla, eventually-consistent reads are implemented as consistency +// level LOCAL_ONE, and strongly-consistent reads as LOCAL_QUORUM. +static db::consistency_level get_read_consistency(const rjson::value& request) { + const rjson::value* consistent_read_value = rjson::find(request, "ConsistentRead"); + bool consistent_read = false; + if (consistent_read_value && !consistent_read_value->IsNull()) { + if (consistent_read_value->IsBool()) { + consistent_read = consistent_read_value->GetBool(); + } else { + throw api_error::validation("ConsistentRead flag must be a boolean"); + } + } + return consistent_read ? db::consistency_level::LOCAL_QUORUM : db::consistency_level::LOCAL_ONE; +} + +// attrs_to_get saves for each top-level attribute an attrs_to_get_node, +// a hierarchy of subparts that need to be kept. The following function +// calculate_attrs_to_get() takes either AttributesToGet or +// ProjectionExpression parameters (having both is *not* allowed), +// and returns the list of cells we need to read, or a disengaged optional +// when *all* attributes are to be returned. +// However, in our current implementation, only top-level attributes are +// stored as separate cells - a nested document is stored serialized together +// (as JSON) in the same cell. So this function return a map - each key is the +// top-level attribute we will need need to read, and the value for each +// top-level attribute is the partial hierarchy (struct hierarchy_filter) +// that we will need to extract from that serialized JSON. +// For example, if ProjectionExpression lists a.b and a.c[2], we +// return one top-level attribute name, "a", with the value "{b, c[2]}". +static std::optional calculate_attrs_to_get(const rjson::value& req, parsed::expression_cache& parsed_expression_cache, std::unordered_set& used_attribute_names, select_type select = select_type::regular) { + if (select == select_type::count) { + // An empty map asks to retrieve no attributes. Note that this is + // different from a disengaged optional which means retrieve all. + return attrs_to_get(); + } + // FIXME: also need to handle select_type::projection + const bool has_attributes_to_get = req.HasMember("AttributesToGet"); + const bool has_projection_expression = req.HasMember("ProjectionExpression"); + if (has_attributes_to_get && has_projection_expression) { + throw api_error::validation( + format("GetItem does not allow both ProjectionExpression and AttributesToGet to be given together")); + } + if (has_attributes_to_get) { + const rjson::value& attributes_to_get = req["AttributesToGet"]; + attrs_to_get ret; + for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) { + attribute_path_map_add("AttributesToGet", ret, rjson::to_string(*it)); + validate_attr_name_length("AttributesToGet", it->GetStringLength(), false); + } + if (ret.empty()) { + throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead."); + } + return ret; + } else if (has_projection_expression) { + const rjson::value& projection_expression = req["ProjectionExpression"]; + const rjson::value* expression_attribute_names = rjson::find(req, "ExpressionAttributeNames"); + std::vector paths_to_get; + try { + paths_to_get = parsed_expression_cache.parse_projection_expression(rjson::to_string_view(projection_expression)); + } catch(expressions_syntax_error& e) { + throw api_error::validation(e.what()); + } + resolve_projection_expression(paths_to_get, expression_attribute_names, used_attribute_names); + attrs_to_get ret; + for (const parsed::path& p : paths_to_get) { + attribute_path_map_add("ProjectionExpression", ret, p); + } + return ret; + } + // A disengaged optional asks to read everything + return std::nullopt; +} + +// get_table_or_view() is similar to to get_table(), except it returns either +// a table or a materialized view from which to read, based on the TableName +// and optional IndexName in the request. Only requests like Query and Scan +// which allow IndexName should use this function. +enum class table_or_view_type { base, lsi, gsi, vector_index }; +static std::pair +get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) { + table_or_view_type type = table_or_view_type::base; + std::string table_name = get_table_name(request); + + if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) { + return {s, type}; + } + + std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name; + const rjson::value* index_name = rjson::find(request, "IndexName"); + std::string orig_table_name; + if (index_name) { + if (index_name->IsString()) { + orig_table_name = std::move(table_name); + table_name = view_name(orig_table_name, rjson::to_string_view(*index_name)); + type = table_or_view_type::gsi; + } else { + throw api_error::validation( + fmt::format("Non-string IndexName '{}'", rjson::to_string_view(*index_name))); + } + // If no tables for global indexes were found, the index may be local + if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) { + type = table_or_view_type::lsi; + table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name)); + } + } + + try { + return { proxy.data_dictionary().find_schema(keyspace_name, table_name), type }; + } catch(data_dictionary::no_such_column_family&) { + if (index_name) { + // DynamoDB returns a different error depending on whether the + // base table doesn't exist (ResourceNotFoundException) or it + // does exist but the index does not (ValidationException). + auto base_table = proxy.data_dictionary().try_find_table(keyspace_name, orig_table_name); + if (base_table) { + // If the given IndexName is a vector index, not a GSI or LSI, + // give a more helpful message than just an index not found. + if (base_table->schema()->has_index(rjson::to_sstring(*index_name))) { + throw api_error::validation( + fmt::format("IndexName '{}' is a vector index for table '{}, so VectorSearch is mandatory in Query.", rjson::to_string_view(*index_name), orig_table_name)); + } + throw api_error::validation( + fmt::format("Requested resource not found: Index '{}' for table '{}'", rjson::to_string_view(*index_name), orig_table_name)); + } else { + throw api_error::resource_not_found( + fmt::format("Requested resource not found: Table: {} not found", orig_table_name)); + } + } else { + throw api_error::resource_not_found( + fmt::format("Requested resource not found: Table: {} not found", table_name)); + } + } +} + + +// Parse the "Select" parameter of a Scan or Query operation, throwing a +// ValidationException in various forbidden combinations of options and +// finally returning one of three options: +// 1. regular - the default scan behavior of returning all or specific +// attributes ("ALL_ATTRIBUTES" or "SPECIFIC_ATTRIBUTES"). +// 2. count - just count the items ("COUNT") +// 3. projection - return projected attributes ("ALL_PROJECTED_ATTRIBUTES") +// An ValidationException is thrown when recognizing an invalid combination +// of options - such as ALL_PROJECTED_ATTRIBUTES for a base table, or +// SPECIFIC_ATTRIBUTES without ProjectionExpression or AttributesToGet. +static select_type parse_select(const rjson::value& request, table_or_view_type table_type) { + const rjson::value* select_value = rjson::find(request, "Select"); + const bool has_attributes_to_get = request.HasMember("AttributesToGet"); + const bool has_projection_expression = request.HasMember("ProjectionExpression"); + if (!select_value) { + // "Select" is not specified: + // If ProjectionExpression or AttributesToGet are present, + // then Select defaults to SPECIFIC_ATTRIBUTES: + if (has_projection_expression || has_attributes_to_get) { + return select_type::regular; + } + // Otherwise, "Select" defaults to ALL_ATTRIBUTES on a base table, + // or ALL_PROJECTED_ATTRIBUTES on an index. This is explicitly + // documented in the DynamoDB API reference. + return table_type == table_or_view_type::base ? + select_type::regular : select_type::projection; + } + if (!select_value->IsString()) { + throw api_error::validation("Select parameter must be a string"); + } + std::string_view select = rjson::to_string_view(*select_value); + if (select == "SPECIFIC_ATTRIBUTES") { + if (has_projection_expression || has_attributes_to_get) { + return select_type::regular; + } + throw api_error::validation("Select=SPECIFIC_ATTRIBUTES requires AttributesToGet or ProjectionExpression"); + } + if (has_projection_expression || has_attributes_to_get) { + throw api_error::validation("AttributesToGet or ProjectionExpression require Select to be either SPECIFIC_ATTRIBUTES or missing"); + } + if (select == "COUNT") { + return select_type::count; + } + if (select == "ALL_ATTRIBUTES") { + // FIXME: when we support projections (#5036), if this is a GSI and + // not all attributes are projected to it, we should throw. + return select_type::regular; + } + if (select == "ALL_PROJECTED_ATTRIBUTES") { + if (table_type == table_or_view_type::base) { + throw api_error::validation("ALL_PROJECTED_ATTRIBUTES only allowed for indexes"); + } + return select_type::projection; + } + throw api_error::validation(fmt::format("Unknown Select value '{}'. Allowed choices: ALL_ATTRIBUTES, SPECIFIC_ATTRIBUTES, ALL_PROJECTED_ATTRIBUTES, COUNT", + select)); +} + +// "filter" represents a condition that can be applied to individual items +// read by a Query or Scan operation, to decide whether to keep the item. +// A filter is constructed from a Query or Scan request. This uses the +// relevant fields in the query (FilterExpression or QueryFilter/ScanFilter + +// ConditionalOperator). These fields are pre-checked and pre-parsed as much +// as possible, to ensure that later checking of many items is efficient. +class filter { +private: + // Holding QueryFilter/ScanFilter + ConditionalOperator: + struct conditions_filter { + bool require_all; + rjson::value conditions; + }; + // Holding a parsed FilterExpression: + struct expression_filter { + parsed::condition_expression expression; + }; + std::optional> _imp; +public: + // Filtering for Scan and Query are very similar, but there are some + // small differences, especially the names of the request attributes. + enum class request_type { SCAN, QUERY }; + // Note that a filter does not store pointers to the query used to + // construct it. + filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt, + std::unordered_set& used_attribute_names, + std::unordered_set& used_attribute_values); + bool check(const rjson::value& item) const; + bool filters_on(std::string_view attribute) const; + // for_filters_on() runs the given function on the attributes that the + // filter works on. It may run for the same attribute more than once if + // used more than once in the filter. + void for_filters_on(const noncopyable_function& func) const; + operator bool() const { return bool(_imp); } +}; + +filter::filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt, + std::unordered_set& used_attribute_names, + std::unordered_set& used_attribute_values) { + const rjson::value* expression = rjson::find(request, "FilterExpression"); + const char* conditions_attribute = (rt == request_type::SCAN) ? "ScanFilter" : "QueryFilter"; + const rjson::value* conditions = rjson::find(request, conditions_attribute); + auto conditional_operator = get_conditional_operator(request); + if (conditional_operator != conditional_operator_type::MISSING && + (!conditions || (conditions->IsObject() && conditions->GetObject().ObjectEmpty()))) { + throw api_error::validation( + format("'ConditionalOperator' parameter cannot be specified for missing or empty {}", + conditions_attribute)); + } + if (expression && conditions) { + throw api_error::validation( + format("FilterExpression and {} are not allowed together", conditions_attribute)); + } + if (expression) { + if (!expression->IsString()) { + throw api_error::validation("FilterExpression must be a string"); + } + if (expression->GetStringLength() == 0) { + throw api_error::validation("FilterExpression must not be empty"); + } + if (rjson::find(request, "AttributesToGet")) { + throw api_error::validation("Cannot use both old-style and new-style parameters in same request: FilterExpression and AttributesToGet"); + } + try { + auto parsed = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(*expression), "FilterExpression"); + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); + resolve_condition_expression(parsed, + expression_attribute_names, expression_attribute_values, + used_attribute_names, used_attribute_values); + _imp = expression_filter { std::move(parsed) }; + } catch(expressions_syntax_error& e) { + throw api_error::validation(e.what()); + } + } + if (conditions) { + if (rjson::find(request, "ProjectionExpression")) { + throw api_error::validation(format("Cannot use both old-style and new-style parameters in same request: {} and ProjectionExpression", conditions_attribute)); + } + bool require_all = conditional_operator != conditional_operator_type::OR; + _imp = conditions_filter { require_all, rjson::copy(*conditions) }; + } +} + +bool filter::check(const rjson::value& item) const { + if (!_imp) { + return true; + } + return std::visit(overloaded_functor { + [&] (const conditions_filter& f) -> bool { + return verify_condition(f.conditions, f.require_all, &item); + }, + [&] (const expression_filter& f) -> bool { + return verify_condition_expression(f.expression, &item); + } + }, *_imp); +} + +bool filter::filters_on(std::string_view attribute) const { + if (!_imp) { + return false; + } + return std::visit(overloaded_functor { + [&] (const conditions_filter& f) -> bool { + for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) { + if (rjson::to_string_view(it->name) == attribute) { + return true; + } + } + return false; + }, + [&] (const expression_filter& f) -> bool { + return condition_expression_on(f.expression, attribute); + } + }, *_imp); +} + +void filter::for_filters_on(const noncopyable_function& func) const { + if (_imp) { + std::visit(overloaded_functor { + [&] (const conditions_filter& f) -> void { + for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) { + func(rjson::to_string_view(it->name)); + } + }, + [&] (const expression_filter& f) -> void { + return for_condition_expression_on(f.expression, func); + } + }, *_imp); + } +} + +class describe_items_visitor { + typedef std::vector columns_t; + const columns_t& _columns; + const std::optional& _attrs_to_get; + std::unordered_set _extra_filter_attrs; + const filter& _filter; + typename columns_t::const_iterator _column_it; + rjson::value _item; + // _items is a chunked_vector instead of a RapidJson array + // (rjson::value) because unfortunately RapidJson arrays are stored + // contiguously in memory, and cause large allocations when a Query/Scan + // returns a long list of short items (issue #23535). + utils::chunked_vector _items; + size_t _scanned_count; + +public: + describe_items_visitor(const columns_t& columns, const std::optional& attrs_to_get, filter& filter) + : _columns(columns) + , _attrs_to_get(attrs_to_get) + , _filter(filter) + , _column_it(columns.begin()) + , _item(rjson::empty_object()) + , _scanned_count(0) + { + // _filter.check() may need additional attributes not listed in + // _attrs_to_get (i.e., not requested as part of the output). + // We list those in _extra_filter_attrs. We will include them in + // the JSON but take them out before finally returning the JSON. + if (_attrs_to_get) { + _filter.for_filters_on([&] (std::string_view attr) { + std::string a(attr); // no heterogeneous maps searches :-( + if (!_attrs_to_get->contains(a)) { + _extra_filter_attrs.emplace(std::move(a)); + } + }); + } + } + + void start_row() { + _column_it = _columns.begin(); + } + + void accept_value(managed_bytes_view_opt result_bytes_view) { + if (!result_bytes_view) { + ++_column_it; + return; + } + result_bytes_view->with_linearized([this] (bytes_view bv) { + std::string column_name = (*_column_it)->name_as_text(); + if (column_name != executor::ATTRS_COLUMN_NAME) { + if (!_attrs_to_get || _attrs_to_get->contains(column_name) || _extra_filter_attrs.contains(column_name)) { + if (!_item.HasMember(column_name.c_str())) { + rjson::add_with_string_name(_item, column_name, rjson::empty_object()); + } + rjson::value& field = _item[column_name.c_str()]; + rjson::add_with_string_name(field, type_to_string((*_column_it)->type), json_key_column_value(bv, **_column_it)); + } + } else { + auto deserialized = attrs_type()->deserialize(bv); + auto keys_and_values = value_cast(deserialized); + for (auto entry : keys_and_values) { + std::string attr_name = value_cast(entry.first); + if (!_attrs_to_get || _attrs_to_get->contains(attr_name) || _extra_filter_attrs.contains(attr_name)) { + bytes value = value_cast(entry.second); + // Even if _attrs_to_get asked to keep only a part of a + // top-level attribute, we keep the entire attribute + // at this stage, because the item filter might still + // need the other parts (it was easier for us to keep + // extra_filter_attrs at top-level granularity). We'll + // filter the unneeded parts after item filtering. + rjson::add_with_string_name(_item, attr_name, deserialize_item(value)); + } + } + } + }); + ++_column_it; + } + + void end_row() { + if (_filter.check(_item)) { + // As noted above, we kept entire top-level attributes listed in + // _attrs_to_get. We may need to only keep parts of them. + if (_attrs_to_get) { + for (const auto& attr: *_attrs_to_get) { + // If !attr.has_value() it means we were asked not to keep + // attr entirely, but just parts of it. + if (!attr.second.has_value()) { + rjson::value* toplevel= rjson::find(_item, attr.first); + if (toplevel && !hierarchy_filter(*toplevel, attr.second)) { + rjson::remove_member(_item, attr.first); + } + } + } + } + // Remove the extra attributes _extra_filter_attrs which we had + // to add just for the filter, and not requested to be returned: + for (const auto& attr : _extra_filter_attrs) { + rjson::remove_member(_item, attr); + } + + _items.push_back(std::move(_item)); + } + _item = rjson::empty_object(); + ++_scanned_count; + } + + utils::chunked_vector get_items() && { + return std::move(_items); + } + + size_t get_scanned_count() { + return _scanned_count; + } +}; + +// describe_items() returns a JSON object that includes members "Count" +// and "ScannedCount", but *not* "Items" - that is returned separately +// as a chunked_vector to avoid large contiguous allocations which +// RapidJSON does of its array. The caller should add "Items" to the +// returned JSON object if needed, or print it separately. +// The returned chunked_vector (the items) is std::optional<>, because +// the user may have requested only to count items, and not return any +// items - which is different from returning an empty list of items. +static future>, size_t>> describe_items( + const cql3::selection::selection& selection, + std::unique_ptr result_set, + std::optional&& attrs_to_get, + filter&& filter) { + describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter); + co_await result_set->visit_gently(visitor); + auto scanned_count = visitor.get_scanned_count(); + utils::chunked_vector items = std::move(visitor).get_items(); + rjson::value items_descr = rjson::empty_object(); + auto size = items.size(); + rjson::add(items_descr, "Count", rjson::value(size)); + rjson::add(items_descr, "ScannedCount", rjson::value(scanned_count)); + // If attrs_to_get && attrs_to_get->empty(), this means the user asked not + // to get any attributes (i.e., a Scan or Query with Select=COUNT) and we + // shouldn't return "Items" at all. + // TODO: consider optimizing the case of Select=COUNT without a filter. + // In that case, we currently build a list of empty items and here drop + // it. We could just count the items and not bother with the empty items. + // (However, remember that when we do have a filter, we need the items). + std::optional> opt_items; + if (!attrs_to_get || !attrs_to_get->empty()) { + opt_items = std::move(items); + } + co_return std::tuple(std::move(items_descr), std::move(opt_items), size); +} + +static rjson::value encode_paging_state(const schema& schema, const service::pager::paging_state& paging_state) { + rjson::value last_evaluated_key = rjson::empty_object(); + std::vector exploded_pk = paging_state.get_partition_key().explode(); + auto exploded_pk_it = exploded_pk.begin(); + for (const column_definition& cdef : schema.partition_key_columns()) { + rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object()); + rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()]; + rjson::add_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef)); + ++exploded_pk_it; + } + auto pos = paging_state.get_position_in_partition(); + if (pos.has_key()) { + // Alternator itself allows at most one column in clustering key, but + // user can use Alternator api to access system tables which might have + // multiple clustering key columns. So we need to handle that case here. + auto cdef_it = schema.clustering_key_columns().begin(); + for(const auto &exploded_ck : pos.key().explode()) { + rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef_it->name_as_text()), rjson::empty_object()); + rjson::value& key_entry = last_evaluated_key[cdef_it->name_as_text()]; + rjson::add_with_string_name(key_entry, type_to_string(cdef_it->type), json_key_column_value(exploded_ck, *cdef_it)); + ++cdef_it; + } + } + // To avoid possible conflicts (and thus having to reserve these names) we + // avoid adding the weight and region fields of the position to the paging + // state. Alternator will never need these as it doesn't have range + // tombstones (the only thing that can generate a position other than at(row)). + // We conditionally include these fields when reading CQL tables through alternator. + if (!is_alternator_keyspace(schema.ks_name()) && (!pos.has_key() || pos.get_bound_weight() != bound_weight::equal)) { + rjson::add_with_string_name(last_evaluated_key, scylla_paging_region, rjson::empty_object()); + rjson::add(last_evaluated_key[scylla_paging_region.data()], "S", rjson::from_string(fmt::to_string(pos.region()))); + rjson::add_with_string_name(last_evaluated_key, scylla_paging_weight, rjson::empty_object()); + rjson::add(last_evaluated_key[scylla_paging_weight.data()], "N", static_cast(pos.get_bound_weight())); + } + return last_evaluated_key; +} + +// RapidJSON allocates arrays contiguously in memory, so we want to avoid +// returning a large number of items as a single rapidjson array, and use +// a chunked_vector instead. The following constant is an arbitrary cutoff +// point for when to switch from a rapidjson array to a chunked_vector. +static constexpr int max_items_for_rapidjson_array = 256; + +static future do_query(service::storage_proxy& proxy, + schema_ptr table_schema, + const rjson::value* exclusive_start_key, + dht::partition_range_vector partition_ranges, + std::vector ck_bounds, + std::optional attrs_to_get, + uint32_t limit, + db::consistency_level cl, + filter filter, + query::partition_slice::option_set custom_opts, + service::client_state& client_state, + alternator::stats& stats, + tracing::trace_state_ptr trace_state, + service_permit permit, + bool enforce_authorization, + bool warn_authorization) { + lw_shared_ptr old_paging_state = nullptr; + + tracing::trace(trace_state, "Performing a database query"); + + // Reverse the schema and the clustering bounds as the underlying code expects + // reversed queries in the native reversed format. + auto query_schema = table_schema; + const bool reversed = custom_opts.contains(); + if (reversed) { + query_schema = table_schema->get_reversed(); + + std::reverse(ck_bounds.begin(), ck_bounds.end()); + for (auto& bound : ck_bounds) { + bound = query::reverse(bound); + } + } + + if (exclusive_start_key) { + partition_key pk = pk_from_json(*exclusive_start_key, table_schema); + auto pos = position_in_partition::for_partition_start(); + if (table_schema->clustering_key_size() > 0) { + pos = pos_from_json(*exclusive_start_key, table_schema); + } + old_paging_state = make_lw_shared(pk, pos, query::max_partitions, query_id::create_null_id(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0); + } + + co_await verify_permission(enforce_authorization, warn_authorization, client_state, table_schema, auth::permission::SELECT, stats); + + auto regular_columns = + table_schema->regular_columns() | std::views::transform(&column_definition::id) + | std::ranges::to(); + auto static_columns = + table_schema->static_columns() | std::views::transform(&column_definition::id) + | std::ranges::to(); + auto selection = cql3::selection::selection::wildcard(table_schema); + query::partition_slice::option_set opts = selection->get_query_options(); + opts.add(custom_opts); + auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts); + auto command = ::make_lw_shared(query_schema->id(), query_schema->version(), partition_slice, proxy.get_max_result_size(partition_slice), + query::tombstone_limit(proxy.get_tombstone_limit())); + + elogger.trace("Executing read query (reversed {}): table schema {}, query schema {}", partition_slice.is_reversed(), table_schema->version(), query_schema->version()); + + auto query_state_ptr = std::make_unique(client_state, trace_state, std::move(permit)); + + // FIXME: should be moved above, set on opts, so get_max_result_size knows it? + command->slice.options.set(); + auto query_options = std::make_unique(cl, std::vector{}); + query_options = std::make_unique(std::move(query_options), std::move(old_paging_state)); + auto p = service::pager::query_pagers::pager(proxy, query_schema, selection, *query_state_ptr, *query_options, command, std::move(partition_ranges), nullptr); + + std::unique_ptr rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout()); + if (!p->is_exhausted()) { + rs->get_metadata().set_paging_state(p->state()); + } + auto paging_state = rs->get_metadata().paging_state(); + bool has_filter = filter; + auto [items_descr, opt_items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter)); + if (paging_state) { + rjson::add(items_descr, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state)); + } + if (has_filter) { + stats.cql_stats.filtered_rows_read_total += p->stats().rows_read_total; + // update our "filtered_row_matched_total" for all the rows matched, despited the filter + stats.cql_stats.filtered_rows_matched_total += size; + } + if (opt_items) { + if (opt_items->size() >= max_items_for_rapidjson_array) { + // There are many items, better print the JSON and the array of + // items (opt_items) separately to avoid RapidJSON's contiguous + // allocation of arrays. + co_return make_streamed_with_extra_array(std::move(items_descr), "Items", std::move(*opt_items)); + } + // There aren't many items in the chunked vector opt_items, + // let's just insert them into the JSON object and print the + // full JSON normally. + rjson::value items_json = rjson::empty_array(); + for (auto& item : *opt_items) { + rjson::push_back(items_json, std::move(item)); + } + rjson::add(items_descr, "Items", std::move(items_json)); + } + if (is_big(items_descr)) { + co_return make_streamed(std::move(items_descr)); + } + co_return rjson::print(std::move(items_descr)); +} + +static dht::token token_for_segment(int segment, int total_segments) { + throwing_assert(total_segments > 1 && segment >= 0 && segment < total_segments); + uint64_t delta = std::numeric_limits::max() / total_segments; + return dht::token::from_int64(std::numeric_limits::min() + delta * segment); +} + +static dht::partition_range get_range_for_segment(int segment, int total_segments) { + if (total_segments == 1) { + return dht::partition_range::make_open_ended_both_sides(); + } + if (segment == 0) { + dht::token ending_token = token_for_segment(1, total_segments); + return dht::partition_range::make_ending_with( + dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false)); + } else if (segment == total_segments - 1) { + dht::token starting_token = token_for_segment(segment, total_segments); + return dht::partition_range::make_starting_with( + dht::partition_range::bound(dht::ring_position::starting_at(starting_token))); + } else { + dht::token starting_token = token_for_segment(segment, total_segments); + dht::token ending_token = token_for_segment(segment + 1, total_segments); + return dht::partition_range::make( + dht::partition_range::bound(dht::ring_position::starting_at(starting_token)), + dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false) + ); + } +} + +future executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { + _stats.api_operations.scan++; + elogger.trace("Scanning {}", request); + + auto [schema, table_type] = get_table_or_view(_proxy, request); + db::consistency_level cl = get_read_consistency(request); + maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "Scan", request, cl); + tracing::add_alternator_table_name(trace_state, schema->cf_name()); + get_stats_from_schema(_proxy, *schema)->api_operations.scan++; + auto segment = get_int_attribute(request, "Segment"); + auto total_segments = get_int_attribute(request, "TotalSegments"); + if (segment || total_segments) { + if (!segment || !total_segments) { + return make_ready_future(api_error::validation( + "Both Segment and TotalSegments attributes need to be present for a parallel scan")); + } + if (*segment < 0 || *segment >= *total_segments) { + return make_ready_future(api_error::validation( + "Segment must be non-negative and less than TotalSegments")); + } + if (*total_segments < 0 || *total_segments > 1000000) { + return make_ready_future(api_error::validation( + "TotalSegments must be non-negative and less or equal to 1000000")); + } + } + + rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey"); + + if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) { + return make_ready_future(api_error::validation( + "Consistent reads are not allowed on global indexes (GSI)")); + } + rjson::value* limit_json = rjson::find(request, "Limit"); + uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits::max(); + if (limit <= 0) { + return make_ready_future(api_error::validation("Limit must be greater than 0")); + } + + select_type select = parse_select(request, table_type); + + std::unordered_set used_attribute_names; + std::unordered_set used_attribute_values; + auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select); + + dht::partition_range_vector partition_ranges; + if (segment) { + auto range = get_range_for_segment(*segment, *total_segments); + if (exclusive_start_key) { + auto ring_pos = dht::ring_position{dht::decorate_key(*schema, pk_from_json(*exclusive_start_key, schema))}; + if (!range.contains(ring_pos, dht::ring_position_comparator(*schema))) { + return make_ready_future(api_error::validation( + format("The provided starting key is invalid: Invalid ExclusiveStartKey. Please use ExclusiveStartKey " + "with correct Segment. TotalSegments: {} Segment: {}", *total_segments, *segment))); + } + } + partition_ranges.push_back(range); + } else { + partition_ranges.push_back(dht::partition_range::make_open_ended_both_sides()); + } + std::vector ck_bounds{query::clustering_range::make_open_ended_both_sides()}; + + filter filter(*_parsed_expression_cache, request, filter::request_type::SCAN, used_attribute_names, used_attribute_values); + // Note: Unlike Query, Scan does allow a filter on the key attributes. + // For some *specific* cases of key filtering, such an equality test on + // partition key or comparison operator for the sort key, we could have + // optimized the filtering by modifying partition_ranges and/or + // ck_bounds. We haven't done this optimization yet. + + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); + verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Scan"); + verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Scan"); + + return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, + std::move(filter), query::partition_slice::option_set(), client_state, _stats, trace_state, std::move(permit), _enforce_authorization, _warn_authorization); +} + +static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, const rjson::value& comp_definition, const rjson::value& attrs) { + auto op = get_comparison_operator(comp_definition); + if (op != comparison_operator_type::EQ) { + throw api_error::validation(format("Hash key can only be restricted with equality operator (EQ). {} not supported.", comp_definition)); + } + if (attrs.Size() != 1) { + throw api_error::validation(format("A single attribute is required for a hash key EQ restriction: {}", attrs)); + } + bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef); + partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value)); + auto decorated_key = dht::decorate_key(*schema, pk); + return dht::partition_range(decorated_key); +} + +static query::clustering_range get_clustering_range_for_begins_with(bytes&& target, const clustering_key& ck, schema_ptr schema, data_type t) { + auto it = boost::range::find_end(target, bytes("\xFF"), std::not_equal_to()); + if (it != target.end()) { + ++*it; + target.resize(std::distance(target.begin(), it) + 1); + clustering_key upper_limit = clustering_key::from_single_value(*schema, target); + return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit, false)); + } + return query::clustering_range::make_starting_with(query::clustering_range::bound(ck)); +} + +static query::clustering_range calculate_ck_bound(schema_ptr schema, const column_definition& ck_cdef, const rjson::value& comp_definition, const rjson::value& attrs) { + auto op = get_comparison_operator(comp_definition); + const size_t expected_attrs_size = (op == comparison_operator_type::BETWEEN) ? 2 : 1; + if (attrs.Size() != expected_attrs_size) { + throw api_error::validation(format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs)); + } + bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef); + clustering_key ck = clustering_key::from_single_value(*schema, raw_value); + switch (op) { + case comparison_operator_type::EQ: + return query::clustering_range(ck); + case comparison_operator_type::LE: + return query::clustering_range::make_ending_with(query::clustering_range::bound(ck)); + case comparison_operator_type::LT: + return query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false)); + case comparison_operator_type::GE: + return query::clustering_range::make_starting_with(query::clustering_range::bound(ck)); + case comparison_operator_type::GT: + return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false)); + case comparison_operator_type::BETWEEN: { + bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef); + clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit); + return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit)); + } + case comparison_operator_type::BEGINS_WITH: { + if (raw_value.empty()) { + return query::clustering_range::make_open_ended_both_sides(); + } + // NOTICE(sarna): A range starting with given prefix and ending (non-inclusively) with a string "incremented" by a single + // character at the end. Throws for NUMBER instances. + if (!ck_cdef.type->is_compatible_with(*utf8_type)) { + throw api_error::validation(fmt::format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type))); + } + return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type); + } + default: + throw api_error::validation(format("Operator {} not supported for sort key", comp_definition)); + } +} + +// Calculates primary key bounds from KeyConditions +static std::pair> +calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) { + dht::partition_range_vector partition_ranges; + std::vector ck_bounds; + + for (auto it = conditions.MemberBegin(); it != conditions.MemberEnd(); ++it) { + sstring key = rjson::to_sstring(it->name); + const rjson::value& condition = it->value; + + const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator"); + const rjson::value& attr_list = rjson::get(condition, "AttributeValueList"); + + const column_definition& pk_cdef = schema->partition_key_columns().front(); + const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? &schema->clustering_key_columns().front() : nullptr; + if (key == pk_cdef.name_as_text()) { + if (!partition_ranges.empty()) { + throw api_error::validation("Currently only a single restriction per key is allowed"); + } + partition_ranges.push_back(calculate_pk_bound(schema, pk_cdef, comp_definition, attr_list)); + } + if (ck_cdef && key == ck_cdef->name_as_text()) { + if (!ck_bounds.empty()) { + throw api_error::validation("Currently only a single restriction per key is allowed"); + } + ck_bounds.push_back(calculate_ck_bound(schema, *ck_cdef, comp_definition, attr_list)); + } + } + + // Validate that a query's conditions must be on the hash key, and + // optionally also on the sort key if it exists. + if (partition_ranges.empty()) { + throw api_error::validation(format("Query missing condition on hash key '{}'", schema->partition_key_columns().front().name_as_text())); + } + if (schema->clustering_key_size() == 0) { + if (conditions.MemberCount() != 1) { + throw api_error::validation("Only one condition allowed in table with only hash key"); + } + } else { + if (conditions.MemberCount() == 2 && ck_bounds.empty()) { + throw api_error::validation(format("Query missing condition on sort key '{}'", schema->clustering_key_columns().front().name_as_text())); + } else if (conditions.MemberCount() > 2) { + throw api_error::validation("Only one or two conditions allowed in table with hash key and sort key"); + } + } + + if (ck_bounds.empty()) { + ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } + + return {std::move(partition_ranges), std::move(ck_bounds)}; +} + +// Extract the top-level column name specified in a KeyConditionExpression. +// If a nested attribute path is given, a ValidationException is generated. +// If the column name is a #reference to ExpressionAttributeNames, the +// reference is resolved. +// Note this function returns a string_view, which may refer to data in the +// given parsed::value or expression_attribute_names. +static std::string_view get_toplevel(const parsed::value& v, + const rjson::value* expression_attribute_names, + std::unordered_set& used_attribute_names) +{ + const parsed::path& path = std::get(v._value); + if (path.has_operators()) { + throw api_error::validation("KeyConditionExpression does not support nested attributes"); + } + std::string_view column_name = path.root(); + if (column_name.size() > 0 && column_name[0] == '#') { + used_attribute_names.emplace(column_name); + if (!expression_attribute_names) { + throw api_error::validation( + fmt::format("ExpressionAttributeNames missing, entry '{}' required by KeyConditionExpression", + column_name)); + } + const rjson::value* value = rjson::find(*expression_attribute_names, column_name); + if (!value || !value->IsString()) { + throw api_error::validation( + fmt::format("ExpressionAttributeNames missing entry '{}' required by KeyConditionExpression", + column_name)); + } + column_name = rjson::to_string_view(*value); + } + return column_name; +} + +// Extract a constant value specified in a KeyConditionExpression. +// This constant was originally parsed as a reference (:name) to a member of +// ExpressionAttributeValues, but at this point, after resolve_value(), it +// was already converted into a JSON value. +// This function decodes the value (using its given expected type) into bytes +// which Scylla uses as the actual key value. If the value has the wrong type, +// or the input had other problems, a ValidationException is thrown. +static bytes get_constant_value(const parsed::value& v, + const column_definition& column) +{ + const parsed::constant& constant = std::get(v._value); + const parsed::constant::literal& lit = std::get(constant._value); + return get_key_from_typed_value(*lit, column); +} + +// condition_expression_and_list extracts a list of ANDed primitive conditions +// from a condition_expression. This is useful for KeyConditionExpression, +// which may not use OR or NOT. If the given condition_expression does use +// OR or NOT, this function throws a ValidationException. +static void condition_expression_and_list( + const parsed::condition_expression& condition_expression, + std::vector& conditions) +{ + if (condition_expression._negated) { + throw api_error::validation("KeyConditionExpression cannot use NOT"); + } + std::visit(overloaded_functor { + [&] (const parsed::primitive_condition& cond) { + conditions.push_back(&cond); + }, + [&] (const parsed::condition_expression::condition_list& list) { + if (list.op == '|' && list.conditions.size() > 1) { + throw api_error::validation("KeyConditionExpression cannot use OR"); + } + for (const parsed::condition_expression& cond : list.conditions) { + condition_expression_and_list(cond, conditions); + } + } + }, condition_expression._expression); +} + +// Calculates primary key bounds from KeyConditionExpression +static std::pair> +calculate_bounds_condition_expression(schema_ptr schema, + const rjson::value& expression, + const rjson::value* expression_attribute_values, + std::unordered_set& used_attribute_values, + const rjson::value* expression_attribute_names, + std::unordered_set& used_attribute_names, + parsed::expression_cache& parsed_expression_cache) +{ + if (!expression.IsString()) { + throw api_error::validation("KeyConditionExpression must be a string"); + } + if (expression.GetStringLength() == 0) { + throw api_error::validation("KeyConditionExpression must not be empty"); + } + // We parse the KeyConditionExpression with the same parser we use for + // ConditionExpression. But KeyConditionExpression only supports a subset + // of the ConditionExpression features, so we have many additional + // verifications below that the key condition is legal. Briefly, a valid + // key condition must contain a single partition key and a single + // sort-key range. + parsed::condition_expression p; + try { + p = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(expression), "KeyConditionExpression"); + } catch(expressions_syntax_error& e) { + throw api_error::validation(e.what()); + } + resolve_condition_expression(p, + expression_attribute_names, expression_attribute_values, + used_attribute_names, used_attribute_values); + std::vector conditions; + condition_expression_and_list(p, conditions); + + if (conditions.size() < 1 || conditions.size() > 2) { + throw api_error::validation( + "KeyConditionExpression syntax error: must have 1 or 2 conditions"); + } + // Scylla allows us to have an (equality) constraint on the partition key + // pk_cdef, and a range constraint on the *first* clustering key ck_cdef. + // Note that this is also good enough for our GSI implementation - the + // GSI's user-specified sort key will be the first clustering key. + // FIXME: In the case described in issue #5320 (base and GSI both have + // just hash key - but different ones), this may allow the user to Query + // using the base key which isn't officially part of the GSI. + const column_definition& pk_cdef = schema->partition_key_columns().front(); + const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? + &schema->clustering_key_columns().front() : nullptr; + + dht::partition_range_vector partition_ranges; + std::vector ck_bounds; + for (const parsed::primitive_condition* condp : conditions) { + const parsed::primitive_condition& cond = *condp; + // In all comparison operators, one operand must be a column name, + // the other is a constant (value reference). We remember which is + // which in toplevel_ind, and also the column name in key (not just + // for comparison operators). + std::string_view key; + int toplevel_ind; + switch (cond._values.size()) { + case 1: { + // The only legal single-value condition is a begin_with() function, + // and it must have two parameters - a top-level attribute and a + // value reference.. + const parsed::value::function_call *f = std::get_if(&cond._values[0]._value); + if (!f) { + throw api_error::validation("KeyConditionExpression cannot be just a value"); + } + if (f->_function_name != "begins_with") { + throw api_error::validation( + fmt::format("KeyConditionExpression function '{}' not supported",f->_function_name)); + } + if (f->_parameters.size() != 2 || !f->_parameters[0].is_path() || + !f->_parameters[1].is_constant()) { + throw api_error::validation( + "KeyConditionExpression begins_with() takes attribute and value"); + } + key = get_toplevel(f->_parameters[0], expression_attribute_names, used_attribute_names); + toplevel_ind = -1; + break; + } + case 2: + if (cond._values[0].is_path() && cond._values[1].is_constant()) { + toplevel_ind = 0; + } else if (cond._values[1].is_path() && cond._values[0].is_constant()) { + toplevel_ind = 1; + } else { + throw api_error::validation("KeyConditionExpression must compare attribute with constant"); + } + key = get_toplevel(cond._values[toplevel_ind], expression_attribute_names, used_attribute_names); + break; + case 3: + // Only BETWEEN has three operands. First must be a column name, + // two other must be value references (constants): + if (cond._op != parsed::primitive_condition::type::BETWEEN) { + // Shouldn't happen unless we have a bug in the parser + throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size())); + } + if (cond._values[0].is_path() && cond._values[1].is_constant() && cond._values[2].is_constant()) { + toplevel_ind = 0; + key = get_toplevel(cond._values[0], expression_attribute_names, used_attribute_names); + } else { + throw api_error::validation("KeyConditionExpression must compare attribute with constants"); + } + break; + default: + // Shouldn't happen unless we have a bug in the parser + throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size())); + } + if (cond._op == parsed::primitive_condition::type::IN) { + throw api_error::validation("KeyConditionExpression does not support IN operator"); + } else if (cond._op == parsed::primitive_condition::type::NE) { + throw api_error::validation("KeyConditionExpression does not support NE operator"); + } else if (cond._op == parsed::primitive_condition::type::EQ) { + // the EQ operator (=) is the only one which can be used for both + // the partition key and sort key: + if (sstring(key) == pk_cdef.name_as_text()) { + if (!partition_ranges.empty()) { + throw api_error::validation( + "KeyConditionExpression allows only one condition for each key"); + } + bytes raw_value = get_constant_value(cond._values[!toplevel_ind], pk_cdef); + partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value)); + auto decorated_key = dht::decorate_key(*schema, pk); + partition_ranges.push_back(dht::partition_range(decorated_key)); + } else if (ck_cdef && sstring(key) == ck_cdef->name_as_text()) { + if (!ck_bounds.empty()) { + throw api_error::validation( + "KeyConditionExpression allows only one condition for each key"); + } + bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef); + clustering_key ck = clustering_key::from_single_value(*schema, raw_value); + ck_bounds.push_back(query::clustering_range(ck)); + } else { + throw api_error::validation( + fmt::format("KeyConditionExpression condition on non-key attribute {}", key)); + } + continue; + } + // If we're still here, it's any other operator besides EQ, and these + // are allowed *only* on the clustering key: + if (sstring(key) == pk_cdef.name_as_text()) { + throw api_error::validation( + fmt::format("KeyConditionExpression only '=' condition is supported on partition key {}", key)); + } else if (!ck_cdef || sstring(key) != ck_cdef->name_as_text()) { + throw api_error::validation( + fmt::format("KeyConditionExpression condition on non-key attribute {}", key)); + } + if (!ck_bounds.empty()) { + throw api_error::validation( + "KeyConditionExpression allows only one condition for each key"); + } + if (cond._op == parsed::primitive_condition::type::BETWEEN) { + clustering_key ck1 = clustering_key::from_single_value(*schema, + get_constant_value(cond._values[1], *ck_cdef)); + clustering_key ck2 = clustering_key::from_single_value(*schema, + get_constant_value(cond._values[2], *ck_cdef)); + ck_bounds.push_back(query::clustering_range::make( + query::clustering_range::bound(ck1), query::clustering_range::bound(ck2))); + continue; + } else if (cond._values.size() == 1) { + // We already verified above, that this case this can only be a + // function call to begins_with(), with the first parameter the + // key, the second the value reference. + bytes raw_value = get_constant_value( + std::get(cond._values[0]._value)._parameters[1], *ck_cdef); + if (!ck_cdef->type->is_compatible_with(*utf8_type)) { + // begins_with() supported on bytes and strings (both stored + // in the database as strings) but not on numbers. + throw api_error::validation( + fmt::format("KeyConditionExpression begins_with() not supported on type {}", + type_to_string(ck_cdef->type))); + } else if (raw_value.empty()) { + ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } else { + clustering_key ck = clustering_key::from_single_value(*schema, raw_value); + ck_bounds.push_back(get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef->type)); + } + continue; + } + + // All remaining operator have one value reference parameter in index + // !toplevel_ind. Note how toplevel_ind==1 reverses the direction of + // an inequality. + bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef); + clustering_key ck = clustering_key::from_single_value(*schema, raw_value); + if ((cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 0) || + (cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 1)) { + ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false))); + } else if ((cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 0) || + (cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 1)) { + ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false))); + } else if ((cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 0) || + (cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 1)) { + ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck))); + } else if ((cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 0) || + (cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 1)) { + ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck))); + } + } + + if (partition_ranges.empty()) { + throw api_error::validation( + format("KeyConditionExpression requires a condition on partition key {}", pk_cdef.name_as_text())); + } + if (ck_bounds.empty()) { + ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } + return {std::move(partition_ranges), std::move(ck_bounds)}; +} + +static future query_vector( + service::storage_proxy& proxy, + vector_search::vector_store_client& vsc, + rjson::value request, + service::client_state& client_state, + tracing::trace_state_ptr trace_state, + service_permit permit, + bool enforce_authorization, + bool warn_authorization, + alternator::stats& stats, + parsed::expression_cache& parsed_expr_cache) { + schema_ptr base_schema = get_table(proxy, request); + get_stats_from_schema(proxy, *base_schema)->api_operations.query++; + tracing::add_alternator_table_name(trace_state, base_schema->cf_name()); + + // If vector search is requested, IndexName must be given and must + // refer to a vector index - not to a GSI or LSI. + const rjson::value* index_name_v = rjson::find(request, "IndexName"); + if (!index_name_v || !index_name_v->IsString()) { + co_return api_error::validation( + "VectorSearch requires IndexName referring to a vector index"); + } + std::string_view index_name = rjson::to_string_view(*index_name_v); + int dimensions = 0; + bool is_vector = false; + for (const index_metadata& im : base_schema->indices()) { + if (im.name() == index_name) { + const auto& opts = im.options(); + // The only secondary index we expect to see is a vector index. + // We also expect it to have a valid "dimensions". + auto it = opts.find(db::index::secondary_index::custom_class_option_name); + if (it == opts.end() || it->second != "vector_index") { + on_internal_error(elogger, fmt::format("IndexName '{}' is a secondary index but not a vector index.", index_name)); + } + it = opts.find("dimensions"); + if (it != opts.end()) { + try { + dimensions = std::stoi(it->second); + } catch (std::logic_error&) {} + } + throwing_assert(dimensions > 0); + is_vector = true; + break; + } + } + if (!is_vector) { + co_return api_error::validation( + format("VectorSearch IndexName '{}' is not a vector index.", index_name)); + } + // QueryVector is required inside VectorSearch. + const rjson::value* vector_search = rjson::find(request, "VectorSearch"); + if (!vector_search || !vector_search->IsObject()) { + co_return api_error::validation( + "VectorSearch requires a VectorSearch parameter"); + } + const rjson::value* query_vector = rjson::find(*vector_search, "QueryVector"); + if (!query_vector || !query_vector->IsObject()) { + co_return api_error::validation( + "VectorSearch requires a QueryVector parameter"); + } + // QueryVector should be is a DynamoDB value, which must be of type "L" + // (a list), containing only elements of type "N" (numbers). The number + // of these elements must be exactly the "dimensions" defined for this + // vector index. We'll now validate all these assumptions and parse + // all the numbers in the vector into an std::vector query_vec - + // the type that ann() wants. + const rjson::value* qv_list = rjson::find(*query_vector, "L"); + if (!qv_list || !qv_list->IsArray()) { + co_return api_error::validation( + "VectorSearch QueryVector must be a list of numbers"); + } + const auto& arr = qv_list->GetArray(); + if ((int)arr.Size() != dimensions) { + co_return api_error::validation( + format("VectorSearch QueryVector length {} does not match index Dimensions {}", + arr.Size(), dimensions)); + } + std::vector query_vec; + query_vec.reserve(arr.Size()); + for (const rjson::value& elem : arr) { + if (!elem.IsObject()) { + co_return api_error::validation( + "VectorSearch QueryVector must contain only numbers"); + } + const rjson::value* n_val = rjson::find(elem, "N"); + if (!n_val || !n_val->IsString()) { + co_return api_error::validation( + "VectorSearch QueryVector must contain only numbers"); + } + std::string_view num_str = rjson::to_string_view(*n_val); + float f; + auto [ptr, ec] = std::from_chars(num_str.data(), num_str.data() + num_str.size(), f); + if (ec != std::errc{} || ptr != num_str.data() + num_str.size() || !std::isfinite(f)) { + co_return api_error::validation( + format("VectorSearch QueryVector element '{}' is not a valid number", num_str)); + } + query_vec.push_back(f); + } + + // Limit is mandatory for vector search: it defines k, the number of + // nearest neighbors to return. + const rjson::value* limit_json = rjson::find(request, "Limit"); + if (!limit_json || !limit_json->IsUint()) { + co_return api_error::validation("VectorSearch requires a positive integer Limit parameter"); + } + uint32_t limit = limit_json->GetUint(); + if (limit == 0) { + co_return api_error::validation("Limit must be greater than 0"); + } + + // Consistent reads are not supported for vector search, just like GSI. + if (get_read_consistency(request) != db::consistency_level::LOCAL_ONE) { + co_return api_error::validation( + "Consistent reads are not allowed on vector indexes"); + } + + // Pagination (ExclusiveStartKey) is not supported for vector search. + if (rjson::find(request, "ExclusiveStartKey")) { + co_return api_error::validation( + "VectorSearch does not support pagination (ExclusiveStartKey)"); + } + + // ScanIndexForward is not supported for vector search: the ordering of + // results is determined by vector distance, not by the sort key. + if (rjson::find(request, "ScanIndexForward")) { + co_return api_error::validation( + "VectorSearch does not support ScanIndexForward"); + } + + std::unordered_set used_attribute_names; + std::unordered_set used_attribute_values; + // Parse the Select parameter and determine which attributes to return. + // For a vector index, the default Select is ALL_ATTRIBUTES (full items). + // ALL_PROJECTED_ATTRIBUTES is significantly more efficent because it + // returns what the vector store returned without looking up additional + // base-table data. Currently only the primary key attributes are projected + // but in the future we'll implement projecting additional attributes into + // the vector index - these additional attributes will also be usable for + // filtering). COUNT returns only the count without items. + select_type select = parse_select(request, table_or_view_type::vector_index); + std::optional attrs_to_get_opt; + if (select == select_type::projection) { + // ALL_PROJECTED_ATTRIBUTES for a vector index: return only key attributes. + alternator::attrs_to_get key_attrs; + for (const column_definition& cdef : base_schema->partition_key_columns()) { + attribute_path_map_add("Select", key_attrs, cdef.name_as_text()); + } + for (const column_definition& cdef : base_schema->clustering_key_columns()) { + attribute_path_map_add("Select", key_attrs, cdef.name_as_text()); + } + attrs_to_get_opt = std::move(key_attrs); + } else { + attrs_to_get_opt = calculate_attrs_to_get(request, parsed_expr_cache, used_attribute_names, select); + } + // QueryFilter (the old-style API) is not supported for vector search Queries. + if (rjson::find(request, "QueryFilter")) { + co_return api_error::validation( + "VectorSearch does not support QueryFilter; use FilterExpression instead"); + } + // FilterExpression: post-filter the vector search results by any attribute. + filter flt(parsed_expr_cache, request, filter::request_type::QUERY, + used_attribute_names, used_attribute_values); + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query"); + const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); + verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query"); + + // Verify the user has SELECT permission on the base table, as we + // do for every type of read operation after validating the input + // parameters. + co_await verify_permission(enforce_authorization, warn_authorization, + client_state, base_schema, auth::permission::SELECT, stats); + + // Query the vector store for the approximate nearest neighbors. + auto timeout = executor::default_timeout(); + abort_on_expiry aoe(timeout); + rjson::value pre_filter = rjson::empty_object(); // TODO, implement + auto pkeys_result = co_await vsc.ann( + base_schema->ks_name(), std::string(index_name), base_schema, + std::move(query_vec), limit, pre_filter, aoe.abort_source()); + if (!pkeys_result.has_value()) { + const sstring error_msg = std::visit(vector_search::error_visitor{}, pkeys_result.error()); + co_return api_error::validation(error_msg); + } + const std::vector& pkeys = pkeys_result.value(); + + // For SELECT=COUNT with no filter: skip fetching from the base table and + // just return the count of candidates returned by the vector store. + // If a filter is present, fall through to the base-table fetch to apply it. + if (select == select_type::count && !flt) { + rjson::value response = rjson::empty_object(); + rjson::add(response, "Count", rjson::value(static_cast(pkeys.size()))); + rjson::add(response, "ScannedCount", rjson::value(static_cast(pkeys.size()))); + co_return rjson::print(std::move(response)); + } + + // For SELECT=ALL_PROJECTED_ATTRIBUTES with no filter: skip fetching from + // the base table and build items directly from the key columns returned by + // the vector store. If a filter is present, fall through to the base-table + // fetch to apply it. + if (select == select_type::projection && !flt) { + rjson::value items_json = rjson::empty_array(); + for (const auto& pkey : pkeys) { + rjson::value item = rjson::empty_object(); + std::vector exploded_pk = pkey.partition.key().explode(); + auto exploded_pk_it = exploded_pk.begin(); + for (const column_definition& cdef : base_schema->partition_key_columns()) { + rjson::value key_val = rjson::empty_object(); + rjson::add_with_string_name(key_val, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef)); + rjson::add_with_string_name(item, std::string_view(cdef.name_as_text()), std::move(key_val)); + ++exploded_pk_it; + } + if (base_schema->clustering_key_size() > 0) { + std::vector exploded_ck = pkey.clustering.explode(); + auto exploded_ck_it = exploded_ck.begin(); + for (const column_definition& cdef : base_schema->clustering_key_columns()) { + rjson::value key_val = rjson::empty_object(); + rjson::add_with_string_name(key_val, type_to_string(cdef.type), json_key_column_value(*exploded_ck_it, cdef)); + rjson::add_with_string_name(item, std::string_view(cdef.name_as_text()), std::move(key_val)); + ++exploded_ck_it; + } + } + rjson::push_back(items_json, std::move(item)); + } + rjson::value response = rjson::empty_object(); + rjson::add(response, "Count", rjson::value(static_cast(items_json.Size()))); + rjson::add(response, "ScannedCount", rjson::value(static_cast(pkeys.size()))); + rjson::add(response, "Items", std::move(items_json)); + co_return rjson::print(std::move(response)); + } + + // TODO: For SELECT=SPECIFIC_ATTRIBUTES, if they are part of the projected + // attributes, we should use the above optimized code path - not fall through + // to the read from the base table as below as we need to do if the specific + // attributes contain non-projected columns. + + // Fetch the matching items from the base table and build the response. + // When a filter is present, we always fetch the full item so that all + // attributes are available for filter evaluation, regardless of the + // projection required for the final response. + auto selection = cql3::selection::selection::wildcard(base_schema); + auto regular_columns = base_schema->regular_columns() + | std::views::transform(&column_definition::id) + | std::ranges::to(); + auto attrs_to_get = ::make_shared>( + flt ? std::nullopt : std::move(attrs_to_get_opt)); + + rjson::value items_json = rjson::empty_array(); + int matched_count = 0; + + // Query each primary key individually, in the order returned by the + // vector store, to preserve vector-distance ordering in the response. + // FIXME: do this more efficiently with a batched read that preserves ordering. + for (const auto& pkey : pkeys) { + std::vector bounds{ + base_schema->clustering_key_size() > 0 + ? query::clustering_range::make_singular(pkey.clustering) + : query::clustering_range::make_open_ended_both_sides()}; + auto partition_slice = query::partition_slice(std::move(bounds), {}, + regular_columns, selection->get_query_options()); + auto command = ::make_lw_shared( + base_schema->id(), base_schema->version(), partition_slice, + proxy.get_max_result_size(partition_slice), + query::tombstone_limit(proxy.get_tombstone_limit())); + service::storage_proxy::coordinator_query_result qr = + co_await proxy.query(base_schema, command, + {dht::partition_range(pkey.partition)}, + db::consistency_level::LOCAL_ONE, + service::storage_proxy::coordinator_query_options( + timeout, permit, client_state, trace_state)); + auto opt_item = describe_single_item(base_schema, partition_slice, + *selection, *qr.query_result, *attrs_to_get); + if (opt_item && (!flt || flt.check(*opt_item))) { + ++matched_count; + if (select != select_type::count) { + if (select == select_type::projection) { + // A filter caused us to fall through here instead of + // taking the projection early-exit above. Reconstruct + // the key-only item from the full item we fetched. + rjson::value key_item = rjson::empty_object(); + for (const column_definition& cdef : base_schema->partition_key_columns()) { + if (const rjson::value* v = rjson::find(*opt_item, cdef.name_as_text())) { + rjson::add_with_string_name(key_item, cdef.name_as_text(), rjson::copy(*v)); + } + } + for (const column_definition& cdef : base_schema->clustering_key_columns()) { + if (const rjson::value* v = rjson::find(*opt_item, cdef.name_as_text())) { + rjson::add_with_string_name(key_item, cdef.name_as_text(), rjson::copy(*v)); + } + } + rjson::push_back(items_json, std::move(key_item)); + } else { + // When a filter caused us to fetch the full item, apply the + // requested projection (attrs_to_get_opt) before returning it. + // This mirrors describe_items_visitor::end_row() which removes + // extra filter attributes from the returned item. + if (flt && attrs_to_get_opt) { + for (const auto& [attr_name, subpath] : *attrs_to_get_opt) { + if (!subpath.has_value()) { + if (rjson::value* toplevel = rjson::find(*opt_item, attr_name)) { + if (!hierarchy_filter(*toplevel, subpath)) { + rjson::remove_member(*opt_item, attr_name); + } + } + } + } + std::vector to_remove; + for (auto it = opt_item->MemberBegin(); it != opt_item->MemberEnd(); ++it) { + std::string key(it->name.GetString(), it->name.GetStringLength()); + if (!attrs_to_get_opt->contains(key)) { + to_remove.push_back(std::move(key)); + } + } + for (const auto& key : to_remove) { + rjson::remove_member(*opt_item, key); + } + } + rjson::push_back(items_json, std::move(*opt_item)); + } + } + } + } + + rjson::value response = rjson::empty_object(); + if (select == select_type::count) { + rjson::add(response, "Count", rjson::value(matched_count)); + } else { + rjson::add(response, "Count", rjson::value(static_cast(items_json.Size()))); + rjson::add(response, "Items", std::move(items_json)); + } + rjson::add(response, "ScannedCount", rjson::value(static_cast(pkeys.size()))); + co_return rjson::print(std::move(response)); +} + +future executor::query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { + _stats.api_operations.query++; + elogger.trace("Querying {}", request); + + if (rjson::find(request, "VectorSearch")) { + // If vector search is requested, we have a separate code path. + // IndexName must be given and must refer to a vector index - not + // to a GSI or LSI as the code below assumes. + return query_vector(_proxy, _vsc, std::move(request), client_state, trace_state, std::move(permit), + _enforce_authorization, _warn_authorization, _stats, *_parsed_expression_cache); + } + + auto [schema, table_type] = get_table_or_view(_proxy, request); + db::consistency_level cl = get_read_consistency(request); + maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "Query", request, cl); + + get_stats_from_schema(_proxy, *schema)->api_operations.query++; + tracing::add_alternator_table_name(trace_state, schema->cf_name()); + + rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey"); + if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) { + return make_ready_future(api_error::validation( + "Consistent reads are not allowed on global indexes (GSI)")); + } + rjson::value* limit_json = rjson::find(request, "Limit"); + uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits::max(); + if (limit <= 0) { + return make_ready_future(api_error::validation("Limit must be greater than 0")); + } + + const bool forward = get_bool_attribute(request, "ScanIndexForward", true); + + rjson::value* key_conditions = rjson::find(request, "KeyConditions"); + rjson::value* key_condition_expression = rjson::find(request, "KeyConditionExpression"); + std::unordered_set used_attribute_values; + std::unordered_set used_attribute_names; + if (key_conditions && key_condition_expression) { + throw api_error::validation("Query does not allow both " + "KeyConditions and KeyConditionExpression to be given together"); + } else if (!key_conditions && !key_condition_expression) { + throw api_error::validation("Query must have one of " + "KeyConditions or KeyConditionExpression"); + } + + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues"); + + // exactly one of key_conditions or key_condition_expression + auto [partition_ranges, ck_bounds] = key_conditions + ? calculate_bounds_conditions(schema, *key_conditions) + : calculate_bounds_condition_expression(schema, *key_condition_expression, + expression_attribute_values, + used_attribute_values, + expression_attribute_names, + used_attribute_names, *_parsed_expression_cache); + + filter filter(*_parsed_expression_cache, request, filter::request_type::QUERY, + used_attribute_names, used_attribute_values); + + // A query is not allowed to filter on the partition key or the sort key. + for (const column_definition& cdef : schema->partition_key_columns()) { // just one + if (filter.filters_on(cdef.name_as_text())) { + return make_ready_future(api_error::validation( + format("QueryFilter can only contain non-primary key attributes: Partition key attribute: {}", cdef.name_as_text()))); + } + } + for (const column_definition& cdef : schema->clustering_key_columns()) { + if (filter.filters_on(cdef.name_as_text())) { + return make_ready_future(api_error::validation( + format("QueryFilter can only contain non-primary key attributes: Sort key attribute: {}", cdef.name_as_text()))); + } + // FIXME: this "break" can avoid listing some clustering key columns + // we added for GSIs just because they existed in the base table - + // but not in all cases. We still have issue #5320. + break; + } + + select_type select = parse_select(request, table_type); + + auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select); + verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query"); + verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query"); + query::partition_slice::option_set opts; + opts.set_if(!forward); + return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl, + std::move(filter), opts, client_state, _stats, std::move(trace_state), std::move(permit), _enforce_authorization, _warn_authorization); +} + +// Converts a multi-row selection result to JSON compatible with DynamoDB. +// For each row, this method calls item_callback, which takes the size of +// the item as the parameter. +static future> describe_multi_item(schema_ptr schema, + const query::partition_slice&& slice, + shared_ptr selection, + foreign_ptr> query_result, + shared_ptr> attrs_to_get, + noncopyable_function item_callback) { + cql3::selection::result_set_builder builder(*selection, gc_clock::now()); + query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection)); + auto result_set = builder.build(); + std::vector ret; + for (auto& result_row : result_set->rows()) { + rjson::value item = rjson::empty_object(); + uint64_t item_length_in_bytes = 0; + describe_single_item(*selection, result_row, *attrs_to_get, item, &item_length_in_bytes); + if (item_callback) { + item_callback(item_length_in_bytes); + } + ret.push_back(std::move(item)); + co_await coroutine::maybe_yield(); + } + co_return ret; +} + +// describe_item() wraps the result of describe_single_item() by a map +// as needed by the GetItem request. It should not be used for other purposes, +// use describe_single_item() instead. +static rjson::value describe_item(schema_ptr schema, + const query::partition_slice& slice, + const cql3::selection::selection& selection, + const query::result& query_result, + const std::optional& attrs_to_get, + consumed_capacity_counter& consumed_capacity, + uint64_t& metric) { + std::optional opt_item = describe_single_item(std::move(schema), slice, selection, std::move(query_result), attrs_to_get, &consumed_capacity._total_bytes); + rjson::value item_descr = rjson::empty_object(); + if (opt_item) { + rjson::add(item_descr, "Item", std::move(*opt_item)); + } + consumed_capacity.add_consumed_capacity_to_response_if_needed(item_descr); + metric += consumed_capacity.get_half_units(); + return item_descr; +} + +future executor::get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { + _stats.api_operations.get_item++; + auto start_time = std::chrono::steady_clock::now(); + elogger.trace("Getting item {}", request); + + schema_ptr schema = get_table(_proxy, request); + lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *schema); + per_table_stats->api_operations.get_item++; + tracing::add_alternator_table_name(trace_state, schema->cf_name()); + + rjson::value& query_key = request["Key"]; + db::consistency_level cl = get_read_consistency(request); + + maybe_audit(audit_info, audit::statement_category::QUERY, schema->ks_name(), schema->cf_name(), "GetItem", request, cl); + + partition_key pk = pk_from_json(query_key, schema); + dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))}; + + std::vector bounds; + if (schema->clustering_key_size() == 0) { + bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } else { + clustering_key ck = ck_from_json(query_key, schema); + bounds.push_back(query::clustering_range::make_singular(std::move(ck))); + } + check_key(query_key, schema); + + //TODO(sarna): It would be better to fetch only some attributes of the map, not all + auto regular_columns = + schema->regular_columns() | std::views::transform(&column_definition::id) + | std::ranges::to(); + + auto selection = cql3::selection::selection::wildcard(schema); + + auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); + auto command = ::make_lw_shared(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice), + query::tombstone_limit(_proxy.get_tombstone_limit())); + + std::unordered_set used_attribute_names; + auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names); + const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames"); + verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem"); + rcu_consumed_capacity_counter add_capacity(request, cl == db::consistency_level::LOCAL_QUORUM); + co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::SELECT, _stats); + service::storage_proxy::coordinator_query_result qr = + co_await _proxy.query( + schema, std::move(command), std::move(partition_ranges), cl, + service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)); + per_table_stats->api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time); + _stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time); + uint64_t rcu_half_units = 0; + rjson::value res = describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get), add_capacity, rcu_half_units); + per_table_stats->rcu_half_units_total += rcu_half_units; + _stats.rcu_half_units_total += rcu_half_units; + // Update item size metrics only if we found an item. + if (qr.query_result->row_count().value_or(0) > 0) { + per_table_stats->operation_sizes.get_item_op_size_kb.add(bytes_to_kb_ceil(add_capacity._total_bytes)); + } + co_return rjson::print(std::move(res)); +} + +future executor::batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request, std::unique_ptr& audit_info) { + // FIXME: In this implementation, an unbounded batch size can cause + // unbounded response JSON object to be buffered in memory, unbounded + // parallelism of the requests, and unbounded amount of non-preemptable + // work in the following loops. So we should limit the batch size, and/or + // the response size, as DynamoDB does. + _stats.api_operations.batch_get_item++; + rjson::value& request_items = request["RequestItems"]; + auto start_time = std::chrono::steady_clock::now(); + // We need to validate all the parameters before starting any asynchronous + // query, and fail the entire request on any parse error. So we parse all + // the input into our own vector "requests", each element a table_requests + // listing all the request aimed at a single table. For efficiency, inside + // each table_requests we further group together all reads going to the + // same partition, so we can later send them together. + bool should_add_rcu = rcu_consumed_capacity_counter::should_add_capacity(request); + struct table_requests { + schema_ptr schema; + db::consistency_level cl; + ::shared_ptr> attrs_to_get; + // clustering_keys keeps a sorted set of clustering keys. It must + // be sorted for the read below (see #10827). Additionally each + // clustering key is mapped to the original rjson::value "Key". + using clustering_keys = std::map; + std::unordered_map requests; + table_requests(schema_ptr s) + : schema(std::move(s)) + , requests(8, partition_key::hashing(*schema), partition_key::equality(*schema)) + {} + void add(rjson::value& key) { + auto pk = pk_from_json(key, schema); + auto it = requests.find(pk); + if (it == requests.end()) { + it = requests.emplace(pk, clustering_key::less_compare(*schema)).first; + } + auto ck = ck_from_json(key, schema); + if (auto [_, inserted] = it->second.emplace(ck, &key); !inserted) { + throw api_error::validation("Provided list of item keys contains duplicates"); + } + } + }; + std::vector requests; + uint batch_size = 0; + for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) { + table_requests rs(get_table_from_batch_request(_proxy, it)); + tracing::add_alternator_table_name(trace_state, rs.schema->cf_name()); + rs.cl = get_read_consistency(it->value); + std::unordered_set used_attribute_names; + rs.attrs_to_get = ::make_shared>(calculate_attrs_to_get(it->value, *_parsed_expression_cache, used_attribute_names)); + const rjson::value* expression_attribute_names = rjson::find(it->value, "ExpressionAttributeNames"); + verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "GetItem"); + auto& keys = (it->value)["Keys"]; + for (rjson::value& key : keys.GetArray()) { + rs.add(key); + check_key(key, rs.schema); + } + batch_size += rs.requests.size(); + requests.emplace_back(std::move(rs)); + } + + for (const table_requests& tr : requests) { + co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, tr.schema, auth::permission::SELECT, _stats); + } + + _stats.api_operations.batch_get_item_batch_total += batch_size; + _stats.api_operations.batch_get_item_histogram.add(batch_size); + // If we got here, all "requests" are valid, so let's start the + // requests for the different partitions all in parallel. + std::vector>> response_futures; + std::vector consumed_rcu_half_units_per_table(requests.size()); + for (size_t i = 0; i < requests.size(); i++) { + const table_requests& rs = requests[i]; + bool is_quorum = rs.cl == db::consistency_level::LOCAL_QUORUM; + lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); + per_table_stats->api_operations.batch_get_item_histogram.add(rs.requests.size()); + for (const auto& [pk, cks] : rs.requests) { + dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*rs.schema, pk))}; + std::vector bounds; + if (rs.schema->clustering_key_size() == 0) { + bounds.push_back(query::clustering_range::make_open_ended_both_sides()); + } else { + for (auto& ck : cks) { + bounds.push_back(query::clustering_range::make_singular(ck.first)); + } + } + auto regular_columns = + rs.schema->regular_columns() | std::views::transform(&column_definition::id) + | std::ranges::to(); + auto selection = cql3::selection::selection::wildcard(rs.schema); + auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options()); + auto command = ::make_lw_shared(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice), + query::tombstone_limit(_proxy.get_tombstone_limit())); + command->allow_limit = db::allow_per_partition_rate_limit::yes; + const auto item_callback = [is_quorum, per_table_stats, &rcus_per_table = consumed_rcu_half_units_per_table[i]](uint64_t size) { + rcus_per_table += rcu_consumed_capacity_counter::get_half_units(size, is_quorum); + // Update item size only if the item exists. + if (size > 0) { + per_table_stats->operation_sizes.batch_get_item_op_size_kb.add(bytes_to_kb_ceil(size)); + } + }; + future> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl, + service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then( + [schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get, item_callback = std::move(item_callback)] (service::storage_proxy::coordinator_query_result qr) mutable { + utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); }); + return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get), std::move(item_callback)); + }); + response_futures.push_back(std::move(f)); + } + } + + // Wait for all requests to complete, and then return the response. + // In case of full failure (no reads succeeded), an arbitrary error + // from one of the operations will be returned. + bool some_succeeded = false; + std::exception_ptr eptr; + std::set table_names; // for auditing + // FIXME: will_log() here doesn't pass keyspace/table, so keyspace-level audit + // filtering is bypassed — a batch spanning multiple tables is audited as a whole. + bool should_audit = _audit.local_is_initialized() && _audit.local().will_log(audit::statement_category::QUERY); + rjson::value response = rjson::empty_object(); + rjson::add(response, "Responses", rjson::empty_object()); + rjson::add(response, "UnprocessedKeys", rjson::empty_object()); + auto fut_it = response_futures.begin(); + rjson::value consumed_capacity = rjson::empty_array(); + for (size_t i = 0; i < requests.size(); i++) { + const table_requests& rs = requests[i]; + std::string table = rs.schema->cf_name(); + if (should_audit) { + table_names.insert(table); + } + for (const auto& [_, cks] : rs.requests) { + auto& fut = *fut_it; + ++fut_it; + try { + std::vector results = co_await std::move(fut); + some_succeeded = true; + if (!response["Responses"].HasMember(table)) { + rjson::add_with_string_name(response["Responses"], table, rjson::empty_array()); + } + for (rjson::value& json : results) { + rjson::push_back(response["Responses"][table], std::move(json)); + } + } catch(...) { + eptr = std::current_exception(); + // This read of potentially several rows in one partition, + // failed. We need to add the row key(s) to UnprocessedKeys. + if (!response["UnprocessedKeys"].HasMember(table)) { + // Add the table's entry in UnprocessedKeys. Need to copy + // all the table's parameters from the request except the + // Keys field, which we start empty and then build below. + rjson::add_with_string_name(response["UnprocessedKeys"], table, rjson::empty_object()); + rjson::value& unprocessed_item = response["UnprocessedKeys"][table]; + rjson::value& request_item = request_items[table]; + for (auto it = request_item.MemberBegin(); it != request_item.MemberEnd(); ++it) { + if (it->name != "Keys") { + rjson::add_with_string_name(unprocessed_item, + rjson::to_string_view(it->name), rjson::copy(it->value)); + } + } + rjson::add_with_string_name(unprocessed_item, "Keys", rjson::empty_array()); + } + for (auto& ck : cks) { + rjson::push_back(response["UnprocessedKeys"][table]["Keys"], std::move(*ck.second)); + } + } + } + uint64_t rcu_half_units = consumed_rcu_half_units_per_table[i]; + _stats.rcu_half_units_total += rcu_half_units; + lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); + per_table_stats->rcu_half_units_total += rcu_half_units; + if (should_add_rcu) { + rjson::value entry = rjson::empty_object(); + rjson::add(entry, "TableName", table); + rjson::add(entry, "CapacityUnits", rcu_half_units*0.5); + rjson::push_back(consumed_capacity, std::move(entry)); + } + } + + if (should_add_rcu) { + rjson::add(response, "ConsumedCapacity", std::move(consumed_capacity)); + } + elogger.trace("Unprocessed keys: {}", response["UnprocessedKeys"]); + // NOTE: Each table in the batch has its own CL (set by get_read_consistency()), + // but the audit entry records a single CL for the whole batch. We use ANY as a + // placeholder to indicate "mixed / not applicable". + // FIXME: Auditing is executed only for a complete success + maybe_audit(audit_info, audit::statement_category::QUERY, "", + print_names_for_audit(table_names), "BatchGetItem", request, db::consistency_level::ANY); + if (!some_succeeded && eptr) { + co_await coroutine::return_exception_ptr(std::move(eptr)); + } + auto duration = std::chrono::steady_clock::now() - start_time; + _stats.api_operations.batch_get_item_latency.mark(duration); + for (const table_requests& rs : requests) { + lw_shared_ptr per_table_stats = get_stats_from_schema(_proxy, *rs.schema); + per_table_stats->api_operations.batch_get_item_latency.mark(duration); + } + if (is_big(response)) { + co_return make_streamed(std::move(response)); + } else { + co_return rjson::print(std::move(response)); + } +} + +} // namespace alternator diff --git a/alternator/executor_util.cc b/alternator/executor_util.cc new file mode 100644 index 0000000000..f7d9897d32 --- /dev/null +++ b/alternator/executor_util.cc @@ -0,0 +1,559 @@ +/* + * Copyright 2019-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 + */ + +#include "alternator/executor_util.hh" +#include "alternator/executor.hh" +#include "alternator/error.hh" +#include "auth/resource.hh" +#include "auth/service.hh" +#include "cdc/log.hh" +#include "data_dictionary/data_dictionary.hh" +#include "db/tags/utils.hh" +#include "replica/database.hh" +#include "cql3/selection/selection.hh" +#include "cql3/result_set.hh" +#include "serialization.hh" +#include "service/storage_proxy.hh" +#include "types/map.hh" +#include + +namespace alternator { + +extern logging::logger elogger; // from executor.cc + +std::optional get_int_attribute(const rjson::value& value, std::string_view attribute_name) { + const rjson::value* attribute_value = rjson::find(value, attribute_name); + if (!attribute_value) + return {}; + if (!attribute_value->IsInt()) { + throw api_error::validation(fmt::format("Expected integer value for attribute {}, got: {}", + attribute_name, value)); + } + return attribute_value->GetInt(); +} + +std::string get_string_attribute(const rjson::value& value, std::string_view attribute_name, const char* default_return) { + const rjson::value* attribute_value = rjson::find(value, attribute_name); + if (!attribute_value) + return default_return; + if (!attribute_value->IsString()) { + throw api_error::validation(fmt::format("Expected string value for attribute {}, got: {}", + attribute_name, value)); + } + return rjson::to_string(*attribute_value); +} + +bool get_bool_attribute(const rjson::value& value, std::string_view attribute_name, bool default_return) { + const rjson::value* attribute_value = rjson::find(value, attribute_name); + if (!attribute_value) { + return default_return; + } + if (!attribute_value->IsBool()) { + throw api_error::validation(fmt::format("Expected boolean value for attribute {}, got: {}", + attribute_name, value)); + } + return attribute_value->GetBool(); +} + +std::optional find_table_name(const rjson::value& request) { + const rjson::value* table_name_value = rjson::find(request, "TableName"); + if (!table_name_value) { + return std::nullopt; + } + if (!table_name_value->IsString()) { + throw api_error::validation("Non-string TableName field in request"); + } + std::string table_name = rjson::to_string(*table_name_value); + return table_name; +} + +std::string get_table_name(const rjson::value& request) { + auto name = find_table_name(request); + if (!name) { + throw api_error::validation("Missing TableName field in request"); + } + return *name; +} + +schema_ptr find_table(service::storage_proxy& proxy, const rjson::value& request) { + auto table_name = find_table_name(request); + if (!table_name) { + return nullptr; + } + return find_table(proxy, *table_name); +} + +schema_ptr find_table(service::storage_proxy& proxy, std::string_view table_name) { + try { + return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(table_name), table_name); + } catch(data_dictionary::no_such_column_family&) { + // DynamoDB returns validation error even when table does not exist + // and the table name is invalid. + validate_table_name(table_name); + + throw api_error::resource_not_found( + fmt::format("Requested resource not found: Table: {} not found", table_name)); + } +} + +schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request) { + auto schema = find_table(proxy, request); + if (!schema) { + // if we get here then the name was missing, since syntax or missing actual CF + // checks throw. Slow path, but just call get_table_name to generate exception. + get_table_name(request); + } + return schema; +} + +map_type attrs_type() { + static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true); + return t; +} + +const std::map& get_tags_of_table_or_throw(schema_ptr schema) { + auto tags_ptr = db::get_tags_of_table(schema); + if (tags_ptr) { + return *tags_ptr; + } else { + throw api_error::validation(format("Table {} does not have valid tagging information", schema->ks_name())); + } +} + +bool is_alternator_keyspace(std::string_view ks_name) { + return ks_name.starts_with(executor::KEYSPACE_NAME_PREFIX); +} + +// This tag is set on a GSI when the user did not specify a range key, causing +// Alternator to add the base table's range key as a spurious range key. It is +// used by describe_key_schema() to suppress reporting that key. +extern const sstring SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY; + +void describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map* attribute_types, const std::map* tags) { + rjson::value key_schema = rjson::empty_array(); + const bool ignore_range_keys_as_spurious = tags != nullptr && tags->contains(SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY); + + for (const column_definition& cdef : schema.partition_key_columns()) { + rjson::value key = rjson::empty_object(); + rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text())); + rjson::add(key, "KeyType", "HASH"); + rjson::push_back(key_schema, std::move(key)); + if (attribute_types) { + (*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type); + } + } + if (!ignore_range_keys_as_spurious) { + // NOTE: user requested key (there can be at most one) will always come first. + // There might be more keys following it, which were added, but those were + // not requested by the user, so we ignore them. + for (const column_definition& cdef : schema.clustering_key_columns()) { + rjson::value key = rjson::empty_object(); + rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text())); + rjson::add(key, "KeyType", "RANGE"); + rjson::push_back(key_schema, std::move(key)); + if (attribute_types) { + (*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type); + } + break; + } + } + rjson::add(parent, "KeySchema", std::move(key_schema)); +} + +// Check if the given string has valid characters for a table name, i.e. only +// a-z, A-Z, 0-9, _ (underscore), - (dash), . (dot). Note that this function +// does not check the length of the name - instead, use validate_table_name() +// to validate both the characters and the length. +static bool valid_table_name_chars(std::string_view name) { + for (auto c : name) { + if ((c < 'a' || c > 'z') && + (c < 'A' || c > 'Z') && + (c < '0' || c > '9') && + c != '_' && + c != '-' && + c != '.') { + return false; + } + } + return true; +} + +std::string view_name(std::string_view table_name, std::string_view index_name, const std::string& delim, bool validate_len) { + if (index_name.length() < 3) { + throw api_error::validation("IndexName must be at least 3 characters long"); + } + if (!valid_table_name_chars(index_name)) { + throw api_error::validation( + fmt::format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name)); + } + std::string ret = std::string(table_name) + delim + std::string(index_name); + if (ret.length() > max_auxiliary_table_name_length && validate_len) { + throw api_error::validation( + fmt::format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters", + table_name, index_name, max_auxiliary_table_name_length - delim.size())); + } + return ret; +} + +std::string gsi_name(std::string_view table_name, std::string_view index_name, bool validate_len) { + return view_name(table_name, index_name, ":", validate_len); +} + +std::string lsi_name(std::string_view table_name, std::string_view index_name, bool validate_len) { + return view_name(table_name, index_name, "!:", validate_len); +} + +void check_key(const rjson::value& key, const schema_ptr& schema) { + if (key.MemberCount() != (schema->clustering_key_size() == 0 ? 1 : 2)) { + throw api_error::validation("Given key attribute not in schema"); + } +} + +void verify_all_are_used(const rjson::value* field, + const std::unordered_set& used, const char* field_name, const char* operation) { + if (!field) { + return; + } + for (auto it = field->MemberBegin(); it != field->MemberEnd(); ++it) { + if (!used.contains(rjson::to_string(it->name))) { + throw api_error::validation( + format("{} has spurious '{}', not used in {}", + field_name, rjson::to_string_view(it->name), operation)); + } + } +} + +// This function increments the authorization_failures counter, and may also +// log a warn-level message and/or throw an access_denied exception, depending +// on what enforce_authorization and warn_authorization are set to. +// Note that if enforce_authorization is false, this function will return +// without throwing. So a caller that doesn't want to continue after an +// authorization_error must explicitly return after calling this function. +static void authorization_error(stats& stats, bool enforce_authorization, bool warn_authorization, std::string msg) { + stats.authorization_failures++; + if (enforce_authorization) { + if (warn_authorization) { + elogger.warn("alternator_warn_authorization=true: {}", msg); + } + throw api_error::access_denied(std::move(msg)); + } else { + if (warn_authorization) { + elogger.warn("If you set alternator_enforce_authorization=true the following will be enforced: {}", msg); + } + } +} + +future<> verify_permission( + bool enforce_authorization, + bool warn_authorization, + const service::client_state& client_state, + const schema_ptr& schema, + auth::permission permission_to_check, + stats& stats) { + if (!enforce_authorization && !warn_authorization) { + co_return; + } + // Unfortunately, the fix for issue #23218 did not modify the function + // that we use here - check_has_permissions(). So if we want to allow + // writes to internal tables (from try_get_internal_table()) only to a + // superuser, we need to explicitly check it here. + if (permission_to_check == auth::permission::MODIFY && is_internal_keyspace(schema->ks_name())) { + if (!client_state.user() || + !client_state.user()->name || + !co_await client_state.get_auth_service()->underlying_role_manager().is_superuser(*client_state.user()->name)) { + sstring username = ""; + if (client_state.user() && client_state.user()->name) { + username = client_state.user()->name.value(); + } + authorization_error(stats, enforce_authorization, warn_authorization, fmt::format( + "Write access denied on internal table {}.{} to role {} because it is not a superuser", + schema->ks_name(), schema->cf_name(), username)); + co_return; + } + } + auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name()); + if (!client_state.user() || !client_state.user()->name || + !co_await client_state.check_has_permission(auth::command_desc(permission_to_check, resource))) { + sstring username = ""; + if (client_state.user() && client_state.user()->name) { + username = client_state.user()->name.value(); + } + // Using exceptions for errors makes this function faster in the + // success path (when the operation is allowed). + authorization_error(stats, enforce_authorization, warn_authorization, fmt::format( + "{} access on table {}.{} is denied to role {}, client address {}", + auth::permissions::to_string(permission_to_check), + schema->ks_name(), schema->cf_name(), username, client_state.get_client_address())); + } +} + +// Similar to verify_permission() above, but just for CREATE operations. +// Those do not operate on any specific table, so require permissions on +// ALL KEYSPACES instead of any specific table. +future<> verify_create_permission(bool enforce_authorization, bool warn_authorization, const service::client_state& client_state, stats& stats) { + if (!enforce_authorization && !warn_authorization) { + co_return; + } + auto resource = auth::resource(auth::resource_kind::data); + if (!co_await client_state.check_has_permission(auth::command_desc(auth::permission::CREATE, resource))) { + sstring username = ""; + if (client_state.user() && client_state.user()->name) { + username = client_state.user()->name.value(); + } + authorization_error(stats, enforce_authorization, warn_authorization, fmt::format( + "CREATE access on ALL KEYSPACES is denied to role {}", username)); + } +} + +schema_ptr try_get_internal_table(const data_dictionary::database& db, std::string_view table_name) { + size_t it = table_name.find(executor::INTERNAL_TABLE_PREFIX); + if (it != 0) { + return schema_ptr{}; + } + table_name.remove_prefix(executor::INTERNAL_TABLE_PREFIX.size()); + size_t delim = table_name.find_first_of('.'); + if (delim == std::string_view::npos) { + return schema_ptr{}; + } + std::string_view ks_name = table_name.substr(0, delim); + table_name.remove_prefix(ks_name.size() + 1); + // Only internal keyspaces can be accessed to avoid leakage + auto ks = db.try_find_keyspace(ks_name); + if (!ks || !ks->is_internal()) { + return schema_ptr{}; + } + try { + return db.find_schema(ks_name, table_name); + } catch (data_dictionary::no_such_column_family&) { + // DynamoDB returns validation error even when table does not exist + // and the table name is invalid. + validate_table_name(table_name); + throw api_error::resource_not_found( + fmt::format("Requested resource not found: Internal table: {}.{} not found", ks_name, table_name)); + } +} + +schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) { + sstring table_name = rjson::to_sstring(batch_request->name); // JSON keys are always strings + try { + return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name); + } catch(data_dictionary::no_such_column_family&) { + // DynamoDB returns validation error even when table does not exist + // and the table name is invalid. + validate_table_name(table_name); + throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name)); + } +} + +lw_shared_ptr get_stats_from_schema(service::storage_proxy& sp, const schema& schema) { + try { + replica::table& table = sp.local_db().find_column_family(schema.id()); + if (!table.get_stats().alternator_stats) { + table.get_stats().alternator_stats = seastar::make_shared(schema.ks_name(), schema.cf_name()); + } + return table.get_stats().alternator_stats->_stats; + } catch (std::runtime_error&) { + // If we're here it means that a table we are currently working on was deleted before the + // operation completed, returning a temporary object is fine, if the table get deleted so will its metrics + return make_lw_shared(); + } +} + +void describe_single_item(const cql3::selection::selection& selection, + const std::vector& result_row, + const std::optional& attrs_to_get, + rjson::value& item, + uint64_t* item_length_in_bytes, + bool include_all_embedded_attributes) +{ + const auto& columns = selection.get_columns(); + auto column_it = columns.begin(); + for (const managed_bytes_opt& cell : result_row) { + if (!cell) { + ++column_it; + continue; + } + std::string column_name = (*column_it)->name_as_text(); + if (column_name != executor::ATTRS_COLUMN_NAME) { + if (item_length_in_bytes) { + (*item_length_in_bytes) += column_name.length() + cell->size(); + } + if (!attrs_to_get || attrs_to_get->contains(column_name)) { + // item is expected to start empty, and column_name are unique + // so add() makes sense + rjson::add_with_string_name(item, column_name, rjson::empty_object()); + rjson::value& field = item[column_name.c_str()]; + cell->with_linearized([&] (bytes_view linearized_cell) { + rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(linearized_cell, **column_it)); + }); + } + } else { + auto deserialized = attrs_type()->deserialize(*cell); + auto keys_and_values = value_cast(deserialized); + for (auto entry : keys_and_values) { + std::string attr_name = value_cast(entry.first); + if (item_length_in_bytes) { + (*item_length_in_bytes) += attr_name.length(); + } + if (include_all_embedded_attributes || !attrs_to_get || attrs_to_get->contains(attr_name)) { + bytes value = value_cast(entry.second); + if (item_length_in_bytes && value.length()) { + // ScyllaDB uses one extra byte compared to DynamoDB for the bytes length + (*item_length_in_bytes) += value.length() - 1; + } + rjson::value v = deserialize_item(value); + if (attrs_to_get) { + auto it = attrs_to_get->find(attr_name); + if (it != attrs_to_get->end()) { + // attrs_to_get may have asked for only part of + // this attribute. hierarchy_filter() modifies v, + // and returns false when nothing is to be kept. + if (!hierarchy_filter(v, it->second)) { + continue; + } + } + } + // item is expected to start empty, and attribute + // names are unique so add() makes sense + rjson::add_with_string_name(item, attr_name, std::move(v)); + } else if (item_length_in_bytes) { + (*item_length_in_bytes) += value_cast(entry.second).length() - 1; + } + } + } + ++column_it; + } +} + +std::optional describe_single_item(schema_ptr schema, + const query::partition_slice& slice, + const cql3::selection::selection& selection, + const query::result& query_result, + const std::optional& attrs_to_get, + uint64_t* item_length_in_bytes) { + rjson::value item = rjson::empty_object(); + + cql3::selection::result_set_builder builder(selection, gc_clock::now()); + query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection)); + + auto result_set = builder.build(); + if (result_set->empty()) { + if (item_length_in_bytes) { + // empty results is counted as having a minimal length (e.g. 1 byte). + (*item_length_in_bytes) += 1; + } + // If there is no matching item, we're supposed to return an empty + // object without an Item member - not one with an empty Item member + return {}; + } + if (result_set->size() > 1) { + // If the result set contains multiple rows, the code should have + // called describe_multi_item(), not this function. + throw std::logic_error("describe_single_item() asked to describe multiple items"); + } + describe_single_item(selection, *result_set->rows().begin(), attrs_to_get, item, item_length_in_bytes); + return item; +} + +static void check_big_array(const rjson::value& val, int& size_left); +static void check_big_object(const rjson::value& val, int& size_left); + +// For simplicity, we use a recursive implementation. This is fine because +// Alternator limits the depth of JSONs it reads from inputs, and doesn't +// add more than a couple of levels in its own output construction. +bool is_big(const rjson::value& val, int big_size) { + if (val.IsString()) { + return ssize_t(val.GetStringLength()) > big_size; + } else if (val.IsObject()) { + check_big_object(val, big_size); + return big_size < 0; + } else if (val.IsArray()) { + check_big_array(val, big_size); + return big_size < 0; + } + return false; +} + +static void check_big_array(const rjson::value& val, int& size_left) { + // Assume a fixed size of 10 bytes for each number, boolean, etc., or + // beginning of a sub-object. This doesn't have to be accurate. + size_left -= 10 * val.Size(); + for (const auto& v : val.GetArray()) { + if (size_left < 0) { + return; + } + // Note that we avoid recursive calls for the leaves (anything except + // array or object) because usually those greatly outnumber the trunk. + if (v.IsString()) { + size_left -= v.GetStringLength(); + } else if (v.IsObject()) { + check_big_object(v, size_left); + } else if (v.IsArray()) { + check_big_array(v, size_left); + } + } +} + +static void check_big_object(const rjson::value& val, int& size_left) { + size_left -= 10 * val.MemberCount(); + for (const auto& m : val.GetObject()) { + if (size_left < 0) { + return; + } + size_left -= m.name.GetStringLength(); + if (m.value.IsString()) { + size_left -= m.value.GetStringLength(); + } else if (m.value.IsObject()) { + check_big_object(m.value, size_left); + } else if (m.value.IsArray()) { + check_big_array(m.value, size_left); + } + } +} + +void validate_table_name(std::string_view name, const char* source) { + if (name.length() < 3 || name.length() > max_table_name_length) { + throw api_error::validation( + format("{} must be at least 3 characters long and at most {} characters long", source, max_table_name_length)); + } + if (!valid_table_name_chars(name)) { + throw api_error::validation( + format("{} must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", source)); + } +} + +void validate_cdc_log_name_length(std::string_view table_name) { + if (cdc::log_name(table_name).length() > max_auxiliary_table_name_length) { + // CDC will add cdc_log_suffix ("_scylla_cdc_log") to the table name + // to create its log table, and this will exceed the maximum allowed + // length. To provide a more helpful error message, we assume that + // cdc::log_name() always adds a suffix of the same length. + int suffix_len = cdc::log_name(table_name).length() - table_name.length(); + throw api_error::validation(fmt::format("Streams or vector search cannot be enabled on a table whose name is longer than {} characters: {}", + max_auxiliary_table_name_length - suffix_len, table_name)); + } +} + +body_writer make_streamed(rjson::value&& value) { + return [value = std::move(value)](output_stream&& _out) mutable -> future<> { + auto out = std::move(_out); + std::exception_ptr ex; + try { + co_await rjson::print(value, out); + } catch (...) { + ex = std::current_exception(); + } + co_await out.close(); + co_await rjson::destroy_gently(std::move(value)); + if (ex) { + co_await coroutine::return_exception_ptr(std::move(ex)); + } + }; +} + +} // namespace alternator diff --git a/alternator/executor_util.hh b/alternator/executor_util.hh new file mode 100644 index 0000000000..c60b1b05ad --- /dev/null +++ b/alternator/executor_util.hh @@ -0,0 +1,247 @@ +/* + * Copyright 2019-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 + */ + +// This header file, and the implementation file executor_util.cc, contain +// various utility functions that are reused in many different operations +// (API requests) across Alternator's code - in files such as executor.cc, +// executor_read.cc, streams.cc, ttl.cc, and more. These utility functions +// include things like extracting and validating pieces from a JSON request, +// checking permissions, constructing auxiliary table names, and more. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "utils/rjson.hh" +#include "schema/schema_fwd.hh" +#include "types/types.hh" +#include "auth/permission.hh" +#include "alternator/stats.hh" +#include "alternator/attribute_path.hh" +#include "utils/managed_bytes.hh" + +namespace query { class partition_slice; class result; } +namespace cql3::selection { class selection; } +namespace data_dictionary { class database; } +namespace service { class storage_proxy; class client_state; } + +namespace alternator { + +/// The body_writer is used for streaming responses - where the response body +/// is written in chunks to the output_stream. This allows for efficient +/// handling of large responses without needing to allocate a large buffer in +/// memory. It is one of the variants of executor::request_return_type. +using body_writer = noncopyable_function(output_stream&&)>; + +/// Get the value of an integer attribute, or an empty optional if it is +/// missing. If the attribute exists, but is not an integer, a descriptive +/// api_error is thrown. +std::optional get_int_attribute(const rjson::value& value, std::string_view attribute_name); + +/// Get the value of a string attribute, or a default value if it is missing. +/// If the attribute exists, but is not a string, a descriptive api_error is +/// thrown. +std::string get_string_attribute(const rjson::value& value, std::string_view attribute_name, const char* default_return); + +/// Get the value of a boolean attribute, or a default value if it is missing. +/// If the attribute exists, but is not a bool, a descriptive api_error is +/// thrown. +bool get_bool_attribute(const rjson::value& value, std::string_view attribute_name, bool default_return); + +/// Extract table name from a request. +/// Most requests expect the table's name to be listed in a "TableName" field. +/// get_table_name() returns the name or api_error in case the table name is +/// missing or not a string. +std::string get_table_name(const rjson::value& request); + +/// find_table_name() is like get_table_name() except that it returns an +/// optional table name - it returns an empty optional when the TableName +/// is missing from the request, instead of throwing as get_table_name() +/// does. However, find_table_name() still throws if a TableName exists but +/// is not a string. +std::optional find_table_name(const rjson::value& request); + +/// Extract table schema from a request. +/// Many requests expect the table's name to be listed in a "TableName" field +/// and need to look it up as an existing table. The get_table() function +/// does this, with the appropriate validation and api_error in case the table +/// name is missing, invalid or the table doesn't exist. If everything is +/// successful, it returns the table's schema. +schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request); + +/// This find_table() variant is like get_table() excepts that it returns a +/// nullptr instead of throwing if the request does not mention a TableName. +/// In other cases of errors (i.e., a table is mentioned but doesn't exist) +/// this function throws too. +schema_ptr find_table(service::storage_proxy& proxy, const rjson::value& request); + +/// This find_table() variant is like the previous one except that it takes +/// the table name directly instead of a request object. It is used in cases +/// where we already have the table name extracted from the request. +schema_ptr find_table(service::storage_proxy& proxy, std::string_view table_name); + +// We would have liked to support table names up to 255 bytes, like DynamoDB. +// But Scylla creates a directory whose name is the table's name plus 33 +// bytes (dash and UUID), and since directory names are limited to 255 bytes, +// we need to limit table names to 222 bytes, instead of 255. See issue #4480. +// We actually have two limits here, +// * max_table_name_length is the limit that Alternator will impose on names +// of new Alternator tables. +// * max_auxiliary_table_name_length is the potentially higher absolute limit +// that Scylla imposes on the names of auxiliary tables that Alternator +// wants to create internally - i.e. materialized views or CDC log tables. +// The second limit might mean that it is not possible to add a GSI to an +// existing table, because the name of the new auxiliary table may go over +// the limit. The second limit is also one of the reasons why the first limit +// is set lower than 222 - to have room to enable streams which add the extra +// suffix "_scylla_cdc_log" to the table name. +inline constexpr int max_table_name_length = 192; +inline constexpr int max_auxiliary_table_name_length = 222; + +/// validate_table_name() validates the TableName parameter in a request - it +/// should be called in CreateTable, and in other requests only when noticing +/// that the named table doesn't exist. +/// The DynamoDB developer guide, https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.NamingRulesDataTypes.html#HowItWorks.NamingRules +/// specifies that table "names must be between 3 and 255 characters long and +/// can contain only the following characters: a-z, A-Z, 0-9, _ (underscore), +/// - (dash), . (dot)". However, Alternator only allows max_table_name_length +/// characters (see above) - not 255. +/// validate_table_name() throws the appropriate api_error if this validation +/// fails. +void validate_table_name(std::string_view name, const char* source = "TableName"); + +/// Validate that a CDC log table could be created for the base table with a +/// given table_name, and if not, throw a user-visible api_error::validation. +/// It is not possible to create a CDC log table if the table name is so long +/// that adding the 15-character suffix "_scylla_cdc_log" (cdc_log_suffix) +/// makes it go over max_auxiliary_table_name_length. +/// Note that if max_table_name_length is set to less than 207 (which is +/// max_auxiliary_table_name_length-15), then this function will never +/// fail. However, it's still important to call it in UpdateTable, in case +/// we have pre-existing tables with names longer than this to avoid #24598. +void validate_cdc_log_name_length(std::string_view table_name); + +/// Checks if a keyspace, given by its name, is an Alternator keyspace. +/// This just checks if the name begins in executor::KEYSPACE_NAME_PREFIX, +/// a prefix that all keyspaces created by Alternator's CreateTable use. +bool is_alternator_keyspace(std::string_view ks_name); + +/// Wraps db::get_tags_of_table() and throws api_error::validation if the +/// table is missing the tags extension. +const std::map& get_tags_of_table_or_throw(schema_ptr schema); + +/// Returns a type object representing the type of the ":attrs" column used +/// by Alternator to store all non-key attribute. This type is a map from +/// string (attribute name) to bytes (serialized attribute value). +map_type attrs_type(); + +// In DynamoDB index names are local to a table, while in Scylla, materialized +// view names are global (in a keyspace). So we need to compose a unique name +// for the view taking into account both the table's name and the index name. +// We concatenate the table and index name separated by a delim character +// (a character not allowed by DynamoDB in ordinary table names, default: ":"). +// The downside of this approach is that it limits the sum of the lengths, +// instead of each component individually as DynamoDB does. +// The view_name() function assumes the table_name has already been validated +// but validates the legality of index_name and the combination of both. +std::string view_name(std::string_view table_name, std::string_view index_name, + const std::string& delim = ":", bool validate_len = true); +std::string gsi_name(std::string_view table_name, std::string_view index_name, + bool validate_len = true); +std::string lsi_name(std::string_view table_name, std::string_view index_name, + bool validate_len = true); + +/// After calling pk_from_json() and ck_from_json() to extract the pk and ck +/// components of a key, and if that succeeded, call check_key() to further +/// check that the key doesn't have any spurious components. +void check_key(const rjson::value& key, const schema_ptr& schema); + +/// Fail with api_error::validation if the expression if has unused attribute +/// names or values. This is how DynamoDB behaves, so we do too. +void verify_all_are_used(const rjson::value* field, + const std::unordered_set& used, + const char* field_name, + const char* operation); + +/// Check CQL's Role-Based Access Control (RBAC) permission (MODIFY, +/// SELECT, DROP, etc.) on the given table. When permission is denied an +/// appropriate user-readable api_error::access_denied is thrown. +future<> verify_permission(bool enforce_authorization, bool warn_authorization, const service::client_state&, const schema_ptr&, auth::permission, stats& stats); + +/// Similar to verify_permission() above, but just for CREATE operations. +/// Those do not operate on any specific table, so require permissions on +/// ALL KEYSPACES instead of any specific table. +future<> verify_create_permission(bool enforce_authorization, bool warn_authorization, const service::client_state&, stats& stats); + +// Sets a KeySchema JSON array inside the given parent object describing the +// key attributes of the given schema as HASH or RANGE keys. Additionally, +// adds mappings from key attribute names to their DynamoDB type string into +// attribute_types. +void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map* attribute_types = nullptr, const std::map* tags = nullptr); + +/// is_big() checks approximately if the given JSON value is "bigger" than +/// the given big_size number of bytes. The goal is to *quickly* detect +/// oversized JSON that, for example, is too large to be serialized to a +/// contiguous string - we don't need an accurate size for that. Moreover, +/// as soon as we detect that the JSON is indeed "big", we can return true +/// and don't need to continue calculating its exact size. +bool is_big(const rjson::value& val, int big_size = 100'000); + +/// try_get_internal_table() handles the special case that the given table_name +/// begins with INTERNAL_TABLE_PREFIX (".scylla.alternator."). In that case, +/// this function assumes that the rest of the name refers to an internal +/// Scylla table (e.g., system table) and returns the schema of that table - +/// or an exception if it doesn't exist. Otherwise, if table_name does not +/// start with INTERNAL_TABLE_PREFIX, this function returns an empty schema_ptr +/// and the caller should look for a normal Alternator table with that name. +schema_ptr try_get_internal_table(const data_dictionary::database& db, std::string_view table_name); + +/// get_table_from_batch_request() is used by batch write/read operations to +/// look up the schema for a table named in a batch request, by the JSON member +/// name (which is the table name in a BatchWriteItem or BatchGetItem request). +schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request); + +/// Returns (or lazily creates) the per-table stats object for the given schema. +/// If the table has been deleted, returns a temporary stats object. +lw_shared_ptr get_stats_from_schema(service::storage_proxy& sp, const schema& schema); + +/// Writes one item's attributes into `item` from the given selection result +/// row. If include_all_embedded_attributes is true, all attributes from the +/// ATTRS_COLUMN map column are included regardless of attrs_to_get. +void describe_single_item(const cql3::selection::selection&, + const std::vector&, + const std::optional&, + rjson::value&, + uint64_t* item_length_in_bytes = nullptr, + bool include_all_embedded_attributes = false); + +/// Converts a single result row to a JSON item, or returns an empty optional +/// if the result is empty. +std::optional describe_single_item(schema_ptr, + const query::partition_slice&, + const cql3::selection::selection&, + const query::result&, + const std::optional&, + uint64_t* item_length_in_bytes = nullptr); + +/// Make a body_writer (function that can write output incrementally to the +/// HTTP stream) from the given JSON object. +/// Note: only useful for (very) large objects as there are overhead issues +/// with this as well, but for massive lists of return objects this can +/// help avoid large allocations/many re-allocs. +body_writer make_streamed(rjson::value&&); + +} // namespace alternator diff --git a/alternator/expressions.cc b/alternator/expressions.cc index f06e5c3e5b..8537a88c9a 100644 --- a/alternator/expressions.cc +++ b/alternator/expressions.cc @@ -744,7 +744,7 @@ void validate_attr_name_length(std::string_view supplementary_context, size_t at constexpr const size_t DYNAMODB_NONKEY_ATTR_NAME_SIZE_MAX = 65535; const size_t max_length = is_key ? DYNAMODB_KEY_ATTR_NAME_SIZE_MAX : DYNAMODB_NONKEY_ATTR_NAME_SIZE_MAX; - if (attr_name_length > max_length) { + if (attr_name_length > max_length || attr_name_length == 0) { std::string error_msg; if (!error_msg_prefix.empty()) { error_msg += error_msg_prefix; @@ -754,7 +754,11 @@ void validate_attr_name_length(std::string_view supplementary_context, size_t at error_msg += supplementary_context; error_msg += " - "; } - error_msg += fmt::format("Attribute name is too large, must be less than {} bytes", std::to_string(max_length + 1)); + if (attr_name_length == 0) { + error_msg += "Empty attribute name"; + } else { + error_msg += fmt::format("Attribute name is too large, must be less than {} bytes", std::to_string(max_length + 1)); + } throw api_error::validation(error_msg); } } diff --git a/alternator/http_compression.cc b/alternator/http_compression.cc index fd1315b111..88541248ba 100644 --- a/alternator/http_compression.cc +++ b/alternator/http_compression.cc @@ -264,7 +264,7 @@ private: } }; -executor::body_writer compress(response_compressor::compression_type ct, const db::config& cfg, executor::body_writer&& bw) { +body_writer compress(response_compressor::compression_type ct, const db::config& cfg, body_writer&& bw) { return [bw = std::move(bw), ct, level = cfg.alternator_response_gzip_compression_level()](output_stream&& out) mutable -> future<> { output_stream_options opts; opts.trim_to_size = true; @@ -287,7 +287,7 @@ executor::body_writer compress(response_compressor::compression_type ct, const d }; } -future> response_compressor::generate_reply(std::unique_ptr rep, sstring accept_encoding, const char* content_type, executor::body_writer&& body_writer) { +future> response_compressor::generate_reply(std::unique_ptr rep, sstring accept_encoding, const char* content_type, body_writer&& body_writer) { response_compressor::compression_type ct = find_compression(accept_encoding, std::numeric_limits::max()); if (ct != response_compressor::compression_type::none) { rep->add_header("Content-Encoding", get_encoding_name(ct)); diff --git a/alternator/http_compression.hh b/alternator/http_compression.hh index c5bbb7720f..8124151be3 100644 --- a/alternator/http_compression.hh +++ b/alternator/http_compression.hh @@ -85,7 +85,7 @@ public: future> generate_reply(std::unique_ptr rep, sstring accept_encoding, const char* content_type, std::string&& response_body); future> generate_reply(std::unique_ptr rep, - sstring accept_encoding, const char* content_type, executor::body_writer&& body_writer); + sstring accept_encoding, const char* content_type, body_writer&& body_writer); }; } diff --git a/alternator/serialization.cc b/alternator/serialization.cc index ff16564b12..850b6dcfa3 100644 --- a/alternator/serialization.cc +++ b/alternator/serialization.cc @@ -14,12 +14,12 @@ #include "types/concrete_types.hh" #include "types/json_utils.hh" #include "mutation/position_in_partition.hh" +#include "alternator/executor_util.hh" static logging::logger slogger("alternator-serialization"); namespace alternator { -bool is_alternator_keyspace(const sstring& ks_name); type_info type_info_from_string(std::string_view type) { static thread_local const std::unordered_map type_infos = { diff --git a/alternator/server.cc b/alternator/server.cc index 762b0234bb..68f7cacf7a 100644 --- a/alternator/server.cc +++ b/alternator/server.cc @@ -8,6 +8,7 @@ #include "alternator/server.hh" #include "audit/audit.hh" +#include "alternator/executor_util.hh" #include "gms/application_state.hh" #include "utils/log.hh" #include @@ -143,7 +144,7 @@ public: return _response_compressor.generate_reply(std::move(rep), std::move(accept_encoding), REPLY_CONTENT_TYPE, std::move(str)); }, - [&] (executor::body_writer&& body_writer) { + [&] (body_writer&& body_writer) { return _response_compressor.generate_reply(std::move(rep), std::move(accept_encoding), REPLY_CONTENT_TYPE, std::move(body_writer)); }, diff --git a/alternator/streams.cc b/alternator/streams.cc index f44eec2c85..1f77229e6d 100644 --- a/alternator/streams.cc +++ b/alternator/streams.cc @@ -34,6 +34,7 @@ #include "executor.hh" #include "streams.hh" +#include "alternator/executor_util.hh" #include "data_dictionary/data_dictionary.hh" #include "utils/rjson.hh" @@ -282,7 +283,7 @@ future alternator::executor::list_str auto arn = stream_arn{ i->schema(), cdc::get_base_table(db.real_database(), *i->schema()) }; rjson::add(new_entry, "StreamArn", arn); rjson::add(new_entry, "StreamLabel", rjson::from_string(stream_label(*s))); - rjson::add(new_entry, "TableName", rjson::from_string(cdc::base_name(table_name(*s)))); + rjson::add(new_entry, "TableName", rjson::from_string(cdc::base_name(s->cf_name()))); rjson::push_back(streams, std::move(new_entry)); --limit; } @@ -883,7 +884,7 @@ future executor::describe_stream(client_state& cl rjson::add(stream_desc, "StreamArn", stream_arn); rjson::add(stream_desc, "StreamViewType", type); - rjson::add(stream_desc, "TableName", rjson::from_string(table_name(*bs))); + rjson::add(stream_desc, "TableName", rjson::from_string(bs->cf_name())); describe_key_schema(stream_desc, *bs); diff --git a/alternator/ttl.cc b/alternator/ttl.cc index f61b57f5f7..3f3c57c00b 100644 --- a/alternator/ttl.cc +++ b/alternator/ttl.cc @@ -44,6 +44,7 @@ #include "cql3/query_options.hh" #include "cql3/column_identifier.hh" #include "alternator/executor.hh" +#include "alternator/executor_util.hh" #include "alternator/controller.hh" #include "alternator/serialization.hh" #include "alternator/ttl_tag.hh" diff --git a/cdc/log.cc b/cdc/log.cc index 955f1f290f..89fbcabe6c 100644 --- a/cdc/log.cc +++ b/cdc/log.cc @@ -195,7 +195,7 @@ public: for (auto sp : cfms) { const auto& schema = *sp; - if (!schema.cdc_options().enabled()) { + if (!cdc_enabled(schema)) { continue; } diff --git a/configure.py b/configure.py index 39f18e334a..8f23fb688b 100755 --- a/configure.py +++ b/configure.py @@ -1438,6 +1438,8 @@ alternator = [ 'alternator/controller.cc', 'alternator/server.cc', 'alternator/executor.cc', + 'alternator/executor_read.cc', + 'alternator/executor_util.cc', 'alternator/stats.cc', 'alternator/serialization.cc', 'alternator/expressions.cc', diff --git a/docs/alternator/alternator.md b/docs/alternator/alternator.md index 661dcffeb8..b932e1230c 100644 --- a/docs/alternator/alternator.md +++ b/docs/alternator/alternator.md @@ -151,4 +151,5 @@ attribute a, modifying only a.b[3].c, and then writing back a. compatibility new-apis network + vector-search ``` diff --git a/docs/alternator/vector-search.md b/docs/alternator/vector-search.md new file mode 100644 index 0000000000..7038e41d62 --- /dev/null +++ b/docs/alternator/vector-search.md @@ -0,0 +1,322 @@ +# Alternator Vector Search + +## Introduction + +Alternator vector search is a ScyllaDB extension to the DynamoDB-compatible +API that enables _approximate nearest neighbor_ (ANN) search on numeric +vectors stored as item attributes. + +In a typical use case, each item in a table contains a high-dimensional +embedding vector (e.g., produced by a machine-learning model), and a query +asks for the _k_ items whose stored vectors are closest to a given query +vector. This kind of similarity search is a building block for +recommendation engines, semantic text search, image retrieval, and other +AI/ML workloads. + +Because this feature does not exist in Amazon DynamoDB, all applications +that use it must be written specifically for Alternator. + +For a broader introduction to Vector Search concepts and terminology, see the +[Vector Search Concepts](https://cloud.docs.scylladb.com/stable/vector-search/vector-search-concepts.html) +and +[Vector Search Glossary](https://cloud.docs.scylladb.com/stable/vector-search/vector-search-glossary.html) +sections of the ScyllaDB Cloud documentation. + +## Overview + +The workflow has three steps: + +1. **Create** a table (or update an existing one) with one or more + _vector indexes_. +2. **Write** items that include the indexed vector attribute, just like any + other list attribute. +3. **Query** using the `VectorSearch` parameter to retrieve the _k_ nearest + neighbors. + +## API extensions + +### CreateTable — VectorIndexes parameter + +A new optional parameter `VectorIndexes` can be passed to `CreateTable`. +It is a list of vector index definitions, each specifying: + +| Field | Type | Description | +|-------|------|-------------| +| `IndexName` | String | Unique name for this vector index. Follows the same naming rules as table names: 3–192 characters, matching the regex `[a-zA-Z0-9._-]+`. | +| `VectorAttribute` | Structure | Describes the attribute to index (see below). | +| `Projection` | Structure | Optional. Specifies which attributes are projected into the vector index (see below). | + +**VectorAttribute fields:** + +| Field | Type | Description | +|-------|------|-------------| +| `AttributeName` | String | The item attribute that holds the vector. It must not be a key column. | +| `Dimensions` | Integer | The fixed size of the vector (number of elements). | + +**Projection fields:** + +The optional `Projection` parameter is identical to the one used for DynamoDB +GSI and LSI, and specifies which attributes are stored in the vector index and +returned when `Select=ALL_PROJECTED_ATTRIBUTES` is used in a vector search query: + +| `ProjectionType` | Description | +|-----------------|-------------| +| `KEYS_ONLY` | Only the primary key attributes of the base table (hash key and range key if present) are projected into the index. This is the default when `Projection` is omitted. | +| `ALL` | All table attributes are projected into the index. *(Not yet supported.)* | +| `INCLUDE` | The primary key attributes plus the additional non-key attributes listed in `NonKeyAttributes` are projected. *(Not yet supported.)* | + +> **Note:** Currently only `ProjectionType=KEYS_ONLY` is implemented. Specifying +> `ProjectionType=ALL` or `ProjectionType=INCLUDE` returns a `ValidationException`. +> Since `KEYS_ONLY` is also the default, omitting `Projection` entirely is +> equivalent to specifying `{'ProjectionType': 'KEYS_ONLY'}`. + +Example (using boto3): +```python +table = dynamodb.create_table( + TableName='my-table', + KeySchema=[{'AttributeName': 'id', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'id', 'AttributeType': 'S'}], + BillingMode='PAY_PER_REQUEST', + VectorIndexes=[ + { + 'IndexName': 'embedding-index', + 'VectorAttribute': {'AttributeName': 'embedding', 'Dimensions': 1536}, + } + ], +) +``` + +**Constraints:** +- A vector index may not share a name with another vector index, a GSI, or an LSI on the same table. +- The target attribute must not be a key column or an index key column. +- `Dimensions` must be a positive integer up to the implementation maximum. +- Vector indexes require ScyllaDB to operate with tablets (not vnodes). +- Multiple vector indexes can be created on the same table in a single `CreateTable` call. + +--- + +### UpdateTable — VectorIndexUpdates parameter + +A new optional parameter `VectorIndexUpdates` can be passed to `UpdateTable` +to add or remove a vector index after the table is created. At most one +index operation (Create or Delete) may be requested per call. + +Each element of the list is an object with exactly one of the following keys: + +**Create:** +```json +{ + "Create": { + "IndexName": "my-vector-index", + "VectorAttribute": {"AttributeName": "embedding", "Dimensions": 1536}, + "Projection": {"ProjectionType": "KEYS_ONLY"} + } +} +``` + +The `Projection` field in the `Create` action is optional and accepts the same +values as the `Projection` field in `CreateTable`'s `VectorIndexes` (see above). +Currently only `ProjectionType=KEYS_ONLY` is supported; it is also the default +when `Projection` is omitted. + +**Delete:** +```json +{ + "Delete": { + "IndexName": "my-vector-index" + } +} +``` + +The same constraints as `CreateTable`'s `VectorIndexes` apply. + +--- + +### DescribeTable — VectorIndexes in the response + +`DescribeTable` (and `CreateTable`'s response) returns a `VectorIndexes` +field in the `TableDescription` object when the table has at least one +vector index. The structure mirrors the `CreateTable` input: a list of +objects each containing `IndexName`, `VectorAttribute` +(`AttributeName` + `Dimensions`), and `Projection` (`ProjectionType`). +Currently `Projection` always contains `{"ProjectionType": "KEYS_ONLY"}` +because that is the only supported projection type. + +Each vector index entry also includes status fields that mirror the standard +behavior of `GlobalSecondaryIndexes` in DynamoDB: + +| Field | Type | Description | +|-------|------|-------------| +| `IndexStatus` | String | `"ACTIVE"` when the vector store has finished building the index and it is fully operational. `"CREATING"` while the index is still being built (the vector store service is still initializing or has not yet been discovered). | +| `Backfilling` | Boolean | Present and `true` only when `IndexStatus` is `"CREATING"` and the vector store is actively performing the initial scan of the base table (bootstrapping). When the initial scan is not yet started, or has already completed, this field is absent. | + +When creating a vector index on a non-empty table (via `UpdateTable`), data +already in the table is picked up by the vector store through a full-table +scan (backfill). During this period `IndexStatus` will be `"CREATING"` and +`Backfilling` will be `true`. Once the scan completes the vector store +transitions to monitoring CDC for ongoing changes, and `IndexStatus` becomes +`"ACTIVE"`. + +**Type enforcement before and after index creation differs:** + +- **Pre-existing items:** when the backfill scan encounters an item whose + indexed attribute is not a list of exactly the right number of numbers + (e.g., it is a string, a list of the wrong length, or contains + non-numeric elements), that item is silently skipped and not indexed. + The item remains in the base table unchanged. + +- **New writes once the index exists:** any attempt to write a value to the + indexed attribute that is not a list of exactly `Dimensions` numbers — + where each number is representable as a 32-bit float — is rejected with a + `ValidationException`. This applies to `PutItem`, `UpdateItem`, and + `BatchWriteItem`. A missing value for the indexed attribute is always + allowed; such items simply are not indexed. + +> **Important:** Applications must wait until `IndexStatus` is `"ACTIVE"` before +> issuing `Query` requests against a vector index. Queries on a vector index +> whose `IndexStatus` is still `"CREATING"` may fail. This applies both when +> adding a vector index to an existing table via `UpdateTable` **and** when +> creating a new table with a `VectorIndexes` parameter in `CreateTable` — even +> though the new table starts empty, the vector store still needs a short +> initialization period before it can serve queries. + +--- + +### Query — VectorSearch parameter + +To perform a nearest-neighbor search, pass the `VectorSearch` parameter +to `Query`. When this parameter is present the request is interpreted as a +vector search rather than a standard key-condition query. + +**VectorSearch fields:** + +| Field | Type | Description | +|-------|------|-------------| +| `QueryVector` | AttributeValue (list `L`) | The query vector as a DynamoDB `AttributeValue` of type `L`. Every element must be of type `N` (number). | + +Example: +```python +response = table.query( + IndexName='embedding-index', + Limit=10, + VectorSearch={ + 'QueryVector': {'L': [{'N': '0.1'}, {'N': '-0.3'}, {'N': '0.7'}, ...]}, + }, +) +``` + +**Requirements:** + +| Parameter | Details | +|-----------|---------| +| `IndexName` | Required. Must name a vector index on this table (not a GSI or LSI). | +| `VectorSearch.QueryVector` | Required. A DynamoDB `AttributeValue` of type `L`; all elements must be of type `N` (number). | +| QueryVector length | Must match the `Dimensions` configured for the named vector index. | +| `Limit` | Required. Defines _k_ — how many nearest neighbors to return. Must be a positive integer. | + +**Differences from standard Query:** + +Vector search reinterprets several standard `Query` parameters in a fundamentally +different way, and explicitly rejects others that have no meaningful interpretation: + +- **`Limit` means top-k, not page size.** In a standard Query, `Limit` caps + the number of items examined per page, and you page through results with + `ExclusiveStartKey`. In vector search, `Limit` defines _k_: the ANN + algorithm runs once and returns exactly the _k_ nearest neighbors. There + is no natural "next page" — each page would require a full re-run of the + search — so **`ExclusiveStartKey` is rejected**. + +- **Results are ordered by vector distance, not by sort key.** A standard + Query returns rows in sort-key order; `ScanIndexForward=false` reverses + that order. Vector search always returns results ordered by their distance + to `QueryVector` (nearest first). Having `ScanIndexForward` specify sort-key + direction has no meaning here, so **`ScanIndexForward` is rejected**. + +- **Eventual consistency only.** The vector store is an external service fed + asynchronously from ScyllaDB via CDC. Like GSIs, vector indexes can never + reflect writes instantly, so strongly-consistent reads are impossible. + **`ConsistentRead=true` is rejected.** + +- **No key condition.** A standard Query requires a `KeyConditionExpression` + to select which partition to read. Vector search queries the vector store + globally across all partitions of the table. `KeyConditions` and + `KeyConditionExpression` are therefore not applicable and are silently + ignored. (Local vector indexes, which would scope the search to a single + partition and use `KeyConditionExpression`, are not yet supported.) + +**Select parameter:** + +The standard DynamoDB `Select` parameter is supported for vector search queries +and controls which attributes are returned for each matching item: + +| `Select` value | Behavior | +|----------------|----------| +| `ALL_PROJECTED_ATTRIBUTES` (default) | Return only the attributes projected to the vector index. Currently, only the primary key attributes (hash key, and range key if present) are projected; support for configuring additional projected attributes is not yet implemented. Note that the vector attribute itself is **not** included: the vector store may not retain the original floating-point values (e.g., it may quantize them), so the authoritative copy lives only in the base table. This is the most efficient option because Scylla can return the results directly from the vector store without an additional fetch from the base table. | +| `ALL_ATTRIBUTES` | Return all attributes of each matching item, fetched from the base table. | +| `SPECIFIC_ATTRIBUTES` | Return only the attributes named in `ProjectionExpression` or `AttributesToGet`. | +| `COUNT` | Return only the count of matching items; no `Items` list is included in the response. | + +When neither `Select` nor `ProjectionExpression`/`AttributesToGet` is specified, +`Select` defaults to `ALL_PROJECTED_ATTRIBUTES`. When `ProjectionExpression` or +`AttributesToGet` is present without an explicit `Select`, it implies +`SPECIFIC_ATTRIBUTES`. Using `ProjectionExpression` or `AttributesToGet` +together with an explicit `Select` other than `SPECIFIC_ATTRIBUTES` is an error. + +**Note on performance:** Unlike a DynamoDB LSI, a vector index allows you to +read non-projected attributes (e.g., with `ALL_ATTRIBUTES` or +`SPECIFIC_ATTRIBUTES` requesting a non-key column). However, doing so requires +an additional read from the base table for each result — similar to reading +through a secondary index (rather than a materialized view) in CQL — and is +therefore significantly slower than returning only projected attributes with +`ALL_PROJECTED_ATTRIBUTES`. For latency-sensitive applications, prefer +`ALL_PROJECTED_ATTRIBUTES` or limiting `SPECIFIC_ATTRIBUTES` to key columns. + +**FilterExpression:** + +Vector search supports `FilterExpression` for post-filtering results. This +works the same way as `FilterExpression` on a standard DynamoDB `Query`: after +the ANN search, the filter is applied to each candidate item and only matching +items are returned. + +**Important:** filtering happens _after_ the `Limit` nearest neighbors have +already been selected by the vector index. If the filter discards some of +those candidates, the response may contain **fewer than `Limit` items**. The +server does not automatically fetch additional neighbors to replace filtered-out +items. This is identical to how `FilterExpression` interacts with `Limit` in a +standard DynamoDB `Query`. + +The response always includes two count fields: + +| Field | Description | +|-------|-------------| +| `ScannedCount` | The number of candidates returned by the vector index (always equal to `Limit`, unless the table contains fewer than `Limit` items). | +| `Count` | The number of items that passed the `FilterExpression` (or equal to `ScannedCount` when no filter is present). | + +**Interaction with `Select`:** + +- `Select=ALL_ATTRIBUTES`: Each candidate item is fetched from the + base table, the filter is evaluated against all its attributes, and only + matching items are returned. `Count` reflects the number of items that passed + the filter. + +- `Select=SPECIFIC_ATTRIBUTES`: Each candidate item is fetched from the base + table — including any attributes needed by the filter expression, even if + those attributes are not listed in `ProjectionExpression` — and the filter is + applied. Only the projected attributes are returned in the response; filter + attributes that were not requested are not included in the returned items. + +- `Select=COUNT`: The candidate items are still fetched from the base table and + the filter is evaluated for each one, but no `Items` list is returned. `Count` + reflects the number of items that passed the filter; `ScannedCount` is the + total number of candidates examined. This is useful for counting matches + without transferring item data to the client. + +- `Select=ALL_PROJECTED_ATTRIBUTES` (default): When no filter is present this is the most + efficient mode — results are returned directly from the vector store without + any base-table reads. When a `FilterExpression` is present, however, the full + item must be fetched from the base table to evaluate the filter, and only the + projected (key) attributes are returned for items that pass. + +> **Note:** `QueryFilter` (the legacy non-expression filter API) is **not** +> supported for vector search queries and will be rejected with a +> `ValidationException`. Use `FilterExpression` instead. diff --git a/main.cc b/main.cc index d7bf653d35..9e5bb50c7b 100644 --- a/main.cc +++ b/main.cc @@ -2609,7 +2609,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl api::set_server_service_levels(ctx, cql_server_ctl, qp).get(); - alternator::controller alternator_ctl(gossiper, proxy, ss, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group); + alternator::controller alternator_ctl(gossiper, proxy, ss, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, vector_store_client, *cfg, dbcfg.statement_scheduling_group); // Register at_exit last, so that storage_service::drain_on_shutdown will be called first auto do_drain = defer_verbose_shutdown("local storage", [&ss] { diff --git a/test/alternator/conftest.py b/test/alternator/conftest.py index 620afa60ec..ff2d8302ef 100644 --- a/test/alternator/conftest.py +++ b/test/alternator/conftest.py @@ -154,8 +154,7 @@ def new_dynamodb_session(request, dynamodb, get_valid_alternator_role): conf = botocore.client.Config(parameter_validation=False) if request.config.getoption('aws'): return boto3.resource('dynamodb', config=conf) - if host.hostname == 'localhost': - conf = conf.merge(botocore.client.Config(retries={"max_attempts": 0}, read_timeout=300)) + conf = conf.merge(botocore.client.Config(retries={"max_attempts": 0}, read_timeout=300)) user, secret = get_valid_alternator_role(dynamodb.meta.client._endpoint.host, role=user) region_name = dynamodb.meta.client.meta.region_name return ses.resource('dynamodb', endpoint_url=dynamodb.meta.client._endpoint.host, verify=host.scheme != 'http', diff --git a/test/alternator/run b/test/alternator/run index 9a88a75cb6..8c96fd0b82 100755 --- a/test/alternator/run +++ b/test/alternator/run @@ -6,6 +6,7 @@ import run import os import requests +import glob # When tests are to be run against AWS (the "--aws" option), it is not # necessary to start Scylla at all. All we need to do is to run pytest. @@ -83,6 +84,8 @@ def run_alternator_cmd(pid, dir): ] else: cmd += ['--alternator-port', '8000'] + if '--vs' in sys.argv: + cmd += ['--vector-store-primary-uri', f'http://{ip}:6080'] cmd += extra_scylla_options for i in remove_scylla_options: @@ -111,6 +114,49 @@ if '--https' in sys.argv: else: alternator_url=f"http://{ip}:8000" +# If the "--vs" option is given, also run the vector store process. +# The vector store is run in its own temporary directory, but runs +# on the same IP address as Scylla (otherwise, the first of the two +# which we will run will not know where to find the second). +def run_vector_store_cmd(pid, dir): + global ip # same IP as Scylla, see comment above + print('Booting Vector Store on ' + ip + ' in ' + dir + '...') + with open(f'{dir}/.password', 'w') as f: + print('cassandra', file=f) + env = { + 'VECTOR_STORE_URI': f'{ip}:6080', + 'VECTOR_STORE_SCYLLADB_URI': f'{ip}:9042', + 'VECTOR_STORE_SCYLLADB_USERNAME': 'cassandra', + 'VECTOR_STORE_SCYLLADB_PASSWORD_FILE': f'{dir}/.password', + } + global vector_store_executable # set by the code below + cmd = [ + vector_store_executable + ] + return (cmd, env) +if '--vs' in sys.argv: + sys.argv.remove('--vs') + # Find the vector-store executable. We look for it in the vector-store/ + # directory next to the Scylla working directory taking the newest built + # executable, but it can also be specified by the user by setting the + # VECTOR_STORE environment variable to the path of if os.getenv('SCYLLA'): + global vector_store_executable + if os.getenv('VECTOR_STORE'): + vector_store_executable = os.path.abspath(os.getenv('VECTOR_STORE')) + else: + vector_store_dir = os.path.join(os.path.dirname(run.source_path), 'vector-store') + vector_stores = glob.glob(os.path.join(vector_store_dir, 'target/*/vector-store')) + if not vector_stores: + print(f"Can't find a compiled Vector Store in {vector_store_dir}.\nPlease build Vector Store or set VECTOR_STORE to the path of a Vector Store executable.") + exit(1) + vector_store_executable = max(vector_stores, key=os.path.getmtime) + if not os.access(vector_store_executable, os.X_OK): + print(f"Cannot execute '{vector_store_executable}'.\nPlease set VECTOR_STORE to the path of a Vector Store executable.") + exit(1) + print(f"Vector Store is: {vector_store_executable}.") + run.run_with_temporary_dir(run_vector_store_cmd) + + # Wait for both CQL and Alternator APIs to become responsive. We obviously # need the Alternator API to test Alternator, but currently we also need # CQL for setting up authentication. diff --git a/test/alternator/test_gsi.py b/test/alternator/test_gsi.py index 1b5ac0855f..95bebd7b27 100644 --- a/test/alternator/test_gsi.py +++ b/test/alternator/test_gsi.py @@ -1672,6 +1672,22 @@ def test_gsi_query_select_1(test_table_gsi_1): Select='SPECIFIC_ATTRIBUTES', AttributesToGet=['y'], KeyConditions={'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}}) + assert_index_query(test_table_gsi_1, 'hello', expected_items, + Select='SPECIFIC_ATTRIBUTES', + ProjectionExpression='y', + KeyConditions={'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}}) + # If AttributesToGet or ProjectionExpression are used, SPECIFIC_ATTRIBUTES + # is implied, and can be omitted. + assert_index_query(test_table_gsi_1, 'hello', expected_items, + AttributesToGet=['y'], + KeyConditions={'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}}) + assert_index_query(test_table_gsi_1, 'hello', expected_items, + ProjectionExpression='y', + KeyConditions={'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}}) + assert_index_query(test_table_gsi_1, 'hello', expected_items, + ProjectionExpression='#name', + ExpressionAttributeNames={'#name': 'y'}, + KeyConditions={'c': {'AttributeValueList': [c], 'ComparisonOperator': 'EQ'}}) assert not 'Items' in test_table_gsi_1.query(ConsistentRead=False, IndexName='hello', Select='COUNT', @@ -1714,13 +1730,24 @@ def test_gsi_query_select_2(dynamodb): Select='ALL_ATTRIBUTES', KeyConditions={'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}}) # SPECIFIC_ATTRIBUTES (with AttributesToGet / ProjectionExpression) - # is allowed for the projected attributes, but not for unprojected - # attributes. + # is allowed for the projected attributes, but not allowed for + # unprojected attributes: expected_items = [{'a': z['a']} for z in items if z['x'] == x] assert_index_query(table, 'hello', expected_items, Select='SPECIFIC_ATTRIBUTES', AttributesToGet=['a'], KeyConditions={'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}}) + # Requesting an unprojected attribute 'b' via AttributesToGet or + # ProjectionExpression returns an explicit error, not silent nothing. + with pytest.raises(ClientError, match='ValidationException.*project'): + table.query(IndexName='hello', + Select='SPECIFIC_ATTRIBUTES', + AttributesToGet=['b'], + KeyConditions={'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}}) + with pytest.raises(ClientError, match='ValidationException.*project'): + table.query(IndexName='hello', + ProjectionExpression='b', + KeyConditions={'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}}) # Select=COUNT is also allowed, and doesn't return item content assert not 'Items' in table.query(ConsistentRead=False, IndexName='hello', diff --git a/test/alternator/test_item.py b/test/alternator/test_item.py index 7cfb89f7fa..2ace2ce175 100644 --- a/test/alternator/test_item.py +++ b/test/alternator/test_item.py @@ -878,3 +878,13 @@ def test_many_attributes(test_table_s): AttributeUpdates={key: {'Value': more_attributes[key], 'Action': 'PUT'} for key in more_attributes.keys()}) item = {**item, **more_attributes} assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'] == item + +# Test that attribute names can contain basically any character - even things +# like backslashes, quotes, spaces, newlines, and even null (!). +def test_attribute_allowed_chars(test_table_s): + p = random_string() + s = bytes(range(256)).decode('latin-1') + for chars in ['abc', ' ', "-\\\"'_.:/#&", s]: + test_table_s.update_item(Key={'p': p}, + AttributeUpdates={chars: {'Value': chars, 'Action': 'PUT'}}) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item'][chars] == chars \ No newline at end of file diff --git a/test/alternator/test_limits.py b/test/alternator/test_limits.py index cf1898b913..fb8ba7df3b 100644 --- a/test/alternator/test_limits.py +++ b/test/alternator/test_limits.py @@ -84,6 +84,29 @@ def test_limit_attribute_length_nonkey_bad(test_table_s): ExpressionAttributeNames={'#name': too_long_name}, ExpressionAttributeValues={':val': 1}) +# Empty attribute name is also not allowed. Reproduces SCYLLADB-1069. +# We have similar tests for empty keys in test_item.py::test_{put,update}_item_empty_key. +def test_limit_attribute_length_nonkey_empty(test_table_s): + p = random_string() + with pytest.raises(ClientError, match='ValidationException.*Empty attribute name'): + test_table_s.put_item(Item={'p': p, '': 1}) + with pytest.raises(ClientError, match='ValidationException.*Empty attribute name'): + test_table_s.get_item(Key={'p': p}, ProjectionExpression='#name', + ExpressionAttributeNames={'#name': ''}) + with pytest.raises(ClientError, match='ValidationException.*Empty attribute name'): + test_table_s.get_item(Key={'p': p}, AttributesToGet=['']) + with pytest.raises(ClientError, match='ValidationException.*Empty attribute name'): + test_table_s.update_item(Key={'p': p}, AttributeUpdates={'': {'Value': 2, 'Action': 'PUT'}}) + with pytest.raises(ClientError, match='ValidationException.*Empty attribute name'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='SET #name = :val', + ExpressionAttributeNames={'#name': ''}, + ExpressionAttributeValues={':val': 3}) + with pytest.raises(ClientError, match='ValidationException.*Empty attribute name'): + test_table_s.update_item(Key={'p': p}, UpdateExpression='SET a = :val', + ConditionExpression='#name = :val', + ExpressionAttributeNames={'#name': ''}, + ExpressionAttributeValues={':val': 1}) + # Attribute length test 3: Test that *key* (hash and range) attribute names # up to 255 characters are allowed. In the test below we'll see that larger # sizes aren't allowed. diff --git a/test/alternator/test_metrics.py b/test/alternator/test_metrics.py index b09a994b3e..2ab1e5fb6a 100644 --- a/test/alternator/test_metrics.py +++ b/test/alternator/test_metrics.py @@ -35,6 +35,7 @@ from botocore.exceptions import ClientError from test.alternator.test_cql_rbac import new_dynamodb, new_role from test.alternator.util import random_string, new_test_table, is_aws, scylla_config_read, scylla_config_temporary, get_signed_request +from test.alternator.test_vector import vs # Fixture for checking if we are able to test Scylla metrics. Scylla metrics # are not available on AWS (of course), but may also not be available for @@ -396,6 +397,29 @@ def test_scan_operations(test_table_s, metrics): test_table_s.query(Limit=1, KeyConditionExpression='p=:p', ExpressionAttributeValues={':p': 'dog'}) +def test_table_scan_operations(test_table_s, metrics): + with check_table_increases_operation(metrics, ['Query', 'Scan'], test_table_s.name): + test_table_s.scan(Limit=1) + test_table_s.query(Limit=1, KeyConditionExpression='p=:p', + ExpressionAttributeValues={':p': 'dog'}) + +# Test counter for Query with VectorSearch: both global and per-table. +def test_query_vector_operations(vs, metrics): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexes=[{'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}]) as table: + with check_increases_operation(metrics, ['Query']): + with check_table_increases_operation(metrics, ['Query'], table.name): + # The vector store may or may not be configured; either way + # the Query counter must be incremented. + try: + table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, Limit=1) + except ClientError: + pass + # Test counters for DescribeEndpoints: def test_describe_endpoints_operations(dynamodb, metrics): with check_increases_operation(metrics, ['DescribeEndpoints']): diff --git a/test/alternator/test_vector.py b/test/alternator/test_vector.py new file mode 100644 index 0000000000..8b643f7e14 --- /dev/null +++ b/test/alternator/test_vector.py @@ -0,0 +1,2208 @@ +# Copyright 2026-present ScyllaDB +# +# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1 + +# Tests for the vector search feature. This is an Alternator extension +# that does not exist on DynamoDB, so most tests in this file are skipped +# when running against DynamoDB (a few tests that don't use the "vs" fixture +# can still run on DynamoDB). + +import pytest +import time +import decimal +from decimal import Decimal +from contextlib import contextmanager +from functools import cache + +from botocore.exceptions import ClientError +import boto3.dynamodb.types + +from .util import random_string, new_test_table, unique_table_name, scylla_config_read, scylla_config_write, client_no_transform, is_aws + +# Monkey-patch the boto3 library to stop doing its own error-checking on +# numbers. This works around a bug https://github.com/boto/boto3/issues/2500 +# of incorrect checking of responses, and we also need to get boto3 to not do +# its own error checking of requests, to allow us to check the server's +# handling of such errors. +# This is needed at least for test_numeric_list_precision_range(). +boto3.dynamodb.types.DYNAMODB_CONTEXT = decimal.Context(prec=100) + +# We want to be able to run these tests using an unmodified boto3 library - +# which doesn't understand the new parameters that Alternator added to +# CreateTable, Query, and so on, and moreover will strip unexpected fields +# in Alternator's responses. +# So the following fixture "vs" is a DynamoDB API connection, similar to our +# usual "dynamodb" fixture, but modified to allow our new vector-search +# parameters in the requests and responses. +# +# Users can use exactly the same code to get vector search support in boto3, +# but the more "official" way would be to modify botocore's JSON configuration +# file, botocore/data/dynamodb/2012-08-10/service-2.json. +@pytest.fixture(scope="module") +def vs(new_dynamodb_session, dynamodb): + if is_aws(dynamodb): + pytest.skip('Scylla-only: vector search extensions not available on DynamoDB') + resource = new_dynamodb_session() + client = resource.meta.client + # Patch the client to support the new APIs: + # All the new parameter "shapes" that we will use below for the + # new parameters of the different operations: + new_shapes = { + # For CreateTable (and also DescribeTable's output) + 'VectorIndexes': { + 'type': 'list', + 'member': {'shape': 'VectorIndex'}, + }, + 'VectorIndex': { + 'type': 'structure', + 'members': { + 'IndexName': {'shape': 'String'}, + 'VectorAttribute': {'shape': 'VectorAttribute'}, + 'Projection': {'shape': 'Projection'}, + # The following two fields are only returned in DescribeTable's + # output, not accepted in CreateTable's input. + 'IndexStatus': {'shape': 'String'}, + 'Backfilling': {'shape': 'BooleanObject'}, + }, + 'required': ['IndexName', 'VectorAttribute'], + }, + 'VectorAttribute': { + 'type': 'structure', + 'members': { + 'AttributeName': {'shape': 'String'}, + 'Dimensions': {'shape': 'Integer'}, + }, + 'required': ['AttributeName', 'Dimensions'], + }, + # For UpdateTable: + 'VectorIndexUpdates': { + 'type': 'list', + 'member': {'shape': 'VectorIndexUpdate'}, + }, + 'VectorIndexUpdate': { + 'type': 'structure', + 'members': { + 'Create': {'shape': 'CreateVectorIndexAction'}, + 'Delete': {'shape': 'DeleteVectorIndexAction'}, + } + }, + 'CreateVectorIndexAction': { + 'type': 'structure', + 'members': { + 'IndexName': {'shape': 'String'}, + 'VectorAttribute': {'shape': 'VectorAttribute'}, + 'Projection': {'shape': 'Projection'}, + }, + 'required': ['IndexName', 'VectorAttribute'], + }, + 'DeleteVectorIndexAction': { + 'type': 'structure', + 'members': { + 'IndexName': {'shape': 'String'}, + }, + 'required': ['IndexName'], + }, + # For Query: + 'VectorSearch': { + 'type': 'structure', + 'members': { + 'QueryVector': {'shape': 'AttributeValue'}, + }, + 'required': ['QueryVector'], + }, + } + # Register the new shapes: + service_model = client.meta.service_model + shape_resolver = service_model._shape_resolver + for shape_name, shape_def in new_shapes.items(): + shape_resolver._shape_map[shape_name] = shape_def + # Evict any cached shapes for these names + shape_resolver._shape_cache.pop(shape_name, None) + + # Add a VectorIndexes parameter to CreateTable + create_table_op = service_model.operation_model('CreateTable') + input_shape = create_table_op.input_shape + input_shape._shape_model['members']['VectorIndexes'] = { + 'shape': 'VectorIndexes' + } + input_shape._cache.pop('members', None) + + # Add VectorIndexUpdates parameter to UpdateTable + update_table_op = service_model.operation_model('UpdateTable') + input_shape = update_table_op.input_shape + input_shape._shape_model['members']['VectorIndexUpdates'] = { + 'shape': 'VectorIndexUpdates' + } + input_shape._cache.pop('members', None) + + # Add a VectorSearch parameter to Query + query_op = service_model.operation_model('Query') + input_shape = query_op.input_shape + input_shape._shape_model['members']['VectorSearch'] = { + 'shape': 'VectorSearch' + } + input_shape._cache.pop('members', None) + + # Add a VectorIndexes field to "TableDescription", the shape returned + # by DescribeTable and also CreateTable + output_shape = shape_resolver.get_shape_by_name('TableDescription') + output_shape._shape_model['members']['VectorIndexes'] = { + 'shape': 'VectorIndexes' + } + output_shape._cache.pop('members', None) + shape_resolver._shape_cache.pop('TableDescription', None) + + yield resource + +# A simple test for the vector type. In vector search, a vector is simply +# an array of known size that contains only numbers. In the DynamoDB API, +# there is no special "vector" type, it's just an regular "list" type, +# and the indexing code may later require that it contain only numbers or +# have a specific length. When this test was written, this vector is stored +# inefficiently as a JSON string with ASCII representation of numbers, but +# in the future, we may decide to recognize such numeric-only lists and +# store them on disk in an optimized way - and still this test will need +# to continue passing. +def test_vector_value(dynamodb, test_table_s): + p = random_string() + v = [Decimal("0"), Decimal("1.2"), Decimal("-2.3"), Decimal("1.2e10")] + test_table_s.put_item(Item={'p': p, 'v': v}) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['v'] == v + +# Even if we will have an optimized storage format for a numeric-only list, +# we will still need to use some form of "decimal" type (variable precision +# based on decimal digits) to support DynamoDB's full numeric precision and +# range (38 decimal digits, exponent up to 125) even in a list. This test +# confirms that Alternator indeed allows that full precision and range +# (which is different from any hardware floating-point type) inside lists. +# See similar tests but for a single number in test_number.py. +def test_numeric_list_precision_range(test_table_s): + p = random_string() + v = [Decimal("3.1415926535897932384626433832795028841"), + Decimal("9.99999999e125")] + test_table_s.put_item(Item={'p': p, 'v': v}) + assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['v'] == v + +# Test CreateTable creating a new table with a basic vector index. This test +# doesn't check that the vector index actually works - we'll do this in +# separate tests below. It just tests that the new Alternator-only +# CreateTable parameter "VectorIndexes" isn't rejected or otherwise fails. +# This test also doesn't cover all the different parameters inside +# VectorIndexes. +def test_createtable_vectorindexes(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'hello', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 4} + }]) as table: + pass + +# Test that in CreateTable's VectorIndexes, a IndexName and VectorAttribute +# is required. Inside the VectorAttribute, a AttributeName and Dimensions +# are required. With any of those fields missing we get a ValidationException. +def test_createtable_vectorindexes_missing_fields(vs): + # Note: in new_dynamodb_session in conftest.py, we used + # parameter_validation=False by default, so boto3 doesn't do the + # validation of missing parameters for us, which is good, because + # it allows us to send requests with missing fields and see the server + # catch that error. + for bad in bad_vector_indexes: + with pytest.raises(ClientError, match='ValidationException'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[bad]) as table: + pass + +bad_vector_indexes = [ + # everything missing: + {}, + # VectorAttribute missing: + {'IndexName': 'hello'}, + # IndexName missing: + {'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 4}}, + # VectorAttribute missing parts: + {'IndexName': 'hello', 'VectorAttribute': {'Dimensions': 4}}, + {'IndexName': 'hello', 'VectorAttribute': {'AttributeName': 'v'}}, +] + +# Check that we are not allowed to create two VectorIndexes with the same +# name. +def test_createtable_vectorindexes_same_name(vs): + with pytest.raises(ClientError, match='ValidationException.*Duplicate.*hello'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'hello', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 4} + }, + { 'IndexName': 'hello', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 7} + } + ]) as table: + pass + +# Check that we are not allowed to a VectorIndexes with the same name as +# the name of another type of index - GSI or an LSI. +def test_createtable_vectorindexes_same_name_gsi(vs): + with pytest.raises(ClientError, match='ValidationException.*Duplicate.*hello'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + GlobalSecondaryIndexes=[ + { 'IndexName': 'hello', + 'KeySchema': [{ 'AttributeName': 'p', 'KeyType': 'HASH' }], + 'Projection': { 'ProjectionType': 'ALL' } + }], + VectorIndexes=[ + { 'IndexName': 'hello', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 7} + }] + ) as table: + pass + +def test_createtable_vectorindexes_same_name_lsi(vs): + with pytest.raises(ClientError, match='ValidationException.*Duplicate.*hello'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, + { 'AttributeName': 'c', 'KeyType': 'RANGE' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }, + { 'AttributeName': 'c', 'AttributeType': 'S' }, + { 'AttributeName': 'x', 'AttributeType': 'S' }], + LocalSecondaryIndexes=[ + { 'IndexName': 'hello', + 'KeySchema': [{ 'AttributeName': 'p', 'KeyType': 'HASH' }, + { 'AttributeName': 'x', 'KeyType': 'RANGE' }], + 'Projection': { 'ProjectionType': 'ALL' } + }], + VectorIndexes=[ + { 'IndexName': 'hello', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 7} + }] + ) as table: + pass + +# Test that if a table is created to use vnodes instead of the modern default +# of tablets, then it can't use a vector index because vector index is +# officially supported only with tablets. +# When we finally remove vnode support from the code, this test should be +# deleted. +def test_createtable_vectorindexes_vnodes_forbidden(vs): + with pytest.raises(ClientError, match='ValidationException.*vnodes'): + with new_test_table(vs, + # set system:initial_tablets to a non-number to disable tablets: + Tags=[{'Key': 'system:initial_tablets', 'Value': 'none'}], + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'hello', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 7} + }] + ) as table: + pass + +# Verify that a vector index's IndexName follows the same naming rules as +# table names - name length from 3 up to 192 (max_table_name_length) and +# match the regex [a-zA-Z0-9._-]+. +# Note that these rules are similar, but not identical, to the rules for +# IndexName for GSI/LSI (tested in test_gsi.py and test_lsi.py) - there, +# Alternator doesn't put the limit on the length of the GSI/LSI's IndexName, +# but puts a limit (222) on the sum of the table's name and GSI/LSI's name. +def test_createtable_vectorindexes_indexname_rules(vs): + # Forbidden names: shorter than 3 characters, longer than 192 + # characters, or containing characters outside [a-zA-Z0-9._-]. + # These names should be rejected + for bad_name in ['xy', 'x'*193, 'hello$world', 'hello world']: + with pytest.raises(ClientError, match='ValidationException.*IndexName'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': bad_name, + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 74 } + }] + ) as table: + pass + # Allowed names: exactly 3 characters, 192 characters, and using + # all characters from [a-zA-Z0-9._-]. + # This test is slightly slower than usual, because three tables and + # indexes will be successfully created and then immediately deleted. + for good_name in ['xyz', 'x'*192, + 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-']: + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': good_name, + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 74 } + }] + ) as table: + pass + +# Check that the "Dimensions" property in CreateTable's VectorIndexes's +# VectorAttribute must be an integer between 1 and 16000 (MAX_VECTOR_DIMENSION +# in the code). +MAX_VECTOR_DIMENSION = 16000 +def test_createtable_vectorindexes_dimensions_rules(vs): + # Forbidden dimensions: non-integer, negative, zero, and above + # MAX_VECTOR_DIMENSION. These dimensions should be rejected. + for bad_dimensions in ['hello', 1.2, -17, 0, MAX_VECTOR_DIMENSION+1]: + with pytest.raises(ClientError, match='ValidationException.*Dimensions'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'vector_index', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': bad_dimensions } + }] + ) as table: + pass + # Allowed dimensions: 1, MAX_VECTOR_DIMENSION: + for good_dimensions in [1, MAX_VECTOR_DIMENSION]: + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'vector_index', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': good_dimensions } + }] + ) as table: + pass + +# Check that the "AttributeName" property in CreateTable's VectorIndexes's +# VectorAttribute must not be a key column (of the base table or any of its +# GSIs or LSIs). This is because key columns have a declared type, which +# can't be a vector (a list), so making such a column the key of a vector +# index makes no sense. +def test_createtable_vectorindexes_attributename_key(vs): + # Forbidden AttributeName: base-table keys (hash and range), GSI keys, + # LSI keys: + for bad_attr in ['p', 'c', 'x', 'y', 'z']: + with pytest.raises(ClientError, match='ValidationException.*AttributeName'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, + { 'AttributeName': 'c', 'KeyType': 'RANGE' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }, + { 'AttributeName': 'c', 'AttributeType': 'S' }, + { 'AttributeName': 'x', 'AttributeType': 'S' }, + { 'AttributeName': 'y', 'AttributeType': 'S' }, + { 'AttributeName': 'z', 'AttributeType': 'S' }, + ], + VectorIndexes=[ + { 'IndexName': 'vector_index', + 'VectorAttribute': {'AttributeName': bad_attr, 'Dimensions': 42 } + }], + GlobalSecondaryIndexes=[ + { 'IndexName': 'gsi', + 'KeySchema': [{ 'AttributeName': 'x', 'KeyType': 'HASH' }, + { 'AttributeName': 'y', 'KeyType': 'RANGE' }], + 'Projection': { 'ProjectionType': 'ALL' } + }], + LocalSecondaryIndexes=[ + { 'IndexName': 'lsi', + 'KeySchema': [{ 'AttributeName': 'p', 'KeyType': 'HASH' }, + { 'AttributeName': 'z', 'KeyType': 'RANGE' }], + 'Projection': { 'ProjectionType': 'ALL' } + }], + ) as table: + pass + +# Check that the "AttributeName" property in CreateTable's VectorIndexes's +# VectorAttribute is an attribute name, limited exactly like ordinary (non- +# key) attributes to 65535 (DYNAMODB_NONKEY_ATTR_NAME_SIZE_MAX) bytes. +# Note that there is no limitation on which characters are allowed, so we +# don't check that. +def test_createtable_vectorindexes_attributename_len(vs): + # Forbidden AttributeName: empty string, string over 65535 + for bad_attr in ['', 'x'*65536]: + with pytest.raises(ClientError, match='ValidationException.*AttributeName'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'vector_index', + 'VectorAttribute': {'AttributeName': bad_attr, 'Dimensions': 42 } + }] + ) as table: + pass + +# Test that we can add two different vector indexes on the same table +# in CreateTable, but they must be on different attributes. +def test_createtable_vectorindexes_multiple(vs): + # Can create two vector indexes on two different attributes: + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'ind1', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 42 } + }, + { 'IndexName': 'ind2', + 'VectorAttribute': {'AttributeName': 'y', 'Dimensions': 17 } + }, + ]) as table: + pass + # But can't create two vector indexes on the same attribute. + # Why don't we allow that? If the two indexes were to request different + # "dimensions", Alternator would not know which vector length to enforce + # when inserting values. But for simplicity (and avoiding wasted space + # and work) we decided not to allow two indexes on the same attribute, + # in any case. + with pytest.raises(ClientError, match='ValidationException.*Duplicate'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'ind1', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 42 } + }, + { 'IndexName': 'ind2', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 42 } + }, + ]) as table: + pass + +# Test that vector indexes are correctly listed in DescribeTable: +def test_describetable_vectorindexes(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'ind1', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 42 } + }, + { 'IndexName': 'ind2', + 'VectorAttribute': {'AttributeName': 'y', 'Dimensions': 17 } + }, + ]) as table: + desc = table.meta.client.describe_table(TableName=table.name) + assert 'Table' in desc + assert 'VectorIndexes' in desc['Table'] + vector_indexes = desc['Table']['VectorIndexes'] + assert len(vector_indexes) == 2 + for vec in vector_indexes: + assert vec['IndexName'] == 'ind1' or vec['IndexName'] == 'ind2' + if vec['IndexName'] == 'ind1': + assert vec['VectorAttribute'] == {'AttributeName': 'x', 'Dimensions': 42} + else: # vec['IndexName'] == 'ind2': + assert vec['VectorAttribute'] == {'AttributeName': 'y', 'Dimensions': 17} + assert vec['Projection'] == {'ProjectionType': 'KEYS_ONLY'} + +# Test that like DescribeTable, CreateTable also returns the VectorIndexes +# definition its response +def test_createtable_vectorindexes_returned(vs): + # To look at the response of CreateTable, we need to use the "client" + # interface, not the usual higher-level "resource" interface that we + # usually use in tests - because that doesn't return the actual response. + client = vs.meta.client + table_name = unique_table_name() + resp = client.create_table( + TableName=table_name, + BillingMode='PAY_PER_REQUEST', + KeySchema=[{ 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[{ + 'IndexName': 'ind', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 42 }, + }]) + try: + assert 'TableDescription' in resp + assert 'VectorIndexes' in resp['TableDescription'] + vector_indexes = resp['TableDescription']['VectorIndexes'] + assert len(vector_indexes) == 1 + vec = vector_indexes[0] + assert vec['IndexName'] == 'ind' + assert vec['VectorAttribute'] == {'AttributeName': 'x', 'Dimensions': 42} + # Note that today, CreateTable just echoes back the parameters it got + # doesn't add default parameters, so we don't expect to see, for + # example, a "Projection" field in the response because we didn't send + # one. We may change this decision in the future. + #assert vec['Projection'] == {'ProjectionType': 'KEYS_ONLY'} + finally: + # In principle, we need to wait for the table to become ACTIVE before + # deleting it. But this test only runs on Alternator, where + # CreateTable is synchronous anyway, so we don't bother to add a + # waiting loop. + client.delete_table(TableName=table_name) + +# Basic test for UpdateTable successfully adding a vector index +def test_updatetable_vectorindex_create(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]) as table: + # There are no vector indexes yet: + desc = table.meta.client.describe_table(TableName=table.name) + assert 'Table' in desc + assert 'VectorIndexes' not in desc['Table'] + # Add a vector index with UpdateTable + table.update(VectorIndexUpdates=[{'Create': + { 'IndexName': 'hello', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 17 } + }}]) + # Now describe_table should see the new vector index: + desc = table.meta.client.describe_table(TableName=table.name) + assert 'Table' in desc + assert 'VectorIndexes' in desc['Table'] + vector_indexes = desc['Table']['VectorIndexes'] + assert len(vector_indexes) == 1 + vec = vector_indexes[0] + assert vec['IndexName'] == 'hello' + assert vec['VectorAttribute'] == {'AttributeName': 'x', 'Dimensions': 17} + +# Basic test for UpdateTable successfully removing a vector index +def test_updatetable_vectorindex_delete(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[{ + 'IndexName': 'hello', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 42 } + }]) as table: + # There should be one vector index now: + desc = table.meta.client.describe_table(TableName=table.name) + assert 'Table' in desc + assert 'VectorIndexes' in desc['Table'] + assert len(desc['Table']['VectorIndexes']) == 1 + # Delete the vector index with UpdateTable + table.update(VectorIndexUpdates=[ + {'Delete': { 'IndexName': 'hello' }}]) + # Now describe_table should see no vector index: + desc = table.meta.client.describe_table(TableName=table.name) + assert 'Table' in desc + assert 'VectorIndexes' not in desc['Table'] + +# UpdateTable can't remove a vector index that doesn't exist. We get a +# ResourceNotFoundException. +def test_updatetable_vectorindex_delete_nonexistent(vs): + with pytest.raises(ClientError, match='ResourceNotFoundException'): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]) as table: + table.update(VectorIndexUpdates=[ + {'Delete': { 'IndexName': 'nonexistent' }}]) + +# Test that in UpdateTable's Create operation, a IndexName and VectorAttribute +# are required. Inside the VectorAttribute, a AttributeName and Dimensions +# are required. With any of those fields missing we get a ValidationException. +def test_updatetable_vectorindex_missing_fields(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]) as table: + # Note: in new_dynamodb_session in conftest.py, we used + # parameter_validation=False by default, so boto3 doesn't do the + # validation of missing parameters for us, which is good, because + # it allows us to send requests with missing fields and see the server + # catch that error. + for bad in bad_vector_indexes: + with pytest.raises(ClientError, match='ValidationException.*VectorIndexUpdates'): + table.update(VectorIndexUpdates=[{'Create': bad}]) + +# Test that when adding a vector index with UpdateTable, +# 1. Its name cannot be the same as an existing vector index or GSI or LSI +# 2. Its attribute cannot be a key column (of base, GSI or LSI) or the +# attribute on an existing vector index +def test_updatetable_vectorindex_taken_name_or_attribute(vs): + # We create a table with vector index, GSI and LSI, so we can check + # all the desired cases on a single table. + with new_test_table(vs, + KeySchema=[ + { 'AttributeName': 'p', 'KeyType': 'HASH' }, + { 'AttributeName': 'c', 'KeyType': 'RANGE' }], + AttributeDefinitions=[ + { 'AttributeName': 'p', 'AttributeType': 'S' }, + { 'AttributeName': 'c', 'AttributeType': 'S' }, + { 'AttributeName': 'x', 'AttributeType': 'S' }, + { 'AttributeName': 'y', 'AttributeType': 'S' }, + { 'AttributeName': 'z', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'vec', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 13 }}], + GlobalSecondaryIndexes=[ + { 'IndexName': 'gsi', + 'KeySchema': [ + { 'AttributeName': 'x', 'KeyType': 'HASH' }, + { 'AttributeName': 'y', 'KeyType': 'RANGE' }], + 'Projection': { 'ProjectionType': 'ALL' }}], + LocalSecondaryIndexes=[ + { 'IndexName': 'lsi', + 'KeySchema': [ + { 'AttributeName': 'p', 'KeyType': 'HASH' }, + { 'AttributeName': 'z', 'KeyType': 'RANGE' }], + 'Projection': { 'ProjectionType': 'ALL' } + }], + ) as table: + # IndexName already in use: + for bad_name in ['vec', 'gsi', 'lsi']: + with pytest.raises(ClientError, match='ValidationException.*already exists'): + table.update(VectorIndexUpdates=[{'Create': + { 'IndexName': bad_name, + 'VectorAttribute': {'AttributeName': 'xyz', 'Dimensions': 17 } + }}]) + # AttributeName already in use: + for bad_attr in ['p', 'c', 'x', 'y', 'z', 'v']: + with pytest.raises(ClientError, match='ValidationException.*AttributeName'): + table.update(VectorIndexUpdates=[{'Create': + { 'IndexName': 'newind', + 'VectorAttribute': {'AttributeName': bad_attr, 'Dimensions': 17 } + }}]) + +# In test_updatetable_vectorindex_taken_name_or_attribute() above we tested +# that we can't add a vector index with the same name as an existing GSI or +# LSI. Here we check that the reverse also holds - we can't add a GSI with +# the same name as an existing vector index. +def test_updatetable_gsi_same_name_as_vector_index(vs): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexes=[ + {'IndexName': 'vec', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}} + ]) as table: + with pytest.raises(ClientError, match='ValidationException.*already exists'): + table.meta.client.update_table( + TableName=table.name, + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + GlobalSecondaryIndexUpdates=[{'Create': { + 'IndexName': 'vec', + 'KeySchema': [{'AttributeName': 'p', 'KeyType': 'HASH'}], + 'Projection': {'ProjectionType': 'ALL'} + }}]) + +# Similarly, we can't add a GSI on an attribute that's already used as a +# vector index attribute. +def test_updatetable_gsi_key_is_vector_attribute(vs): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexes=[ + {'IndexName': 'vec', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}} + ]) as table: + # The attribute 'v' is already a vector index target - it cannot + # become the hash key of a new GSI. + with pytest.raises(ClientError, match='ValidationException.*AttributeDefinitions'): + table.meta.client.update_table( + TableName=table.name, + AttributeDefinitions=[{'AttributeName': 'v', 'AttributeType': 'S'}], + GlobalSecondaryIndexUpdates=[{'Create': { + 'IndexName': 'gsi', + 'KeySchema': [{'AttributeName': 'v', 'KeyType': 'HASH'}], + 'Projection': {'ProjectionType': 'ALL'} + }}]) + +# Test that if a table is created to use vnodes instead of the modern default +# of tablets, then one can't add to it a vector index because vector index is +# officially supported only with tablets. This is the UpdateTable version +# of a similar test for CreateTable above. +# When we finally remove vnode support from the code, this test should be +# deleted. +def test_updatetable_vectorindex_vnodes_forbidden(vs): + with new_test_table(vs, + # set system:initial_tablets to a non-number to disable tablets: + Tags=[{'Key': 'system:initial_tablets', 'Value': 'none'}], + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]) as table: + with pytest.raises(ClientError, match='ValidationException.*vnodes'): + table.update(VectorIndexUpdates=[{'Create': + { 'IndexName': 'ind', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 17 } + }}]) + +# Similar to an above test for the CreateTable case, verify that also for +# UpdateTable create a new vector index, a vector index's IndexName must +# have length from 3 up to 192 (max_table_name_length) and match the regex +# [a-zA-Z0-9._-]+. +def test_updatetable_vectorindex_indexname_bad(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]) as table: + # Forbidden names: shorter than 3 characters, longer than 192 + # characters, or containing characters outside [a-zA-Z0-9._-]. + # These names should be rejected + for bad_name in ['xy', 'x'*193, 'hello$world', 'hello world']: + with pytest.raises(ClientError, match='ValidationException.*IndexName'): + table.update(VectorIndexUpdates=[{'Create': + { 'IndexName': bad_name, + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 17 } + }}]) + +# Similar to an above test for the CreateTable case, verify that also for +# UpdateTable create a new vector index, a vector index's Dimensions must +# be an integer between 1 and MAX_VECTOR_DIMENSION +def test_updatetable_vectorindex_dimensions_bad(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]) as table: + for bad_dimensions in ['hello', 1.2, -17, 0, MAX_VECTOR_DIMENSION+1]: + with pytest.raises(ClientError, match='ValidationException.*Dimensions'): + table.update(VectorIndexUpdates=[{'Create': + { 'IndexName': 'ind', + 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': bad_dimensions } + }}]) + +# Similar to an above test for the CreateTable case, verify that also for +# UpdateTable create a new vector index, a vector index's attribute must +# have between 1 and 65535 bytes. +# Note that we also checked above that it can't be one of the existing keys +# (of base table, GSI or LSI), or an already indexed vector column. Here we +# only test the allowed length limits. +def test_updatetable_vectorindex_attributename_bad_len(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]) as table: + for bad_attr in ['', 'x'*65536]: + with pytest.raises(ClientError, match='ValidationException.*AttributeName'): + table.update(VectorIndexUpdates=[{'Create': + { 'IndexName': 'ind', + 'VectorAttribute': {'AttributeName': bad_attr, 'Dimensions': 17 } + }}]) + +# DynamoDB currently limits UpdateTable to only one GSI operation (Create +# or Delete), so we placed the same limit on VectorIndexUpdates - even though +# it's an array, it must have exactly one element. Let's validate this +# limitation is enforced - but if one day we decide to lift it, we can +# and delete this test. +def test_updatetable_vectorindex_just_one_update(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]) as table: + # Zero operations aren't allowed - it's treated just like a missing + # VectorIndexUpdates, and therefore a do-nothing UpdateTable which + # is not allowed. + with pytest.raises(ClientError, match='ValidationException.*requires one'): + table.update(VectorIndexUpdates=[]) + # Two "Create" aren't allowed. + # Again following DynamoDB's lead on GSI, interestingly in this case + # the error is LimitExceededException, not ValidationException. + with pytest.raises(ClientError, match='LimitExceededException.*allows one'): + table.update(VectorIndexUpdates=[ + {'Create': {'IndexName': 'ind1', 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 17 }}}, + {'Create': {'IndexName': 'ind2', 'VectorAttribute': {'AttributeName': 'y', 'Dimensions': 17 }}}]) + # Two "Delete" aren't allowed (they are rejected even before noticing + # that the indexes we ask to delete don't exist). + with pytest.raises(ClientError, match='LimitExceededException.*allows one'): + table.update(VectorIndexUpdates=[ + {'Delete': {'IndexName': 'ind1'}}, + {'Delete': {'IndexName': 'ind2'}}]) + # Also one "Delete" and one "Create" isn't allowed + with pytest.raises(ClientError, match='LimitExceededException.*allows one'): + table.update(VectorIndexUpdates=[ + {'Create': {'IndexName': 'ind1', 'VectorAttribute': {'AttributeName': 'x', 'Dimensions': 17 }}}, + {'Delete': {'IndexName': 'ind2'}}]) + +# Also, it's not allowed to have in one UpdateTable request both a +# VectorIndexUpdates and a GlobalSecondaryIndexUpdates. There is no real +# reason why we can't support this, but since we already don't allow adding +# (or deleting) more than one GSI or more than one vector index in the same +# operation, it makes sense to disallow having both. If one day we decide to +# allow both in the same request, we can delete this test. +def test_updatetable_vector_and_gsi_same_request(vs): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + with pytest.raises(ClientError, match='LimitExceededException'): + table.meta.client.update_table( + TableName=table.name, + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexUpdates=[{'Create': { + 'IndexName': 'vec', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3} + }}], + GlobalSecondaryIndexUpdates=[{'Create': { + 'IndexName': 'gsi', + 'KeySchema': [{'AttributeName': 'p', 'KeyType': 'HASH'}], + 'Projection': {'ProjectionType': 'ALL'} + }}]) + +# Test that PutItem still works as expected on a table with a vector index +# created by CreateTable or UpdateTable. It might not work if we set up CDC +# in a broken way that breaks writes. +def test_putitem_vectorindex_createtable(vs): + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }], + VectorIndexes=[ + { 'IndexName': 'vec', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3 }}] + ) as table: + p = random_string() + item = {'p': p, 'v': [1,2,3]} + table.put_item(Item=item) + assert item == table.get_item(Key={'p': p}, ConsistentRead=True)['Item'] + +def test_putitem_vectorindex_updatetable(vs): + # Create the table without a vector index, and add it later: + with new_test_table(vs, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }], + AttributeDefinitions=[{ 'AttributeName': 'p', 'AttributeType': 'S' }]) as table: + table.update(VectorIndexUpdates=[ + {'Create': {'IndexName': 'ind', 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3 }}}]) + # In general we may need to wait here until the vector index is + # ACTIVE, but currently in Alternator we don't need to wait. + p = random_string() + item = {'p': p, 'v': [1,2,3]} + table.put_item(Item=item) + assert item == table.get_item(Key={'p': p}, ConsistentRead=True)['Item'] + +# Simple test table with a vector index on a 3-dimensional vector column v +# Please note that because this is a shared table, tests that perform +# global queries on it, not filtering to a specific partition, may get +# results from other tests - so such tests will need to create their own +# table instead of using this shared one. +@pytest.fixture(scope="module") +def table_vs(vs): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexes=[ + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}} + ]) as table: + yield table + +# Test that a Query with a VectorSearch parameter without an IndexName +# is rejected with a ValidationException. +def test_query_vectorsearch_missing_indexname(table_vs): + with pytest.raises(ClientError, match='ValidationException.*IndexName'): + table_vs.query(VectorSearch={'QueryVector': [1, 2, 3]}) + +# Test that a Query with a VectorSearch parameter with an IndexName +# which does not refer to a valid vector index is rejected with a +# ValidationException. Note that it doesn't really matter if IndexName +# refers to a garbage name or to a real GSI/LSI - the code just checks +# if it's a known vector index name. +def test_query_vectorsearch_wrong_indexname(table_vs): + with pytest.raises(ClientError, match='ValidationException.*is not a vector index'): + table_vs.query(IndexName='nonexistent', + VectorSearch={'QueryVector': [1, 2, 3]}) + +# Test that a Query on a vector index without a VectorSearch parameter is +# rejected. When VectorSearch isn't specified, the code expects a base table, +# LSI or GSI - which it won't find. But rather than reporting unhelpfully +# that an index by that name doesn't exist, we want to report that this index +# does exist - and is a vector index - so VectorSearch must be specified. +def test_query_vectorindex_no_vectorsearch(table_vs): + with pytest.raises(ClientError, match='ValidationException.*VectorSearch'): + table_vs.query( + IndexName='vind', + KeyConditionExpression='p = :p', + ExpressionAttributeValues={':p': 'x'}, + ) + +# Test that a Query with a VectorSearch parameter that is missing the +# required QueryVector field is rejected with a ValidationException. +def test_query_vectorsearch_missing_queryvector(table_vs): + with pytest.raises(ClientError, match='ValidationException.*QueryVector'): + table_vs.query( + IndexName='vind', + VectorSearch={}, + ) + +# Test that QueryVector must be a list of numbers, automatically (thanks to +# boto3) using the DynamoDB encoding {"L": [{"N": "1"}, ...]}, and must +# have the exact length defined as Dimensions of the vector index - which +# in table_vs is 3. +def test_query_vectorsearch_queryvector_bad(table_vs): + # A non-list QueryVector is rejected: + with pytest.raises(ClientError, match='ValidationException.*list of numbers'): + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': 'not a list'}, + ) + # A list of the right length but with non-numeric elements + # should be rejected: + with pytest.raises(ClientError, match='ValidationException.*only numbers'): + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 'b', 3]} + ) + # A numeric list but with the wrong length is rejected: + with pytest.raises(ClientError, match='ValidationException.*length'): + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 2]} + ) + with pytest.raises(ClientError, match='ValidationException.*length'): + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3, 4]} + ) + +# Test that a Query with VectorSearch requires a Limit parameter, which +# determines how many nearest neighbors to return. This is somewhat +# different from the usual meaning of "Limit" in a query which is +# optional, and used for pagination of results. Vector search does +# not currently support pagination. +def test_query_vectorsearch_limit_bad(table_vs): + # Limit cannot be missing: + with pytest.raises(ClientError, match='ValidationException.*Limit'): + table_vs.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, + ) + # Limit must be a positive integer: + for bad_limit in ['hello', 1.5, 0, -3]: + with pytest.raises(ClientError, match='ValidationException.*Limit'): + table_vs.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, + Limit=bad_limit + ) + +# Test that a Query with VectorSearch does not support ConsistentRead=True, +# just like queries on a GSI. +def test_query_vectorsearch_consistent_read(table_vs): + with pytest.raises(ClientError, match='ValidationException.*Consistent'): + table_vs.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, + Limit=10, + ConsistentRead=True) + +# Test that a Query with VectorSearch does not support pagination via +# ExclusiveStartKey. +def test_query_vectorsearch_exclusive_start_key(table_vs): + with pytest.raises(ClientError, match='ValidationException.*ExclusiveStartKey'): + table_vs.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, + Limit=10, + ExclusiveStartKey={'p': 'somekey'}, + ) + +# Test that a Query with VectorSearch does not support ScanIndexForward. +# The ordering of vector search results is determined by vector distance, +# not by the sort key, so ScanIndexForward makes no sense and is rejected. +def test_query_vectorsearch_scan_index_forward(table_vs): + with pytest.raises(ClientError, match='ValidationException.*ScanIndexForward'): + table_vs.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, + Limit=10, + ScanIndexForward=True, + ) + with pytest.raises(ClientError, match='ValidationException.*ScanIndexForward'): + table_vs.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, + Limit=10, + ScanIndexForward=False, + ) + +# Test that a Query with VectorSearch and an unused element in +# ExpressionAttributeValues is rejected. +def test_query_vectorsearch_unused_expression_attribute_values(table_vs): + with pytest.raises(ClientError, match='ValidationException.*val2'): + table_vs.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, + Limit=1, + ExpressionAttributeValues={':val2': 'a'}, + ) + +# Test that a Query with VectorSearch and an unused element in +# ExpressionAttributeNames is rejected. +def test_query_vectorsearch_unused_expression_attribute_names(table_vs): + with pytest.raises(ClientError, match='ValidationException.*name2'): + table_vs.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, + Limit=1, + ProjectionExpression='#name1', + ExpressionAttributeNames={'#name1': 'x', '#name2': 'y'}, + ) + +# Helper function to check a vector store is configured in Scylla +# with the --vector-store-primary-uri option. This can be done, for +# example, by running test/alternator/run with the option "--vs". +# This function needs some table as a parameter; calling it again +# for the same table will use a cached result. +@cache +def vector_store_configured(table_vs): + # Issue a trial query to detect whether Scylla was started with a vector + # store URI. If we get an error message "Vector Store is disabled", it + # means the vector store is not configured. If we get any other error or + # success - it means the vector store is configured (but might not be + # ready yet - individual tests will use their own retry loops). + try: + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': [0, 0, 0]}, + Limit=1) + except ClientError as e: + if 'Vector Store is disabled' in e.response['Error']['Message']: + return False + return True + +# Fixture to skip a test if the vector store is not configured. +# It is assumed that if Scylla is configured to use the vector store, then +# the reverse is also true - the vector store is configured to use Scylla, +# so we can check the end-to-end functionality. +@pytest.fixture(scope="module") +def needs_vector_store(table_vs): + if not vector_store_configured(table_vs): + pytest.skip('Vector Store is not configured (run with --vs)') + +# The context manager unconfigured_vector_store() temporarily (for the +# duration of the "with" block) un-configures the vector store in Scylla - +# the vector_store_primary_uri configuration option. This allows testing the +# behavior when the vector store is not configured, even if we are testing +# on a setup where it is configured. +@contextmanager +def unconfigured_vector_store(vs): + # As mentioned in issue #28225, we can't write an empty string to the + # configuration due to a bug. But luckily, we can write any garbage which + # isn't a valid URI, and this will be considered unconfigured. + # We also can't restore an empty configuration due to #28225. + # When #28225 is fixed, this entire function can be simplified to just: + # with scylla_config_temporary_string(vs, 'vector_store_primary_uri', ''): + # yield + # Instead we need to use the following mess: + original_value = scylla_config_read(vs, 'vector_store_primary_uri') + if original_value == '""': + # nothing to do, or to restore + yield + return + assert original_value.startswith('"') and original_value.endswith('"') + original_value = original_value[1:-1] + scylla_config_write(vs, 'vector_store_primary_uri', 'garbage') + try: + yield + finally: + scylla_config_write(vs, 'vector_store_primary_uri', original_value) + +# If the vector store is not configured, then Query with VectorSearch is +# rejected with a ValidationException saying "Vector Store is disabled". +def test_query_vector_store_disabled(vs, table_vs): + with unconfigured_vector_store(vs): + with pytest.raises(ClientError, match='ValidationException.*Vector Store is disabled'): + table_vs.query(IndexName='vind', VectorSearch={'QueryVector': [0, 0, 0]}, + Limit=1) + +# Test that even if the vector store is not configured, it is possible to +# create a vector index on the table - but DescribeTable will always show +# that it is CREATING, not ACTIVE. +# I'm not convinced it is a good idea to allow create vector indexes if +# the vector store isn't even configured in Scylla, but currently we do +# allow it. +def test_vectorindex_status_without_vector_store(vs): + with unconfigured_vector_store(vs): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexes=[ + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}} + ]) as table: + desc = table.meta.client.describe_table(TableName=table.name) + vector_indexes = desc['Table']['VectorIndexes'] + assert len(vector_indexes) == 1 + assert vector_indexes[0]['IndexName'] == 'vind' + assert vector_indexes[0]['IndexStatus'] == 'CREATING' + +# Timeout (in seconds) used by the retry loops in tests that wait for the +# vector store to index data. Centralized here so it can be adjusted easily. +VECTOR_STORE_TIMEOUT = 20 + +# Test that a vector search Query returns the nearest-neighbour item. +# The vector store is eventually consistent: after put_item the ANN index +# takes time to reflect the new item, so we retry until it appears. +# A private table is used to avoid other tests' data interfering with the +# Limit=1 result. Data is inserted before the index is created so the +# vector store picks it up faster, by prefill scan rather than CDC. +def test_query_vector_prefill(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + p = random_string() + table.put_item(Item={'p': p, 'v': [1, 0, 0]}) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + try: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1 + ) + if result.get('Items') and result['Items'][0]['p'] == p: + break + except ClientError: + pass + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for vector store to return the expected item') + time.sleep(0.1) + +# Same as test_query_vector_prefill but for a table with a clustering key, which +# exercises the separate code path in query_vector() for hash+range tables. +def test_query_vector_with_ck_prefill(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[ + {'AttributeName': 'p', 'KeyType': 'HASH'}, + {'AttributeName': 'c', 'KeyType': 'RANGE'}], + AttributeDefinitions=[ + {'AttributeName': 'p', 'AttributeType': 'S'}, + {'AttributeName': 'c', 'AttributeType': 'S'}]) as table: + p = random_string() + c = random_string() + table.put_item(Item={'p': p, 'c': c, 'v': [1, 0, 0]}) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + try: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1 + ) + items = result.get('Items', []) + if items and items[0]['p'] == p and items[0]['c'] == c: + break + except ClientError: + pass + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for vector store to return the expected item') + time.sleep(0.1) + +# Utility function for waiting until the given vector index is ACTIVE, which +# means that when this function returns, we are guaranteed that: +# 1. Queries on this index will succeed. +# 2. The prefill scan of the existing table data has completed, so all items +# that existed in the table before the index was created have been indexed. +# This function uses DescribeTable and waits for the index's IndexStatus to +# become "ACTIVE". This is more elgant than waiting for an actual Query to +# succeed, and also doesn't require knowing the dimensions of this index to +# attempt a real Query. +def wait_for_vector_index_active(table, index_name): + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + desc = table.meta.client.describe_table(TableName=table.name) + for vi in desc.get('Table', {}).get('VectorIndexes', []): + if vi['IndexName'] == index_name and vi['IndexStatus'] == 'ACTIVE': + return + if time.monotonic() > deadline: + pytest.fail(f'Timed out waiting for vector index "{index_name}" to become ACTIVE') + time.sleep(0.1) + +# Test that wait_for_vector_index_active(), waiting for IndexStatus==ACTIVE, +# indeed reliably waits for the index to be ready. A Query issued immediately +# after wait_for_vector_index_active() returns should succeed without any +# retry loop, and also returns the prefilled data. +def test_wait_for_vector_index_active(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + p = random_string() + table.put_item(Item={'p': p, 'v': [1, 0, 0]}) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + wait_for_vector_index_active(table, 'vind') + # The index is now ACTIVE: the prefill scan has completed and the + # item we inserted is guaranteed to be indexed. Query without catching + # exceptions or retrying. + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1 + ) + assert result.get('Items') and result['Items'][0]['p'] == p + +# The tests test_query_vector_prefill and test_query_vector_with_ck_prefill +# used string keys in the indexed table. In theory, there shouldn't be any +# difference in the vector store's behavior if the keys are of a different +# type (in addition to string, they can be numeric or binary). But in +# practice, the factor store does handle different key types differently, +# and this test used to fail before this was fixed. +# To save a bit of time, we don't test all combinations of hash and range +# key types but test each type at least once as a hash key and a range key. +@pytest.mark.skip(reason="Bug in vector store for non-string keys, fails very slowly so let's skip") +@pytest.mark.parametrize('hash_type,range_type', [ + ('N', None), ('B', None), ('S', 'N'), ('S', 'B'), +], ids=[ + 'N', 'B', 'SN', 'SB']) +def test_query_vector_prefill_key_types(vs, needs_vector_store, hash_type, range_type): + key_schema = [{'AttributeName': 'p', 'KeyType': 'HASH'}] + attr_defs = [{'AttributeName': 'p', 'AttributeType': hash_type}] + if range_type is not None: + key_schema.append({'AttributeName': 'c', 'KeyType': 'RANGE'}) + attr_defs.append({'AttributeName': 'c', 'AttributeType': range_type}) + key = {'S': 'hello', 'N': Decimal('42'), 'B': b'hello'} + with new_test_table(vs, KeySchema=key_schema, + AttributeDefinitions=attr_defs) as table: + p = key[hash_type] + item = {'p': p, 'v': [1, 0, 0]} + if range_type is not None: + c = key[range_type] + item['c'] = c + table.put_item(Item=item) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + wait_for_vector_index_active(table, 'vind') + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1) + assert len(result['Items']) == 1 and result['Items'] == [item] + +# Same as test_query_vector_prefill but whereas in test_query_vector_prefill +# the vector store reads the indexed data by scanning the table, here the +# vector index is created first and only later the data is written, so the +# vector store is expected to pick it up via CDC. +def test_query_vector_cdc(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexes=[ + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}} + ]) as table: + # Wait until the vector store is ready (prefill of the empty table + # has completed), to ensure the subsequent write is picked up via CDC. + wait_for_vector_index_active(table, 'vind') + # Now write the item. It should reach the vector store via CDC. + p = random_string() + table.put_item(Item={'p': p, 'v': [1, 0, 0]}) + # Retry the query until the newly written item appears in the results. + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + try: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1 + ) + if result.get('Items') and result['Items'][0]['p'] == p: + break + except ClientError: + pass + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for vector store to return the expected item via CDC') + time.sleep(0.1) + +# Similar test to test_query_vector_cdc, where an item is written after the +# vector index is created, but here the item is written using LWT (using a +# ConditionExpression that causes the request to be a read-modify-write +# operation so need to use LWT for most write isolation modes). This is +# important to test because LWT has different code path for recognizing we +# need to write to the CDC log). +def test_query_vector_cdc_lwt(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexes=[ + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}} + ]) as table: + wait_for_vector_index_active(table, 'vind') + # Write the item, with a ConditionExpression to guarantee LWT. + p = random_string() + table.put_item(Item={'p': p, 'v': [1, 0, 0]}, + ConditionExpression='attribute_not_exists(p)') + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1) + if len(result['Items']) > 0: + break + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for vector store index an item via CDC') + time.sleep(0.1) + assert len(result['Items']) == 1 and result['Items'][0]['p'] == p + + +# Similar to test_query_vector_cdc, this test also that a vector search Query +# find data inserted after the index was created. But this test adds a twist: +# before creating the index, we insert a malformed value for the vector +# attribute (a string). We check that this malformed is ignored by the initial +# prefill scan, but should not prevent a later write with a well-formed vector +# from being indexed and returned by queries. +@pytest.mark.parametrize('malformed', ['garbage', [1,2]], ids=['string','wrong_length']) +def test_query_vector_cdc_malformed_prefill(vs, needs_vector_store, malformed): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + # Vector index is not yet enabled, so we can insert a string as the + # value of v, without validation. + p1 = random_string() + table.put_item(Item={'p': p1, 'v': malformed}) + # Insert another item with a proper vector + p2 = random_string() + table.put_item(Item={'p': p2, 'v': [1, 0, 0]}) + # Now create the vector index. The prefill scan will encounter the + # malformed item and must silently ignore it. + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + # Wait for the prefill scan to complete (index becomes ACTIVE). + wait_for_vector_index_active(table, 'vind') + # At this point only p2 should be indexed and returned by a query + result = table.query(IndexName='vind', VectorSearch={'QueryVector': [1, 0, 0]}, Limit=10) + assert len(result['Items']) == 1 and result['Items'][0]['p'] == p2 + # Now replace the value of p1 by a properly formed vector. It should + # be eventually picked up by CDC and indexed by the vector index: + table.put_item(Item={'p': p1, 'v': [1, Decimal("0.1"), 0]}) + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + result = table.query(IndexName='vind', VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=10) + if len(result['Items']) == 2 and {item['p'] for item in result['Items']} == {p2, p1}: + break + if time.monotonic() > deadline: + assert len(result['Items']) == 2 and {item['p'] for item in result['Items']} == {p2, p1} + break + time.sleep(0.1) + +# Test like test_query_vector_prefill, but with a query returning multiple +# results. This helps us verify that: +# 1. "Limits" determines the number of results. +# 2. The query_vector() code correctly handles the need to read and +# return multiple items. +# 3. The multiple results are correctly sorted by distance (nearest first). +def test_query_vector_multiple_results(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + # Insert 4 items at known cosine distances from the query vector [1, 0, 0]: + # p1 at [1, 0, 0] - cosine distance 0 (closest, identical direction) + # p2 at [1, 0.1, 0] - cosine distance ~0.005 (2nd, slightly off-axis) + # p3 at [0, 1, 0] - cosine distance 1 (3rd, orthogonal) + # p4 at [-1, 0, 0] - cosine distance 2 (farthest, opposite direction) + # Data is inserted before the vector index is created so the vector + # store picks it up via scan rather than CDC, which finishes faster. + p1, p2, p3, p4 = random_string(), random_string(), random_string(), random_string() + table.put_item(Item={'p': p1, 'v': [Decimal("1"), Decimal("0"), Decimal("0")]}) + table.put_item(Item={'p': p2, 'v': [Decimal("1"), Decimal("0.1"), Decimal("0")]}) + table.put_item(Item={'p': p3, 'v': [Decimal("0"), Decimal("1"), Decimal("0")]}) + table.put_item(Item={'p': p4, 'v': [Decimal("-1"), Decimal("0"), Decimal("0")]}) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + expected_order = [p1, p2, p3] + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + try: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [Decimal("1"), Decimal("0"), Decimal("0")]}, + Limit=3 + ) + items = result.get('Items', []) + got = [item['p'] for item in items if item['p'] in {p1, p2, p3, p4}] + if got == expected_order: + break + except ClientError: + pass + if time.monotonic() > deadline: + pytest.fail(f'Timed out waiting for correct ordered results; last got: {got}, expected {expected_order}') + time.sleep(0.1) + +# Same as test_query_vector_multiple_results but for a table with a +# clustering key, to exercise the hash+range code path in query_vector(). +def test_query_vector_with_ck_multiple_results(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[ + {'AttributeName': 'p', 'KeyType': 'HASH'}, + {'AttributeName': 'c', 'KeyType': 'RANGE'}], + AttributeDefinitions=[ + {'AttributeName': 'p', 'AttributeType': 'S'}, + {'AttributeName': 'c', 'AttributeType': 'S'}]) as table: + p1, p2, p3, p4 = random_string(), random_string(), random_string(), random_string() + c1, c2, c3, c4 = random_string(), random_string(), random_string(), random_string() + table.put_item(Item={'p': p1, 'c': c1, 'v': [Decimal("1"), Decimal("0"), Decimal("0")]}) + table.put_item(Item={'p': p2, 'c': c2, 'v': [Decimal("1"), Decimal("0.1"), Decimal("0")]}) + table.put_item(Item={'p': p3, 'c': c3, 'v': [Decimal("0"), Decimal("1"), Decimal("0")]}) + table.put_item(Item={'p': p4, 'c': c4, 'v': [Decimal("-1"), Decimal("0"), Decimal("0")]}) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + expected_order = [(p1, c1), (p2, c2), (p3, c3)] + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + try: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [Decimal("1"), Decimal("0"), Decimal("0")]}, + Limit=3 + ) + items = result.get('Items', []) + pcs = {(p1, c1), (p2, c2), (p3, c3), (p4, c4)} + got = [(item['p'], item['c']) for item in items if (item['p'], item['c']) in pcs] + if got == expected_order: + break + except ClientError: + pass + if time.monotonic() > deadline: + pytest.fail(f'Timed out waiting for correct ordered results; last got: {got}, expected {expected_order}') + time.sleep(0.1) + +# Test that a vector search Query returns, with Select='ALL_ATTRIBUTES', the +# full item content correctly (all attributes, correct key values) in the +# expected order - for multiple results. Two variants are tested via +# parametrize, to exercise two separate code paths in query_vector(), for +# tables with and without clustering keys: +# - no_ck: table with just a hash key +# - with_ck: table with a hash key and a range key +@pytest.mark.parametrize('have_ck', [False, True], ids=['no_ck', 'with_ck']) +def test_query_vector_full_items(vs, needs_vector_store, have_ck): + key_schema = [{'AttributeName': 'p', 'KeyType': 'HASH'}] + attr_defs = [{'AttributeName': 'p', 'AttributeType': 'S'}] + if have_ck: + key_schema.append({'AttributeName': 'c', 'KeyType': 'RANGE'}) + attr_defs.append({'AttributeName': 'c', 'AttributeType': 'S'}) + with new_test_table(vs, + KeySchema=key_schema, + AttributeDefinitions=attr_defs) as table: + # Build 3 items, each with distinct key(s), a vector, and extra attributes. + # A 4th item is inserted but should not appear with Limit=3. + if have_ck: + # deliberately use just two different p values, so some of the + # returned items have the same p but different c, to exercise yet + # another potentially different code path: + p1 = random_string() + p2 = random_string() + ps = [p1, p1, p2, p2] + else: + ps = [random_string() for _ in range(4)] + vectors = [ + [Decimal("1"), Decimal("0"), Decimal("0")], # closest to query + [Decimal("1"), Decimal("0.1"), Decimal("0")], # 2nd + [Decimal("0"), Decimal("1"), Decimal("0")], # 3rd + [Decimal("-1"), Decimal("0"), Decimal("0")], # farthest, excluded + ] + items = [] + for i, (p, v) in enumerate(zip(ps, vectors)): + item = {'p': p, 'v': v, 'x': f'attr_{i}', 'y': Decimal(str(i * 10))} + if have_ck: + item['c'] = random_string() + items.append(item) + table.put_item(Item=item) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + # The 3 nearest items in expected distance order (closest first). + expected_items = items[:3] + # Wait until the returned items match the expected list exactly, + # verifying both the full content of each item and their order. + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + try: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [Decimal("1"), Decimal("0"), Decimal("0")]}, + Limit=3, + Select='ALL_ATTRIBUTES' + ) + if result.get('Items') == expected_items: + break + except ClientError: + pass + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for vector store to return the expected items') + time.sleep(0.1) + +# Test that PutItem rejects a vector attribute value that is invalid for +# the declared vector index on that attribute. The index on table_vs declares +# attribute 'v' as a 3-dimensional vector, so putting a non-list, a list of +# wrong length, a list with non-numeric elements, or a list containing a +# number that cannot be represented as a float must all be rejected. +# +# Note that this write rejection feature is nice to have (and mirrors what +# happens in GSI where writes with the wrong type for the indexed column +# are rejected), but was not really necessary: We could have allowed writes +# with the wrong type, and items with a wrong type would simply be ignored +# by the vector index and not returned in vector search results. +def test_putitem_vectorindex_bad_vector(table_vs): + p = random_string() + # Not a list - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + table_vs.put_item(Item={'p': p, 'v': 'not a list'}) + # A list of the wrong length - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + table_vs.put_item(Item={'p': p, 'v': [1, 2]}) + with pytest.raises(ClientError, match='ValidationException'): + table_vs.put_item(Item={'p': p, 'v': [1, 2, 3, 4]}) + # A list of the right length but with a non-numeric element - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + table_vs.put_item(Item={'p': p, 'v': [1, 'hello', 3]}) + # A list whose numeric elements can't be represented as a 32-bit float + # (value out of float range) - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + table_vs.put_item(Item={'p': p, 'v': [1, Decimal('1e100'), 3]}) + +# Same as test_putitem_vectorindex_bad_vector but using UpdateItem. +def test_updateitem_vectorindex_bad_vector(table_vs): + p = random_string() + # Not a list - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + table_vs.update_item(Key={'p': p}, + UpdateExpression='SET v = :val', + ExpressionAttributeValues={':val': 'not a list'}) + # A list of the wrong length - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + table_vs.update_item(Key={'p': p}, + UpdateExpression='SET v = :val', + ExpressionAttributeValues={':val': [1, 2]}) + with pytest.raises(ClientError, match='ValidationException'): + table_vs.update_item(Key={'p': p}, + UpdateExpression='SET v = :val', + ExpressionAttributeValues={':val': [1, 2, 3, 4]}) + # A list of the right length but with a non-numeric element - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + table_vs.update_item(Key={'p': p}, + UpdateExpression='SET v = :val', + ExpressionAttributeValues={':val': [1, 'hello', 3]}) + # A list whose numeric elements can't be represented as a 32-bit float + # (value out of float range) - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + table_vs.update_item(Key={'p': p}, + UpdateExpression='SET v = :val', + ExpressionAttributeValues={':val': [1, Decimal('1e100'), 3]}) + +# Same as test_putitem_vectorindex_bad_vector but using BatchWriteItem. +def test_batchwriteitem_vectorindex_bad_vector(table_vs): + p = random_string() + # Not a list - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + with table_vs.batch_writer() as batch: + batch.put_item(Item={'p': p, 'v': 'not a list'}) + # A list of the wrong length - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + with table_vs.batch_writer() as batch: + batch.put_item(Item={'p': p, 'v': [1, 2]}) + with pytest.raises(ClientError, match='ValidationException'): + with table_vs.batch_writer() as batch: + batch.put_item(Item={'p': p, 'v': [1, 2, 3, 4]}) + # A list of the right length but with a non-numeric element - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + with table_vs.batch_writer() as batch: + batch.put_item(Item={'p': p, 'v': [1, 'hello', 3]}) + # A list whose numeric elements can't be represented as a 32-bit float + # (value out of float range) - should be rejected: + with pytest.raises(ClientError, match='ValidationException'): + with table_vs.batch_writer() as batch: + batch.put_item(Item={'p': p, 'v': [1, Decimal('1e100'), 3]}) + +# Test that DeleteItem removes the item from the vector index. +# Two variants are tested via parametrize: +# - without clustering key (no_ck): deleting the only item in a partition +# generates a partition tombstone in CDC +# - with clustering key (with_ck): deleting a row generates a row tombstone +# in CDC, which is a different code path +@pytest.mark.parametrize('with_ck', [False, True], ids=['no_ck', 'with_ck']) +def test_deleteitem_vectorindex(vs, needs_vector_store, with_ck): + key_schema = [{'AttributeName': 'p', 'KeyType': 'HASH'}] + attr_defs = [{'AttributeName': 'p', 'AttributeType': 'S'}] + if with_ck: + key_schema.append({'AttributeName': 'c', 'KeyType': 'RANGE'}) + attr_defs.append({'AttributeName': 'c', 'AttributeType': 'S'}) + with new_test_table(vs, + KeySchema=key_schema, + AttributeDefinitions=attr_defs, + VectorIndexes=[ + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}} + ]) as table: + # Wait until the vector store is ready (empty table prefill done). + wait_for_vector_index_active(table, 'vind') + # Write the item and wait for it to appear in the vector index. + p = random_string() + item = {'p': p, 'v': [1, 0, 0]} + key = {'p': p} + if with_ck: + c = random_string() + item['c'] = c + key['c'] = c + table.put_item(Item=item) + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Select='ALL_ATTRIBUTES', + Limit=1) + if len(result['Items']) > 0: + assert result['Items'][0] == item + break + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for item to appear in vector index') + time.sleep(0.1) + # Delete the item and wait for it to disappear from the vector index. + table.delete_item(Key=key) + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1) + if len(result.get('Items', [])) == 0: + break + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for deleted item to disappear from vector index') + time.sleep(0.1) + +# Test vector index with Alternator TTL together. A table is created without +# TTL enabled, data is inserted with expiration time set to the past (but +# expiration not yet enabled), and the item should still appear in vector +# search. Then TTL expiration is enabled and the item should disappear from +# the vector search once TTL deletes it and the deletion propagates via CDC. +# This test is skipped if alternator_ttl_period_in_seconds is not set to a +# low value because otherwise it would take too long to run. +# Two code paths are tested via parametrize: +# - without clustering key (no_ck): partition deletions in CDC. +# - with clustering key (with_ck): row deletions in CDC. +@pytest.mark.parametrize('have_ck', [False, True], ids=['no_ck', 'with_ck']) +def test_vector_with_ttl(vs, needs_vector_store, have_ck): + period = scylla_config_read(vs, 'alternator_ttl_period_in_seconds') + if period is None or float(period) > 1: + pytest.skip('need alternator_ttl_period_in_seconds <= 1 to run this test quickly') + key_schema = [{'AttributeName': 'p', 'KeyType': 'HASH'}] + attr_defs = [{'AttributeName': 'p', 'AttributeType': 'S'}] + if have_ck: + key_schema.append({'AttributeName': 'c', 'KeyType': 'RANGE'}) + attr_defs.append({'AttributeName': 'c', 'AttributeType': 'S'}) + with new_test_table(vs, + KeySchema=key_schema, + AttributeDefinitions=attr_defs, + VectorIndexes=[ + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}} + ]) as table: + # Wait until the vector store is ready (prefill of the empty table + # has completed), to ensure the rest of the test doesn't need to + # the vector store not yet being up (we'll still need to wait for + # specific data to be indexed, but the index itself will be ready) + wait_for_vector_index_active(table, 'vind') + p = random_string() + item = {'p': p, 'expiration': int(time.time()) - 60, 'v': [1, 0, 0]} + if have_ck: + c = random_string() + item['c'] = c + # Insert an item with 'expiration' set to the past, before TTL is enabled. + # The item should still be visible (and indexed) because TTL is not yet + # configured on this table. + table.put_item(Item=item) + # Wait for the item to appear in vector search. Since TTL is not yet + # enabled, the item must be visible despite its past expiration time. + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1) + if len(result.get('Items', [])) > 0: + assert result['Items'][0]['p'] == p + break + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for item to appear in vector search before TTL was enabled') + time.sleep(0.1) + # Now enable TTL on the 'expiration' attribute. The item has its + # expiration in the past, so TTL should delete it quickly. + table.meta.client.update_time_to_live( + TableName=table.name, + TimeToLiveSpecification={'AttributeName': 'expiration', 'Enabled': True}) + # Wait for the item to disappear from vector search. TTL deletes the + # item from the database, and the deletion propagates to the vector + # store via CDC. + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + float(period) + while True: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Select='ALL_PROJECTED_ATTRIBUTES', + Limit=1 + ) + if len(result['Items']) == 0: + break + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for TTL-expired item to disappear from vector search') + time.sleep(0.1) + # Since we used Select='ALL_PROJECTED_ATTRIBUTES', the loop above + # already confirms the vector store removed the item (the results + # come directly from the vector store, not the base table). + assert result['Items'] == [] + +# Test support for "Select" parameter in vector search Query. +# We test all valid Select values and their effects on the returned items, +# as well as validation errors for invalid combinations. +# The first part tests validation errors (no vector store needed), and +# the second part tests correct results (needs vector store). +def test_query_vectorsearch_select_bad(table_vs): + # Unknown Select value + with pytest.raises(ClientError, match='ValidationException.*Select'): + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, Limit=1, + Select='GARBAGE') + # Select=SPECIFIC_ATTRIBUTES without ProjectionExpression or AttributesToGet + with pytest.raises(ClientError, match='ValidationException.*SPECIFIC_ATTRIBUTES'): + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, Limit=1, + Select='SPECIFIC_ATTRIBUTES') + # ProjectionExpression with Select=ALL_ATTRIBUTES is not allowed + with pytest.raises(ClientError, match='ValidationException.*SPECIFIC_ATTRIBUTES'): + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, Limit=1, + Select='ALL_ATTRIBUTES', ProjectionExpression='p') + # ProjectionExpression with Select=COUNT is not allowed + with pytest.raises(ClientError, match='ValidationException.*SPECIFIC_ATTRIBUTES'): + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, Limit=1, + Select='COUNT', ProjectionExpression='p') + # ProjectionExpression with Select=ALL_PROJECTED_ATTRIBUTES is not allowed + with pytest.raises(ClientError, match='ValidationException.*SPECIFIC_ATTRIBUTES'): + table_vs.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 2, 3]}, Limit=1, + Select='ALL_PROJECTED_ATTRIBUTES', ProjectionExpression='p') + +def test_query_vectorsearch_select(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + p = random_string() + # Insert data before creating the vector index so the vector store + # picks it up via prefill scan rather than CDC (faster). + table.put_item(Item={'p': p, 'v': [1, 0, 0], 'x': 'hello', 'y': 'world'}) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + # Wait for the item to appear in vector search. + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + try: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1) + if result.get('Items') and result['Items'][0]['p'] == p: + break + except ClientError: + pass + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for item to be indexed') + time.sleep(0.1) + # ALL_PROJECTED_ATTRIBUTES (default when no Select): returns only the + # primary key attributes. + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1) + assert result['Items'] == [{'p': p}] + # Explicit Select=ALL_PROJECTED_ATTRIBUTES: + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1, + Select='ALL_PROJECTED_ATTRIBUTES') + assert result['Items'] == [{'p': p}] + # Select=ALL_ATTRIBUTES: returns the full item. + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1, + Select='ALL_ATTRIBUTES') + assert result['Items'] == [{'p': p, 'v': [1, 0, 0], 'x': 'hello', 'y': 'world'}] + # Select=SPECIFIC_ATTRIBUTES with ProjectionExpression: returns only + # the specified attributes. + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1, + Select='SPECIFIC_ATTRIBUTES', ProjectionExpression='p, x') + assert result['Items'] == [{'p': p, 'x': 'hello'}] + # ProjectionExpression without Select: defaults to SPECIFIC_ATTRIBUTES. + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1, + ProjectionExpression='p, y') + assert result['Items'] == [{'p': p, 'y': 'world'}] + # Can also use ProjectionExpression with ExpressionAttributeNames: + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1, + ProjectionExpression='#name1, #name2', + ExpressionAttributeNames={'#name1': 'p', '#name2': 'x'}) + assert result['Items'] == [{'p': p, 'x': 'hello'}] + # Select=SPECIFIC_ATTRIBUTES with AttributesToGet: returns only the + # specified attributes. + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1, + Select='SPECIFIC_ATTRIBUTES', AttributesToGet=['p', 'x']) + assert result['Items'] == [{'p': p, 'x': 'hello'}] + # AttributesToGet without Select: defaults to SPECIFIC_ATTRIBUTES. + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1, + AttributesToGet=['p', 'y']) + assert result['Items'] == [{'p': p, 'y': 'world'}] + # Select=COUNT: returns only the count, no items list. + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, Limit=1, + Select='COUNT') + assert 'Items' not in result + assert result['Count'] == 1 + +# Test that invalid Projection parameter values are rejected for both +# CreateTable and UpdateTable's vector index creation. +def test_vector_projection_bad(vs): + bad_projections = [ + # 'not_an_object', # We can't check this with boto3 + {'ProjectionType': 'GARBAGE'}, + {}, # missing ProjectionType + ] + for bad_projection in bad_projections: + with pytest.raises(ClientError, match='ValidationException.*Projection'): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexes=[{ + 'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}, + 'Projection': bad_projection, + }]) as table: + pass + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + for bad_projection in bad_projections: + with pytest.raises(ClientError, match='ValidationException.*Projection'): + table.update(VectorIndexUpdates=[{'Create': { + 'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}, + 'Projection': bad_projection, + }}]) + +# Test that a vector index created with Projection={'ProjectionType': 'KEYS_ONLY'} +# (via CreateTable or UpdateTable) works correctly: +# - The ProjectionType=KEYS_ONLY is accepted +# - Select=ALL_PROJECTED_ATTRIBUTES returns only the primary key attributes +# - Select=ALL_ATTRIBUTES returns all attributes +# ProjectionType=KEYS_ONLY matches the default vector index behavior, so it +# doesn't change results but must be accepted as a valid parameter. +@pytest.mark.parametrize('via_update', [False, True], ids=['createtable', 'updatetable']) +def test_vector_projection_keys_only(vs, needs_vector_store, via_update): + if via_update: + ctx = new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) + else: + ctx = new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}], + VectorIndexes=[{ + 'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}, + 'Projection': {'ProjectionType': 'KEYS_ONLY'}, + }]) + with ctx as table: + if via_update: + table.update(VectorIndexUpdates=[{'Create': { + 'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}, + 'Projection': {'ProjectionType': 'KEYS_ONLY'}, + }}]) + p = random_string() + table.put_item(Item={'p': p, 'v': [1, 0, 0], 'x': 'hello'}) + wait_for_vector_index_active(table, 'vind') + # Select=ALL_PROJECTED_ATTRIBUTES returns only the primary key. + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1, + Select='ALL_PROJECTED_ATTRIBUTES') + assert result['Items'] == [{'p': p}] + # Select=ALL_ATTRIBUTES returns the full item. + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1, + Select='ALL_ATTRIBUTES') + assert result['Items'] == [{'p': p, 'v': [1, 0, 0], 'x': 'hello'}] + +# As we saw in test_item.py::test_attribute_allowed_chars in the DynamoDB API +# attribute names can contain any characters whatsoever, including quotes, +# spaces, and even null bytes. Test that such crazy attribute names can be +# used as vector attributes in vector indexes, and that a vector index with +# such an attribute can be created and used successfully. +def test_vector_attribute_allowed_chars(vs, needs_vector_store): + # To check both scan-based prefill and CDC-based indexing, we create the + # table without a vector index and then add the vector index. Data that + # we added before creating the index needs scan, and data added later + # needs CDC. We want to ensure that both work correctly with such + # attribute names. + attribute_name = 'v with spaces and .-+-&*!#@$%^()\\ \' "quotes" and \0 null byte' + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + p1 = random_string() + table.put_item(Item={'p': p1, attribute_name: [1, 0, 0]}) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': attribute_name, 'Dimensions': 3}}}]) + wait_for_vector_index_active(table, 'vind') + # The previous item was indexed by a scan. Now let's add another item + # which will get indexed by CDC. + p2 = random_string() + table.put_item(Item={'p': p2, attribute_name: [0, 0, 1]}) + # Wait until the CDC-indexed update (v=[0, 0, 1]) is reflected in the + # vector search results. + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + result = table.query(IndexName='vind', + VectorSearch={'QueryVector': [0, 0, 1]}, Limit=2) + if 'Items' in result and len(result['Items']) == 2 and result['Items'][0]['p'] == p2 and result['Items'][1]['p'] == p1: + break + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for items to appear in vector search') + time.sleep(0.1) + +# Test FilterExpression for post-filtering vector search results: After Limit +# results are found by the vector index and the full items are retrieved +# from the base table, items which do not match the given FilterExpression are +# removed. This means that fewer than Limit results may be returned. This +# matches DynamoDB's general Query behavior where the filtering is applied after +# Limit. +# Two Select values are tested (via parametrize): +# ALL_ATTRIBUTES: the matching items are returned in the Items list. +# COUNT: no items are returned, but the implementation still needs to retrieve +# full items (or at least the attributes needed by the filter) and +# count how many among the Limit candidates matched the filter. +# ScannedCount (number of pre-filtering results) and Count (number of post- +# filtering results) are returned in both cases and checked. +@pytest.mark.parametrize('select', ['ALL_ATTRIBUTES', 'COUNT']) +def test_query_vectorsearch_filter_expression(vs, needs_vector_store, select): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + # Insert all 5 items before the vector index is created so the vector + # store picks them up via prefill scan (faster than CDC). + # p_far is the furthest item and will not be among the 4 nearest + # neighbors returned with Limit=4. + p_keep1, p_keep2 = random_string(), random_string() + p_drop1, p_drop2 = random_string(), random_string() + p_far = random_string() + table.put_item(Item={'p': p_keep1, 'v': [1, 0, 0], 'x': 'keep'}) + table.put_item(Item={'p': p_drop1, 'v': [1, Decimal("0.1"), 0], 'x': 'drop'}) + table.put_item(Item={'p': p_keep2, 'v': [1, Decimal("0.2"), 0], 'x': 'keep'}) + table.put_item(Item={'p': p_drop2, 'v': [1, Decimal("0.3"), 0], 'x': 'drop'}) + table.put_item(Item={'p': p_far, 'v': [1, Decimal("0.4"), 0], 'x': 'keep'}) + nearest_ps = {p_keep1, p_keep2, p_drop1, p_drop2} # 4 nearest neighbors + keep_ps = {p_keep1, p_keep2} # x='keep' items among 4 nearest neighbors + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + # Wait until nearest 4 items (nearest_ps) are visible in a query + # without a filter. + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + try: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=4, + ) + if {item['p'] for item in result.get('Items', [])} == nearest_ps: + break + except ClientError: + pass + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for all items to be indexed') + time.sleep(0.1) + # Query with a FilterExpression that matches 2 of the 4 nearest + # candidates (Limit=4). We expect Count=2 and ScannedCount=4. Note + # that even though p_far also has x=keep, it was not among the 4 + # nearest neighbors - so it will not be included. + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=4, + Select=select, + FilterExpression='x = :want', + ExpressionAttributeValues={':want': 'keep'}, + ) + assert result['Count'] == 2 + assert result['ScannedCount'] == 4 + if select == 'COUNT': + assert 'Items' not in result + else: + assert {item['p'] for item in result['Items']} == keep_ps + +# Test FilterExpression for post-filtering vector search results with +# Select=SPECIFIC_ATTRIBUTES. Here the full items are not returned, but still +# need to be retrieved from the base table - including attributes which are +# needed by the filter but not returned in the final results. +def test_query_vectorsearch_filter_expression_specific_attributes(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + # Same 5-item setup as test_query_vectorsearch_filter_expression. + # p_far is the furthest and won't be among the 4 nearest with Limit=4. + p_keep1, p_keep2 = random_string(), random_string() + p_drop1, p_drop2 = random_string(), random_string() + p_far = random_string() + table.put_item(Item={'p': p_keep1, 'v': [1, 0, 0], 'x': 'keep'}) + table.put_item(Item={'p': p_drop1, 'v': [1, Decimal("0.1"), 0], 'x': 'drop'}) + table.put_item(Item={'p': p_keep2, 'v': [1, Decimal("0.2"), 0], 'x': 'keep'}) + table.put_item(Item={'p': p_drop2, 'v': [1, Decimal("0.3"), 0], 'x': 'drop'}) + table.put_item(Item={'p': p_far, 'v': [1, Decimal("0.4"), 0], 'x': 'keep'}) + nearest_ps = {p_keep1, p_keep2, p_drop1, p_drop2} + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + # Wait until the 4 nearest items are visible without a filter. + deadline = time.monotonic() + VECTOR_STORE_TIMEOUT + while True: + try: + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=4, + ) + if {item['p'] for item in result.get('Items', [])} == nearest_ps: + break + except ClientError: + pass + if time.monotonic() > deadline: + pytest.fail('Timed out waiting for all items to be indexed') + time.sleep(0.1) + # Query with Select=SPECIFIC_ATTRIBUTES projecting only 'p', but + # FilterExpression uses 'x' which is NOT in the projection. The + # implementation must still retrieve 'x' from the base table to + # evaluate the filter, even though 'x' is not returned to the caller. + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=4, + Select='SPECIFIC_ATTRIBUTES', + ProjectionExpression='p', + FilterExpression='x = :want', + ExpressionAttributeValues={':want': 'keep'}, + ) + assert result['Count'] == 2 + assert result['ScannedCount'] == 4 + # Items should contain only 'p' (the projected attribute), not 'x' + # (the filter attribute that was not projected). + assert result['Items'] == [{'p': p_keep1}, {'p': p_keep2}] + +# Test FilterExpression with Select=SPECIFIC_ATTRIBUTES and a nested +# ProjectionExpression (e.g. 'x.a'). Only the requested sub-attribute +# should be returned, not the entire top-level attribute. +def test_query_vectorsearch_filter_expression_nested_projection(vs, needs_vector_store): + with new_test_table(vs, + KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}], + AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'S'}]) as table: + p = random_string() + # Item has a nested attribute 'x' with sub-attributes 'a' and 'b'. + # The FilterExpression uses 'y', which is not in the projection. + table.put_item(Item={'p': p, 'v': [1, 0, 0], 'x': {'a': 'keep', 'b': 'drop'}, 'y': 'pass'}) + table.update(VectorIndexUpdates=[{'Create': + {'IndexName': 'vind', + 'VectorAttribute': {'AttributeName': 'v', 'Dimensions': 3}}}]) + wait_for_vector_index_active(table, 'vind') + # ProjectionExpression requests only the nested attribute 'x.a' (and 'p'). + # FilterExpression uses 'y', which is not in the projection at all. + # The result should contain only 'p' and 'x': {'a': 'keep'} - the + # 'b' sub-attribute of 'x' must not appear, and 'y' must not appear. + result = table.query( + IndexName='vind', + VectorSearch={'QueryVector': [1, 0, 0]}, + Limit=1, + Select='SPECIFIC_ATTRIBUTES', + ProjectionExpression='p, x.a', + FilterExpression='y = :want', + ExpressionAttributeValues={':want': 'pass'}, + ) + assert result['Count'] == 1 + assert result['ScannedCount'] == 1 + assert result['Items'] == [{'p': p, 'x': {'a': 'keep'}}] + +# Test that garbage values (like "dog" or "Inf") for the "N"-typed numbers +# are not allowed as vector attribute values given as a list of numbers. +# They should be rejected with a validation error both before the index is +# created (this test) and after (the next test), because such values are not +# allowed as "N" variables - regardless of vector search. +# This test (the "before") doesn't need vector search and can also run on +# DynamoDB. It reproduces issue #8070 - where Alternator validates number +# values, but forget to validate numbers when they are inside a list. +@pytest.mark.xfail(reason='issue #8070 - Alternator did not validate "N" values inside lists') +def test_putitem_vector_bad_number_string_before(test_table_s): + p = random_string() + # boto3 normally validates number strings before sending them to the + # server, so we need client_no_transform to bypass that validation and + # let the server reject the bad values itself. + with client_no_transform(test_table_s.meta.client) as client: + for bad_num in ['dog', 'Inf', 'NaN', 'Infinity', '-Infinity']: + with pytest.raises(ClientError, match='ValidationException'): + client.put_item( + TableName=test_table_s.name, + Item={ + 'p': {'S': p}, + 'v': {'L': [{'N': '1'}, {'N': bad_num}, {'N': '0'}]}, + }) + +def test_putitem_vector_bad_number_string_after(table_vs): + p = random_string() + # After the vector index is created, invalid "N" strings in a list + # must be rejected - they remain invalid DynamoDB numbers. + with client_no_transform(table_vs.meta.client) as client: + for bad_num in ['dog', 'Inf', 'NaN', 'Infinity', '-Infinity']: + with pytest.raises(ClientError, match='ValidationException'): + client.put_item( + TableName=table_vs.name, + Item={ + 'p': {'S': p}, + 'v': {'L': [{'N': '1'}, {'N': bad_num}, {'N': '0'}]}, + }) + +# Test that a Query with a vector with a non-numeric "N" element, like "dog" +# or "Inf", is rejected with a validation error. Note that the Query path +# does not convert the numbers to Alternator's internal type ("decimal") so +# the validation path is different, so we need to check it. +def test_query_vectorsearch_queryvector_bad_number_string(table_vs, needs_vector_store): + # boto3 validates number strings before sending them, so we use + # client_no_transform to bypass that and let the server reject them. + with client_no_transform(table_vs.meta.client) as client: + for bad_num in ['dog', 'Inf', 'NaN', 'Infinity', '-Infinity']: + print(bad_num) + with pytest.raises(ClientError, match='ValidationException.*not a valid number'): + client.query( + TableName=table_vs.name, + IndexName='vind', + VectorSearch={'QueryVector': {'L': [{'N': '1'}, {'N': bad_num}, {'N': '0'}]}}, + Limit=1, + ) + +############################################################################## +# CONTINUE HERE - MAKE A DECISION! PRE-FILTERING: +# Tests *pre-filtering* for filtering on projected attributes, which can be (if +# we continue the implementation) pushed to the vector store or right now - +# key columns. +# We need to decide: In DynamoDB pre-filtering is done with +# KeyConditionExpression NOT FilterExpression. +# KeyConditionExpression is the traditional approach in Query, but is a bit +# weird because these aren't really "keys" (even though in CQL CREATE TABLE +# syntax we pretend they are - and today, they really are keys). +# Do we want to force the user to put these pre-filtering in KeyConditionExpression +# instead of FilterExpression, and only allow FilterExpression for post-filtering +# that must be done in Scylla? +# Alternatively, we could put everything in FilterExpression. This is less +# with DynamoDB's usual semantics but simpler for users. +############################################################################ + +# WRITE TEST: +# Test that if the FilterExpression happens to only use projected attributes +# (by default this means key attributes) - or if we decide (see above) that +# it's KeyconditionExpression, then it can be, and is, sent to the vector +# store and performed there. We can check that this happens by noticing that +# we get a full LIMIT of results, and not less. + +# WRITE TEST: +# Test FilterExpression for post-filtering vector search results with +# Select=ALL_PROJECTED_ATTRIBUTES where the filter only needs projected +# attributes (currently those are key attributes). Here it is important +# for efficency that the vector index applies the filter and we do not need +# to retrive the full items from the base table at all. We can verify that +# this code path was reached by checking that we got back LIMIT results, and +# not fewer. + +# TODO: Like test_vector_projection_keys_only, write additional tests for +# ProjectionType=INCLUDE with NonKeyAttributes and ProjectionType=ALL. +# We don't yet support this feature, so I didn't bother to write such a +# test yet, but I can write an xfailing test because we know what we +# expect ALL_PROJECTED_ATTRIBUTES to return in that case. + +# TODO: test enabling vector index and Alternator Streams together, and +# checking that Alternator Streams works as expected. Also we may need to +# do something to avoid vector search's favorite parameters like TTL and +# post-changes to take control - or vice versa we may get CDC which isn't +# good enough for vector search. +# Note that today, Alternator Streams only works with vnodes while vector +# search doesn't work with vnodes - so we can't actually check this +# combination! But we must check it when Alternator Streams finally supports +# tablets. diff --git a/vector_search/vector_store_client.cc b/vector_search/vector_store_client.cc index 04833feca9..4d7b708c26 100644 --- a/vector_search/vector_store_client.cc +++ b/vector_search/vector_store_client.cc @@ -297,6 +297,36 @@ struct vector_store_client::impl { return _primary_uris.empty() && _secondary_uris.empty(); } + auto get_index_status(keyspace_name keyspace, index_name name, abort_source& as) + -> future { + using index_status = vector_store_client::index_status; + if (is_disabled()) { + co_return index_status::creating; + } + auto path = format("/api/v1/indexes/{}/{}/status", keyspace, name); + auto resp = co_await request(operation_type::GET, std::move(path), std::nullopt, as); + if (!resp || resp->status != status_type::ok) { + co_return index_status::creating; + } + try { + auto json = rjson::parse(response_content_to_sstring(resp->content)); + const auto* status = rjson::find(json, "status"); + if (!status || !status->IsString()) { + co_return index_status::creating; + } + auto sv = rjson::to_string_view(*status); + if (sv == "SERVING") { + co_return index_status::serving; + } + if (sv == "BOOTSTRAPPING") { + co_return index_status::backfilling; + } + co_return index_status::creating; + } catch (...) { + co_return index_status::creating; + } + } + auto ann(keyspace_name keyspace, index_name name, schema_ptr schema, vs_vector vs_vector, limit limit, const rjson::value& filter, abort_source& as) -> future> { if (is_disabled()) { @@ -376,6 +406,10 @@ auto vector_store_client::is_disabled() const -> bool { return _impl->is_disabled(); } +auto vector_store_client::get_index_status(keyspace_name keyspace, index_name name, abort_source& as) -> future { + return _impl->get_index_status(std::move(keyspace), std::move(name), as); +} + auto vector_store_client::ann(keyspace_name keyspace, index_name name, schema_ptr schema, vs_vector vs_vector, limit limit, const rjson::value& filter, abort_source& as) -> future> { return _impl->ann(keyspace, name, schema, vs_vector, limit, filter, as); diff --git a/vector_search/vector_store_client.hh b/vector_search/vector_store_client.hh index fdb9d21b75..27514f96dd 100644 --- a/vector_search/vector_store_client.hh +++ b/vector_search/vector_store_client.hh @@ -74,6 +74,21 @@ public: /// Check if the vector_store_client is disabled. auto is_disabled() const -> bool; + /// The operational status of a single vector index, as reported by the vector store. + enum class index_status { + /// The index is not yet ready: initializing, not yet discovered, or the + /// vector store is unreachable. + creating, + /// The index is performing the initial full scan of the base table + /// (backfilling). Queries may be served but results are incomplete. + backfilling, + /// The index has completed the initial scan and is fully operational. + serving, + }; + + /// Query the vector store for the current status of a specific vector index. + auto get_index_status(keyspace_name keyspace, index_name name, abort_source& as) -> future; + /// Request the vector store service for the primary keys of the nearest neighbors auto ann(keyspace_name keyspace, index_name name, schema_ptr schema, vs_vector vs_vector, limit limit, const rjson::value& filter, abort_source& as) -> future>;