Convert most use sites from `co_return coroutine::make_exception`
to `co_await coroutine::return_exception{,_ptr}` where possible.
In cases this is done in a catch clause, convert to
`co_return coroutine::exception`, generating an exception_ptr
if needed.
Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
Closes #10972
4415 lines
223 KiB
C++
4415 lines
223 KiB
C++
/*
|
|
* Copyright 2019-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#include <regex>
|
|
|
|
#include "utils/base64.hh"
|
|
|
|
#include <seastar/core/sleep.hh>
|
|
#include "alternator/executor.hh"
|
|
#include "log.hh"
|
|
#include "schema_builder.hh"
|
|
#include "data_dictionary/keyspace_metadata.hh"
|
|
#include "exceptions/exceptions.hh"
|
|
#include "timestamp.hh"
|
|
#include "types/map.hh"
|
|
#include "schema.hh"
|
|
#include "query-request.hh"
|
|
#include "query-result-reader.hh"
|
|
#include "cql3/selection/selection.hh"
|
|
#include "cql3/result_set.hh"
|
|
#include "cql3/type_json.hh"
|
|
#include "bytes.hh"
|
|
#include "cql3/update_parameters.hh"
|
|
#include "server.hh"
|
|
#include "service/pager/query_pagers.hh"
|
|
#include <functional>
|
|
#include "error.hh"
|
|
#include "serialization.hh"
|
|
#include "expressions.hh"
|
|
#include "conditions.hh"
|
|
#include "cql3/constants.hh"
|
|
#include <optional>
|
|
#include "utils/overloaded_functor.hh"
|
|
#include <seastar/json/json_elements.hh>
|
|
#include <boost/algorithm/cxx11/any_of.hpp>
|
|
#include "collection_mutation.hh"
|
|
#include "db/query_context.hh"
|
|
#include "schema.hh"
|
|
#include "alternator/tags_extension.hh"
|
|
#include "alternator/rmw_operation.hh"
|
|
#include <seastar/core/coroutine.hh>
|
|
#include <boost/range/adaptors.hpp>
|
|
#include <boost/range/algorithm/find_end.hpp>
|
|
#include <unordered_set>
|
|
#include "service/storage_proxy.hh"
|
|
#include "gms/gossiper.hh"
|
|
#include "schema_registry.hh"
|
|
#include "utils/error_injection.hh"
|
|
#include "db/schema_tables.hh"
|
|
#include "utils/rjson.hh"
|
|
|
|
logging::logger elogger("alternator-executor");
|
|
|
|
namespace alternator {
|
|
|
|
static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, api::timestamp_type);
|
|
|
|
static map_type attrs_type() {
|
|
static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true);
|
|
return t;
|
|
}
|
|
|
|
static const column_definition& attrs_column(const schema& schema) {
|
|
const column_definition* cdef = schema.get_column_definition(bytes(executor::ATTRS_COLUMN_NAME));
|
|
assert(cdef);
|
|
return *cdef;
|
|
}
|
|
|
|
make_jsonable::make_jsonable(rjson::value&& value)
|
|
: _value(std::move(value))
|
|
{}
|
|
std::string make_jsonable::to_json() const {
|
|
return rjson::print(_value);
|
|
}
|
|
|
|
json::json_return_type make_streamed(rjson::value&& value) {
|
|
// CMH. json::json_return_type uses std::function, not noncopyable_function.
|
|
// Need to make a copyable version of value. Gah.
|
|
auto rs = make_shared<rjson::value>(std::move(value));
|
|
std::function<future<>(output_stream<char>&&)> func = [rs](output_stream<char>&& os) mutable -> future<> {
|
|
// move objects to coroutine frame.
|
|
auto los = std::move(os);
|
|
auto lrs = std::move(rs);
|
|
try {
|
|
co_await rjson::print(*lrs, los);
|
|
co_await los.flush();
|
|
co_await los.close();
|
|
} catch (...) {
|
|
// at this point, we cannot really do anything. HTTP headers and return code are
|
|
// already written, and quite potentially a portion of the content data.
|
|
// just log + rethrow. It is probably better the HTTP server closes connection
|
|
// abruptly or something...
|
|
elogger.error("Unhandled exception in data streaming: {}", std::current_exception());
|
|
throw;
|
|
}
|
|
co_return;
|
|
};
|
|
return func;
|
|
}
|
|
|
|
json_string::json_string(std::string&& value)
|
|
: _value(std::move(value))
|
|
{}
|
|
std::string json_string::to_json() const {
|
|
return _value;
|
|
}
|
|
|
|
void executor::supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp) {
|
|
rjson::add(descr, "CreationDateTime", rjson::value(std::chrono::duration_cast<std::chrono::seconds>(gc_clock::now().time_since_epoch()).count()));
|
|
rjson::add(descr, "TableStatus", "ACTIVE");
|
|
auto schema_id_str = schema.id().to_sstring();
|
|
rjson::add(descr, "TableId", rjson::from_string(schema_id_str));
|
|
|
|
executor::supplement_table_stream_info(descr, schema, sp);
|
|
}
|
|
|
|
// We would have liked to support table names up to 255 bytes, like DynamoDB.
|
|
// But Scylla creates a directory whose name is the table's name plus 33
|
|
// bytes (dash and UUID), and since directory names are limited to 255 bytes,
|
|
// we need to limit table names to 222 bytes, instead of 255.
|
|
// See https://github.com/scylladb/scylla/issues/4480
|
|
static constexpr int max_table_name_length = 222;
|
|
|
|
// The DynamoDB developer guide, https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.NamingRulesDataTypes.html#HowItWorks.NamingRules
|
|
// specifies that table names "names must be between 3 and 255 characters long
|
|
// and can contain only the following characters: a-z, A-Z, 0-9, _ (underscore), - (dash), . (dot)
|
|
// validate_table_name throws the appropriate api_error if this validation fails.
|
|
static void validate_table_name(const std::string& name) {
|
|
if (name.length() < 3 || name.length() > max_table_name_length) {
|
|
throw api_error::validation(
|
|
format("TableName must be at least 3 characters long and at most {} characters long", max_table_name_length));
|
|
}
|
|
static const std::regex valid_table_name_chars ("[a-zA-Z0-9_.-]*");
|
|
if (!std::regex_match(name.c_str(), valid_table_name_chars)) {
|
|
throw api_error::validation(
|
|
"TableName must satisfy regular expression pattern: [a-zA-Z0-9_.-]+");
|
|
}
|
|
}
|
|
|
|
// In DynamoDB index names are local to a table, while in Scylla, materialized
|
|
// view names are global (in a keyspace). So we need to compose a unique name
|
|
// for the view taking into account both the table's name and the index name.
|
|
// We concatenate the table and index name separated by a delim character
|
|
// (a character not allowed by DynamoDB in ordinary table names, default: ":").
|
|
// The downside of this approach is that it limits the sum of the lengths,
|
|
// instead of each component individually as DynamoDB does.
|
|
// The view_name() function assumes the table_name has already been validated
|
|
// but validates the legality of index_name and the combination of both.
|
|
static std::string view_name(const std::string& table_name, std::string_view index_name, const std::string& delim = ":") {
|
|
static const std::regex valid_index_name_chars ("[a-zA-Z0-9_.-]*");
|
|
if (index_name.length() < 3) {
|
|
throw api_error::validation("IndexName must be at least 3 characters long");
|
|
}
|
|
if (!std::regex_match(index_name.data(), valid_index_name_chars)) {
|
|
throw api_error::validation(
|
|
format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
|
|
}
|
|
std::string ret = table_name + delim + std::string(index_name);
|
|
if (ret.length() > max_table_name_length) {
|
|
throw api_error::validation(
|
|
format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
|
|
table_name, index_name, max_table_name_length - delim.size()));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static std::string lsi_name(const std::string& table_name, std::string_view index_name) {
|
|
return view_name(table_name, index_name, "!:");
|
|
}
|
|
|
|
/** Extract table name from a request.
|
|
* Most requests expect the table's name to be listed in a "TableName" field.
|
|
* This convenience function returns the name, with appropriate validation
|
|
* and api_error in case the table name is missing or not a string, or
|
|
* doesn't pass validate_table_name().
|
|
*/
|
|
static std::optional<std::string> find_table_name(const rjson::value& request) {
|
|
const rjson::value* table_name_value = rjson::find(request, "TableName");
|
|
if (!table_name_value) {
|
|
return std::nullopt;
|
|
}
|
|
if (!table_name_value->IsString()) {
|
|
throw api_error::validation("Non-string TableName field in request");
|
|
}
|
|
std::string table_name = table_name_value->GetString();
|
|
validate_table_name(table_name);
|
|
return table_name;
|
|
}
|
|
|
|
static std::string get_table_name(const rjson::value& request) {
|
|
auto name = find_table_name(request);
|
|
if (!name) {
|
|
throw api_error::validation("Missing TableName field in request");
|
|
}
|
|
return *name;
|
|
}
|
|
|
|
/** Extract table schema from a request.
|
|
* Many requests expect the table's name to be listed in a "TableName" field
|
|
* and need to look it up as an existing table. This convenience function
|
|
* does this, with the appropriate validation and api_error in case the table
|
|
* name is missing, invalid or the table doesn't exist. If everything is
|
|
* successful, it returns the table's schema.
|
|
*/
|
|
schema_ptr executor::find_table(service::storage_proxy& proxy, const rjson::value& request) {
|
|
auto table_name = find_table_name(request);
|
|
if (!table_name) {
|
|
return nullptr;
|
|
}
|
|
try {
|
|
return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(*table_name), *table_name);
|
|
} catch(data_dictionary::no_such_column_family&) {
|
|
throw api_error::resource_not_found(
|
|
format("Requested resource not found: Table: {} not found", *table_name));
|
|
}
|
|
}
|
|
|
|
schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request) {
|
|
auto schema = executor::find_table(proxy, request);
|
|
if (!schema) {
|
|
// if we get here then the name was missing, since syntax or missing actual CF
|
|
// checks throw. Slow path, but just call get_table_name to generate exception.
|
|
get_table_name(request);
|
|
}
|
|
return schema;
|
|
}
|
|
|
|
static std::tuple<bool, std::string_view, std::string_view> try_get_internal_table(data_dictionary::database db, std::string_view table_name) {
|
|
size_t it = table_name.find(executor::INTERNAL_TABLE_PREFIX);
|
|
if (it != 0) {
|
|
return {false, "", ""};
|
|
}
|
|
table_name.remove_prefix(executor::INTERNAL_TABLE_PREFIX.size());
|
|
size_t delim = table_name.find_first_of('.');
|
|
if (delim == std::string_view::npos) {
|
|
return {false, "", ""};
|
|
}
|
|
std::string_view ks_name = table_name.substr(0, delim);
|
|
table_name.remove_prefix(ks_name.size() + 1);
|
|
// Only internal keyspaces can be accessed to avoid leakage
|
|
auto ks = db.try_find_keyspace(ks_name);
|
|
if (!ks || !ks->is_internal()) {
|
|
return {false, "", ""};
|
|
}
|
|
return {true, ks_name, table_name};
|
|
}
|
|
|
|
// get_table_or_view() is similar to to get_table(), except it returns either
|
|
// a table or a materialized view from which to read, based on the TableName
|
|
// and optional IndexName in the request. Only requests like Query and Scan
|
|
// which allow IndexName should use this function.
|
|
enum class table_or_view_type { base, lsi, gsi };
|
|
static std::pair<schema_ptr, table_or_view_type>
|
|
get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
|
|
table_or_view_type type = table_or_view_type::base;
|
|
std::string table_name = get_table_name(request);
|
|
|
|
auto [is_internal_table, internal_ks_name, internal_table_name] = try_get_internal_table(proxy.data_dictionary(), table_name);
|
|
if (is_internal_table) {
|
|
try {
|
|
return { proxy.data_dictionary().find_schema(sstring(internal_ks_name), sstring(internal_table_name)), type };
|
|
} catch (data_dictionary::no_such_column_family&) {
|
|
throw api_error::resource_not_found(
|
|
format("Requested resource not found: Internal table: {}.{} not found", internal_ks_name, internal_table_name));
|
|
}
|
|
}
|
|
|
|
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
|
|
const rjson::value* index_name = rjson::find(request, "IndexName");
|
|
std::string orig_table_name;
|
|
if (index_name) {
|
|
if (index_name->IsString()) {
|
|
orig_table_name = std::move(table_name);
|
|
table_name = view_name(orig_table_name, rjson::to_string_view(*index_name));
|
|
type = table_or_view_type::gsi;
|
|
} else {
|
|
throw api_error::validation(
|
|
format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
|
|
}
|
|
// If no tables for global indexes were found, the index may be local
|
|
if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
|
|
type = table_or_view_type::lsi;
|
|
table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name));
|
|
}
|
|
}
|
|
|
|
try {
|
|
return { proxy.data_dictionary().find_schema(keyspace_name, table_name), type };
|
|
} catch(data_dictionary::no_such_column_family&) {
|
|
if (index_name) {
|
|
// DynamoDB returns a different error depending on whether the
|
|
// base table doesn't exist (ResourceNotFoundException) or it
|
|
// does exist but the index does not (ValidationException).
|
|
if (proxy.data_dictionary().has_schema(keyspace_name, orig_table_name)) {
|
|
throw api_error::validation(
|
|
format("Requested resource not found: Index '{}' for table '{}'", index_name->GetString(), orig_table_name));
|
|
} else {
|
|
throw api_error::resource_not_found(
|
|
format("Requested resource not found: Table: {} not found", orig_table_name));
|
|
}
|
|
} else {
|
|
throw api_error::resource_not_found(
|
|
format("Requested resource not found: Table: {} not found", table_name));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convenience function for getting the value of a string attribute, or a
|
|
// default value if it is missing. If the attribute exists, but is not a
|
|
// string, a descriptive api_error is thrown.
|
|
static std::string get_string_attribute(const rjson::value& value, std::string_view attribute_name, const char* default_return) {
|
|
const rjson::value* attribute_value = rjson::find(value, attribute_name);
|
|
if (!attribute_value)
|
|
return default_return;
|
|
if (!attribute_value->IsString()) {
|
|
throw api_error::validation(format("Expected string value for attribute {}, got: {}",
|
|
attribute_name, value));
|
|
}
|
|
return std::string(attribute_value->GetString(), attribute_value->GetStringLength());
|
|
}
|
|
|
|
// Convenience function for getting the value of a boolean attribute, or a
|
|
// default value if it is missing. If the attribute exists, but is not a
|
|
// bool, a descriptive api_error is thrown.
|
|
static bool get_bool_attribute(const rjson::value& value, std::string_view attribute_name, bool default_return) {
|
|
const rjson::value* attribute_value = rjson::find(value, attribute_name);
|
|
if (!attribute_value) {
|
|
return default_return;
|
|
}
|
|
if (!attribute_value->IsBool()) {
|
|
throw api_error::validation(format("Expected boolean value for attribute {}, got: {}",
|
|
attribute_name, value));
|
|
}
|
|
return attribute_value->GetBool();
|
|
}
|
|
|
|
// Convenience function for getting the value of an integer attribute, or
|
|
// an empty optional if it is missing. If the attribute exists, but is not
|
|
// an integer, a descriptive api_error is thrown.
|
|
static std::optional<int> get_int_attribute(const rjson::value& value, std::string_view attribute_name) {
|
|
const rjson::value* attribute_value = rjson::find(value, attribute_name);
|
|
if (!attribute_value)
|
|
return {};
|
|
if (!attribute_value->IsInt()) {
|
|
throw api_error::validation(format("Expected integer value for attribute {}, got: {}",
|
|
attribute_name, value));
|
|
}
|
|
return attribute_value->GetInt();
|
|
}
|
|
|
|
// Sets a KeySchema object inside the given JSON parent describing the key
|
|
// attributes of the the given schema as being either HASH or RANGE keys.
|
|
// Additionally, adds to a given map mappings between the key attribute
|
|
// names and their type (as a DynamoDB type string).
|
|
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types) {
|
|
rjson::value key_schema = rjson::empty_array();
|
|
for (const column_definition& cdef : schema.partition_key_columns()) {
|
|
rjson::value key = rjson::empty_object();
|
|
rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text()));
|
|
rjson::add(key, "KeyType", "HASH");
|
|
rjson::push_back(key_schema, std::move(key));
|
|
if (attribute_types) {
|
|
(*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type);
|
|
}
|
|
}
|
|
for (const column_definition& cdef : schema.clustering_key_columns()) {
|
|
rjson::value key = rjson::empty_object();
|
|
rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text()));
|
|
rjson::add(key, "KeyType", "RANGE");
|
|
rjson::push_back(key_schema, std::move(key));
|
|
if (attribute_types) {
|
|
(*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type);
|
|
}
|
|
// FIXME: this "break" can avoid listing some clustering key columns
|
|
// we added for GSIs just because they existed in the base table -
|
|
// but not in all cases. We still have issue #5320. See also
|
|
// reproducer in test_gsi_2_describe_table_schema.
|
|
break;
|
|
}
|
|
rjson::add(parent, "KeySchema", std::move(key_schema));
|
|
|
|
}
|
|
|
|
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>& attribute_types) {
|
|
describe_key_schema(parent, schema, &attribute_types);
|
|
}
|
|
|
|
static rjson::value generate_arn_for_table(const schema& schema) {
|
|
return rjson::from_string(format("arn:scylla:alternator:{}:scylla:table/{}", schema.ks_name(), schema.cf_name()));
|
|
}
|
|
|
|
static rjson::value generate_arn_for_index(const schema& schema, std::string_view index_name) {
|
|
return rjson::from_string(format(
|
|
"arn:scylla:alternator:{}:scylla:table/{}/index/{}",
|
|
schema.ks_name(), schema.cf_name(), index_name));
|
|
}
|
|
|
|
bool is_alternator_keyspace(const sstring& ks_name) {
|
|
return ks_name.find(executor::KEYSPACE_NAME_PREFIX) == 0;
|
|
}
|
|
|
|
sstring executor::table_name(const schema& s) {
|
|
return s.cf_name();
|
|
}
|
|
|
|
future<executor::request_return_type> executor::describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.describe_table++;
|
|
elogger.trace("Describing table {}", request);
|
|
|
|
schema_ptr schema = get_table(_proxy, request);
|
|
|
|
tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
|
|
|
|
rjson::value table_description = rjson::empty_object();
|
|
rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
|
|
// FIXME: take the tables creation time, not the current time!
|
|
size_t creation_date_seconds = std::chrono::duration_cast<std::chrono::seconds>(gc_clock::now().time_since_epoch()).count();
|
|
rjson::add(table_description, "CreationDateTime", rjson::value(creation_date_seconds));
|
|
// FIXME: In DynamoDB the CreateTable implementation is asynchronous, and
|
|
// the table may be in "Creating" state until creating is finished.
|
|
// We don't currently do this in Alternator - instead CreateTable waits
|
|
// until the table is really available. So/ DescribeTable returns either
|
|
// ACTIVE or doesn't exist at all (and DescribeTable returns an error).
|
|
// The other states (CREATING, UPDATING, DELETING) are not currently
|
|
// returned.
|
|
rjson::add(table_description, "TableStatus", "ACTIVE");
|
|
rjson::add(table_description, "TableArn", generate_arn_for_table(*schema));
|
|
rjson::add(table_description, "TableId", rjson::from_string(schema->id().to_sstring()));
|
|
// FIXME: Instead of hardcoding, we should take into account which mode was chosen
|
|
// when the table was created. But, Spark jobs expect something to be returned
|
|
// and PAY_PER_REQUEST seems closer to reality than PROVISIONED.
|
|
rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
|
|
rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
|
|
rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_date_seconds));
|
|
|
|
std::unordered_map<std::string,std::string> key_attribute_types;
|
|
// Add base table's KeySchema and collect types for AttributeDefinitions:
|
|
describe_key_schema(table_description, *schema, key_attribute_types);
|
|
|
|
data_dictionary::table t = _proxy.data_dictionary().find_column_family(schema);
|
|
if (!t.views().empty()) {
|
|
rjson::value gsi_array = rjson::empty_array();
|
|
rjson::value lsi_array = rjson::empty_array();
|
|
for (const view_ptr& vptr : t.views()) {
|
|
rjson::value view_entry = rjson::empty_object();
|
|
const sstring& cf_name = vptr->cf_name();
|
|
size_t delim_it = cf_name.find(':');
|
|
if (delim_it == sstring::npos) {
|
|
elogger.error("Invalid internal index table name: {}", cf_name);
|
|
continue;
|
|
}
|
|
sstring index_name = cf_name.substr(delim_it + 1);
|
|
rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
|
|
rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
|
|
// Add indexes's KeySchema and collect types for AttributeDefinitions:
|
|
describe_key_schema(view_entry, *vptr, key_attribute_types);
|
|
// Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
|
|
rjson::value& index_array = (delim_it > 1 && cf_name[delim_it-1] == '!') ? lsi_array : gsi_array;
|
|
rjson::push_back(index_array, std::move(view_entry));
|
|
}
|
|
if (!lsi_array.Empty()) {
|
|
rjson::add(table_description, "LocalSecondaryIndexes", std::move(lsi_array));
|
|
}
|
|
if (!gsi_array.Empty()) {
|
|
rjson::add(table_description, "GlobalSecondaryIndexes", std::move(gsi_array));
|
|
}
|
|
}
|
|
// Use map built by describe_key_schema() for base and indexes to produce
|
|
// AttributeDefinitions for all key columns:
|
|
rjson::value attribute_definitions = rjson::empty_array();
|
|
for (auto& type : key_attribute_types) {
|
|
rjson::value key = rjson::empty_object();
|
|
rjson::add(key, "AttributeName", rjson::from_string(type.first));
|
|
rjson::add(key, "AttributeType", rjson::from_string(type.second));
|
|
rjson::push_back(attribute_definitions, std::move(key));
|
|
}
|
|
rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));
|
|
|
|
supplement_table_stream_info(table_description, *schema, _proxy);
|
|
|
|
// FIXME: still missing some response fields (issue #5026)
|
|
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "Table", std::move(table_description));
|
|
elogger.trace("returning {}", response);
|
|
return make_ready_future<executor::request_return_type>(make_jsonable(std::move(response)));
|
|
}
|
|
|
|
future<executor::request_return_type> executor::delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.delete_table++;
|
|
elogger.trace("Deleting table {}", request);
|
|
|
|
std::string table_name = get_table_name(request);
|
|
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
|
|
tracing::add_table_name(trace_state, keyspace_name, table_name);
|
|
auto& p = _proxy.container();
|
|
|
|
co_await _mm.container().invoke_on(0, [&] (service::migration_manager& mm) -> future<> {
|
|
// FIXME: the following needs to be in a loop. If mm.announce() below
|
|
// fails, we need to retry the whole thing.
|
|
auto group0_guard = co_await mm.start_group0_operation();
|
|
|
|
if (!p.local().data_dictionary().has_schema(keyspace_name, table_name)) {
|
|
throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name));
|
|
}
|
|
|
|
auto m = co_await mm.prepare_column_family_drop_announcement(keyspace_name, table_name, group0_guard.write_timestamp(), service::migration_manager::drop_views::yes);
|
|
auto m2 = mm.prepare_keyspace_drop_announcement(keyspace_name, group0_guard.write_timestamp());
|
|
|
|
std::move(m2.begin(), m2.end(), std::back_inserter(m));
|
|
|
|
co_await mm.announce(std::move(m), std::move(group0_guard));
|
|
});
|
|
|
|
// FIXME: need more attributes?
|
|
rjson::value table_description = rjson::empty_object();
|
|
rjson::add(table_description, "TableName", rjson::from_string(table_name));
|
|
rjson::add(table_description, "TableStatus", "DELETING");
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "TableDescription", std::move(table_description));
|
|
elogger.trace("returning {}", response);
|
|
co_return make_jsonable(std::move(response));
|
|
}
|
|
|
|
static data_type parse_key_type(const std::string& type) {
|
|
// Note that keys are only allowed to be string, blob or number (S/B/N).
|
|
// The other types: boolean and various lists or sets - are not allowed.
|
|
if (type.length() == 1) {
|
|
switch (type[0]) {
|
|
case 'S': return utf8_type;
|
|
case 'B': return bytes_type;
|
|
case 'N': return decimal_type; // FIXME: use a specialized Alternator type, not the general "decimal_type".
|
|
}
|
|
}
|
|
throw api_error::validation(
|
|
format("Invalid key type '{}', can only be S, B or N.", type));
|
|
}
|
|
|
|
|
|
static void add_column(schema_builder& builder, const std::string& name, const rjson::value& attribute_definitions, column_kind kind) {
|
|
// FIXME: Currently, the column name ATTRS_COLUMN_NAME is not allowed
|
|
// because we use it for our untyped attribute map, and we can't have a
|
|
// second column with the same name. We should fix this, by renaming
|
|
// some column names which we want to reserve.
|
|
if (name == executor::ATTRS_COLUMN_NAME) {
|
|
throw api_error::validation(format("Column name '{}' is currently reserved. FIXME.", name));
|
|
}
|
|
for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
|
|
const rjson::value& attribute_info = *it;
|
|
if (attribute_info["AttributeName"].GetString() == name) {
|
|
auto type = attribute_info["AttributeType"].GetString();
|
|
builder.with_column(to_bytes(name), parse_key_type(type), kind);
|
|
return;
|
|
}
|
|
}
|
|
throw api_error::validation(
|
|
format("KeySchema key '{}' missing in AttributeDefinitions", name));
|
|
}
|
|
|
|
// Parse the KeySchema request attribute, which specifies the column names
|
|
// for a key. A KeySchema must include up to two elements, the first must be
|
|
// the HASH key name, and the second one, if exists, must be a RANGE key name.
|
|
// The function returns the two column names - the first is the hash key
|
|
// and always present, the second is the range key and may be an empty string.
|
|
static std::pair<std::string, std::string> parse_key_schema(const rjson::value& obj) {
|
|
const rjson::value *key_schema;
|
|
if (!obj.IsObject() || !(key_schema = rjson::find(obj, "KeySchema"))) {
|
|
throw api_error::validation("Missing KeySchema member");
|
|
}
|
|
if (!key_schema->IsArray() || key_schema->Size() < 1 || key_schema->Size() > 2) {
|
|
throw api_error::validation("KeySchema must list exactly one or two key columns");
|
|
}
|
|
if (!(*key_schema)[0].IsObject()) {
|
|
throw api_error::validation("First element of KeySchema must be an object");
|
|
}
|
|
const rjson::value *v = rjson::find((*key_schema)[0], "KeyType");
|
|
if (!v || !v->IsString() || v->GetString() != std::string("HASH")) {
|
|
throw api_error::validation("First key in KeySchema must be a HASH key");
|
|
}
|
|
v = rjson::find((*key_schema)[0], "AttributeName");
|
|
if (!v || !v->IsString()) {
|
|
throw api_error::validation("First key in KeySchema must have string AttributeName");
|
|
}
|
|
std::string hash_key = v->GetString();
|
|
std::string range_key;
|
|
if (key_schema->Size() == 2) {
|
|
if (!(*key_schema)[1].IsObject()) {
|
|
throw api_error::validation("Second element of KeySchema must be an object");
|
|
}
|
|
v = rjson::find((*key_schema)[1], "KeyType");
|
|
if (!v || !v->IsString() || v->GetString() != std::string("RANGE")) {
|
|
throw api_error::validation("Second key in KeySchema must be a RANGE key");
|
|
}
|
|
v = rjson::find((*key_schema)[1], "AttributeName");
|
|
if (!v || !v->IsString()) {
|
|
throw api_error::validation("Second key in KeySchema must have string AttributeName");
|
|
}
|
|
range_key = v->GetString();
|
|
}
|
|
return {hash_key, range_key};
|
|
}
|
|
|
|
static schema_ptr get_table_from_arn(service::storage_proxy& proxy, std::string_view arn) {
|
|
// Expected format: arn:scylla:alternator:${KEYSPACE_NAME}:scylla:table/${TABLE_NAME};
|
|
constexpr size_t prefix_size = sizeof("arn:scylla:alternator:") - 1;
|
|
// NOTE: This code returns AccessDeniedException if it's problematic to parse or recognize an arn.
|
|
// Technically, a properly formatted, but nonexistent arn *should* return AccessDeniedException,
|
|
// while an incorrectly formatted one should return ValidationException.
|
|
// Unfortunately, the rules are really uncertain, since DynamoDB
|
|
// states that arns are of the form arn:partition:service:region:account-id:resource-type/resource-id
|
|
// or similar - yet, for some arns that do not fit that pattern (e.g. "john"),
|
|
// it still returns AccessDeniedException rather than ValidationException.
|
|
// Consequently, this code simply falls back to AccessDeniedException,
|
|
// concluding that an error is an error and code which uses tagging
|
|
// must be ready for handling AccessDeniedException instances anyway.
|
|
try {
|
|
size_t keyspace_end = arn.find_first_of(':', prefix_size);
|
|
std::string_view keyspace_name = arn.substr(prefix_size, keyspace_end - prefix_size);
|
|
size_t table_start = arn.find_first_of('/');
|
|
std::string_view table_name = arn.substr(table_start + 1);
|
|
if (table_name.find('/') != std::string_view::npos) {
|
|
// A table name cannot contain a '/' - if it does, it's not a
|
|
// table ARN, it may be an index. DynamoDB returns a
|
|
// ValidationException in that case - see #10786.
|
|
throw api_error::validation(format("ResourceArn '{}' is not a valid table ARN", table_name));
|
|
}
|
|
// FIXME: remove sstring creation once find_schema gains a view-based interface
|
|
return proxy.data_dictionary().find_schema(sstring(keyspace_name), sstring(table_name));
|
|
} catch (const data_dictionary::no_such_column_family& e) {
|
|
throw api_error::access_denied("Incorrect resource identifier");
|
|
} catch (const std::out_of_range& e) {
|
|
throw api_error::access_denied("Incorrect resource identifier");
|
|
}
|
|
}
|
|
|
|
const std::map<sstring, sstring>& get_tags_of_table(schema_ptr schema) {
|
|
auto it = schema->extensions().find(tags_extension::NAME);
|
|
if (it == schema->extensions().end()) {
|
|
throw api_error::validation(format("Table {} does not have valid tagging information", schema->ks_name()));
|
|
}
|
|
auto tags_extension = static_pointer_cast<alternator::tags_extension>(it->second);
|
|
return tags_extension->tags();
|
|
}
|
|
|
|
// find_tag() returns the value of a specific tag, or nothing if it doesn't
|
|
// exist. Unlike get_tags_of_table() above, if the table is missing the
|
|
// tags extension (e.g., is not an Alternator table) it's not an error -
|
|
// we return nothing, as in the case that tags exist but not this tag.
|
|
std::optional<std::string> find_tag(const schema& s, const sstring& tag) {
|
|
auto it1 = s.extensions().find(tags_extension::NAME);
|
|
if (it1 == s.extensions().end()) {
|
|
return std::nullopt;
|
|
}
|
|
const std::map<sstring, sstring>& tags_map =
|
|
static_pointer_cast<alternator::tags_extension>(it1->second)->tags();
|
|
auto it2 = tags_map.find(tag);
|
|
if (it2 == tags_map.end()) {
|
|
return std::nullopt;
|
|
} else {
|
|
return it2->second;
|
|
}
|
|
}
|
|
|
|
static bool is_legal_tag_char(char c) {
|
|
// FIXME: According to docs, unicode strings should also be accepted.
|
|
// Alternator currently uses a simplified ASCII approach
|
|
return std::isalnum(c) || std::isspace(c)
|
|
|| c == '+' || c == '-' || c == '=' || c == '.' || c == '_' || c == ':' || c == '/' ;
|
|
}
|
|
|
|
static bool validate_legal_tag_chars(std::string_view tag) {
|
|
return std::all_of(tag.begin(), tag.end(), &is_legal_tag_char);
|
|
}
|
|
|
|
static const std::unordered_set<std::string_view> allowed_write_isolation_values = {
|
|
"f", "forbid", "forbid_rmw",
|
|
"a", "always", "always_use_lwt",
|
|
"o", "only_rmw_uses_lwt",
|
|
"u", "unsafe", "unsafe_rmw",
|
|
};
|
|
|
|
static void validate_tags(const std::map<sstring, sstring>& tags) {
|
|
auto it = tags.find(rmw_operation::WRITE_ISOLATION_TAG_KEY);
|
|
if (it != tags.end()) {
|
|
std::string_view value = it->second;
|
|
if (!allowed_write_isolation_values.contains(value)) {
|
|
throw api_error::validation(
|
|
format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_write_isolation_values));
|
|
}
|
|
}
|
|
}
|
|
|
|
static rmw_operation::write_isolation parse_write_isolation(std::string_view value) {
|
|
if (!value.empty()) {
|
|
switch (value[0]) {
|
|
case 'f':
|
|
return rmw_operation::write_isolation::FORBID_RMW;
|
|
case 'a':
|
|
return rmw_operation::write_isolation::LWT_ALWAYS;
|
|
case 'o':
|
|
return rmw_operation::write_isolation::LWT_RMW_ONLY;
|
|
case 'u':
|
|
return rmw_operation::write_isolation::UNSAFE_RMW;
|
|
}
|
|
}
|
|
// Shouldn't happen as validate_tags() / set_default_write_isolation()
|
|
// verify allow only a closed set of values.
|
|
return rmw_operation::default_write_isolation;
|
|
|
|
}
|
|
// This default_write_isolation is always overwritten in main.cc, which calls
|
|
// set_default_write_isolation().
|
|
rmw_operation::write_isolation rmw_operation::default_write_isolation =
|
|
rmw_operation::write_isolation::LWT_ALWAYS;
|
|
void rmw_operation::set_default_write_isolation(std::string_view value) {
|
|
if (value.empty()) {
|
|
throw std::runtime_error("When Alternator is enabled, write "
|
|
"isolation policy must be selected, using the "
|
|
"'--alternator-write-isolation' option. "
|
|
"See docs/alternator/alternator.md for instructions.");
|
|
}
|
|
if (!allowed_write_isolation_values.contains(value)) {
|
|
throw std::runtime_error(format("Invalid --alternator-write-isolation "
|
|
"setting '{}'. Allowed values: {}.",
|
|
value, allowed_write_isolation_values));
|
|
}
|
|
default_write_isolation = parse_write_isolation(value);
|
|
}
|
|
|
|
enum class update_tags_action { add_tags, delete_tags };
|
|
static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>& tags_map, update_tags_action action) {
|
|
if (action == update_tags_action::add_tags) {
|
|
for (auto it = tags.Begin(); it != tags.End(); ++it) {
|
|
const rjson::value& key = (*it)["Key"];
|
|
const rjson::value& value = (*it)["Value"];
|
|
auto tag_key = rjson::to_string_view(key);
|
|
if (tag_key.empty() || tag_key.size() > 128 || !validate_legal_tag_chars(tag_key)) {
|
|
throw api_error::validation("The Tag Key provided is invalid string");
|
|
}
|
|
auto tag_value = rjson::to_string_view(value);
|
|
if (tag_value.empty() || tag_value.size() > 256 || !validate_legal_tag_chars(tag_value)) {
|
|
throw api_error::validation("The Tag Value provided is invalid string");
|
|
}
|
|
tags_map[sstring(tag_key)] = sstring(tag_value);
|
|
}
|
|
} else if (action == update_tags_action::delete_tags) {
|
|
for (auto it = tags.Begin(); it != tags.End(); ++it) {
|
|
tags_map.erase(sstring(it->GetString(), it->GetStringLength()));
|
|
}
|
|
}
|
|
|
|
if (tags_map.size() > 50) {
|
|
throw api_error::validation("Number of Tags exceed the current limit for the provided ResourceArn");
|
|
}
|
|
validate_tags(tags_map);
|
|
}
|
|
|
|
// FIXME: Updating tags currently relies on updating schema, which may be subject
|
|
// to races during concurrent updates of the same table. Once Scylla schema updates
|
|
// are fixed, this issue will automatically get fixed as well.
|
|
future<> update_tags(service::migration_manager& mm, schema_ptr schema, std::map<sstring, sstring>&& tags_map) {
|
|
co_await mm.container().invoke_on(0, [s = global_schema_ptr(std::move(schema)), tags_map = std::move(tags_map)] (service::migration_manager& mm) -> future<> {
|
|
// FIXME: the following needs to be in a loop. If mm.announce() below
|
|
// fails, we need to retry the whole thing.
|
|
auto group0_guard = co_await mm.start_group0_operation();
|
|
|
|
schema_builder builder(s);
|
|
builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>(tags_map));
|
|
|
|
auto m = co_await mm.prepare_column_family_update_announcement(builder.build(), false, std::vector<view_ptr>(), group0_guard.write_timestamp());
|
|
|
|
co_await mm.announce(std::move(m), std::move(group0_guard));
|
|
});
|
|
}
|
|
|
|
future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.tag_resource++;
|
|
|
|
const rjson::value* arn = rjson::find(request, "ResourceArn");
|
|
if (!arn || !arn->IsString()) {
|
|
co_return api_error::access_denied("Incorrect resource identifier");
|
|
}
|
|
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
|
|
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
|
const rjson::value* tags = rjson::find(request, "Tags");
|
|
if (!tags || !tags->IsArray()) {
|
|
co_return api_error::validation("Cannot parse tags");
|
|
}
|
|
if (tags->Size() < 1) {
|
|
co_return api_error::validation("The number of tags must be at least 1") ;
|
|
}
|
|
update_tags_map(*tags, tags_map, update_tags_action::add_tags);
|
|
co_await update_tags(_mm, schema, std::move(tags_map));
|
|
co_return json_string("");
|
|
}
|
|
|
|
future<executor::request_return_type> executor::untag_resource(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.untag_resource++;
|
|
|
|
const rjson::value* arn = rjson::find(request, "ResourceArn");
|
|
if (!arn || !arn->IsString()) {
|
|
co_return api_error::access_denied("Incorrect resource identifier");
|
|
}
|
|
const rjson::value* tags = rjson::find(request, "TagKeys");
|
|
if (!tags || !tags->IsArray()) {
|
|
co_return api_error::validation(format("Cannot parse tag keys"));
|
|
}
|
|
|
|
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
|
|
|
|
std::map<sstring, sstring> tags_map = get_tags_of_table(schema);
|
|
update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
|
|
co_await update_tags(_mm, schema, std::move(tags_map));
|
|
co_return json_string("");
|
|
}
|
|
|
|
future<executor::request_return_type> executor::list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.list_tags_of_resource++;
|
|
const rjson::value* arn = rjson::find(request, "ResourceArn");
|
|
if (!arn || !arn->IsString()) {
|
|
return make_ready_future<request_return_type>(api_error::access_denied("Incorrect resource identifier"));
|
|
}
|
|
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
|
|
|
|
auto tags_map = get_tags_of_table(schema);
|
|
rjson::value ret = rjson::empty_object();
|
|
rjson::add(ret, "Tags", rjson::empty_array());
|
|
|
|
rjson::value& tags = ret["Tags"];
|
|
for (auto& tag_entry : tags_map) {
|
|
rjson::value new_entry = rjson::empty_object();
|
|
rjson::add(new_entry, "Key", rjson::from_string(tag_entry.first));
|
|
rjson::add(new_entry, "Value", rjson::from_string(tag_entry.second));
|
|
rjson::push_back(tags, std::move(new_entry));
|
|
}
|
|
|
|
return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
|
|
}
|
|
|
|
static future<> wait_for_schema_agreement(service::migration_manager& mm, db::timeout_clock::time_point deadline) {
|
|
return do_until([&mm, deadline] {
|
|
if (db::timeout_clock::now() > deadline) {
|
|
throw std::runtime_error("Unable to reach schema agreement");
|
|
}
|
|
return mm.have_schema_agreement();
|
|
}, [] {
|
|
return seastar::sleep(500ms);
|
|
});
|
|
}
|
|
|
|
static void verify_billing_mode(const rjson::value& request) {
|
|
// Alternator does not yet support billing or throughput limitations, but
|
|
// let's verify that BillingMode is at least legal.
|
|
std::string billing_mode = get_string_attribute(request, "BillingMode", "PROVISIONED");
|
|
if (billing_mode == "PAY_PER_REQUEST") {
|
|
if (rjson::find(request, "ProvisionedThroughput")) {
|
|
throw api_error::validation("When BillingMode=PAY_PER_REQUEST, ProvisionedThroughput cannot be specified.");
|
|
}
|
|
} else if (billing_mode == "PROVISIONED") {
|
|
if (!rjson::find(request, "ProvisionedThroughput")) {
|
|
throw api_error::validation("When BillingMode=PROVISIONED, ProvisionedThroughput must be specified.");
|
|
}
|
|
} else {
|
|
throw api_error::validation("Unknown BillingMode={}. Must be PAY_PER_REQUEST or PROVISIONED.");
|
|
}
|
|
}
|
|
|
|
static future<executor::request_return_type> create_table_on_shard0(tracing::trace_state_ptr trace_state, rjson::value request, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper) {
|
|
assert(this_shard_id() == 0);
|
|
|
|
// We begin by parsing and validating the content of the CreateTable
|
|
// command. We can't inspect the current database schema at this point
|
|
// (e.g., verify that this table doesn't already exist) - we can only
|
|
// do this further down - after taking group0_guard.
|
|
std::string table_name = get_table_name(request);
|
|
if (table_name.find(executor::INTERNAL_TABLE_PREFIX) == 0) {
|
|
co_return api_error::validation(format("Prefix {} is reserved for accessing internal tables", executor::INTERNAL_TABLE_PREFIX));
|
|
}
|
|
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
|
|
const rjson::value& attribute_definitions = request["AttributeDefinitions"];
|
|
|
|
tracing::add_table_name(trace_state, keyspace_name, table_name);
|
|
|
|
schema_builder builder(keyspace_name, table_name);
|
|
auto [hash_key, range_key] = parse_key_schema(request);
|
|
add_column(builder, hash_key, attribute_definitions, column_kind::partition_key);
|
|
if (!range_key.empty()) {
|
|
add_column(builder, range_key, attribute_definitions, column_kind::clustering_key);
|
|
}
|
|
builder.with_column(bytes(executor::ATTRS_COLUMN_NAME), attrs_type(), column_kind::regular_column);
|
|
|
|
verify_billing_mode(request);
|
|
|
|
schema_ptr partial_schema = builder.build();
|
|
|
|
// Parse GlobalSecondaryIndexes parameters before creating the base
|
|
// table, so if we have a parse errors we can fail without creating
|
|
// any table.
|
|
const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
|
|
std::vector<schema_builder> view_builders;
|
|
std::vector<sstring> where_clauses;
|
|
std::unordered_set<std::string> index_names;
|
|
if (gsi) {
|
|
if (!gsi->IsArray()) {
|
|
co_return api_error::validation("GlobalSecondaryIndexes must be an array.");
|
|
}
|
|
for (const rjson::value& g : gsi->GetArray()) {
|
|
const rjson::value* index_name_v = rjson::find(g, "IndexName");
|
|
if (!index_name_v || !index_name_v->IsString()) {
|
|
co_return api_error::validation("GlobalSecondaryIndexes IndexName must be a string.");
|
|
}
|
|
std::string_view index_name = rjson::to_string_view(*index_name_v);
|
|
auto [it, added] = index_names.emplace(index_name);
|
|
if (!added) {
|
|
co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
|
|
}
|
|
std::string vname(view_name(table_name, index_name));
|
|
elogger.trace("Adding GSI {}", index_name);
|
|
// FIXME: read and handle "Projection" parameter. This will
|
|
// require the MV code to copy just parts of the attrs map.
|
|
schema_builder view_builder(keyspace_name, vname);
|
|
auto [view_hash_key, view_range_key] = parse_key_schema(g);
|
|
if (partial_schema->get_column_definition(to_bytes(view_hash_key)) == nullptr) {
|
|
// A column that exists in a global secondary index is upgraded from being a map entry
|
|
// to having a regular column definition in the base schema
|
|
add_column(builder, view_hash_key, attribute_definitions, column_kind::regular_column);
|
|
}
|
|
add_column(view_builder, view_hash_key, attribute_definitions, column_kind::partition_key);
|
|
if (!view_range_key.empty()) {
|
|
if (partial_schema->get_column_definition(to_bytes(view_range_key)) == nullptr) {
|
|
// A column that exists in a global secondary index is upgraded from being a map entry
|
|
// to having a regular column definition in the base schema
|
|
if (partial_schema->get_column_definition(to_bytes(view_hash_key)) == nullptr) {
|
|
// FIXME: this is alternator limitation only, because Scylla's materialized views
|
|
// we use underneath do not allow more than 1 base regular column to be part of the MV key
|
|
elogger.warn("Only 1 regular column from the base table should be used in the GSI key in order to ensure correct liveness management without assumptions");
|
|
}
|
|
add_column(builder, view_range_key, attribute_definitions, column_kind::regular_column);
|
|
}
|
|
add_column(view_builder, view_range_key, attribute_definitions, column_kind::clustering_key);
|
|
}
|
|
// Base key columns which aren't part of the index's key need to
|
|
// be added to the view nontheless, as (additional) clustering
|
|
// key(s).
|
|
if (hash_key != view_hash_key && hash_key != view_range_key) {
|
|
add_column(view_builder, hash_key, attribute_definitions, column_kind::clustering_key);
|
|
}
|
|
if (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
|
|
add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
|
|
}
|
|
sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
|
|
if (!view_range_key.empty()) {
|
|
where_clause = where_clause + " AND \"" + view_hash_key + "\" IS NOT NULL";
|
|
}
|
|
where_clauses.push_back(std::move(where_clause));
|
|
view_builders.emplace_back(std::move(view_builder));
|
|
}
|
|
}
|
|
|
|
const rjson::value* lsi = rjson::find(request, "LocalSecondaryIndexes");
|
|
if (lsi) {
|
|
if (!lsi->IsArray()) {
|
|
throw api_error::validation("LocalSecondaryIndexes must be an array.");
|
|
}
|
|
for (const rjson::value& l : lsi->GetArray()) {
|
|
const rjson::value* index_name_v = rjson::find(l, "IndexName");
|
|
if (!index_name_v || !index_name_v->IsString()) {
|
|
throw api_error::validation("LocalSecondaryIndexes IndexName must be a string.");
|
|
}
|
|
std::string_view index_name = rjson::to_string_view(*index_name_v);
|
|
auto [it, added] = index_names.emplace(index_name);
|
|
if (!added) {
|
|
co_return api_error::validation(format("Duplicate IndexName '{}', ", index_name));
|
|
}
|
|
std::string vname(lsi_name(table_name, index_name));
|
|
elogger.trace("Adding LSI {}", index_name);
|
|
if (range_key.empty()) {
|
|
co_return api_error::validation("LocalSecondaryIndex requires that the base table have a range key");
|
|
}
|
|
// FIXME: read and handle "Projection" parameter. This will
|
|
// require the MV code to copy just parts of the attrs map.
|
|
schema_builder view_builder(keyspace_name, vname);
|
|
auto [view_hash_key, view_range_key] = parse_key_schema(l);
|
|
if (view_hash_key != hash_key) {
|
|
co_return api_error::validation("LocalSecondaryIndex hash key must match the base table hash key");
|
|
}
|
|
add_column(view_builder, view_hash_key, attribute_definitions, column_kind::partition_key);
|
|
if (view_range_key.empty()) {
|
|
co_return api_error::validation("LocalSecondaryIndex must specify a sort key");
|
|
}
|
|
if (view_range_key == hash_key) {
|
|
co_return api_error::validation("LocalSecondaryIndex sort key cannot be the same as hash key");
|
|
}
|
|
if (view_range_key != range_key) {
|
|
add_column(builder, view_range_key, attribute_definitions, column_kind::regular_column);
|
|
}
|
|
add_column(view_builder, view_range_key, attribute_definitions, column_kind::clustering_key);
|
|
// Base key columns which aren't part of the index's key need to
|
|
// be added to the view nontheless, as (additional) clustering
|
|
// key(s).
|
|
if (!range_key.empty() && view_range_key != range_key) {
|
|
add_column(view_builder, range_key, attribute_definitions, column_kind::clustering_key);
|
|
}
|
|
view_builder.with_column(bytes(executor::ATTRS_COLUMN_NAME), attrs_type(), column_kind::regular_column);
|
|
// Note above we don't need to add virtual columns, as all
|
|
// base columns were copied to view. TODO: reconsider the need
|
|
// for virtual columns when we support Projection.
|
|
sstring where_clause = "\"" + view_hash_key + "\" IS NOT NULL";
|
|
if (!view_range_key.empty()) {
|
|
where_clause = where_clause + " AND \"" + view_range_key + "\" IS NOT NULL";
|
|
}
|
|
where_clauses.push_back(std::move(where_clause));
|
|
view_builders.emplace_back(std::move(view_builder));
|
|
}
|
|
}
|
|
|
|
// We don't yet support configuring server-side encryption (SSE) via the
|
|
// SSESpecifiction attribute, but an SSESpecification with Enabled=false
|
|
// is simply the default, and should be accepted:
|
|
rjson::value* sse_specification = rjson::find(request, "SSESpecification");
|
|
if (sse_specification && sse_specification->IsObject()) {
|
|
rjson::value* enabled = rjson::find(*sse_specification, "Enabled");
|
|
if (!enabled || !enabled->IsBool()) {
|
|
co_return api_error("ValidationException", "SSESpecification needs boolean Enabled");
|
|
}
|
|
if (enabled->GetBool()) {
|
|
// TODO: full support for SSESpecification
|
|
co_return api_error("ValidationException", "SSESpecification: configuring encryption-at-rest is not yet supported.");
|
|
}
|
|
}
|
|
|
|
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
|
if (stream_specification && stream_specification->IsObject()) {
|
|
executor::add_stream_options(*stream_specification, builder, sp);
|
|
}
|
|
|
|
// Parse the "Tags" parameter early, so we can avoid creating the table
|
|
// at all if this parsing failed.
|
|
const rjson::value* tags = rjson::find(request, "Tags");
|
|
std::map<sstring, sstring> tags_map;
|
|
if (tags && tags->IsArray()) {
|
|
update_tags_map(*tags, tags_map, update_tags_action::add_tags);
|
|
}
|
|
builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>(tags_map));
|
|
|
|
schema_ptr schema = builder.build();
|
|
auto where_clause_it = where_clauses.begin();
|
|
for (auto& view_builder : view_builders) {
|
|
// Note below we don't need to add virtual columns, as all
|
|
// base columns were copied to view. TODO: reconsider the need
|
|
// for virtual columns when we support Projection.
|
|
for (const column_definition& regular_cdef : schema->regular_columns()) {
|
|
if (!view_builder.has_column(*cql3::to_identifier(regular_cdef))) {
|
|
view_builder.with_column(regular_cdef.name(), regular_cdef.type, column_kind::regular_column);
|
|
}
|
|
}
|
|
const bool include_all_columns = true;
|
|
view_builder.with_view_info(*schema, include_all_columns, *where_clause_it);
|
|
view_builder.add_extension(tags_extension::NAME, ::make_shared<tags_extension>());
|
|
++where_clause_it;
|
|
}
|
|
|
|
// FIXME: the following needs to be in a loop. If mm.announce() below
|
|
// fails, we need to retry the whole thing.
|
|
auto group0_guard = co_await mm.start_group0_operation();
|
|
auto ts = group0_guard.write_timestamp();
|
|
std::vector<mutation> schema_mutations;
|
|
try {
|
|
schema_mutations = co_await create_keyspace(keyspace_name, sp, mm, gossiper, ts);
|
|
} catch (exceptions::already_exists_exception&) {
|
|
if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
|
|
co_return api_error::resource_in_use(format("Table {} already exists", table_name));
|
|
}
|
|
}
|
|
if (sp.data_dictionary().try_find_table(schema->id())) {
|
|
// This should never happen, the ID is supposed to be unique
|
|
co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
|
|
}
|
|
db::schema_tables::add_table_or_view_to_schema_mutation(schema, ts, true, schema_mutations);
|
|
// we must call before_create_column_family callbacks - which allow
|
|
// listeners to modify our schema_mutations. For example, CDC may add
|
|
// another table (the CDC log table) to the same keyspace.
|
|
// Unfortunately the convention is that this callback must be run in
|
|
// a Seastar thread.
|
|
co_await seastar::async([&] {
|
|
mm.get_notifier().before_create_column_family(*schema, schema_mutations, ts);
|
|
});
|
|
for (schema_builder& view_builder : view_builders) {
|
|
db::schema_tables::add_table_or_view_to_schema_mutation(
|
|
view_ptr(view_builder.build()), ts, true, schema_mutations);
|
|
}
|
|
co_await mm.announce(std::move(schema_mutations), std::move(group0_guard));
|
|
|
|
co_await wait_for_schema_agreement(mm, db::timeout_clock::now() + 10s);
|
|
rjson::value status = rjson::empty_object();
|
|
executor::supplement_table_info(request, *schema, sp);
|
|
rjson::add(status, "TableDescription", std::move(request));
|
|
co_return make_jsonable(std::move(status));
|
|
}
|
|
|
|
future<executor::request_return_type> executor::create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.create_table++;
|
|
elogger.trace("Creating table {}", request);
|
|
|
|
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container()]
|
|
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
|
|
co_return co_await create_table_on_shard0(tr, std::move(request), sp.local(), mm, g.local());
|
|
});
|
|
}
|
|
|
|
future<executor::request_return_type> executor::update_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.update_table++;
|
|
elogger.trace("Updating table {}", request);
|
|
|
|
static const std::vector<sstring> unsupported = {
|
|
"AttributeDefinitions",
|
|
"GlobalSecondaryIndexUpdates",
|
|
"ProvisionedThroughput",
|
|
"ReplicaUpdates",
|
|
"SSESpecification",
|
|
};
|
|
|
|
for (auto& s : unsupported) {
|
|
if (rjson::find(request, s)) {
|
|
co_await coroutine::return_exception(api_error::validation(s + " not supported"));
|
|
}
|
|
}
|
|
|
|
if (rjson::find(request, "BillingMode")) {
|
|
verify_billing_mode(request);
|
|
}
|
|
|
|
co_return co_await _mm.container().invoke_on(0, [&p = _proxy.container(), request = std::move(request), gt = tracing::global_trace_state_ptr(std::move(trace_state))]
|
|
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
|
|
// FIXME: the following needs to be in a loop. If mm.announce() below
|
|
// fails, we need to retry the whole thing.
|
|
auto group0_guard = co_await mm.start_group0_operation();
|
|
|
|
schema_ptr tab = get_table(p.local(), request);
|
|
|
|
tracing::add_table_name(gt, tab->ks_name(), tab->cf_name());
|
|
|
|
// the ugly but harmless conversion to string_view here is because
|
|
// Seastar's sstring is missing a find(std::string_view) :-()
|
|
if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
|
|
co_await coroutine::return_exception(api_error::validation(format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
|
|
}
|
|
|
|
schema_builder builder(tab);
|
|
|
|
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
|
if (stream_specification && stream_specification->IsObject()) {
|
|
add_stream_options(*stream_specification, builder, p.local());
|
|
}
|
|
|
|
auto schema = builder.build();
|
|
|
|
auto m = co_await mm.prepare_column_family_update_announcement(schema, false, std::vector<view_ptr>(), group0_guard.write_timestamp());
|
|
|
|
co_await mm.announce(std::move(m), std::move(group0_guard));
|
|
|
|
co_await wait_for_schema_agreement(mm, db::timeout_clock::now() + 10s);
|
|
|
|
rjson::value status = rjson::empty_object();
|
|
supplement_table_info(request, *schema, p.local());
|
|
rjson::add(status, "TableDescription", std::move(request));
|
|
co_return make_jsonable(std::move(status));
|
|
});
|
|
}
|
|
|
|
// attribute_collector is a helper class used to accept several attribute
|
|
// puts or deletes, and collect them as single collection mutation.
|
|
// The implementation is somewhat complicated by the need of cells in a
|
|
// collection to be sorted by key order.
|
|
class attribute_collector {
|
|
std::map<bytes, atomic_cell, serialized_compare> collected;
|
|
void add(bytes&& name, atomic_cell&& cell) {
|
|
collected.emplace(std::move(name), std::move(cell));
|
|
}
|
|
void add(const bytes& name, atomic_cell&& cell) {
|
|
collected.emplace(name, std::move(cell));
|
|
}
|
|
public:
|
|
attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
|
|
void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
|
|
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
|
|
|
|
}
|
|
void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
|
|
add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
|
|
}
|
|
void del(bytes&& name, api::timestamp_type ts) {
|
|
add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
|
|
}
|
|
void del(const bytes& name, api::timestamp_type ts) {
|
|
add(name, atomic_cell::make_dead(ts, gc_clock::now()));
|
|
}
|
|
collection_mutation_description to_mut() {
|
|
collection_mutation_description ret;
|
|
for (auto&& e : collected) {
|
|
ret.cells.emplace_back(e.first, std::move(e.second));
|
|
}
|
|
return ret;
|
|
}
|
|
bool empty() const {
|
|
return collected.empty();
|
|
}
|
|
};
|
|
|
|
// After calling pk_from_json() and ck_from_json() to extract the pk and ck
|
|
// components of a key, and if that succeeded, call check_key() to further
|
|
// check that the key doesn't have any spurious components.
|
|
static void check_key(const rjson::value& key, const schema_ptr& schema) {
|
|
if (key.MemberCount() != (schema->clustering_key_size() == 0 ? 1 : 2)) {
|
|
throw api_error::validation("Given key attribute not in schema");
|
|
}
|
|
}
|
|
|
|
// Verify that a value parsed from the user input is legal. In particular,
|
|
// we check that the value is not an empty set, string or bytes - which is
|
|
// (somewhat artificially) forbidden by DynamoDB.
|
|
void validate_value(const rjson::value& v, const char* caller) {
|
|
if (!v.IsObject() || v.MemberCount() != 1) {
|
|
throw api_error::validation(format("{}: improperly formatted value '{}'", caller, v));
|
|
}
|
|
auto it = v.MemberBegin();
|
|
const std::string_view type = rjson::to_string_view(it->name);
|
|
if (type == "SS" || type == "BS" || type == "NS") {
|
|
if (!it->value.IsArray()) {
|
|
throw api_error::validation(format("{}: improperly formatted set '{}'", caller, v));
|
|
}
|
|
if (it->value.Size() == 0) {
|
|
throw api_error::validation(format("{}: empty set not allowed", caller));
|
|
}
|
|
} else if (type == "S" || type == "B") {
|
|
if (!it->value.IsString()) {
|
|
throw api_error::validation(format("{}: improperly formatted value '{}'", caller, v));
|
|
}
|
|
} else if (type == "N") {
|
|
if (!it->value.IsString()) {
|
|
// DynamoDB uses a SerializationException in this case, not ValidationException.
|
|
throw api_error::serialization(format("{}: number value must be encoded as string '{}'", caller, v));
|
|
}
|
|
} else if (type != "L" && type != "M" && type != "BOOL" && type != "NULL") {
|
|
// TODO: can do more sanity checks on the content of the above types.
|
|
throw api_error::validation(format("{}: unknown type {} for value {}", caller, type, v));
|
|
}
|
|
}
|
|
|
|
// The put_or_delete_item class builds the mutations needed by the PutItem and
|
|
// DeleteItem operations - either as stand-alone commands or part of a list
|
|
// of commands in BatchWriteItems.
|
|
// put_or_delete_item splits each operation into two stages: Constructing the
|
|
// object parses and validates the user input (throwing exceptions if there
|
|
// are input errors). Later, build() generates the actual mutation, with a
|
|
// specified timestamp. This split is needed because of the peculiar needs of
|
|
// BatchWriteItems and LWT. BatchWriteItems needs all parsing to happen before
|
|
// any writing happens (if one of the commands has an error, none of the
|
|
// writes should be done). LWT makes it impossible for the parse step to
|
|
// generate "mutation" objects, because the timestamp still isn't known.
|
|
class put_or_delete_item {
|
|
private:
|
|
partition_key _pk;
|
|
clustering_key _ck;
|
|
struct cell {
|
|
bytes column_name;
|
|
bytes value;
|
|
};
|
|
// PutItem: engaged _cells, write these cells to item (_pk, _ck).
|
|
// DeleteItem: disengaged _cells, delete the entire item (_pk, _ck).
|
|
std::optional<std::vector<cell>> _cells;
|
|
public:
|
|
struct delete_item {};
|
|
struct put_item {};
|
|
put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item);
|
|
put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item);
|
|
// put_or_delete_item doesn't keep a reference to schema (so it can be
|
|
// moved between shards for LWT) so it needs to be given again to build():
|
|
mutation build(schema_ptr schema, api::timestamp_type ts) const;
|
|
const partition_key& pk() const { return _pk; }
|
|
const clustering_key& ck() const { return _ck; }
|
|
};
|
|
|
|
put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item)
|
|
: _pk(pk_from_json(key, schema)), _ck(ck_from_json(key, schema)) {
|
|
check_key(key, schema);
|
|
}
|
|
|
|
put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item)
|
|
: _pk(pk_from_json(item, schema)), _ck(ck_from_json(item, schema)) {
|
|
_cells = std::vector<cell>();
|
|
_cells->reserve(item.MemberCount());
|
|
for (auto it = item.MemberBegin(); it != item.MemberEnd(); ++it) {
|
|
bytes column_name = to_bytes(it->name.GetString());
|
|
validate_value(it->value, "PutItem");
|
|
const column_definition* cdef = schema->get_column_definition(column_name);
|
|
if (!cdef) {
|
|
bytes value = serialize_item(it->value);
|
|
_cells->push_back({std::move(column_name), serialize_item(it->value)});
|
|
} else if (!cdef->is_primary_key()) {
|
|
// Fixed-type regular column can be used for GSI key
|
|
_cells->push_back({std::move(column_name),
|
|
get_key_from_typed_value(it->value, *cdef)});
|
|
}
|
|
}
|
|
}
|
|
|
|
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
|
|
mutation m(schema, _pk);
|
|
// If there's no clustering key, a tombstone should be created directly
|
|
// on a partition, not on a clustering row - otherwise it will look like
|
|
// an open-ended range tombstone, which will crash on KA/LA sstable format.
|
|
// Ref: #6035
|
|
const bool use_partition_tombstone = schema->clustering_key_size() == 0;
|
|
if (!_cells) {
|
|
if (use_partition_tombstone) {
|
|
m.partition().apply(tombstone(ts, gc_clock::now()));
|
|
} else {
|
|
// a DeleteItem operation:
|
|
m.partition().clustered_row(*schema, _ck).apply(tombstone(ts, gc_clock::now()));
|
|
}
|
|
return m;
|
|
}
|
|
// else, a PutItem operation:
|
|
auto& row = m.partition().clustered_row(*schema, _ck);
|
|
attribute_collector attrs_collector;
|
|
for (auto& c : *_cells) {
|
|
const column_definition* cdef = schema->get_column_definition(c.column_name);
|
|
if (!cdef) {
|
|
attrs_collector.put(c.column_name, c.value, ts);
|
|
} else {
|
|
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
|
|
}
|
|
}
|
|
if (!attrs_collector.empty()) {
|
|
auto serialized_map = attrs_collector.to_mut().serialize(*attrs_type());
|
|
row.cells().apply(attrs_column(*schema), std::move(serialized_map));
|
|
}
|
|
// To allow creation of an item with no attributes, we need a row marker.
|
|
row.apply(row_marker(ts));
|
|
// PutItem is supposed to completely replace the old item, so we need to
|
|
// also have a tombstone removing old cells. We can't use the timestamp
|
|
// ts, because when data and tombstone tie on timestamp, the tombstone
|
|
// wins. So we need to use ts-1. Note that we use this trick also in
|
|
// Scylla proper, to implement the operation to replace an entire
|
|
// collection ("UPDATE .. SET x = ..") - see
|
|
// cql3::update_parameters::make_tombstone_just_before().
|
|
if (use_partition_tombstone) {
|
|
m.partition().apply(tombstone(ts-1, gc_clock::now()));
|
|
} else {
|
|
row.apply(tombstone(ts-1, gc_clock::now()));
|
|
}
|
|
return m;
|
|
}
|
|
|
|
// The DynamoDB API doesn't let the client control the server's timeout, so
|
|
// we have a global default_timeout() for Alternator requests. The value of
|
|
// s_default_timeout is overwritten in alternator::controller::start_server()
|
|
// based on the "alternator_timeout_in_ms" configuration parameter.
|
|
db::timeout_clock::duration executor::s_default_timeout = 10s;
|
|
void executor::set_default_timeout(db::timeout_clock::duration timeout) {
|
|
s_default_timeout = timeout;
|
|
}
|
|
db::timeout_clock::time_point executor::default_timeout() {
|
|
return db::timeout_clock::now() + s_default_timeout;
|
|
}
|
|
|
|
static future<std::unique_ptr<rjson::value>> get_previous_item(
|
|
service::storage_proxy& proxy,
|
|
service::client_state& client_state,
|
|
schema_ptr schema,
|
|
const partition_key& pk,
|
|
const clustering_key& ck,
|
|
service_permit permit,
|
|
alternator::stats& stats);
|
|
|
|
static lw_shared_ptr<query::read_command> previous_item_read_command(service::storage_proxy& proxy,
|
|
schema_ptr schema,
|
|
const clustering_key& ck,
|
|
shared_ptr<cql3::selection::selection> selection) {
|
|
std::vector<query::clustering_range> bounds;
|
|
if (schema->clustering_key_size() == 0) {
|
|
bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
} else {
|
|
bounds.push_back(query::clustering_range::make_singular(ck));
|
|
}
|
|
// FIXME: We pretend to take a selection (all callers currently give us a
|
|
// wildcard selection...) but here we read the entire item anyway. We
|
|
// should take the column list from selection instead of building it here.
|
|
auto regular_columns = boost::copy_range<query::column_id_vector>(
|
|
schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
|
|
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
|
|
return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice));
|
|
}
|
|
|
|
static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) {
|
|
return dht::partition_range_vector{dht::partition_range(dht::decorate_key(schema, pk))};
|
|
}
|
|
static dht::partition_range_vector to_partition_ranges(const dht::decorated_key& pk) {
|
|
return dht::partition_range_vector{dht::partition_range(pk)};
|
|
}
|
|
|
|
// Parse the different options for the ReturnValues parameter. We parse all
|
|
// the known options, but only UpdateItem actually supports all of them. The
|
|
// other operations (DeleteItem and PutItem) will refuse some of them.
|
|
rmw_operation::returnvalues rmw_operation::parse_returnvalues(const rjson::value& request) {
|
|
const rjson::value* attribute_value = rjson::find(request, "ReturnValues");
|
|
if (!attribute_value) {
|
|
return rmw_operation::returnvalues::NONE;
|
|
}
|
|
if (!attribute_value->IsString()) {
|
|
throw api_error::validation(format("Expected string value for ReturnValues, got: {}", *attribute_value));
|
|
}
|
|
auto s = rjson::to_string_view(*attribute_value);
|
|
if (s == "NONE") {
|
|
return rmw_operation::returnvalues::NONE;
|
|
} else if (s == "ALL_OLD") {
|
|
return rmw_operation::returnvalues::ALL_OLD;
|
|
} else if (s == "UPDATED_OLD") {
|
|
return rmw_operation::returnvalues::UPDATED_OLD;
|
|
} else if (s == "ALL_NEW") {
|
|
return rmw_operation::returnvalues::ALL_NEW;
|
|
} else if (s == "UPDATED_NEW") {
|
|
return rmw_operation::returnvalues::UPDATED_NEW;
|
|
} else {
|
|
throw api_error::validation(format("Unrecognized value for ReturnValues: {}", s));
|
|
}
|
|
}
|
|
|
|
rmw_operation::rmw_operation(service::storage_proxy& proxy, rjson::value&& request)
|
|
: _request(std::move(request))
|
|
, _schema(get_table(proxy, _request))
|
|
, _write_isolation(get_write_isolation_for_schema(_schema))
|
|
, _returnvalues(parse_returnvalues(_request))
|
|
{
|
|
// _pk and _ck will be assigned later, by the subclass's constructor
|
|
// (each operation puts the key in a slightly different location in
|
|
// the request).
|
|
}
|
|
|
|
std::optional<mutation> rmw_operation::apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) {
|
|
if (qr->row_count()) {
|
|
auto selection = cql3::selection::selection::wildcard(_schema);
|
|
auto previous_item = executor::describe_single_item(_schema, slice, *selection, *qr, {});
|
|
if (previous_item) {
|
|
return apply(std::make_unique<rjson::value>(std::move(*previous_item)), ts);
|
|
}
|
|
}
|
|
return apply(std::unique_ptr<rjson::value>(), ts);
|
|
}
|
|
|
|
rmw_operation::write_isolation rmw_operation::get_write_isolation_for_schema(schema_ptr schema) {
|
|
const auto& tags = get_tags_of_table(schema);
|
|
auto it = tags.find(WRITE_ISOLATION_TAG_KEY);
|
|
if (it == tags.end() || it->second.empty()) {
|
|
return default_write_isolation;
|
|
}
|
|
return parse_write_isolation(it->second);
|
|
}
|
|
|
|
// shard_for_execute() checks whether execute() must be called on a specific
|
|
// other shard. Running execute() on a specific shard is necessary only if it
|
|
// will use LWT (storage_proxy::cas()). This is because cas() can only be
|
|
// called on the specific shard owning (as per cas_shard()) _pk's token.
|
|
// Knowing if execute() will call cas() or not may depend on whether there is
|
|
// a read-before-write, but not just on it - depending on configuration,
|
|
// execute() may unconditionally use cas() for every write. Unfortunately,
|
|
// this requires duplicating here a bit of logic from execute().
|
|
std::optional<shard_id> rmw_operation::shard_for_execute(bool needs_read_before_write) {
|
|
if (_write_isolation == write_isolation::FORBID_RMW ||
|
|
(_write_isolation == write_isolation::LWT_RMW_ONLY && !needs_read_before_write) ||
|
|
_write_isolation == write_isolation::UNSAFE_RMW) {
|
|
return {};
|
|
}
|
|
// If we're still here, cas() *will* be called by execute(), so let's
|
|
// find the appropriate shard to run it on:
|
|
auto token = dht::get_token(*_schema, _pk);
|
|
auto desired_shard = service::storage_proxy::cas_shard(*_schema, token);
|
|
if (desired_shard == this_shard_id()) {
|
|
return {};
|
|
}
|
|
return desired_shard;
|
|
}
|
|
|
|
// Build the return value from the different RMW operations (UpdateItem,
|
|
// PutItem, DeleteItem). All these return nothing by default, but can
|
|
// optionally return Attributes if requested via the ReturnValues option.
|
|
static future<executor::request_return_type> rmw_operation_return(rjson::value&& attributes) {
|
|
rjson::value ret = rjson::empty_object();
|
|
if (!attributes.IsNull()) {
|
|
rjson::add(ret, "Attributes", std::move(attributes));
|
|
}
|
|
return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
|
|
}
|
|
|
|
static future<std::unique_ptr<rjson::value>> get_previous_item(
|
|
service::storage_proxy& proxy,
|
|
service::client_state& client_state,
|
|
schema_ptr schema,
|
|
const partition_key& pk,
|
|
const clustering_key& ck,
|
|
service_permit permit,
|
|
alternator::stats& stats)
|
|
{
|
|
stats.reads_before_write++;
|
|
auto selection = cql3::selection::selection::wildcard(schema);
|
|
auto command = previous_item_read_command(proxy, schema, ck, selection);
|
|
command->allow_limit = db::allow_per_partition_rate_limit::yes;
|
|
auto cl = db::consistency_level::LOCAL_QUORUM;
|
|
|
|
return proxy.query(schema, command, to_partition_ranges(*schema, pk), cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state)).then(
|
|
[schema, command, selection = std::move(selection)] (service::storage_proxy::coordinator_query_result qr) {
|
|
auto previous_item = executor::describe_single_item(schema, command->slice, *selection, *qr.query_result, {});
|
|
if (previous_item) {
|
|
return make_ready_future<std::unique_ptr<rjson::value>>(std::make_unique<rjson::value>(std::move(*previous_item)));
|
|
} else {
|
|
return make_ready_future<std::unique_ptr<rjson::value>>();
|
|
}
|
|
});
|
|
}
|
|
|
|
future<executor::request_return_type> rmw_operation::execute(service::storage_proxy& proxy,
|
|
service::client_state& client_state,
|
|
tracing::trace_state_ptr trace_state,
|
|
service_permit permit,
|
|
bool needs_read_before_write,
|
|
stats& stats) {
|
|
if (needs_read_before_write) {
|
|
if (_write_isolation == write_isolation::FORBID_RMW) {
|
|
throw api_error::validation("Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
|
|
}
|
|
stats.reads_before_write++;
|
|
if (_write_isolation == write_isolation::UNSAFE_RMW) {
|
|
// This is the old, unsafe, read before write which does first
|
|
// a read, then a write. TODO: remove this mode entirely.
|
|
return get_previous_item(proxy, client_state, schema(), _pk, _ck, permit, stats).then(
|
|
[this, &client_state, &proxy, trace_state, permit = std::move(permit)] (std::unique_ptr<rjson::value> previous_item) mutable {
|
|
std::optional<mutation> m = apply(std::move(previous_item), api::new_timestamp());
|
|
if (!m) {
|
|
return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("Failed condition."));
|
|
}
|
|
return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this] () mutable {
|
|
return rmw_operation_return(std::move(_return_attributes));
|
|
});
|
|
});
|
|
}
|
|
} else if (_write_isolation != write_isolation::LWT_ALWAYS) {
|
|
std::optional<mutation> m = apply(nullptr, api::new_timestamp());
|
|
assert(m); // !needs_read_before_write, so apply() did not check a condition
|
|
return proxy.mutate(std::vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes).then([this] () mutable {
|
|
return rmw_operation_return(std::move(_return_attributes));
|
|
});
|
|
}
|
|
// If we're still here, we need to do this write using LWT:
|
|
stats.write_using_lwt++;
|
|
auto timeout = executor::default_timeout();
|
|
auto selection = cql3::selection::selection::wildcard(schema());
|
|
auto read_command = needs_read_before_write ?
|
|
previous_item_read_command(proxy, schema(), _ck, selection) :
|
|
nullptr;
|
|
return proxy.cas(schema(), shared_from_this(), read_command, to_partition_ranges(*schema(), _pk),
|
|
{timeout, std::move(permit), client_state, trace_state},
|
|
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM, timeout, timeout).then([this, read_command] (bool is_applied) mutable {
|
|
if (!is_applied) {
|
|
return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("Failed condition."));
|
|
}
|
|
return rmw_operation_return(std::move(_return_attributes));
|
|
});
|
|
}
|
|
|
|
static parsed::condition_expression get_parsed_condition_expression(rjson::value& request) {
|
|
rjson::value* condition_expression = rjson::find(request, "ConditionExpression");
|
|
if (!condition_expression) {
|
|
// Returning an empty() condition_expression means no condition.
|
|
return parsed::condition_expression{};
|
|
}
|
|
if (!condition_expression->IsString()) {
|
|
throw api_error::validation("ConditionExpression must be a string");
|
|
}
|
|
if (condition_expression->GetStringLength() == 0) {
|
|
throw api_error::validation("ConditionExpression must not be empty");
|
|
}
|
|
try {
|
|
return parse_condition_expression(rjson::to_string_view(*condition_expression));
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
}
|
|
|
|
static bool check_needs_read_before_write(const parsed::condition_expression& condition_expression) {
|
|
// Theoretically, a condition expression may exist but not refer to the
|
|
// item at all. But this is not a useful case and there is no point in
|
|
// optimizing for it.
|
|
return !condition_expression.empty();
|
|
}
|
|
|
|
// Fail the expression if it has unused attribute names or values. This is
|
|
// how DynamoDB behaves, so we do too.
|
|
static void verify_all_are_used(const rjson::value& req, const char* field,
|
|
const std::unordered_set<std::string>& used, const char* operation) {
|
|
const rjson::value* attribute_names = rjson::find(req, field);
|
|
if (!attribute_names) {
|
|
return;
|
|
}
|
|
for (auto it = attribute_names->MemberBegin(); it != attribute_names->MemberEnd(); ++it) {
|
|
if (!used.contains(it->name.GetString())) {
|
|
throw api_error::validation(
|
|
format("{} has spurious '{}', not used in {}",
|
|
field, it->name.GetString(), operation));
|
|
}
|
|
}
|
|
}
|
|
|
|
class put_item_operation : public rmw_operation {
|
|
private:
|
|
put_or_delete_item _mutation_builder;
|
|
public:
|
|
parsed::condition_expression _condition_expression;
|
|
put_item_operation(service::storage_proxy& proxy, rjson::value&& request)
|
|
: rmw_operation(proxy, std::move(request))
|
|
, _mutation_builder(rjson::get(_request, "Item"), schema(), put_or_delete_item::put_item{}) {
|
|
_pk = _mutation_builder.pk();
|
|
_ck = _mutation_builder.ck();
|
|
if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
|
|
throw api_error::validation(format("PutItem supports only NONE or ALL_OLD for ReturnValues"));
|
|
}
|
|
_condition_expression = get_parsed_condition_expression(_request);
|
|
const rjson::value* expression_attribute_names = rjson::find(_request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(_request, "ExpressionAttributeValues");
|
|
if (!_condition_expression.empty()) {
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
resolve_condition_expression(_condition_expression,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "PutItem");
|
|
verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "PutItem");
|
|
} else {
|
|
if (expression_attribute_names) {
|
|
throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
|
|
}
|
|
if (expression_attribute_values) {
|
|
throw api_error::validation("ExpressionAttributeValues cannot be used without ConditionExpression");
|
|
}
|
|
}
|
|
}
|
|
bool needs_read_before_write() const {
|
|
return _request.HasMember("Expected") ||
|
|
check_needs_read_before_write(_condition_expression) ||
|
|
_returnvalues == returnvalues::ALL_OLD;
|
|
}
|
|
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
|
|
if (!verify_expected(_request, previous_item.get()) ||
|
|
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
|
// If the update is to be cancelled because of an unfulfilled Expected
|
|
// condition, return an empty optional mutation, which is more
|
|
// efficient than throwing an exception.
|
|
return {};
|
|
}
|
|
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
|
_return_attributes = std::move(*previous_item);
|
|
} else {
|
|
_return_attributes = {};
|
|
}
|
|
return _mutation_builder.build(_schema, ts);
|
|
}
|
|
virtual ~put_item_operation() = default;
|
|
};
|
|
|
|
future<executor::request_return_type> executor::put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.put_item++;
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
elogger.trace("put_item {}", request);
|
|
|
|
auto op = make_shared<put_item_operation>(_proxy, std::move(request));
|
|
tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
|
|
const bool needs_read_before_write = op->needs_read_before_write();
|
|
if (auto shard = op->shard_for_execute(needs_read_before_write); shard) {
|
|
_stats.api_operations.put_item--; // uncount on this shard, will be counted in other shard
|
|
_stats.shard_bounce_for_lwt++;
|
|
return container().invoke_on(*shard, _ssg,
|
|
[request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
|
|
(executor& e) mutable {
|
|
return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
|
|
(service::client_state& client_state) mutable {
|
|
//FIXME: A corresponding FIXME can be found in transport/server.cc when a message must be bounced
|
|
// to another shard - once it is solved, this place can use a similar solution. Instead of passing
|
|
// empty_service_permit() to the background operation, the current permit's lifetime should be prolonged,
|
|
// so that it's destructed only after all background operations are finished as well.
|
|
return e.put_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request));
|
|
});
|
|
});
|
|
}
|
|
return op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats).finally([op, start_time, this] {
|
|
_stats.api_operations.put_item_latency.add(std::chrono::steady_clock::now() - start_time);
|
|
});
|
|
}
|
|
|
|
class delete_item_operation : public rmw_operation {
|
|
private:
|
|
put_or_delete_item _mutation_builder;
|
|
public:
|
|
parsed::condition_expression _condition_expression;
|
|
delete_item_operation(service::storage_proxy& proxy, rjson::value&& request)
|
|
: rmw_operation(proxy, std::move(request))
|
|
, _mutation_builder(rjson::get(_request, "Key"), schema(), put_or_delete_item::delete_item{}) {
|
|
_pk = _mutation_builder.pk();
|
|
_ck = _mutation_builder.ck();
|
|
if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
|
|
throw api_error::validation(format("DeleteItem supports only NONE or ALL_OLD for ReturnValues"));
|
|
}
|
|
_condition_expression = get_parsed_condition_expression(_request);
|
|
const rjson::value* expression_attribute_names = rjson::find(_request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(_request, "ExpressionAttributeValues");
|
|
if (!_condition_expression.empty()) {
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
resolve_condition_expression(_condition_expression,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "DeleteItem");
|
|
verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "DeleteItem");
|
|
} else {
|
|
if (expression_attribute_names) {
|
|
throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
|
|
}
|
|
if (expression_attribute_values) {
|
|
throw api_error::validation("ExpressionAttributeValues cannot be used without ConditionExpression");
|
|
}
|
|
}
|
|
}
|
|
bool needs_read_before_write() const {
|
|
return _request.HasMember("Expected") ||
|
|
check_needs_read_before_write(_condition_expression) ||
|
|
_returnvalues == returnvalues::ALL_OLD;
|
|
}
|
|
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override {
|
|
if (!verify_expected(_request, previous_item.get()) ||
|
|
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
|
// If the update is to be cancelled because of an unfulfilled Expected
|
|
// condition, return an empty optional mutation, which is more
|
|
// efficient than throwing an exception.
|
|
return {};
|
|
}
|
|
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
|
_return_attributes = std::move(*previous_item);
|
|
} else {
|
|
_return_attributes = {};
|
|
}
|
|
return _mutation_builder.build(_schema, ts);
|
|
}
|
|
virtual ~delete_item_operation() = default;
|
|
};
|
|
|
|
future<executor::request_return_type> executor::delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.delete_item++;
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
elogger.trace("delete_item {}", request);
|
|
|
|
auto op = make_shared<delete_item_operation>(_proxy, std::move(request));
|
|
tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
|
|
const bool needs_read_before_write = op->needs_read_before_write();
|
|
if (auto shard = op->shard_for_execute(needs_read_before_write); shard) {
|
|
_stats.api_operations.delete_item--; // uncount on this shard, will be counted in other shard
|
|
_stats.shard_bounce_for_lwt++;
|
|
return container().invoke_on(*shard, _ssg,
|
|
[request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
|
|
(executor& e) mutable {
|
|
return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
|
|
(service::client_state& client_state) mutable {
|
|
//FIXME: A corresponding FIXME can be found in transport/server.cc when a message must be bounced
|
|
// to another shard - once it is solved, this place can use a similar solution. Instead of passing
|
|
// empty_service_permit() to the background operation, the current permit's lifetime should be prolonged,
|
|
// so that it's destructed only after all background operations are finished as well.
|
|
return e.delete_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request));
|
|
});
|
|
});
|
|
}
|
|
return op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats).finally([op, start_time, this] {
|
|
_stats.api_operations.delete_item_latency.add(std::chrono::steady_clock::now() - start_time);
|
|
});
|
|
}
|
|
|
|
static schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) {
|
|
sstring table_name = batch_request->name.GetString(); // JSON keys are always strings
|
|
validate_table_name(table_name);
|
|
try {
|
|
return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
|
|
} catch(data_dictionary::no_such_column_family&) {
|
|
throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name));
|
|
}
|
|
}
|
|
|
|
using primary_key = std::pair<partition_key, clustering_key>;
|
|
struct primary_key_hash {
|
|
schema_ptr _s;
|
|
size_t operator()(const primary_key& key) const {
|
|
return utils::hash_combine(partition_key::hashing(*_s)(key.first), clustering_key::hashing(*_s)(key.second));
|
|
}
|
|
};
|
|
struct primary_key_equal {
|
|
schema_ptr _s;
|
|
bool operator()(const primary_key& k1, const primary_key& k2) const {
|
|
return partition_key::equality(*_s)(k1.first, k2.first) && clustering_key::equality(*_s)(k1.second, k2.second);
|
|
}
|
|
};
|
|
|
|
// This is a cas_request subclass for applying given put_or_delete_items to
|
|
// one partition using LWT as part as BatchWriteItems. This is a write-only
|
|
// operation, not needing the previous value of the item (the mutation to be
|
|
// done is known prior to starting the operation). Nevertheless, we want to
|
|
// do this mutation via LWT to ensure that it is serialized with other LWT
|
|
// mutations to the same partition.
|
|
class put_or_delete_item_cas_request : public service::cas_request {
|
|
schema_ptr schema;
|
|
std::vector<put_or_delete_item> _mutation_builders;
|
|
public:
|
|
put_or_delete_item_cas_request(schema_ptr s, std::vector<put_or_delete_item>&& b) :
|
|
schema(std::move(s)), _mutation_builders(std::move(b)) { }
|
|
virtual ~put_or_delete_item_cas_request() = default;
|
|
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts) override {
|
|
std::optional<mutation> ret;
|
|
for (const put_or_delete_item& mutation_builder : _mutation_builders) {
|
|
// We assume all these builders have the same partition.
|
|
if (ret) {
|
|
ret->apply(mutation_builder.build(schema, ts));
|
|
} else {
|
|
ret = mutation_builder.build(schema, ts);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
};
|
|
|
|
static future<> cas_write(service::storage_proxy& proxy, schema_ptr schema, dht::decorated_key dk, std::vector<put_or_delete_item>&& mutation_builders,
|
|
service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit) {
|
|
auto timeout = executor::default_timeout();
|
|
auto op = seastar::make_shared<put_or_delete_item_cas_request>(schema, std::move(mutation_builders));
|
|
return proxy.cas(schema, op, nullptr, to_partition_ranges(dk),
|
|
{timeout, std::move(permit), client_state, trace_state},
|
|
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM,
|
|
timeout, timeout).discard_result();
|
|
// We discarded cas()'s future value ("is_applied") because BatchWriteItems
|
|
// does not need to support conditional updates.
|
|
}
|
|
|
|
|
|
struct schema_decorated_key {
|
|
schema_ptr schema;
|
|
dht::decorated_key dk;
|
|
};
|
|
struct schema_decorated_key_hash {
|
|
size_t operator()(const schema_decorated_key& k) const {
|
|
return std::hash<dht::token>()(k.dk.token());
|
|
}
|
|
};
|
|
struct schema_decorated_key_equal {
|
|
bool operator()(const schema_decorated_key& k1, const schema_decorated_key& k2) const {
|
|
return k1.schema == k2.schema && k1.dk.equal(*k1.schema, k2.dk);
|
|
}
|
|
};
|
|
|
|
// FIXME: if we failed writing some of the mutations, need to return a list
|
|
// of these failed mutations rather than fail the whole write (issue #5650).
|
|
static future<> do_batch_write(service::storage_proxy& proxy,
|
|
smp_service_group ssg,
|
|
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
|
|
service::client_state& client_state,
|
|
tracing::trace_state_ptr trace_state,
|
|
service_permit permit,
|
|
stats& stats) {
|
|
if (mutation_builders.empty()) {
|
|
return make_ready_future<>();
|
|
}
|
|
// NOTE: technically, do_batch_write could be reworked to use LWT only for part
|
|
// of the batched requests and not use it for others, but it's not considered
|
|
// likely that a batch will contain both tables which always demand LWT and ones
|
|
// that don't - it's fragile to split a batch into multiple storage proxy requests though.
|
|
// Hence, the decision is conservative - if any table enforces LWT,the whole batch will use it.
|
|
const bool needs_lwt = boost::algorithm::any_of(mutation_builders | boost::adaptors::map_keys, [] (const schema_ptr& schema) {
|
|
return rmw_operation::get_write_isolation_for_schema(schema) == rmw_operation::write_isolation::LWT_ALWAYS;
|
|
});
|
|
if (!needs_lwt) {
|
|
// Do a normal write, without LWT:
|
|
std::vector<mutation> mutations;
|
|
mutations.reserve(mutation_builders.size());
|
|
api::timestamp_type now = api::new_timestamp();
|
|
for (auto& b : mutation_builders) {
|
|
mutations.push_back(b.second.build(b.first, now));
|
|
}
|
|
return proxy.mutate(std::move(mutations),
|
|
db::consistency_level::LOCAL_QUORUM,
|
|
executor::default_timeout(),
|
|
trace_state,
|
|
std::move(permit),
|
|
db::allow_per_partition_rate_limit::yes);
|
|
} else {
|
|
// Do the write via LWT:
|
|
// Multiple mutations may be destined for the same partition, adding
|
|
// or deleting different items of one partition. Join them together
|
|
// because we can do them in one cas() call.
|
|
std::unordered_map<schema_decorated_key, std::vector<put_or_delete_item>, schema_decorated_key_hash, schema_decorated_key_equal>
|
|
key_builders(1, schema_decorated_key_hash{}, schema_decorated_key_equal{});
|
|
for (auto& b : mutation_builders) {
|
|
auto dk = dht::decorate_key(*b.first, b.second.pk());
|
|
auto [it, added] = key_builders.try_emplace(schema_decorated_key{b.first, dk});
|
|
it->second.push_back(std::move(b.second));
|
|
}
|
|
return parallel_for_each(std::move(key_builders), [&proxy, &client_state, &stats, trace_state, ssg, permit = std::move(permit)] (auto& e) {
|
|
stats.write_using_lwt++;
|
|
auto desired_shard = service::storage_proxy::cas_shard(*e.first.schema, e.first.dk.token());
|
|
if (desired_shard == this_shard_id()) {
|
|
return cas_write(proxy, e.first.schema, e.first.dk, std::move(e.second), client_state, trace_state, permit);
|
|
} else {
|
|
stats.shard_bounce_for_lwt++;
|
|
return proxy.container().invoke_on(desired_shard, ssg,
|
|
[cs = client_state.move_to_other_shard(),
|
|
mb = e.second,
|
|
dk = e.first.dk,
|
|
ks = e.first.schema->ks_name(),
|
|
cf = e.first.schema->cf_name(),
|
|
gt = tracing::global_trace_state_ptr(trace_state),
|
|
permit = std::move(permit)]
|
|
(service::storage_proxy& proxy) mutable {
|
|
return do_with(cs.get(), [&proxy, mb = std::move(mb), dk = std::move(dk), ks = std::move(ks), cf = std::move(cf),
|
|
trace_state = tracing::trace_state_ptr(gt)]
|
|
(service::client_state& client_state) mutable {
|
|
auto schema = proxy.data_dictionary().find_schema(ks, cf);
|
|
//FIXME: A corresponding FIXME can be found in transport/server.cc when a message must be bounced
|
|
// to another shard - once it is solved, this place can use a similar solution. Instead of passing
|
|
// empty_service_permit() to the background operation, the current permit's lifetime should be prolonged,
|
|
// so that it's destructed only after all background operations are finished as well.
|
|
return cas_write(proxy, schema, dk, std::move(mb), client_state, std::move(trace_state), empty_service_permit());
|
|
});
|
|
});
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
future<executor::request_return_type> executor::batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.batch_write_item++;
|
|
rjson::value& request_items = request["RequestItems"];
|
|
|
|
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders;
|
|
mutation_builders.reserve(request_items.MemberCount());
|
|
|
|
for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
|
|
schema_ptr schema = get_table_from_batch_request(_proxy, it);
|
|
tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
|
|
std::unordered_set<primary_key, primary_key_hash, primary_key_equal> used_keys(
|
|
1, primary_key_hash{schema}, primary_key_equal{schema});
|
|
for (auto& request : it->value.GetArray()) {
|
|
if (!request.IsObject() || request.MemberCount() != 1) {
|
|
return make_ready_future<request_return_type>(api_error::validation(format("Invalid BatchWriteItem request: {}", request)));
|
|
}
|
|
auto r = request.MemberBegin();
|
|
const std::string r_name = r->name.GetString();
|
|
if (r_name == "PutRequest") {
|
|
const rjson::value& put_request = r->value;
|
|
const rjson::value& item = put_request["Item"];
|
|
mutation_builders.emplace_back(schema, put_or_delete_item(
|
|
item, schema, put_or_delete_item::put_item{}));
|
|
auto mut_key = std::make_pair(mutation_builders.back().second.pk(), mutation_builders.back().second.ck());
|
|
if (used_keys.contains(mut_key)) {
|
|
return make_ready_future<request_return_type>(api_error::validation("Provided list of item keys contains duplicates"));
|
|
}
|
|
used_keys.insert(std::move(mut_key));
|
|
} else if (r_name == "DeleteRequest") {
|
|
const rjson::value& key = (r->value)["Key"];
|
|
mutation_builders.emplace_back(schema, put_or_delete_item(
|
|
key, schema, put_or_delete_item::delete_item{}));
|
|
auto mut_key = std::make_pair(mutation_builders.back().second.pk(),
|
|
mutation_builders.back().second.ck());
|
|
if (used_keys.contains(mut_key)) {
|
|
return make_ready_future<request_return_type>(api_error::validation("Provided list of item keys contains duplicates"));
|
|
}
|
|
used_keys.insert(std::move(mut_key));
|
|
} else {
|
|
return make_ready_future<request_return_type>(api_error::validation(format("Unknown BatchWriteItem request type: {}", r_name)));
|
|
}
|
|
}
|
|
}
|
|
|
|
return do_batch_write(_proxy, _ssg, std::move(mutation_builders), client_state, trace_state, std::move(permit), _stats).then([] () {
|
|
// FIXME: Issue #5650: If we failed writing some of the updates,
|
|
// need to return a list of these failed updates in UnprocessedItems
|
|
// rather than fail the whole write (issue #5650).
|
|
rjson::value ret = rjson::empty_object();
|
|
rjson::add(ret, "UnprocessedItems", rjson::empty_object());
|
|
return make_ready_future<executor::request_return_type>(make_jsonable(std::move(ret)));
|
|
});
|
|
}
|
|
|
|
static std::string get_item_type_string(const rjson::value& v) {
|
|
if (!v.IsObject() || v.MemberCount() != 1) {
|
|
throw api_error::validation(format("Item has invalid format: {}", v));
|
|
}
|
|
auto it = v.MemberBegin();
|
|
return it->name.GetString();
|
|
}
|
|
|
|
// attrs_to_get saves for each top-level attribute an attrs_to_get_node,
|
|
// a hierarchy of subparts that need to be kept. The following function
|
|
// takes a given JSON value and drops its parts which weren't asked to be
|
|
// kept. It modifies the given JSON value, or returns false to signify that
|
|
// the entire object should be dropped.
|
|
// Note that The JSON value is assumed to be encoded using the DynamoDB
|
|
// conventions - i.e., it is really a map whose key has a type string,
|
|
// and the value is the real object.
|
|
template<typename T>
|
|
static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>& h) {
|
|
if (!val.IsObject() || val.MemberCount() != 1) {
|
|
// This shouldn't happen. We shouldn't have stored malformed objects.
|
|
// But today Alternator does not validate the structure of nested
|
|
// documents before storing them, so this can happen on read.
|
|
throw api_error::internal(format("Malformed value object read: {}", val));
|
|
}
|
|
const char* type = val.MemberBegin()->name.GetString();
|
|
rjson::value& v = val.MemberBegin()->value;
|
|
if (h.has_members()) {
|
|
const auto& members = h.get_members();
|
|
if (type[0] != 'M' || !v.IsObject()) {
|
|
// If v is not an object (dictionary, map), none of the members
|
|
// can match.
|
|
return false;
|
|
}
|
|
rjson::value newv = rjson::empty_object();
|
|
for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) {
|
|
std::string attr = it->name.GetString();
|
|
auto x = members.find(attr);
|
|
if (x != members.end()) {
|
|
if (x->second) {
|
|
// Only a part of this attribute is to be filtered, do it.
|
|
if (hierarchy_filter(it->value, *x->second)) {
|
|
// because newv started empty and attr are unique
|
|
// (keys of v), we can use add() here
|
|
rjson::add_with_string_name(newv, attr, std::move(it->value));
|
|
}
|
|
} else {
|
|
// The entire attribute is to be kept
|
|
rjson::add_with_string_name(newv, attr, std::move(it->value));
|
|
}
|
|
}
|
|
}
|
|
if (newv.MemberCount() == 0) {
|
|
return false;
|
|
}
|
|
v = newv;
|
|
} else if (h.has_indexes()) {
|
|
const auto& indexes = h.get_indexes();
|
|
if (type[0] != 'L' || !v.IsArray()) {
|
|
return false;
|
|
}
|
|
rjson::value newv = rjson::empty_array();
|
|
const auto& a = v.GetArray();
|
|
for (unsigned i = 0; i < v.Size(); i++) {
|
|
auto x = indexes.find(i);
|
|
if (x != indexes.end()) {
|
|
if (x->second) {
|
|
if (hierarchy_filter(a[i], *x->second)) {
|
|
rjson::push_back(newv, std::move(a[i]));
|
|
}
|
|
} else {
|
|
// The entire attribute is to be kept
|
|
rjson::push_back(newv, std::move(a[i]));
|
|
}
|
|
}
|
|
}
|
|
if (newv.Size() == 0) {
|
|
return false;
|
|
}
|
|
v = newv;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Add a path to a attribute_path_map. Throws a validation error if the path
|
|
// "overlaps" with one already in the filter (one is a sub-path of the other)
|
|
// or "conflicts" with it (both a member and index is requested).
|
|
template<typename T>
|
|
void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const parsed::path& p, T value = {}) {
|
|
using node = attribute_path_map_node<T>;
|
|
// The first step is to look for the top-level attribute (p.root()):
|
|
auto it = map.find(p.root());
|
|
if (it == map.end()) {
|
|
if (p.has_operators()) {
|
|
it = map.emplace(p.root(), node {std::nullopt}).first;
|
|
} else {
|
|
(void) map.emplace(p.root(), node {std::move(value)}).first;
|
|
// Value inserted for top-level node. We're done.
|
|
return;
|
|
}
|
|
} else if(!p.has_operators()) {
|
|
// If p is top-level and we already have it or a part of it
|
|
// in map, it's a forbidden overlapping path.
|
|
throw api_error::validation(format(
|
|
"Invalid {}: two document paths overlap at {}", source, p.root()));
|
|
} else if (it->second.has_value()) {
|
|
// If we're here, it != map.end() && p.has_operators && it->second.has_value().
|
|
// This means the top-level attribute already has a value, and we're
|
|
// trying to add a non-top-level value. It's an overlap.
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p.root()));
|
|
}
|
|
node* h = &it->second;
|
|
// The second step is to walk h from the top-level node to the inner node
|
|
// where we're supposed to insert the value:
|
|
for (const auto& op : p.operators()) {
|
|
std::visit(overloaded_functor {
|
|
[&] (const std::string& member) {
|
|
if (h->is_empty()) {
|
|
*h = node {typename node::members_t()};
|
|
} else if (h->has_indexes()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
|
|
} else if (h->has_value()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
|
|
}
|
|
typename node::members_t& members = h->get_members();
|
|
auto it = members.find(member);
|
|
if (it == members.end()) {
|
|
it = members.insert({member, std::make_unique<node>()}).first;
|
|
}
|
|
h = it->second.get();
|
|
},
|
|
[&] (unsigned index) {
|
|
if (h->is_empty()) {
|
|
*h = node {typename node::indexes_t()};
|
|
} else if (h->has_members()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
|
|
} else if (h->has_value()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
|
|
}
|
|
typename node::indexes_t& indexes = h->get_indexes();
|
|
auto it = indexes.find(index);
|
|
if (it == indexes.end()) {
|
|
it = indexes.insert({index, std::make_unique<node>()}).first;
|
|
}
|
|
h = it->second.get();
|
|
}
|
|
}, op);
|
|
}
|
|
// Finally, insert the value in the node h.
|
|
if (h->is_empty()) {
|
|
*h = node {std::move(value)};
|
|
} else {
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
|
|
}
|
|
}
|
|
|
|
// A very simplified version of the above function for the special case of
|
|
// adding only top-level attribute. It's not only simpler, we also use a
|
|
// different error message, referring to a "duplicate attribute"instead of
|
|
// "overlapping paths". DynamoDB also has this distinction (errors in
|
|
// AttributesToGet refer to duplicates, not overlaps, but errors in
|
|
// ProjectionExpression refer to overlap - even if it's an exact duplicate).
|
|
template<typename T>
|
|
void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const std::string& attr, T value = {}) {
|
|
using node = attribute_path_map_node<T>;
|
|
auto it = map.find(attr);
|
|
if (it == map.end()) {
|
|
map.emplace(attr, node {std::move(value)});
|
|
} else {
|
|
throw api_error::validation(format(
|
|
"Invalid {}: Duplicate attribute: {}", source, attr));
|
|
}
|
|
}
|
|
|
|
// Parse the "Select" parameter of a Scan or Query operation, throwing a
|
|
// ValidationException in various forbidden combinations of options and
|
|
// finally returning one of three options:
|
|
// 1. regular - the default scan behavior of returning all or specific
|
|
// attributes ("ALL_ATTRIBUTES" or "SPECIFIC_ATTRIBUTES").
|
|
// 2. count - just count the items ("COUNT")
|
|
// 3. projection - return projected attributes ("ALL_PROJECTED_ATTRIBUTES")
|
|
// An ValidationException is thrown when recognizing an invalid combination
|
|
// of options - such as ALL_PROJECTED_ATTRIBUTES for a base table, or
|
|
// SPECIFIC_ATTRIBUTES without ProjectionExpression or AttributesToGet.
|
|
enum class select_type { regular, count, projection };
|
|
static select_type parse_select(const rjson::value& request, table_or_view_type table_type) {
|
|
const rjson::value* select_value = rjson::find(request, "Select");
|
|
if (!select_value) {
|
|
// If "Select" is not specificed, it defaults to ALL_ATTRIBUTES
|
|
// on a base table, or ALL_PROJECTED_ATTRIBUTES on an index
|
|
return table_type == table_or_view_type::base ?
|
|
select_type::regular : select_type::projection;
|
|
}
|
|
if (!select_value->IsString()) {
|
|
throw api_error::validation("Select parameter must be a string");
|
|
}
|
|
std::string_view select = rjson::to_string_view(*select_value);
|
|
const bool has_attributes_to_get = request.HasMember("AttributesToGet");
|
|
const bool has_projection_expression = request.HasMember("ProjectionExpression");
|
|
if (select == "SPECIFIC_ATTRIBUTES") {
|
|
if (has_projection_expression || has_attributes_to_get) {
|
|
return select_type::regular;
|
|
}
|
|
throw api_error::validation("Select=SPECIFIC_ATTRIBUTES requires AttributesToGet or ProjectionExpression");
|
|
}
|
|
if (has_projection_expression || has_attributes_to_get) {
|
|
throw api_error::validation("AttributesToGet or ProjectionExpression require Select to be either SPECIFIC_ATTRIBUTES or missing");
|
|
}
|
|
if (select == "COUNT") {
|
|
return select_type::count;
|
|
}
|
|
if (select == "ALL_ATTRIBUTES") {
|
|
// FIXME: when we support projections (#5036), if this is a GSI and
|
|
// not all attributes are projected to it, we should throw.
|
|
return select_type::regular;
|
|
}
|
|
if (select == "ALL_PROJECTED_ATTRIBUTES") {
|
|
if (table_type == table_or_view_type::base) {
|
|
throw api_error::validation("ALL_PROJECTED_ATTRIBUTES only allowed for indexes");
|
|
}
|
|
return select_type::projection;
|
|
}
|
|
throw api_error::validation(format("Unknown Select value '{}'. Allowed choices: ALL_ATTRIBUTES, SPECIFIC_ATTRIBUTES, ALL_PROJECTED_ATTRIBUTES, COUNT",
|
|
select));
|
|
}
|
|
|
|
// calculate_attrs_to_get() takes either AttributesToGet or
|
|
// ProjectionExpression parameters (having both is *not* allowed),
|
|
// and returns the list of cells we need to read, or a disengaged optional
|
|
// when *all* attributes are to be returned.
|
|
// However, in our current implementation, only top-level attributes are
|
|
// stored as separate cells - a nested document is stored serialized together
|
|
// (as JSON) in the same cell. So this function return a map - each key is the
|
|
// top-level attribute we will need need to read, and the value for each
|
|
// top-level attribute is the partial hierarchy (struct hierarchy_filter)
|
|
// that we will need to extract from that serialized JSON.
|
|
// For example, if ProjectionExpression lists a.b and a.c[2], we
|
|
// return one top-level attribute name, "a", with the value "{b, c[2]}".
|
|
|
|
static std::optional<attrs_to_get> calculate_attrs_to_get(const rjson::value& req, std::unordered_set<std::string>& used_attribute_names, select_type select = select_type::regular) {
|
|
if (select == select_type::count) {
|
|
// An empty map asks to retrieve no attributes. Note that this is
|
|
// different from a disengaged optional which means retrieve all.
|
|
return attrs_to_get();
|
|
}
|
|
// FIXME: also need to handle select_type::projection
|
|
const bool has_attributes_to_get = req.HasMember("AttributesToGet");
|
|
const bool has_projection_expression = req.HasMember("ProjectionExpression");
|
|
if (has_attributes_to_get && has_projection_expression) {
|
|
throw api_error::validation(
|
|
format("GetItem does not allow both ProjectionExpression and AttributesToGet to be given together"));
|
|
}
|
|
if (has_attributes_to_get) {
|
|
const rjson::value& attributes_to_get = req["AttributesToGet"];
|
|
attrs_to_get ret;
|
|
for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
|
|
attribute_path_map_add("AttributesToGet", ret, it->GetString());
|
|
}
|
|
if (ret.empty()) {
|
|
throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
|
|
}
|
|
return ret;
|
|
} else if (has_projection_expression) {
|
|
const rjson::value& projection_expression = req["ProjectionExpression"];
|
|
const rjson::value* expression_attribute_names = rjson::find(req, "ExpressionAttributeNames");
|
|
std::vector<parsed::path> paths_to_get;
|
|
try {
|
|
paths_to_get = parse_projection_expression(rjson::to_string_view(projection_expression));
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
resolve_projection_expression(paths_to_get, expression_attribute_names, used_attribute_names);
|
|
attrs_to_get ret;
|
|
for (const parsed::path& p : paths_to_get) {
|
|
attribute_path_map_add("ProjectionExpression", ret, p);
|
|
}
|
|
return ret;
|
|
}
|
|
// An disengaged optional asks to read everything
|
|
return std::nullopt;
|
|
}
|
|
|
|
/**
|
|
* Helper routine to extract data when we already have
|
|
* row, etc etc.
|
|
*
|
|
* Note: include_all_embedded_attributes means we should
|
|
* include all values in the `ATTRS_COLUMN_NAME` map column.
|
|
*
|
|
* We could change the behaviour to simply include all values
|
|
* from this column if the `ATTRS_COLUMN_NAME` is explicit in
|
|
* `attrs_to_get`, but I am scared to do that now in case
|
|
* there is some corner case in existing code.
|
|
*
|
|
* Explicit bool means we can be sure all previous calls are
|
|
* as before.
|
|
*/
|
|
void executor::describe_single_item(const cql3::selection::selection& selection,
|
|
const std::vector<bytes_opt>& result_row,
|
|
const std::optional<attrs_to_get>& attrs_to_get,
|
|
rjson::value& item,
|
|
bool include_all_embedded_attributes)
|
|
{
|
|
const auto& columns = selection.get_columns();
|
|
auto column_it = columns.begin();
|
|
for (const bytes_opt& cell : result_row) {
|
|
std::string column_name = (*column_it)->name_as_text();
|
|
if (cell && column_name != executor::ATTRS_COLUMN_NAME) {
|
|
if (!attrs_to_get || attrs_to_get->contains(column_name)) {
|
|
// item is expected to start empty, and column_name are unique
|
|
// so add() makes sense
|
|
rjson::add_with_string_name(item, column_name, rjson::empty_object());
|
|
rjson::value& field = item[column_name.c_str()];
|
|
rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(*cell, **column_it));
|
|
}
|
|
} else if (cell) {
|
|
auto deserialized = attrs_type()->deserialize(*cell, cql_serialization_format::latest());
|
|
auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
|
|
for (auto entry : keys_and_values) {
|
|
std::string attr_name = value_cast<sstring>(entry.first);
|
|
if (include_all_embedded_attributes || !attrs_to_get || attrs_to_get->contains(attr_name)) {
|
|
bytes value = value_cast<bytes>(entry.second);
|
|
rjson::value v = deserialize_item(value);
|
|
if (attrs_to_get) {
|
|
auto it = attrs_to_get->find(attr_name);
|
|
if (it != attrs_to_get->end()) {
|
|
// attrs_to_get may have asked for only part of
|
|
// this attribute. hierarchy_filter() modifies v,
|
|
// and returns false when nothing is to be kept.
|
|
if (!hierarchy_filter(v, it->second)) {
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
// item is expected to start empty, and attribute
|
|
// names are unique so add() makes sense
|
|
rjson::add_with_string_name(item, attr_name, std::move(v));
|
|
}
|
|
}
|
|
}
|
|
++column_it;
|
|
}
|
|
}
|
|
|
|
std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
|
|
const query::partition_slice& slice,
|
|
const cql3::selection::selection& selection,
|
|
const query::result& query_result,
|
|
const std::optional<attrs_to_get>& attrs_to_get) {
|
|
rjson::value item = rjson::empty_object();
|
|
|
|
cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
|
|
query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
|
|
|
|
auto result_set = builder.build();
|
|
if (result_set->empty()) {
|
|
// If there is no matching item, we're supposed to return an empty
|
|
// object without an Item member - not one with an empty Item member
|
|
return {};
|
|
}
|
|
if (result_set->size() > 1) {
|
|
// If the result set contains multiple rows, the code should have
|
|
// called describe_multi_item(), not this function.
|
|
throw std::logic_error("describe_single_item() asked to describe multiple items");
|
|
}
|
|
describe_single_item(selection, *result_set->rows().begin(), attrs_to_get, item);
|
|
return item;
|
|
}
|
|
|
|
std::vector<rjson::value> executor::describe_multi_item(schema_ptr schema,
|
|
const query::partition_slice& slice,
|
|
const cql3::selection::selection& selection,
|
|
const query::result& query_result,
|
|
const std::optional<attrs_to_get>& attrs_to_get) {
|
|
cql3::selection::result_set_builder builder(selection, gc_clock::now(), cql_serialization_format::latest());
|
|
query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
|
|
auto result_set = builder.build();
|
|
std::vector<rjson::value> ret;
|
|
for (auto& result_row : result_set->rows()) {
|
|
rjson::value item = rjson::empty_object();
|
|
describe_single_item(selection, result_row, attrs_to_get, item);
|
|
ret.push_back(std::move(item));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static bool check_needs_read_before_write(const parsed::value& v) {
|
|
return std::visit(overloaded_functor {
|
|
[&] (const parsed::constant& c) -> bool {
|
|
return false;
|
|
},
|
|
[&] (const parsed::value::function_call& f) -> bool {
|
|
return boost::algorithm::any_of(f._parameters, [&] (const parsed::value& param) {
|
|
return check_needs_read_before_write(param);
|
|
});
|
|
},
|
|
[&] (const parsed::path& p) -> bool {
|
|
return true;
|
|
}
|
|
}, v._value);
|
|
}
|
|
|
|
static bool check_needs_read_before_write(const attribute_path_map<parsed::update_expression::action>& update_expression) {
|
|
return boost::algorithm::any_of(update_expression, [](const auto& p) {
|
|
if (!p.second.has_value()) {
|
|
// If the action is not on the top-level attribute, we need to
|
|
// read the old item: we change only a part of the top-level
|
|
// attribute, and write the full top-level attribute back.
|
|
return true;
|
|
}
|
|
// Otherwise, the action p.second.get_value() is just on top-level
|
|
// attribute. Check if it needs read-before-write:
|
|
return std::visit(overloaded_functor {
|
|
[&] (const parsed::update_expression::action::set& a) -> bool {
|
|
return check_needs_read_before_write(a._rhs._v1) || (a._rhs._op != 'v' && check_needs_read_before_write(a._rhs._v2));
|
|
},
|
|
[&] (const parsed::update_expression::action::remove& a) -> bool {
|
|
return false;
|
|
},
|
|
[&] (const parsed::update_expression::action::add& a) -> bool {
|
|
return true;
|
|
},
|
|
[&] (const parsed::update_expression::action::del& a) -> bool {
|
|
return true;
|
|
}
|
|
}, p.second.get_value()._action);
|
|
});
|
|
}
|
|
|
|
class update_item_operation : public rmw_operation {
|
|
public:
|
|
// Some information parsed during the constructor to check for input
|
|
// errors, and cached to be used again during apply().
|
|
rjson::value* _attribute_updates;
|
|
// Instead of keeping a parsed::update_expression with an unsorted list
|
|
// list of actions, we keep them in an attribute_path_map which groups
|
|
// them by top-level attribute, and detects forbidden overlaps/conflicts.
|
|
attribute_path_map<parsed::update_expression::action> _update_expression;
|
|
|
|
parsed::condition_expression _condition_expression;
|
|
|
|
update_item_operation(service::storage_proxy& proxy, rjson::value&& request);
|
|
virtual ~update_item_operation() = default;
|
|
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const override;
|
|
bool needs_read_before_write() const;
|
|
};
|
|
|
|
update_item_operation::update_item_operation(service::storage_proxy& proxy, rjson::value&& update_info)
|
|
: rmw_operation(proxy, std::move(update_info))
|
|
{
|
|
const rjson::value* key = rjson::find(_request, "Key");
|
|
if (!key) {
|
|
throw api_error::validation("UpdateItem requires a Key parameter");
|
|
}
|
|
_pk = pk_from_json(*key, _schema);
|
|
_ck = ck_from_json(*key, _schema);
|
|
check_key(*key, _schema);
|
|
|
|
const rjson::value* expression_attribute_names = rjson::find(_request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(_request, "ExpressionAttributeValues");
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
|
|
const rjson::value* update_expression = rjson::find(_request, "UpdateExpression");
|
|
if (update_expression) {
|
|
if (!update_expression->IsString()) {
|
|
throw api_error::validation("UpdateExpression must be a string");
|
|
}
|
|
try {
|
|
parsed::update_expression expr = parse_update_expression(rjson::to_string_view(*update_expression));
|
|
resolve_update_expression(expr,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
if (expr.empty()) {
|
|
throw api_error::validation("Empty expression in UpdateExpression is not allowed");
|
|
}
|
|
for (auto& action : expr.actions()) {
|
|
// Unfortunately we need to copy the action's path, because
|
|
// we std::move the action object.
|
|
auto p = action._path;
|
|
attribute_path_map_add("UpdateExpression", _update_expression, p, std::move(action));
|
|
}
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
}
|
|
_attribute_updates = rjson::find(_request, "AttributeUpdates");
|
|
if (_attribute_updates) {
|
|
if (!_attribute_updates->IsObject()) {
|
|
throw api_error::validation("AttributeUpdates must be an object");
|
|
}
|
|
}
|
|
|
|
_condition_expression = get_parsed_condition_expression(_request);
|
|
resolve_condition_expression(_condition_expression,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
|
|
verify_all_are_used(_request, "ExpressionAttributeNames", used_attribute_names, "UpdateItem");
|
|
verify_all_are_used(_request, "ExpressionAttributeValues", used_attribute_values, "UpdateItem");
|
|
|
|
// DynamoDB forbids having both old-style AttributeUpdates or Expected
|
|
// and new-style UpdateExpression or ConditionExpression in the same request
|
|
const rjson::value* expected = rjson::find(_request, "Expected");
|
|
if (update_expression && _attribute_updates) {
|
|
throw api_error::validation(
|
|
format("UpdateItem does not allow both AttributeUpdates and UpdateExpression to be given together"));
|
|
}
|
|
if (update_expression && expected) {
|
|
throw api_error::validation(
|
|
format("UpdateItem does not allow both old-style Expected and new-style UpdateExpression to be given together"));
|
|
}
|
|
if (_attribute_updates && !_condition_expression.empty()) {
|
|
throw api_error::validation(
|
|
format("UpdateItem does not allow both old-style AttributeUpdates and new-style ConditionExpression to be given together"));
|
|
}
|
|
}
|
|
|
|
// These are the cases where update_item_operation::apply() needs to use
|
|
// "previous_item" for certain AttributeUpdates operations (ADD or DELETE)
|
|
static bool check_needs_read_before_write_attribute_updates(rjson::value *attribute_updates) {
|
|
if (!attribute_updates) {
|
|
return false;
|
|
}
|
|
// We already confirmed in update_item_operation::update_item_operation()
|
|
// that _attribute_updates, when it exists, is a map
|
|
for (auto it = attribute_updates->MemberBegin(); it != attribute_updates->MemberEnd(); ++it) {
|
|
rjson::value* action = rjson::find(it->value, "Action");
|
|
if (action) {
|
|
std::string_view action_s = rjson::to_string_view(*action);
|
|
if (action_s == "ADD") {
|
|
return true;
|
|
}
|
|
// For DELETE operation, it only needs a read before write if the
|
|
// "Value" option is used. Without it, it's just a delete.
|
|
if (action_s == "DELETE" && it->value.HasMember("Value")) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
update_item_operation::needs_read_before_write() const {
|
|
return check_needs_read_before_write(_update_expression) ||
|
|
check_needs_read_before_write(_condition_expression) ||
|
|
check_needs_read_before_write_attribute_updates(_attribute_updates) ||
|
|
_request.HasMember("Expected") ||
|
|
(_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::UPDATED_NEW);
|
|
}
|
|
|
|
// action_result() returns the result of applying an UpdateItem action -
|
|
// this result is either a JSON object or an unset optional which indicates
|
|
// the action was a deletion. The caller (update_item_operation::apply()
|
|
// below) will either write this JSON as the content of a column, or
|
|
// use it as a piece in a bigger top-level attribute.
|
|
static std::optional<rjson::value> action_result(
|
|
const parsed::update_expression::action& action,
|
|
const rjson::value* previous_item) {
|
|
return std::visit(overloaded_functor {
|
|
[&] (const parsed::update_expression::action::set& a) -> std::optional<rjson::value> {
|
|
return calculate_value(a._rhs, previous_item);
|
|
},
|
|
[&] (const parsed::update_expression::action::remove& a) -> std::optional<rjson::value> {
|
|
return std::nullopt;
|
|
},
|
|
[&] (const parsed::update_expression::action::add& a) -> std::optional<rjson::value> {
|
|
parsed::value base;
|
|
parsed::value addition;
|
|
base.set_path(action._path);
|
|
addition.set_constant(a._valref);
|
|
rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
|
|
rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item);
|
|
rjson::value result;
|
|
// An ADD can be used to create a new attribute (when
|
|
// v1.IsNull()) or to add to a pre-existing attribute:
|
|
if (v1.IsNull()) {
|
|
std::string v2_type = get_item_type_string(v2);
|
|
if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") {
|
|
result = v2;
|
|
} else {
|
|
throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v2));
|
|
}
|
|
} else {
|
|
std::string v1_type = get_item_type_string(v1);
|
|
if (v1_type == "N") {
|
|
if (get_item_type_string(v2) != "N") {
|
|
throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
|
|
}
|
|
result = number_add(v1, v2);
|
|
} else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
|
|
if (get_item_type_string(v2) != v1_type) {
|
|
throw api_error::validation(format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
|
|
}
|
|
result = set_sum(v1, v2);
|
|
} else {
|
|
throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1));
|
|
}
|
|
}
|
|
return result;
|
|
},
|
|
[&] (const parsed::update_expression::action::del& a) -> std::optional<rjson::value> {
|
|
parsed::value base;
|
|
parsed::value subset;
|
|
base.set_path(action._path);
|
|
subset.set_constant(a._valref);
|
|
rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
|
|
rjson::value v2 = calculate_value(subset, calculate_value_caller::UpdateExpression, previous_item);
|
|
if (!v1.IsNull()) {
|
|
return set_diff(v1, v2);
|
|
}
|
|
// When we return nullopt here, we ask to *delete* this attribute,
|
|
// which is unnecessary because we know the attribute does not
|
|
// exist anyway. This is a waste, but a small one. Note that also
|
|
// for the "remove" action above we don't bother to check if the
|
|
// previous_item add anything to remove.
|
|
return std::nullopt;
|
|
}
|
|
}, action._action);
|
|
}
|
|
|
|
// Print an attribute_path_map_node<action> as the list of paths it contains:
|
|
static std::ostream& operator<<(std::ostream& out, const attribute_path_map_node<parsed::update_expression::action>& h) {
|
|
if (h.has_value()) {
|
|
out << " " << h.get_value()._path;
|
|
} else if (h.has_members()) {
|
|
for (auto& member : h.get_members()) {
|
|
out << *member.second;
|
|
}
|
|
} else if (h.has_indexes()) {
|
|
for (auto& index : h.get_indexes()) {
|
|
out << *index.second;
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// Apply the hierarchy of actions in an attribute_path_map_node<action> to a
|
|
// JSON object which uses DynamoDB's serialization conventions. The complete,
|
|
// unmodified, previous_item is also necessary for the right-hand sides of the
|
|
// actions. Modifies obj in-place or returns false if it is to be removed.
|
|
static bool hierarchy_actions(
|
|
rjson::value& obj,
|
|
const attribute_path_map_node<parsed::update_expression::action>& h,
|
|
const rjson::value* previous_item)
|
|
{
|
|
if (!obj.IsObject() || obj.MemberCount() != 1) {
|
|
// This shouldn't happen. We shouldn't have stored malformed objects.
|
|
// But today Alternator does not validate the structure of nested
|
|
// documents before storing them, so this can happen on read.
|
|
throw api_error::validation(format("Malformed value object read: {}", obj));
|
|
}
|
|
const char* type = obj.MemberBegin()->name.GetString();
|
|
rjson::value& v = obj.MemberBegin()->value;
|
|
if (h.has_value()) {
|
|
// Action replacing everything in this position in the hierarchy
|
|
std::optional<rjson::value> newv = action_result(h.get_value(), previous_item);
|
|
if (newv) {
|
|
obj = std::move(*newv);
|
|
} else {
|
|
return false;
|
|
}
|
|
} else if (h.has_members()) {
|
|
if (type[0] != 'M' || !v.IsObject()) {
|
|
// A .something on a non-map doesn't work.
|
|
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
|
}
|
|
for (const auto& member : h.get_members()) {
|
|
std::string attr = member.first;
|
|
const attribute_path_map_node<parsed::update_expression::action>& subh = *member.second;
|
|
rjson::value *subobj = rjson::find(v, attr);
|
|
if (subobj) {
|
|
if (!hierarchy_actions(*subobj, subh, previous_item)) {
|
|
rjson::remove_member(v, attr);
|
|
}
|
|
} else {
|
|
// When a.b does not exist, setting a.b itself (i.e.
|
|
// subh.has_value()) is fine, but setting a.b.c is not.
|
|
if (subh.has_value()) {
|
|
std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
|
|
if (newv) {
|
|
// This is the !subobj case, so v doesn't have an
|
|
// attr member so we can use add()
|
|
rjson::add_with_string_name(v, attr, std::move(*newv));
|
|
} else {
|
|
// Removing a.b when a is a map but a.b doesn't exist
|
|
// is silently ignored. It's not considered an error.
|
|
}
|
|
} else {
|
|
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
|
}
|
|
}
|
|
}
|
|
} else if (h.has_indexes()) {
|
|
if (type[0] != 'L' || !v.IsArray()) {
|
|
// A [i] on a non-list doesn't work.
|
|
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
|
}
|
|
unsigned nremoved = 0;
|
|
for (const auto& index : h.get_indexes()) {
|
|
unsigned i = index.first - nremoved;
|
|
const attribute_path_map_node<parsed::update_expression::action>& subh = *index.second;
|
|
if (i < v.Size()) {
|
|
if (!hierarchy_actions(v[i], subh, previous_item)) {
|
|
v.Erase(v.Begin() + i);
|
|
// If we have the actions "REMOVE a[1] SET a[3] = :val",
|
|
// the index 3 refers to the original indexes, before any
|
|
// items were removed. So we offset the next indexes
|
|
// (which are guaranteed to be higher than i - indexes is
|
|
// a sorted map) by an increased "nremoved".
|
|
nremoved++;
|
|
}
|
|
} else {
|
|
// If a[7] does not exist, setting a[7] itself (i.e.
|
|
// subh.has_value()) is fine - and appends an item, though
|
|
// not necessarily with index 7. But setting a[7].b will
|
|
// not work.
|
|
if (subh.has_value()) {
|
|
std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
|
|
if (newv) {
|
|
rjson::push_back(v, std::move(*newv));
|
|
} else {
|
|
// Removing a[7] when the list has fewer elements is
|
|
// silently ignored. It's not considered an error.
|
|
}
|
|
} else {
|
|
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
std::optional<mutation>
|
|
update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts) const {
|
|
if (!verify_expected(_request, previous_item.get()) ||
|
|
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
|
// If the update is to be cancelled because of an unfulfilled
|
|
// condition, return an empty optional mutation, which is more
|
|
// efficient than throwing an exception.
|
|
return {};
|
|
}
|
|
|
|
mutation m(_schema, _pk);
|
|
auto& row = m.partition().clustered_row(*_schema, _ck);
|
|
attribute_collector attrs_collector;
|
|
bool any_updates = false;
|
|
auto do_update = [&] (bytes&& column_name, const rjson::value& json_value,
|
|
const attribute_path_map_node<parsed::update_expression::action>* h = nullptr) {
|
|
any_updates = true;
|
|
if (_returnvalues == returnvalues::ALL_NEW) {
|
|
rjson::replace_with_string_name(_return_attributes,
|
|
to_sstring_view(column_name), rjson::copy(json_value));
|
|
} else if (_returnvalues == returnvalues::UPDATED_NEW) {
|
|
rjson::value&& v = rjson::copy(json_value);
|
|
if (h) {
|
|
// If the operation was only on specific attribute paths,
|
|
// leave only them in _return_attributes.
|
|
if (hierarchy_filter(v, *h)) {
|
|
// In the UPDATED_NEW case, _return_attributes starts
|
|
// empty and the attribute names are unique, so we can
|
|
// use add().
|
|
rjson::add_with_string_name(_return_attributes,
|
|
to_sstring_view(column_name), std::move(v));
|
|
}
|
|
} else {
|
|
rjson::add_with_string_name(_return_attributes,
|
|
to_sstring_view(column_name), std::move(v));
|
|
}
|
|
} else if (_returnvalues == returnvalues::UPDATED_OLD && previous_item) {
|
|
std::string_view cn = to_sstring_view(column_name);
|
|
const rjson::value* col = rjson::find(*previous_item, cn);
|
|
if (col) {
|
|
rjson::value&& v = rjson::copy(*col);
|
|
if (h) {
|
|
if (hierarchy_filter(v, *h)) {
|
|
// In the UPDATED_OLD case, _return_attributes starts
|
|
// empty and the attribute names are unique, so we can
|
|
// use add().
|
|
rjson::add_with_string_name(_return_attributes, cn, std::move(v));
|
|
}
|
|
} else {
|
|
rjson::add_with_string_name(_return_attributes, cn, std::move(v));
|
|
}
|
|
}
|
|
}
|
|
const column_definition* cdef = _schema->get_column_definition(column_name);
|
|
if (cdef) {
|
|
bytes column_value = get_key_from_typed_value(json_value, *cdef);
|
|
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, column_value));
|
|
} else {
|
|
attrs_collector.put(std::move(column_name), serialize_item(json_value), ts);
|
|
}
|
|
};
|
|
bool any_deletes = false;
|
|
auto do_delete = [&] (bytes&& column_name) {
|
|
any_deletes = true;
|
|
if (_returnvalues == returnvalues::ALL_NEW) {
|
|
rjson::remove_member(_return_attributes, to_sstring_view(column_name));
|
|
} else if (_returnvalues == returnvalues::UPDATED_OLD && previous_item) {
|
|
std::string_view cn = to_sstring_view(column_name);
|
|
const rjson::value* col = rjson::find(*previous_item, cn);
|
|
if (col) {
|
|
// In the UPDATED_OLD case the item starts empty and column
|
|
// names are unique, so we can use add()
|
|
rjson::add_with_string_name(_return_attributes, cn, rjson::copy(*col));
|
|
}
|
|
}
|
|
const column_definition* cdef = _schema->get_column_definition(column_name);
|
|
if (cdef) {
|
|
row.cells().apply(*cdef, atomic_cell::make_dead(ts, gc_clock::now()));
|
|
} else {
|
|
attrs_collector.del(std::move(column_name), ts);
|
|
}
|
|
};
|
|
|
|
// In the ReturnValues=ALL_NEW case, we make a copy of previous_item into
|
|
// _return_attributes and parts of it will be overwritten by the new
|
|
// updates (in do_update() and do_delete()). We need to make a copy and
|
|
// cannot overwrite previous_item directly because we still need its
|
|
// original content for update expressions. For example, the expression
|
|
// "REMOVE a SET b=a" is valid, and needs the original value of a to
|
|
// stick around.
|
|
// Note that for ReturnValues=ALL_OLD, we don't need to copy here, and
|
|
// can just move previous_item later, when we don't need it any more.
|
|
if (_returnvalues == returnvalues::ALL_NEW) {
|
|
if (previous_item) {
|
|
_return_attributes = rjson::copy(*previous_item);
|
|
} else {
|
|
// If there is no previous item, usually a new item is created
|
|
// and contains they given key. This may be cancelled at the end
|
|
// of this function if the update is just deletes.
|
|
_return_attributes = rjson::copy(rjson::get(_request, "Key"));
|
|
}
|
|
} else if (_returnvalues == returnvalues::UPDATED_OLD ||
|
|
_returnvalues == returnvalues::UPDATED_NEW) {
|
|
_return_attributes = rjson::empty_object();
|
|
}
|
|
|
|
if (!_update_expression.empty()) {
|
|
for (auto& actions : _update_expression) {
|
|
// The actions of _update_expression are grouped by top-level
|
|
// attributes. Here, all actions in actions.second share the same
|
|
// top-level attribute actions.first.
|
|
std::string column_name = actions.first;
|
|
const column_definition* cdef = _schema->get_column_definition(to_bytes(column_name));
|
|
if (cdef && cdef->is_primary_key()) {
|
|
throw api_error::validation(format("UpdateItem cannot update key column {}", column_name));
|
|
}
|
|
if (actions.second.has_value()) {
|
|
// An action on a top-level attribute column_name. The single
|
|
// action is actions.second.get_value(). We can simply invoke
|
|
// the action and replace the attribute with its result:
|
|
std::optional<rjson::value> result = action_result(actions.second.get_value(), previous_item.get());
|
|
if (result) {
|
|
do_update(to_bytes(column_name), *result);
|
|
} else {
|
|
do_delete(to_bytes(column_name));
|
|
}
|
|
} else {
|
|
// We have actions on a path or more than one path in the same
|
|
// top-level attribute column_name - but not on the top-level
|
|
// attribute as a whole. We already read the full top-level
|
|
// attribute (see check_needs_read_before_write()), and now we
|
|
// need to modify pieces of it and write back the entire
|
|
// top-level attribute.
|
|
if (!previous_item) {
|
|
throw api_error::validation(format("UpdateItem cannot update nested document path on non-existent item"));
|
|
}
|
|
const rjson::value *toplevel = rjson::find(*previous_item, column_name);
|
|
if (!toplevel) {
|
|
throw api_error::validation(format("UpdateItem cannot update document path: missing attribute {}",
|
|
column_name));
|
|
}
|
|
rjson::value result = rjson::copy(*toplevel);
|
|
hierarchy_actions(result, actions.second, previous_item.get());
|
|
do_update(to_bytes(column_name), std::move(result), &actions.second);
|
|
}
|
|
}
|
|
}
|
|
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
|
_return_attributes = std::move(*previous_item);
|
|
}
|
|
if (_attribute_updates) {
|
|
for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
|
|
// Note that it.key() is the name of the column, *it is the operation
|
|
bytes column_name = to_bytes(it->name.GetString());
|
|
const column_definition* cdef = _schema->get_column_definition(column_name);
|
|
if (cdef && cdef->is_primary_key()) {
|
|
throw api_error::validation(
|
|
format("UpdateItem cannot update key column {}", it->name.GetString()));
|
|
}
|
|
std::string action = (it->value)["Action"].GetString();
|
|
if (action == "DELETE") {
|
|
// The DELETE operation can do two unrelated tasks. Without a
|
|
// "Value" option, it is used to delete an attribute. With a
|
|
// "Value" option, it is used to delete a set of elements from
|
|
// a set attribute of the same type.
|
|
if (it->value.HasMember("Value")) {
|
|
// Subtracting sets needs a read of previous_item, so
|
|
// check_needs_read_before_write_attribute_updates()
|
|
// returns true in this case, and previous_item is
|
|
// available to us when the item exists.
|
|
const rjson::value* v1 = previous_item ? rjson::find(*previous_item, to_sstring_view(column_name)) : nullptr;
|
|
const rjson::value& v2 = (it->value)["Value"];
|
|
validate_value(v2, "AttributeUpdates");
|
|
std::string v2_type = get_item_type_string(v2);
|
|
if (v2_type != "SS" && v2_type != "NS" && v2_type != "BS") {
|
|
throw api_error::validation(format("AttributeUpdates DELETE operation with Value only valid for sets, got type {}", v2_type));
|
|
}
|
|
if (v1) {
|
|
std::optional<rjson::value> result = set_diff(*v1, v2);
|
|
if (result) {
|
|
do_update(std::move(column_name), *result);
|
|
} else {
|
|
// DynamoDB does not allow empty sets - if the
|
|
// result is empty, delete the attribute.
|
|
do_delete(std::move(column_name));
|
|
}
|
|
} else {
|
|
// if the attribute or item don't exist, the DELETE
|
|
// operation should silently do nothing - and not
|
|
// create an empty item. It's a waste to call
|
|
// do_delete() on an attribute we already know is
|
|
// deleted, so we can just mark any_deletes = true.
|
|
any_deletes = true;
|
|
}
|
|
} else {
|
|
do_delete(std::move(column_name));
|
|
}
|
|
} else if (action == "PUT") {
|
|
const rjson::value& value = (it->value)["Value"];
|
|
validate_value(value, "AttributeUpdates");
|
|
do_update(std::move(column_name), value);
|
|
} else if (action == "ADD") {
|
|
// Note that check_needs_read_before_write_attribute_updates()
|
|
// made sure we retrieved previous_item (if exists) when there
|
|
// is an ADD action.
|
|
const rjson::value* v1 = previous_item ? rjson::find(*previous_item, to_sstring_view(column_name)) : nullptr;
|
|
const rjson::value& v2 = (it->value)["Value"];
|
|
validate_value(v2, "AttributeUpdates");
|
|
// An ADD can be used to create a new attribute (when
|
|
// !v1) or to add to a pre-existing attribute:
|
|
if (!v1) {
|
|
std::string v2_type = get_item_type_string(v2);
|
|
if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS" || v2_type == "L") {
|
|
do_update(std::move(column_name), v2);
|
|
} else {
|
|
throw api_error::validation(format("An operand in the AttributeUpdates ADD has an incorrect data type: {}", v2));
|
|
}
|
|
} else {
|
|
std::string v1_type = get_item_type_string(*v1);
|
|
std::string v2_type = get_item_type_string(v2);
|
|
if (v2_type != v1_type) {
|
|
throw api_error::validation(format("Operand type mismatch in AttributeUpdates ADD. Expected {}, got {}", v1_type, v2_type));
|
|
}
|
|
if (v1_type == "N") {
|
|
do_update(std::move(column_name), number_add(*v1, v2));
|
|
} else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
|
|
do_update(std::move(column_name), set_sum(*v1, v2));
|
|
} else if (v1_type == "L") {
|
|
// The DynamoDB documentation doesn't say it supports
|
|
// lists in ADD operations, but it turns out that it
|
|
// does. Interestingly, this is only true for
|
|
// AttributeUpdates (this code) - the similar ADD
|
|
// in UpdateExpression doesn't support lists.
|
|
do_update(std::move(column_name), list_concatenate(*v1, v2));
|
|
} else {
|
|
throw api_error::validation(format("An operand in the AttributeUpdates ADD has an incorrect data type: {}", *v1));
|
|
}
|
|
}
|
|
} else {
|
|
throw api_error::validation(
|
|
format("Unknown Action value '{}' in AttributeUpdates", action));
|
|
}
|
|
}
|
|
}
|
|
if (!attrs_collector.empty()) {
|
|
auto serialized_map = attrs_collector.to_mut().serialize(*attrs_type());
|
|
row.cells().apply(attrs_column(*_schema), std::move(serialized_map));
|
|
}
|
|
// To allow creation of an item with no attributes, we need a row marker.
|
|
// Note that unlike Scylla, even an "update" operation needs to add a row
|
|
// marker. An update with only DELETE operations must not add a row marker
|
|
// (this was issue #5862) but any other update, even an empty one, should.
|
|
if (any_updates || !any_deletes) {
|
|
row.apply(row_marker(ts));
|
|
} else if (_returnvalues == returnvalues::ALL_NEW && !previous_item) {
|
|
// There was no pre-existing item, and we're not creating one, so
|
|
// don't report the new item in the returned Attributes.
|
|
_return_attributes = rjson::null_value();
|
|
}
|
|
// ReturnValues=UPDATED_OLD/NEW never return an empty Attributes field,
|
|
// even if a new item was created. Instead it should be missing entirely.
|
|
if (_returnvalues == returnvalues::UPDATED_OLD || _returnvalues == returnvalues::UPDATED_NEW) {
|
|
if (_return_attributes.MemberCount() == 0) {
|
|
_return_attributes = rjson::null_value();
|
|
}
|
|
}
|
|
|
|
return m;
|
|
}
|
|
|
|
future<executor::request_return_type> executor::update_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.update_item++;
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
elogger.trace("update_item {}", request);
|
|
|
|
auto op = make_shared<update_item_operation>(_proxy, std::move(request));
|
|
tracing::add_table_name(trace_state, op->schema()->ks_name(), op->schema()->cf_name());
|
|
const bool needs_read_before_write = op->needs_read_before_write();
|
|
if (auto shard = op->shard_for_execute(needs_read_before_write); shard) {
|
|
_stats.api_operations.update_item--; // uncount on this shard, will be counted in other shard
|
|
_stats.shard_bounce_for_lwt++;
|
|
return container().invoke_on(*shard, _ssg,
|
|
[request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
|
|
(executor& e) mutable {
|
|
return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
|
|
(service::client_state& client_state) mutable {
|
|
//FIXME: A corresponding FIXME can be found in transport/server.cc when a message must be bounced
|
|
// to another shard - once it is solved, this place can use a similar solution. Instead of passing
|
|
// empty_service_permit() to the background operation, the current permit's lifetime should be prolonged,
|
|
// so that it's destructed only after all background operations are finished as well.
|
|
return e.update_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request));
|
|
});
|
|
});
|
|
}
|
|
return op->execute(_proxy, client_state, trace_state, std::move(permit), needs_read_before_write, _stats).finally([op, start_time, this] {
|
|
_stats.api_operations.update_item_latency.add(std::chrono::steady_clock::now() - start_time);
|
|
});
|
|
}
|
|
|
|
// Check according to the request's "ConsistentRead" field, which consistency
|
|
// level we need to use for the read. The field can be True for strongly
|
|
// consistent reads, or False for eventually consistent reads, or if this
|
|
// field is absense, we default to eventually consistent reads.
|
|
// In Scylla, eventually-consistent reads are implemented as consistency
|
|
// level LOCAL_ONE, and strongly-consistent reads as LOCAL_QUORUM.
|
|
static db::consistency_level get_read_consistency(const rjson::value& request) {
|
|
const rjson::value* consistent_read_value = rjson::find(request, "ConsistentRead");
|
|
bool consistent_read = false;
|
|
if (consistent_read_value && !consistent_read_value->IsNull()) {
|
|
if (consistent_read_value->IsBool()) {
|
|
consistent_read = consistent_read_value->GetBool();
|
|
} else {
|
|
throw api_error::validation("ConsistentRead flag must be a boolean");
|
|
}
|
|
}
|
|
return consistent_read ? db::consistency_level::LOCAL_QUORUM : db::consistency_level::LOCAL_ONE;
|
|
}
|
|
|
|
// describe_item() wraps the result of describe_single_item() by a map
|
|
// as needed by the GetItem request. It should not be used for other purposes,
|
|
// use describe_single_item() instead.
|
|
static rjson::value describe_item(schema_ptr schema,
|
|
const query::partition_slice& slice,
|
|
const cql3::selection::selection& selection,
|
|
const query::result& query_result,
|
|
const std::optional<attrs_to_get>& attrs_to_get) {
|
|
std::optional<rjson::value> opt_item = executor::describe_single_item(std::move(schema), slice, selection, std::move(query_result), attrs_to_get);
|
|
if (!opt_item) {
|
|
// If there is no matching item, we're supposed to return an empty
|
|
// object without an Item member - not one with an empty Item member
|
|
return rjson::empty_object();
|
|
}
|
|
rjson::value item_descr = rjson::empty_object();
|
|
rjson::add(item_descr, "Item", std::move(*opt_item));
|
|
return item_descr;
|
|
}
|
|
|
|
future<executor::request_return_type> executor::get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.get_item++;
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
elogger.trace("Getting item {}", request);
|
|
|
|
schema_ptr schema = get_table(_proxy, request);
|
|
|
|
tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
|
|
|
|
rjson::value& query_key = request["Key"];
|
|
db::consistency_level cl = get_read_consistency(request);
|
|
|
|
partition_key pk = pk_from_json(query_key, schema);
|
|
dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))};
|
|
|
|
std::vector<query::clustering_range> bounds;
|
|
if (schema->clustering_key_size() == 0) {
|
|
bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
} else {
|
|
clustering_key ck = ck_from_json(query_key, schema);
|
|
bounds.push_back(query::clustering_range::make_singular(std::move(ck)));
|
|
}
|
|
check_key(query_key, schema);
|
|
|
|
//TODO(sarna): It would be better to fetch only some attributes of the map, not all
|
|
auto regular_columns = boost::copy_range<query::column_id_vector>(
|
|
schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
|
|
|
|
auto selection = cql3::selection::selection::wildcard(schema);
|
|
|
|
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
|
|
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
|
|
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names);
|
|
verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "GetItem");
|
|
|
|
return _proxy.query(schema, std::move(command), std::move(partition_ranges), cl,
|
|
service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state)).then(
|
|
[this, schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = std::move(attrs_to_get), start_time = std::move(start_time)] (service::storage_proxy::coordinator_query_result qr) mutable {
|
|
_stats.api_operations.get_item_latency.add(std::chrono::steady_clock::now() - start_time);
|
|
return make_ready_future<executor::request_return_type>(make_jsonable(describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get))));
|
|
});
|
|
}
|
|
|
|
// is_big() checks approximately if the given JSON value is "bigger" than
|
|
// the given big_size number of bytes. The goal is to *quickly* detect
|
|
// oversized JSON that, for example, is too large to be serialized to a
|
|
// contiguous string - we don't need an accurate size for that. Moreover,
|
|
// as soon as we detect that the JSON is indeed "big", we can return true
|
|
// and don't need to continue calculating its exact size.
|
|
// For simplicity, we use a recursive implementation. This is fine because
|
|
// Alternator limits the depth of JSONs it reads from inputs, and doesn't
|
|
// add more than a couple of levels in its own output construction.
|
|
|
|
static void check_big_object(const rjson::value& val, int& size_left);
|
|
static void check_big_array(const rjson::value& val, int& size_left);
|
|
|
|
static bool is_big(const rjson::value& val, int big_size = 100'000) {
|
|
if (val.IsString()) {
|
|
return ssize_t(val.GetStringLength()) > big_size;
|
|
} else if (val.IsObject()) {
|
|
check_big_object(val, big_size);
|
|
return big_size < 0;
|
|
} else if (val.IsArray()) {
|
|
check_big_array(val, big_size);
|
|
return big_size < 0;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void check_big_array(const rjson::value& val, int& size_left) {
|
|
// Assume a fixed size of 10 bytes for each number, boolean, etc., or
|
|
// beginning of a sub-object. This doesn't have to be accurate.
|
|
size_left -= 10 * val.Size();
|
|
for (const auto& v : val.GetArray()) {
|
|
if (size_left < 0) {
|
|
return;
|
|
}
|
|
// Note that we avoid recursive calls for the leaves (anything except
|
|
// array or object) because usually those greatly outnumber the trunk.
|
|
if (v.IsString()) {
|
|
size_left -= v.GetStringLength();
|
|
} else if (v.IsObject()) {
|
|
check_big_object(v, size_left);
|
|
} else if (v.IsArray()) {
|
|
check_big_array(v, size_left);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void check_big_object(const rjson::value& val, int& size_left) {
|
|
size_left -= 10 * val.MemberCount();
|
|
for (const auto& m : val.GetObject()) {
|
|
if (size_left < 0) {
|
|
return;
|
|
}
|
|
size_left -= m.name.GetStringLength();
|
|
if (m.value.IsString()) {
|
|
size_left -= m.value.GetStringLength();
|
|
} else if (m.value.IsObject()) {
|
|
check_big_object(m.value, size_left);
|
|
} else if (m.value.IsArray()) {
|
|
check_big_array(m.value, size_left);
|
|
}
|
|
}
|
|
}
|
|
|
|
future<executor::request_return_type> executor::batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
// FIXME: In this implementation, an unbounded batch size can cause
|
|
// unbounded response JSON object to be buffered in memory, unbounded
|
|
// parallelism of the requests, and unbounded amount of non-preemptable
|
|
// work in the following loops. So we should limit the batch size, and/or
|
|
// the response size, as DynamoDB does.
|
|
_stats.api_operations.batch_get_item++;
|
|
rjson::value& request_items = request["RequestItems"];
|
|
|
|
// We need to validate all the parameters before starting any asynchronous
|
|
// query, and fail the entire request on any parse error. So we parse all
|
|
// the input into our own vector "requests", each element a table_requests
|
|
// listing all the request aimed at a single table. For efficiency, inside
|
|
// each table_requests we further group together all reads going to the
|
|
// same partition, so we can later send them together.
|
|
struct table_requests {
|
|
schema_ptr schema;
|
|
db::consistency_level cl;
|
|
::shared_ptr<const std::optional<alternator::attrs_to_get>> attrs_to_get;
|
|
// clustering_keys keeps a sorted set of clustering keys. It must
|
|
// be sorted for the read below (see #10827). Additionally each
|
|
// clustering key is mapped to the original rjson::value "Key".
|
|
using clustering_keys = std::map<clustering_key, rjson::value*, clustering_key::less_compare>;
|
|
std::unordered_map<partition_key, clustering_keys, partition_key::hashing, partition_key::equality> requests;
|
|
table_requests(schema_ptr s)
|
|
: schema(std::move(s))
|
|
, requests(8, partition_key::hashing(*schema), partition_key::equality(*schema))
|
|
{}
|
|
void add(rjson::value& key) {
|
|
auto pk = pk_from_json(key, schema);
|
|
auto it = requests.find(pk);
|
|
if (it == requests.end()) {
|
|
it = requests.emplace(pk, clustering_key::less_compare(*schema)).first;
|
|
}
|
|
auto ck = ck_from_json(key, schema);
|
|
if (auto [_, inserted] = it->second.emplace(ck, &key); !inserted) {
|
|
throw api_error::validation("Provided list of item keys contains duplicates");
|
|
}
|
|
}
|
|
};
|
|
std::vector<table_requests> requests;
|
|
|
|
for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
|
|
table_requests rs(get_table_from_batch_request(_proxy, it));
|
|
tracing::add_table_name(trace_state, sstring(executor::KEYSPACE_NAME_PREFIX) + rs.schema->cf_name(), rs.schema->cf_name());
|
|
rs.cl = get_read_consistency(it->value);
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
rs.attrs_to_get = ::make_shared<const std::optional<attrs_to_get>>(calculate_attrs_to_get(it->value, used_attribute_names));
|
|
verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "GetItem");
|
|
auto& keys = (it->value)["Keys"];
|
|
for (rjson::value& key : keys.GetArray()) {
|
|
rs.add(key);
|
|
check_key(key, rs.schema);
|
|
}
|
|
requests.emplace_back(std::move(rs));
|
|
}
|
|
|
|
// If we got here, all "requests" are valid, so let's start the
|
|
// requests for the different partitions all in parallel.
|
|
std::vector<future<std::vector<rjson::value>>> response_futures;
|
|
for (const auto& rs : requests) {
|
|
for (const auto &r : rs.requests) {
|
|
auto& pk = r.first;
|
|
auto& cks = r.second;
|
|
dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*rs.schema, pk))};
|
|
std::vector<query::clustering_range> bounds;
|
|
if (rs.schema->clustering_key_size() == 0) {
|
|
bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
} else {
|
|
for (auto& ck : cks) {
|
|
bounds.push_back(query::clustering_range::make_singular(ck.first));
|
|
}
|
|
}
|
|
auto regular_columns = boost::copy_range<query::column_id_vector>(
|
|
rs.schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
|
|
auto selection = cql3::selection::selection::wildcard(rs.schema);
|
|
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
|
|
auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice));
|
|
command->allow_limit = db::allow_per_partition_rate_limit::yes;
|
|
future<std::vector<rjson::value>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
|
|
service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
|
|
[schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get] (service::storage_proxy::coordinator_query_result qr) mutable {
|
|
utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
|
|
std::vector<rjson::value> jsons = describe_multi_item(schema, partition_slice, *selection, *qr.query_result, *attrs_to_get);
|
|
return make_ready_future<std::vector<rjson::value>>(std::move(jsons));
|
|
});
|
|
response_futures.push_back(std::move(f));
|
|
}
|
|
}
|
|
|
|
// Wait for all requests to complete, and then return the response.
|
|
// In case of full failure (no reads succeeded), an arbitrary error
|
|
// from one of the operations will be returned.
|
|
bool some_succeeded = false;
|
|
std::exception_ptr eptr;
|
|
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "Responses", rjson::empty_object());
|
|
rjson::add(response, "UnprocessedKeys", rjson::empty_object());
|
|
|
|
auto fut_it = response_futures.begin();
|
|
for (const auto& rs : requests) {
|
|
auto table = table_name(*rs.schema);
|
|
for (const auto &r : rs.requests) {
|
|
auto& pk = r.first;
|
|
auto& cks = r.second;
|
|
auto& fut = *fut_it;
|
|
++fut_it;
|
|
try {
|
|
std::vector<rjson::value> results = co_await std::move(fut);
|
|
some_succeeded = true;
|
|
if (!response["Responses"].HasMember(table)) {
|
|
rjson::add_with_string_name(response["Responses"], table, rjson::empty_array());
|
|
}
|
|
for (rjson::value& json : results) {
|
|
rjson::push_back(response["Responses"][table], std::move(json));
|
|
}
|
|
} catch(...) {
|
|
eptr = std::current_exception();
|
|
// This read of potentially several rows in one partition,
|
|
// failed. We need to add the row key(s) to UnprocessedKeys.
|
|
if (!response["UnprocessedKeys"].HasMember(table)) {
|
|
// Add the table's entry in UnprocessedKeys. Need to copy
|
|
// all the table's parameters from the request except the
|
|
// Keys field, which we start empty and then build below.
|
|
rjson::add_with_string_name(response["UnprocessedKeys"], table, rjson::empty_object());
|
|
rjson::value& unprocessed_item = response["UnprocessedKeys"][table];
|
|
rjson::value& request_item = request_items[table];
|
|
for (auto it = request_item.MemberBegin(); it != request_item.MemberEnd(); ++it) {
|
|
if (it->name != "Keys") {
|
|
rjson::add_with_string_name(unprocessed_item,
|
|
rjson::to_string_view(it->name), rjson::copy(it->value));
|
|
}
|
|
}
|
|
rjson::add_with_string_name(unprocessed_item, "Keys", rjson::empty_array());
|
|
}
|
|
for (auto& ck : cks) {
|
|
rjson::push_back(response["UnprocessedKeys"][table]["Keys"], std::move(*ck.second));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
elogger.trace("Unprocessed keys: {}", response["UnprocessedKeys"]);
|
|
if (!some_succeeded && eptr) {
|
|
co_await coroutine::return_exception_ptr(std::move(eptr));
|
|
}
|
|
if (is_big(response)) {
|
|
co_return make_streamed(std::move(response));
|
|
} else {
|
|
co_return make_jsonable(std::move(response));
|
|
}
|
|
}
|
|
|
|
// "filter" represents a condition that can be applied to individual items
|
|
// read by a Query or Scan operation, to decide whether to keep the item.
|
|
// A filter is constructed from a Query or Scan request. This uses the
|
|
// relevant fields in the query (FilterExpression or QueryFilter/ScanFilter +
|
|
// ConditionalOperator). These fields are pre-checked and pre-parsed as much
|
|
// as possible, to ensure that later checking of many items is efficient.
|
|
class filter {
|
|
private:
|
|
// Holding QueryFilter/ScanFilter + ConditionalOperator:
|
|
struct conditions_filter {
|
|
bool require_all;
|
|
rjson::value conditions;
|
|
};
|
|
// Holding a parsed FilterExpression:
|
|
struct expression_filter {
|
|
parsed::condition_expression expression;
|
|
};
|
|
std::optional<std::variant<conditions_filter, expression_filter>> _imp;
|
|
public:
|
|
// Filtering for Scan and Query are very similar, but there are some
|
|
// small differences, especially the names of the request attributes.
|
|
enum class request_type { SCAN, QUERY };
|
|
// Note that a filter does not store pointers to the query used to
|
|
// construct it.
|
|
filter(const rjson::value& request, request_type rt,
|
|
std::unordered_set<std::string>& used_attribute_names,
|
|
std::unordered_set<std::string>& used_attribute_values);
|
|
bool check(const rjson::value& item) const;
|
|
bool filters_on(std::string_view attribute) const;
|
|
// for_filters_on() runs the given function on the attributes that the
|
|
// filter works on. It may run for the same attribute more than once if
|
|
// used more than once in the filter.
|
|
void for_filters_on(const noncopyable_function<void(std::string_view)>& func) const;
|
|
operator bool() const { return bool(_imp); }
|
|
};
|
|
|
|
filter::filter(const rjson::value& request, request_type rt,
|
|
std::unordered_set<std::string>& used_attribute_names,
|
|
std::unordered_set<std::string>& used_attribute_values) {
|
|
const rjson::value* expression = rjson::find(request, "FilterExpression");
|
|
const char* conditions_attribute = (rt == request_type::SCAN) ? "ScanFilter" : "QueryFilter";
|
|
const rjson::value* conditions = rjson::find(request, conditions_attribute);
|
|
auto conditional_operator = get_conditional_operator(request);
|
|
if (conditional_operator != conditional_operator_type::MISSING &&
|
|
(!conditions || (conditions->IsObject() && conditions->GetObject().ObjectEmpty()))) {
|
|
throw api_error::validation(
|
|
format("'ConditionalOperator' parameter cannot be specified for missing or empty {}",
|
|
conditions_attribute));
|
|
}
|
|
if (expression && conditions) {
|
|
throw api_error::validation(
|
|
format("FilterExpression and {} are not allowed together", conditions_attribute));
|
|
}
|
|
if (expression) {
|
|
if (!expression->IsString()) {
|
|
throw api_error::validation("FilterExpression must be a string");
|
|
}
|
|
if (expression->GetStringLength() == 0) {
|
|
throw api_error::validation("FilterExpression must not be empty");
|
|
}
|
|
if (rjson::find(request, "AttributesToGet")) {
|
|
throw api_error::validation("Cannot use both old-style and new-style parameters in same request: FilterExpression and AttributesToGet");
|
|
}
|
|
try {
|
|
auto parsed = parse_condition_expression(rjson::to_string_view(*expression));
|
|
const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
|
|
resolve_condition_expression(parsed,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
_imp = expression_filter { std::move(parsed) };
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
}
|
|
if (conditions) {
|
|
if (rjson::find(request, "ProjectionExpression")) {
|
|
throw api_error::validation(format("Cannot use both old-style and new-style parameters in same request: {} and ProjectionExpression", conditions_attribute));
|
|
}
|
|
bool require_all = conditional_operator != conditional_operator_type::OR;
|
|
_imp = conditions_filter { require_all, rjson::copy(*conditions) };
|
|
}
|
|
}
|
|
|
|
bool filter::check(const rjson::value& item) const {
|
|
if (!_imp) {
|
|
return true;
|
|
}
|
|
return std::visit(overloaded_functor {
|
|
[&] (const conditions_filter& f) -> bool {
|
|
return verify_condition(f.conditions, f.require_all, &item);
|
|
},
|
|
[&] (const expression_filter& f) -> bool {
|
|
return verify_condition_expression(f.expression, &item);
|
|
}
|
|
}, *_imp);
|
|
}
|
|
|
|
bool filter::filters_on(std::string_view attribute) const {
|
|
if (!_imp) {
|
|
return false;
|
|
}
|
|
return std::visit(overloaded_functor {
|
|
[&] (const conditions_filter& f) -> bool {
|
|
for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) {
|
|
if (rjson::to_string_view(it->name) == attribute) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
},
|
|
[&] (const expression_filter& f) -> bool {
|
|
return condition_expression_on(f.expression, attribute);
|
|
}
|
|
}, *_imp);
|
|
}
|
|
|
|
void filter::for_filters_on(const noncopyable_function<void(std::string_view)>& func) const {
|
|
if (_imp) {
|
|
std::visit(overloaded_functor {
|
|
[&] (const conditions_filter& f) -> void {
|
|
for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) {
|
|
func(rjson::to_string_view(it->name));
|
|
}
|
|
},
|
|
[&] (const expression_filter& f) -> void {
|
|
return for_condition_expression_on(f.expression, func);
|
|
}
|
|
}, *_imp);
|
|
}
|
|
}
|
|
|
|
class describe_items_visitor {
|
|
typedef std::vector<const column_definition*> columns_t;
|
|
const columns_t& _columns;
|
|
const std::optional<attrs_to_get>& _attrs_to_get;
|
|
std::unordered_set<std::string> _extra_filter_attrs;
|
|
const filter& _filter;
|
|
typename columns_t::const_iterator _column_it;
|
|
rjson::value _item;
|
|
rjson::value _items;
|
|
size_t _scanned_count;
|
|
|
|
public:
|
|
describe_items_visitor(const columns_t& columns, const std::optional<attrs_to_get>& attrs_to_get, filter& filter)
|
|
: _columns(columns)
|
|
, _attrs_to_get(attrs_to_get)
|
|
, _filter(filter)
|
|
, _column_it(columns.begin())
|
|
, _item(rjson::empty_object())
|
|
, _items(rjson::empty_array())
|
|
, _scanned_count(0)
|
|
{
|
|
// _filter.check() may need additional attributes not listed in
|
|
// _attrs_to_get (i.e., not requested as part of the output).
|
|
// We list those in _extra_filter_attrs. We will include them in
|
|
// the JSON but take them out before finally returning the JSON.
|
|
if (_attrs_to_get) {
|
|
_filter.for_filters_on([&] (std::string_view attr) {
|
|
std::string a(attr); // no heterogenous maps searches :-(
|
|
if (!_attrs_to_get->contains(a)) {
|
|
_extra_filter_attrs.emplace(std::move(a));
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
void start_row() {
|
|
_column_it = _columns.begin();
|
|
}
|
|
|
|
void accept_value(const std::optional<query::result_bytes_view>& result_bytes_view) {
|
|
if (!result_bytes_view) {
|
|
++_column_it;
|
|
return;
|
|
}
|
|
result_bytes_view->with_linearized([this] (bytes_view bv) {
|
|
std::string column_name = (*_column_it)->name_as_text();
|
|
if (column_name != executor::ATTRS_COLUMN_NAME) {
|
|
if (!_attrs_to_get || _attrs_to_get->contains(column_name) || _extra_filter_attrs.contains(column_name)) {
|
|
if (!_item.HasMember(column_name.c_str())) {
|
|
rjson::add_with_string_name(_item, column_name, rjson::empty_object());
|
|
}
|
|
rjson::value& field = _item[column_name.c_str()];
|
|
rjson::add_with_string_name(field, type_to_string((*_column_it)->type), json_key_column_value(bv, **_column_it));
|
|
}
|
|
} else {
|
|
auto deserialized = attrs_type()->deserialize(bv, cql_serialization_format::latest());
|
|
auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
|
|
for (auto entry : keys_and_values) {
|
|
std::string attr_name = value_cast<sstring>(entry.first);
|
|
if (!_attrs_to_get || _attrs_to_get->contains(attr_name) || _extra_filter_attrs.contains(attr_name)) {
|
|
bytes value = value_cast<bytes>(entry.second);
|
|
// Even if _attrs_to_get asked to keep only a part of a
|
|
// top-level attribute, we keep the entire attribute
|
|
// at this stage, because the item filter might still
|
|
// need the other parts (it was easier for us to keep
|
|
// extra_filter_attrs at top-level granularity). We'll
|
|
// filter the unneeded parts after item filtering.
|
|
rjson::add_with_string_name(_item, attr_name, deserialize_item(value));
|
|
}
|
|
}
|
|
}
|
|
});
|
|
++_column_it;
|
|
}
|
|
|
|
void end_row() {
|
|
if (_filter.check(_item)) {
|
|
// As noted above, we kept entire top-level attributes listed in
|
|
// _attrs_to_get. We may need to only keep parts of them.
|
|
if (_attrs_to_get) {
|
|
for (const auto& attr: *_attrs_to_get) {
|
|
// If !attr.has_value() it means we were asked not to keep
|
|
// attr entirely, but just parts of it.
|
|
if (!attr.second.has_value()) {
|
|
rjson::value* toplevel= rjson::find(_item, attr.first);
|
|
if (toplevel && !hierarchy_filter(*toplevel, attr.second)) {
|
|
rjson::remove_member(_item, attr.first);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Remove the extra attributes _extra_filter_attrs which we had
|
|
// to add just for the filter, and not requested to be returned:
|
|
for (const auto& attr : _extra_filter_attrs) {
|
|
rjson::remove_member(_item, attr);
|
|
}
|
|
|
|
rjson::push_back(_items, std::move(_item));
|
|
}
|
|
_item = rjson::empty_object();
|
|
++_scanned_count;
|
|
}
|
|
|
|
rjson::value get_items() && {
|
|
return std::move(_items);
|
|
}
|
|
|
|
size_t get_scanned_count() {
|
|
return _scanned_count;
|
|
}
|
|
};
|
|
|
|
static std::tuple<rjson::value, size_t> describe_items(schema_ptr schema, const query::partition_slice& slice, const cql3::selection::selection& selection, std::unique_ptr<cql3::result_set> result_set, std::optional<attrs_to_get>&& attrs_to_get, filter&& filter) {
|
|
describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter);
|
|
result_set->visit(visitor);
|
|
auto scanned_count = visitor.get_scanned_count();
|
|
rjson::value items = std::move(visitor).get_items();
|
|
rjson::value items_descr = rjson::empty_object();
|
|
auto size = items.Size();
|
|
rjson::add(items_descr, "Count", rjson::value(size));
|
|
rjson::add(items_descr, "ScannedCount", rjson::value(scanned_count));
|
|
// If attrs_to_get && attrs_to_get->empty(), this means the user asked not
|
|
// to get any attributes (i.e., a Scan or Query with Select=COUNT) and we
|
|
// shouldn't return "Items" at all.
|
|
// TODO: consider optimizing the case of Select=COUNT without a filter.
|
|
// In that case, we currently build a list of empty items and here drop
|
|
// it. We could just count the items and not bother with the empty items.
|
|
// (However, remember that when we do have a filter, we need the items).
|
|
if (!attrs_to_get || !attrs_to_get->empty()) {
|
|
rjson::add(items_descr, "Items", std::move(items));
|
|
}
|
|
return {std::move(items_descr), size};
|
|
}
|
|
|
|
static rjson::value encode_paging_state(const schema& schema, const service::pager::paging_state& paging_state) {
|
|
rjson::value last_evaluated_key = rjson::empty_object();
|
|
std::vector<bytes> exploded_pk = paging_state.get_partition_key().explode();
|
|
auto exploded_pk_it = exploded_pk.begin();
|
|
for (const column_definition& cdef : schema.partition_key_columns()) {
|
|
rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
|
|
rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
|
|
rjson::add_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef));
|
|
++exploded_pk_it;
|
|
}
|
|
auto pos = paging_state.get_position_in_partition();
|
|
if (pos.has_key()) {
|
|
auto exploded_ck = pos.key().explode();
|
|
auto exploded_ck_it = exploded_ck.begin();
|
|
for (const column_definition& cdef : schema.clustering_key_columns()) {
|
|
rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
|
|
rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
|
|
rjson::add_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_ck_it, cdef));
|
|
++exploded_ck_it;
|
|
}
|
|
}
|
|
// To avoid possible conflicts (and thus having to reserve these names) we
|
|
// avoid adding the weight and region fields of the position to the paging
|
|
// state. Alternator will never need these as it doesn't have range
|
|
// tombstones (the only thing that can generate a position other than at(row)).
|
|
// We conditionally include these fields when reading CQL tables through alternator.
|
|
if (!is_alternator_keyspace(schema.ks_name()) && (!pos.has_key() || pos.get_bound_weight() != bound_weight::equal)) {
|
|
rjson::add_with_string_name(last_evaluated_key, scylla_paging_region, rjson::empty_object());
|
|
rjson::add(last_evaluated_key[scylla_paging_region.data()], "S", rjson::from_string(to_string(pos.region())));
|
|
rjson::add_with_string_name(last_evaluated_key, scylla_paging_weight, rjson::empty_object());
|
|
rjson::add(last_evaluated_key[scylla_paging_weight.data()], "N", static_cast<int>(pos.get_bound_weight()));
|
|
}
|
|
return last_evaluated_key;
|
|
}
|
|
|
|
static future<executor::request_return_type> do_query(service::storage_proxy& proxy,
|
|
schema_ptr schema,
|
|
const rjson::value* exclusive_start_key,
|
|
dht::partition_range_vector&& partition_ranges,
|
|
std::vector<query::clustering_range>&& ck_bounds,
|
|
std::optional<attrs_to_get>&& attrs_to_get,
|
|
uint32_t limit,
|
|
db::consistency_level cl,
|
|
filter&& filter,
|
|
query::partition_slice::option_set custom_opts,
|
|
service::client_state& client_state,
|
|
cql3::cql_stats& cql_stats,
|
|
tracing::trace_state_ptr trace_state,
|
|
service_permit permit) {
|
|
lw_shared_ptr<service::pager::paging_state> paging_state = nullptr;
|
|
|
|
tracing::trace(trace_state, "Performing a database query");
|
|
|
|
if (exclusive_start_key) {
|
|
partition_key pk = pk_from_json(*exclusive_start_key, schema);
|
|
auto pos = position_in_partition(position_in_partition::partition_start_tag_t());
|
|
if (schema->clustering_key_size() > 0) {
|
|
pos = pos_from_json(*exclusive_start_key, schema);
|
|
}
|
|
paging_state = make_lw_shared<service::pager::paging_state>(pk, pos, query::max_partitions, utils::UUID(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0);
|
|
}
|
|
|
|
auto regular_columns = boost::copy_range<query::column_id_vector>(
|
|
schema->regular_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
|
|
auto static_columns = boost::copy_range<query::column_id_vector>(
|
|
schema->static_columns() | boost::adaptors::transformed([] (const column_definition& cdef) { return cdef.id; }));
|
|
auto selection = cql3::selection::selection::wildcard(schema);
|
|
query::partition_slice::option_set opts = selection->get_query_options();
|
|
opts.add(custom_opts);
|
|
auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts);
|
|
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice));
|
|
|
|
auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));
|
|
|
|
// FIXME: should be moved above, set on opts, so get_max_result_size knows it?
|
|
command->slice.options.set<query::partition_slice::option::allow_short_read>();
|
|
auto query_options = std::make_unique<cql3::query_options>(cl, std::vector<cql3::raw_value>{});
|
|
query_options = std::make_unique<cql3::query_options>(std::move(query_options), std::move(paging_state));
|
|
auto p = service::pager::query_pagers::pager(proxy, schema, selection, *query_state_ptr, *query_options, command, std::move(partition_ranges), nullptr);
|
|
|
|
return p->fetch_page(limit, gc_clock::now(), executor::default_timeout()).then(
|
|
[p = std::move(p), schema, cql_stats, partition_slice = std::move(partition_slice),
|
|
selection = std::move(selection), query_state_ptr = std::move(query_state_ptr),
|
|
attrs_to_get = std::move(attrs_to_get),
|
|
query_options = std::move(query_options),
|
|
filter = std::move(filter)] (std::unique_ptr<cql3::result_set> rs) mutable {
|
|
if (!p->is_exhausted()) {
|
|
rs->get_metadata().set_paging_state(p->state());
|
|
}
|
|
auto paging_state = rs->get_metadata().paging_state();
|
|
bool has_filter = filter;
|
|
auto [items, size] = describe_items(schema, partition_slice, *selection, std::move(rs), std::move(attrs_to_get), std::move(filter));
|
|
if (paging_state) {
|
|
rjson::add(items, "LastEvaluatedKey", encode_paging_state(*schema, *paging_state));
|
|
}
|
|
if (has_filter){
|
|
cql_stats.filtered_rows_read_total += p->stats().rows_read_total;
|
|
// update our "filtered_row_matched_total" for all the rows matched, despited the filter
|
|
cql_stats.filtered_rows_matched_total += size;
|
|
}
|
|
// TODO: better threshold
|
|
if (size > 10) {
|
|
return make_ready_future<executor::request_return_type>(make_streamed(std::move(items)));
|
|
}
|
|
return make_ready_future<executor::request_return_type>(make_jsonable(std::move(items)));
|
|
});
|
|
}
|
|
|
|
static dht::token token_for_segment(int segment, int total_segments) {
|
|
assert(total_segments > 1 && segment >= 0 && segment < total_segments);
|
|
uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
|
|
return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
|
|
}
|
|
|
|
static dht::partition_range get_range_for_segment(int segment, int total_segments) {
|
|
if (total_segments == 1) {
|
|
return dht::partition_range::make_open_ended_both_sides();
|
|
}
|
|
if (segment == 0) {
|
|
dht::token ending_token = token_for_segment(1, total_segments);
|
|
return dht::partition_range::make_ending_with(
|
|
dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false));
|
|
} else if (segment == total_segments - 1) {
|
|
dht::token starting_token = token_for_segment(segment, total_segments);
|
|
return dht::partition_range::make_starting_with(
|
|
dht::partition_range::bound(dht::ring_position::starting_at(starting_token)));
|
|
} else {
|
|
dht::token starting_token = token_for_segment(segment, total_segments);
|
|
dht::token ending_token = token_for_segment(segment + 1, total_segments);
|
|
return dht::partition_range::make(
|
|
dht::partition_range::bound(dht::ring_position::starting_at(starting_token)),
|
|
dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false)
|
|
);
|
|
}
|
|
}
|
|
|
|
future<executor::request_return_type> executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.scan++;
|
|
elogger.trace("Scanning {}", request);
|
|
|
|
auto [schema, table_type] = get_table_or_view(_proxy, request);
|
|
|
|
auto segment = get_int_attribute(request, "Segment");
|
|
auto total_segments = get_int_attribute(request, "TotalSegments");
|
|
if (segment || total_segments) {
|
|
if (!segment || !total_segments) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"Both Segment and TotalSegments attributes need to be present for a parallel scan"));
|
|
}
|
|
if (*segment < 0 || *segment >= *total_segments) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"Segment must be non-negative and less than TotalSegments"));
|
|
}
|
|
if (*total_segments < 0 || *total_segments > 1000000) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"TotalSegments must be non-negative and less or equal to 1000000"));
|
|
}
|
|
}
|
|
|
|
rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey");
|
|
|
|
db::consistency_level cl = get_read_consistency(request);
|
|
if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"Consistent reads are not allowed on global indexes (GSI)"));
|
|
}
|
|
rjson::value* limit_json = rjson::find(request, "Limit");
|
|
uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits<uint32_t>::max();
|
|
if (limit <= 0) {
|
|
return make_ready_future<request_return_type>(api_error::validation("Limit must be greater than 0"));
|
|
}
|
|
|
|
select_type select = parse_select(request, table_type);
|
|
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names, select);
|
|
|
|
dht::partition_range_vector partition_ranges;
|
|
if (segment) {
|
|
auto range = get_range_for_segment(*segment, *total_segments);
|
|
if (exclusive_start_key) {
|
|
auto ring_pos = dht::ring_position{dht::decorate_key(*schema, pk_from_json(*exclusive_start_key, schema))};
|
|
if (!range.contains(ring_pos, dht::ring_position_comparator(*schema))) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
format("The provided starting key is invalid: Invalid ExclusiveStartKey. Please use ExclusiveStartKey "
|
|
"with correct Segment. TotalSegments: {} Segment: {}", *total_segments, *segment)));
|
|
}
|
|
}
|
|
partition_ranges.push_back(range);
|
|
} else {
|
|
partition_ranges.push_back(dht::partition_range::make_open_ended_both_sides());
|
|
}
|
|
std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};
|
|
|
|
filter filter(request, filter::request_type::SCAN, used_attribute_names, used_attribute_values);
|
|
// Note: Unlike Query, Scan does allow a filter on the key attributes.
|
|
// For some *specific* cases of key filtering, such an equality test on
|
|
// partition key or comparison operator for the sort key, we could have
|
|
// optimized the filtering by modifying partition_ranges and/or
|
|
// ck_bounds. We haven't done this optimization yet.
|
|
|
|
verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Scan");
|
|
verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Scan");
|
|
|
|
return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
|
|
std::move(filter), query::partition_slice::option_set(), client_state, _stats.cql_stats, trace_state, std::move(permit));
|
|
}
|
|
|
|
static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, const rjson::value& comp_definition, const rjson::value& attrs) {
|
|
auto op = get_comparison_operator(comp_definition);
|
|
if (op != comparison_operator_type::EQ) {
|
|
throw api_error::validation(format("Hash key can only be restricted with equality operator (EQ). {} not supported.", comp_definition));
|
|
}
|
|
if (attrs.Size() != 1) {
|
|
throw api_error::validation(format("A single attribute is required for a hash key EQ restriction: {}", attrs));
|
|
}
|
|
bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef);
|
|
partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value));
|
|
auto decorated_key = dht::decorate_key(*schema, pk);
|
|
return dht::partition_range(decorated_key);
|
|
}
|
|
|
|
static query::clustering_range get_clustering_range_for_begins_with(bytes&& target, const clustering_key& ck, schema_ptr schema, data_type t) {
|
|
auto it = boost::range::find_end(target, bytes("\xFF"), std::not_equal_to<bytes::value_type>());
|
|
if (it != target.end()) {
|
|
++*it;
|
|
target.resize(std::distance(target.begin(), it) + 1);
|
|
clustering_key upper_limit = clustering_key::from_single_value(*schema, target);
|
|
return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit, false));
|
|
}
|
|
return query::clustering_range::make_starting_with(query::clustering_range::bound(ck));
|
|
}
|
|
|
|
static query::clustering_range calculate_ck_bound(schema_ptr schema, const column_definition& ck_cdef, const rjson::value& comp_definition, const rjson::value& attrs) {
|
|
auto op = get_comparison_operator(comp_definition);
|
|
const size_t expected_attrs_size = (op == comparison_operator_type::BETWEEN) ? 2 : 1;
|
|
if (attrs.Size() != expected_attrs_size) {
|
|
throw api_error::validation(format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs));
|
|
}
|
|
bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef);
|
|
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
|
|
switch (op) {
|
|
case comparison_operator_type::EQ:
|
|
return query::clustering_range(ck);
|
|
case comparison_operator_type::LE:
|
|
return query::clustering_range::make_ending_with(query::clustering_range::bound(ck));
|
|
case comparison_operator_type::LT:
|
|
return query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false));
|
|
case comparison_operator_type::GE:
|
|
return query::clustering_range::make_starting_with(query::clustering_range::bound(ck));
|
|
case comparison_operator_type::GT:
|
|
return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false));
|
|
case comparison_operator_type::BETWEEN: {
|
|
bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef);
|
|
clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit);
|
|
return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit));
|
|
}
|
|
case comparison_operator_type::BEGINS_WITH: {
|
|
if (raw_value.empty()) {
|
|
return query::clustering_range::make_open_ended_both_sides();
|
|
}
|
|
// NOTICE(sarna): A range starting with given prefix and ending (non-inclusively) with a string "incremented" by a single
|
|
// character at the end. Throws for NUMBER instances.
|
|
if (!ck_cdef.type->is_compatible_with(*utf8_type)) {
|
|
throw api_error::validation(format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type)));
|
|
}
|
|
return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type);
|
|
}
|
|
default:
|
|
throw api_error::validation(format("Operator {} not supported for sort key", comp_definition));
|
|
}
|
|
}
|
|
|
|
// Calculates primary key bounds from KeyConditions
|
|
static std::pair<dht::partition_range_vector, std::vector<query::clustering_range>>
|
|
calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) {
|
|
dht::partition_range_vector partition_ranges;
|
|
std::vector<query::clustering_range> ck_bounds;
|
|
|
|
for (auto it = conditions.MemberBegin(); it != conditions.MemberEnd(); ++it) {
|
|
std::string key = it->name.GetString();
|
|
const rjson::value& condition = it->value;
|
|
|
|
const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator");
|
|
const rjson::value& attr_list = rjson::get(condition, "AttributeValueList");
|
|
|
|
const column_definition& pk_cdef = schema->partition_key_columns().front();
|
|
const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? &schema->clustering_key_columns().front() : nullptr;
|
|
if (sstring(key) == pk_cdef.name_as_text()) {
|
|
if (!partition_ranges.empty()) {
|
|
throw api_error::validation("Currently only a single restriction per key is allowed");
|
|
}
|
|
partition_ranges.push_back(calculate_pk_bound(schema, pk_cdef, comp_definition, attr_list));
|
|
}
|
|
if (ck_cdef && sstring(key) == ck_cdef->name_as_text()) {
|
|
if (!ck_bounds.empty()) {
|
|
throw api_error::validation("Currently only a single restriction per key is allowed");
|
|
}
|
|
ck_bounds.push_back(calculate_ck_bound(schema, *ck_cdef, comp_definition, attr_list));
|
|
}
|
|
}
|
|
|
|
// Validate that a query's conditions must be on the hash key, and
|
|
// optionally also on the sort key if it exists.
|
|
if (partition_ranges.empty()) {
|
|
throw api_error::validation(format("Query missing condition on hash key '{}'", schema->partition_key_columns().front().name_as_text()));
|
|
}
|
|
if (schema->clustering_key_size() == 0) {
|
|
if (conditions.MemberCount() != 1) {
|
|
throw api_error::validation("Only one condition allowed in table with only hash key");
|
|
}
|
|
} else {
|
|
if (conditions.MemberCount() == 2 && ck_bounds.empty()) {
|
|
throw api_error::validation(format("Query missing condition on sort key '{}'", schema->clustering_key_columns().front().name_as_text()));
|
|
} else if (conditions.MemberCount() > 2) {
|
|
throw api_error::validation("Only one or two conditions allowed in table with hash key and sort key");
|
|
}
|
|
}
|
|
|
|
if (ck_bounds.empty()) {
|
|
ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
}
|
|
|
|
return {std::move(partition_ranges), std::move(ck_bounds)};
|
|
}
|
|
|
|
// Extract the top-level column name specified in a KeyConditionExpression.
|
|
// If a nested attribute path is given, a ValidationException is generated.
|
|
// If the column name is a #reference to ExpressionAttributeNames, the
|
|
// reference is resolved.
|
|
// Note this function returns a string_view, which may refer to data in the
|
|
// given parsed::value or expression_attribute_names.
|
|
static std::string_view get_toplevel(const parsed::value& v,
|
|
const rjson::value* expression_attribute_names,
|
|
std::unordered_set<std::string>& used_attribute_names)
|
|
{
|
|
const parsed::path& path = std::get<parsed::path>(v._value);
|
|
if (path.has_operators()) {
|
|
throw api_error::validation("KeyConditionExpression does not support nested attributes");
|
|
}
|
|
std::string_view column_name = path.root();
|
|
if (column_name.size() > 0 && column_name[0] == '#') {
|
|
used_attribute_names.emplace(column_name);
|
|
if (!expression_attribute_names) {
|
|
throw api_error::validation(
|
|
format("ExpressionAttributeNames missing, entry '{}' required by KeyConditionExpression",
|
|
column_name));
|
|
}
|
|
const rjson::value* value = rjson::find(*expression_attribute_names, column_name);
|
|
if (!value || !value->IsString()) {
|
|
throw api_error::validation(
|
|
format("ExpressionAttributeNames missing entry '{}' required by KeyConditionExpression",
|
|
column_name));
|
|
}
|
|
column_name = rjson::to_string_view(*value);
|
|
}
|
|
return column_name;
|
|
}
|
|
|
|
// Extract a constant value specified in a KeyConditionExpression.
|
|
// This constant was originally parsed as a reference (:name) to a member of
|
|
// ExpressionAttributeValues, but at this point, after resolve_value(), it
|
|
// was already converted into a JSON value.
|
|
// This function decodes the value (using its given expected type) into bytes
|
|
// which Scylla uses as the actual key value. If the value has the wrong type,
|
|
// or the input had other problems, a ValidationException is thrown.
|
|
static bytes get_constant_value(const parsed::value& v,
|
|
const column_definition& column)
|
|
{
|
|
const parsed::constant& constant = std::get<parsed::constant>(v._value);
|
|
const parsed::constant::literal& lit = std::get<parsed::constant::literal>(constant._value);
|
|
return get_key_from_typed_value(*lit, column);
|
|
}
|
|
|
|
// condition_expression_and_list extracts a list of ANDed primitive conditions
|
|
// from a condition_expression. This is useful for KeyConditionExpression,
|
|
// which may not use OR or NOT. If the given condition_expression does use
|
|
// OR or NOT, this function throws a ValidationException.
|
|
static void condition_expression_and_list(
|
|
const parsed::condition_expression& condition_expression,
|
|
std::vector<const parsed::primitive_condition*>& conditions)
|
|
{
|
|
if (condition_expression._negated) {
|
|
throw api_error::validation("KeyConditionExpression cannot use NOT");
|
|
}
|
|
std::visit(overloaded_functor {
|
|
[&] (const parsed::primitive_condition& cond) {
|
|
conditions.push_back(&cond);
|
|
},
|
|
[&] (const parsed::condition_expression::condition_list& list) {
|
|
if (list.op == '|' && list.conditions.size() > 1) {
|
|
throw api_error::validation("KeyConditionExpression cannot use OR");
|
|
}
|
|
for (const parsed::condition_expression& cond : list.conditions) {
|
|
condition_expression_and_list(cond, conditions);
|
|
}
|
|
}
|
|
}, condition_expression._expression);
|
|
}
|
|
|
|
// Calculates primary key bounds from KeyConditionExpression
|
|
static std::pair<dht::partition_range_vector, std::vector<query::clustering_range>>
|
|
calculate_bounds_condition_expression(schema_ptr schema,
|
|
const rjson::value& expression,
|
|
const rjson::value* expression_attribute_values,
|
|
std::unordered_set<std::string>& used_attribute_values,
|
|
const rjson::value* expression_attribute_names,
|
|
std::unordered_set<std::string>& used_attribute_names)
|
|
{
|
|
if (!expression.IsString()) {
|
|
throw api_error::validation("KeyConditionExpression must be a string");
|
|
}
|
|
if (expression.GetStringLength() == 0) {
|
|
throw api_error::validation("KeyConditionExpression must not be empty");
|
|
}
|
|
// We parse the KeyConditionExpression with the same parser we use for
|
|
// ConditionExpression. But KeyConditionExpression only supports a subset
|
|
// of the ConditionExpression features, so we have many additional
|
|
// verifications below that the key condition is legal. Briefly, a valid
|
|
// key condition must contain a single partition key and a single
|
|
// sort-key range.
|
|
parsed::condition_expression p;
|
|
try {
|
|
p = parse_condition_expression(rjson::to_string_view(expression));
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
resolve_condition_expression(p,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
std::vector<const parsed::primitive_condition*> conditions;
|
|
condition_expression_and_list(p, conditions);
|
|
|
|
if (conditions.size() < 1 || conditions.size() > 2) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression syntax error: must have 1 or 2 conditions");
|
|
}
|
|
// Scylla allows us to have an (equality) constraint on the partition key
|
|
// pk_cdef, and a range constraint on the *first* clustering key ck_cdef.
|
|
// Note that this is also good enough for our GSI implementation - the
|
|
// GSI's user-specified sort key will be the first clustering key.
|
|
// FIXME: In the case described in issue #5320 (base and GSI both have
|
|
// just hash key - but different ones), this may allow the user to Query
|
|
// using the base key which isn't officially part of the GSI.
|
|
const column_definition& pk_cdef = schema->partition_key_columns().front();
|
|
const column_definition* ck_cdef = schema->clustering_key_size() > 0 ?
|
|
&schema->clustering_key_columns().front() : nullptr;
|
|
|
|
dht::partition_range_vector partition_ranges;
|
|
std::vector<query::clustering_range> ck_bounds;
|
|
for (const parsed::primitive_condition* condp : conditions) {
|
|
const parsed::primitive_condition& cond = *condp;
|
|
// In all comparison operators, one operand must be a column name,
|
|
// the other is a constant (value reference). We remember which is
|
|
// which in toplevel_ind, and also the column name in key (not just
|
|
// for comparison operators).
|
|
std::string_view key;
|
|
int toplevel_ind;
|
|
switch (cond._values.size()) {
|
|
case 1: {
|
|
// The only legal single-value condition is a begin_with() function,
|
|
// and it must have two parameters - a top-level attribute and a
|
|
// value reference..
|
|
const parsed::value::function_call *f = std::get_if<parsed::value::function_call>(&cond._values[0]._value);
|
|
if (!f) {
|
|
throw api_error::validation("KeyConditionExpression cannot be just a value");
|
|
}
|
|
if (f->_function_name != "begins_with") {
|
|
throw api_error::validation(
|
|
format("KeyConditionExpression function '{}' not supported",f->_function_name));
|
|
}
|
|
if (f->_parameters.size() != 2 || !f->_parameters[0].is_path() ||
|
|
!f->_parameters[1].is_constant()) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression begins_with() takes attribute and value");
|
|
}
|
|
key = get_toplevel(f->_parameters[0], expression_attribute_names, used_attribute_names);
|
|
toplevel_ind = -1;
|
|
break;
|
|
}
|
|
case 2:
|
|
if (cond._values[0].is_path() && cond._values[1].is_constant()) {
|
|
toplevel_ind = 0;
|
|
} else if (cond._values[1].is_path() && cond._values[0].is_constant()) {
|
|
toplevel_ind = 1;
|
|
} else {
|
|
throw api_error::validation("KeyConditionExpression must compare attribute with constant");
|
|
}
|
|
key = get_toplevel(cond._values[toplevel_ind], expression_attribute_names, used_attribute_names);
|
|
break;
|
|
case 3:
|
|
// Only BETWEEN has three operands. First must be a column name,
|
|
// two other must be value references (constants):
|
|
if (cond._op != parsed::primitive_condition::type::BETWEEN) {
|
|
// Shouldn't happen unless we have a bug in the parser
|
|
throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size()));
|
|
}
|
|
if (cond._values[0].is_path() && cond._values[1].is_constant() && cond._values[2].is_constant()) {
|
|
toplevel_ind = 0;
|
|
key = get_toplevel(cond._values[0], expression_attribute_names, used_attribute_names);
|
|
} else {
|
|
throw api_error::validation("KeyConditionExpression must compare attribute with constants");
|
|
}
|
|
break;
|
|
default:
|
|
// Shouldn't happen unless we have a bug in the parser
|
|
throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size()));
|
|
}
|
|
if (cond._op == parsed::primitive_condition::type::IN) {
|
|
throw api_error::validation("KeyConditionExpression does not support IN operator");
|
|
} else if (cond._op == parsed::primitive_condition::type::NE) {
|
|
throw api_error::validation("KeyConditionExpression does not support NE operator");
|
|
} else if (cond._op == parsed::primitive_condition::type::EQ) {
|
|
// the EQ operator (=) is the only one which can be used for both
|
|
// the partition key and sort key:
|
|
if (sstring(key) == pk_cdef.name_as_text()) {
|
|
if (!partition_ranges.empty()) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression allows only one condition for each key");
|
|
}
|
|
bytes raw_value = get_constant_value(cond._values[!toplevel_ind], pk_cdef);
|
|
partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value));
|
|
auto decorated_key = dht::decorate_key(*schema, pk);
|
|
partition_ranges.push_back(dht::partition_range(decorated_key));
|
|
} else if (ck_cdef && sstring(key) == ck_cdef->name_as_text()) {
|
|
if (!ck_bounds.empty()) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression allows only one condition for each key");
|
|
}
|
|
bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef);
|
|
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
|
|
ck_bounds.push_back(query::clustering_range(ck));
|
|
} else {
|
|
throw api_error::validation(
|
|
format("KeyConditionExpression condition on non-key attribute {}", key));
|
|
}
|
|
continue;
|
|
}
|
|
// If we're still here, it's any other operator besides EQ, and these
|
|
// are allowed *only* on the clustering key:
|
|
if (sstring(key) == pk_cdef.name_as_text()) {
|
|
throw api_error::validation(
|
|
format("KeyConditionExpression only '=' condition is supported on partition key {}", key));
|
|
} else if (!ck_cdef || sstring(key) != ck_cdef->name_as_text()) {
|
|
throw api_error::validation(
|
|
format("KeyConditionExpression condition on non-key attribute {}", key));
|
|
}
|
|
if (!ck_bounds.empty()) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression allows only one condition for each key");
|
|
}
|
|
if (cond._op == parsed::primitive_condition::type::BETWEEN) {
|
|
clustering_key ck1 = clustering_key::from_single_value(*schema,
|
|
get_constant_value(cond._values[1], *ck_cdef));
|
|
clustering_key ck2 = clustering_key::from_single_value(*schema,
|
|
get_constant_value(cond._values[2], *ck_cdef));
|
|
ck_bounds.push_back(query::clustering_range::make(
|
|
query::clustering_range::bound(ck1), query::clustering_range::bound(ck2)));
|
|
continue;
|
|
} else if (cond._values.size() == 1) {
|
|
// We already verified above, that this case this can only be a
|
|
// function call to begins_with(), with the first parameter the
|
|
// key, the second the value reference.
|
|
bytes raw_value = get_constant_value(
|
|
std::get<parsed::value::function_call>(cond._values[0]._value)._parameters[1], *ck_cdef);
|
|
if (!ck_cdef->type->is_compatible_with(*utf8_type)) {
|
|
// begins_with() supported on bytes and strings (both stored
|
|
// in the database as strings) but not on numbers.
|
|
throw api_error::validation(
|
|
format("KeyConditionExpression begins_with() not supported on type {}",
|
|
type_to_string(ck_cdef->type)));
|
|
} else if (raw_value.empty()) {
|
|
ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
} else {
|
|
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
|
|
ck_bounds.push_back(get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef->type));
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// All remaining operator have one value reference parameter in index
|
|
// !toplevel_ind. Note how toplevel_ind==1 reverses the direction of
|
|
// an inequality.
|
|
bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef);
|
|
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
|
|
if ((cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 0) ||
|
|
(cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 1)) {
|
|
ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false)));
|
|
} else if ((cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 0) ||
|
|
(cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 1)) {
|
|
ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false)));
|
|
} else if ((cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 0) ||
|
|
(cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 1)) {
|
|
ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck)));
|
|
} else if ((cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 0) ||
|
|
(cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 1)) {
|
|
ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck)));
|
|
}
|
|
}
|
|
|
|
if (partition_ranges.empty()) {
|
|
throw api_error::validation(
|
|
format("KeyConditionExpression requires a condition on partition key {}", pk_cdef.name_as_text()));
|
|
}
|
|
if (ck_bounds.empty()) {
|
|
ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
}
|
|
return {std::move(partition_ranges), std::move(ck_bounds)};
|
|
}
|
|
|
|
future<executor::request_return_type> executor::query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.query++;
|
|
elogger.trace("Querying {}", request);
|
|
|
|
auto [schema, table_type] = get_table_or_view(_proxy, request);
|
|
|
|
tracing::add_table_name(trace_state, schema->ks_name(), schema->cf_name());
|
|
|
|
rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey");
|
|
db::consistency_level cl = get_read_consistency(request);
|
|
if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"Consistent reads are not allowed on global indexes (GSI)"));
|
|
}
|
|
rjson::value* limit_json = rjson::find(request, "Limit");
|
|
uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits<uint32_t>::max();
|
|
if (limit <= 0) {
|
|
return make_ready_future<request_return_type>(api_error::validation("Limit must be greater than 0"));
|
|
}
|
|
|
|
const bool forward = get_bool_attribute(request, "ScanIndexForward", true);
|
|
|
|
rjson::value* key_conditions = rjson::find(request, "KeyConditions");
|
|
rjson::value* key_condition_expression = rjson::find(request, "KeyConditionExpression");
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
if (key_conditions && key_condition_expression) {
|
|
throw api_error::validation("Query does not allow both "
|
|
"KeyConditions and KeyConditionExpression to be given together");
|
|
} else if (!key_conditions && !key_condition_expression) {
|
|
throw api_error::validation("Query must have one of "
|
|
"KeyConditions or KeyConditionExpression");
|
|
}
|
|
// exactly one of key_conditions or key_condition_expression
|
|
auto [partition_ranges, ck_bounds] = key_conditions
|
|
? calculate_bounds_conditions(schema, *key_conditions)
|
|
: calculate_bounds_condition_expression(schema, *key_condition_expression,
|
|
rjson::find(request, "ExpressionAttributeValues"),
|
|
used_attribute_values,
|
|
rjson::find(request, "ExpressionAttributeNames"),
|
|
used_attribute_names);
|
|
|
|
filter filter(request, filter::request_type::QUERY,
|
|
used_attribute_names, used_attribute_values);
|
|
|
|
// A query is not allowed to filter on the partition key or the sort key.
|
|
for (const column_definition& cdef : schema->partition_key_columns()) { // just one
|
|
if (filter.filters_on(cdef.name_as_text())) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
format("QueryFilter can only contain non-primary key attributes: Partition key attribute: {}", cdef.name_as_text())));
|
|
}
|
|
}
|
|
for (const column_definition& cdef : schema->clustering_key_columns()) {
|
|
if (filter.filters_on(cdef.name_as_text())) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
format("QueryFilter can only contain non-primary key attributes: Sort key attribute: {}", cdef.name_as_text())));
|
|
}
|
|
// FIXME: this "break" can avoid listing some clustering key columns
|
|
// we added for GSIs just because they existed in the base table -
|
|
// but not in all cases. We still have issue #5320.
|
|
break;
|
|
}
|
|
|
|
select_type select = parse_select(request, table_type);
|
|
|
|
auto attrs_to_get = calculate_attrs_to_get(request, used_attribute_names, select);
|
|
verify_all_are_used(request, "ExpressionAttributeValues", used_attribute_values, "Query");
|
|
verify_all_are_used(request, "ExpressionAttributeNames", used_attribute_names, "Query");
|
|
query::partition_slice::option_set opts;
|
|
opts.set_if<query::partition_slice::option::reversed>(!forward);
|
|
return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
|
|
std::move(filter), opts, client_state, _stats.cql_stats, std::move(trace_state), std::move(permit));
|
|
}
|
|
|
|
future<executor::request_return_type> executor::list_tables(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.list_tables++;
|
|
elogger.trace("Listing tables {}", request);
|
|
|
|
rjson::value* exclusive_start_json = rjson::find(request, "ExclusiveStartTableName");
|
|
rjson::value* limit_json = rjson::find(request, "Limit");
|
|
std::string exclusive_start = exclusive_start_json ? exclusive_start_json->GetString() : "";
|
|
int limit = limit_json ? limit_json->GetInt() : 100;
|
|
if (limit < 1 || limit > 100) {
|
|
return make_ready_future<request_return_type>(api_error::validation("Limit must be greater than 0 and no greater than 100"));
|
|
}
|
|
|
|
auto tables = _proxy.data_dictionary().get_tables(); // hold on to temporary, table_names isn't a container, it's a view
|
|
auto table_names = tables
|
|
| boost::adaptors::filtered([] (data_dictionary::table t) {
|
|
return t.schema()->ks_name().find(KEYSPACE_NAME_PREFIX) == 0 && !t.schema()->is_view();
|
|
})
|
|
| boost::adaptors::transformed([] (data_dictionary::table t) {
|
|
return t.schema()->cf_name();
|
|
});
|
|
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "TableNames", rjson::empty_array());
|
|
rjson::value& all_tables = response["TableNames"];
|
|
|
|
//TODO(sarna): Dynamo doesn't declare any ordering when listing tables,
|
|
// but our implementation is vulnerable to changes, because the tables
|
|
// are stored in an unordered map. We may consider (partially) sorting
|
|
// the results before returning them to the client, especially if there
|
|
// is an implicit order of elements that Dynamo imposes.
|
|
auto table_names_it = [&table_names, &exclusive_start] {
|
|
if (!exclusive_start.empty()) {
|
|
auto it = boost::find_if(table_names, [&exclusive_start] (const sstring& table_name) { return table_name == exclusive_start; });
|
|
return std::next(it, it != table_names.end());
|
|
} else {
|
|
return table_names.begin();
|
|
}
|
|
}();
|
|
while (limit > 0 && table_names_it != table_names.end()) {
|
|
rjson::push_back(all_tables, rjson::from_string(*table_names_it));
|
|
--limit;
|
|
++table_names_it;
|
|
}
|
|
|
|
if (table_names_it != table_names.end()) {
|
|
auto& last_table_name = *std::prev(all_tables.End());
|
|
rjson::add(response, "LastEvaluatedTableName", rjson::copy(last_table_name));
|
|
}
|
|
|
|
return make_ready_future<executor::request_return_type>(make_jsonable(std::move(response)));
|
|
}
|
|
|
|
future<executor::request_return_type> executor::describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header) {
|
|
_stats.api_operations.describe_endpoints++;
|
|
rjson::value response = rjson::empty_object();
|
|
// Without having any configuration parameter to say otherwise, we tell
|
|
// the user to return to the same endpoint they used to reach us. The only
|
|
// way we can know this is through the "Host:" header in the request,
|
|
// which typically exists (and in fact is mandatory in HTTP 1.1).
|
|
// A "Host:" header includes both host name and port, exactly what we need
|
|
// to return.
|
|
if (host_header.empty()) {
|
|
return make_ready_future<request_return_type>(api_error::validation("DescribeEndpoints needs a 'Host:' header in request"));
|
|
}
|
|
rjson::add(response, "Endpoints", rjson::empty_array());
|
|
rjson::push_back(response["Endpoints"], rjson::empty_object());
|
|
rjson::add(response["Endpoints"][0], "Address", rjson::from_string(host_header));
|
|
rjson::add(response["Endpoints"][0], "CachePeriodInMinutes", rjson::value(1440));
|
|
return make_ready_future<executor::request_return_type>(make_jsonable(std::move(response)));
|
|
}
|
|
|
|
static std::map<sstring, sstring> get_network_topology_options(service::storage_proxy& sp, gms::gossiper& gossiper, int rf) {
|
|
std::map<sstring, sstring> options;
|
|
sstring rf_str = std::to_string(rf);
|
|
auto& topology = sp.get_token_metadata_ptr()->get_topology();
|
|
for (const gms::inet_address& addr : gossiper.get_live_members()) {
|
|
options.emplace(topology.get_datacenter(addr), rf_str);
|
|
};
|
|
return options;
|
|
}
|
|
|
|
future<executor::request_return_type> executor::describe_continuous_backups(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.describe_continuous_backups++;
|
|
// Unlike most operations which return ResourceNotFound when the given
|
|
// table doesn't exists, this operation returns a TableNoteFoundException.
|
|
// So we can't use the usual get_table() wrapper and need a bit more code:
|
|
std::string table_name = get_table_name(request);
|
|
schema_ptr schema;
|
|
try {
|
|
schema = _proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
|
|
} catch(data_dictionary::no_such_column_family&) {
|
|
throw api_error::table_not_found(
|
|
format("Table {} not found", table_name));
|
|
}
|
|
rjson::value desc = rjson::empty_object();
|
|
rjson::add(desc, "ContinuousBackupsStatus", "DISABLED");
|
|
rjson::value pitr = rjson::empty_object();
|
|
rjson::add(pitr, "PointInTimeRecoveryStatus", "DISABLED");
|
|
rjson::add(desc, "PointInTimeRecoveryDescription", std::move(pitr));
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "ContinuousBackupsDescription", std::move(desc));
|
|
co_return make_jsonable(std::move(response));
|
|
}
|
|
|
|
// Create the keyspace in which we put the alternator table, if it doesn't
|
|
// already exist.
|
|
// Currently, we automatically configure the keyspace based on the number
|
|
// of nodes in the cluster: A cluster with 3 or more live nodes, gets RF=3.
|
|
// A smaller cluster (presumably, a test only), gets RF=1. The user may
|
|
// manually create the keyspace to override this predefined behavior.
|
|
static future<std::vector<mutation>> create_keyspace(std::string_view keyspace_name, service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, api::timestamp_type ts) {
|
|
sstring keyspace_name_str(keyspace_name);
|
|
int endpoint_count = gossiper.get_endpoint_states().size();
|
|
int rf = 3;
|
|
if (endpoint_count < rf) {
|
|
rf = 1;
|
|
elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} nodes.",
|
|
keyspace_name_str, rf, endpoint_count);
|
|
}
|
|
auto opts = get_network_topology_options(sp, gossiper, rf);
|
|
auto ksm = keyspace_metadata::new_keyspace(keyspace_name_str, "org.apache.cassandra.locator.NetworkTopologyStrategy", std::move(opts), true);
|
|
|
|
co_return mm.prepare_new_keyspace_announcement(ksm, ts);
|
|
}
|
|
|
|
future<> executor::start() {
|
|
// Currently, nothing to do on initialization. We delay the keyspace
|
|
// creation (create_keyspace()) until a table is actually created.
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
}
|