When creating an alternator table with tablets, if it has an index, LSI or GSI, require the config option rf_rack_valid_keyspaces to be enabled. The option is required for materialized views in tablets keyspaces to function properly and avoid consistency issues that could happen due to cross-rack migrations and pairing switches when RF-rack validity is not enforced. Currently the option is validated when creating a materialized view via the CQL interface, but it's missing from the alternator interface. Since alternator indexes are based on materialized views, the same check should be added there as well. Fixes scylladb/scylladb#27612 Closes scylladb/scylladb#27622
6096 lines
318 KiB
C++
6096 lines
318 KiB
C++
/*
|
|
* Copyright 2019-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
*/
|
|
|
|
#include <fmt/ranges.h>
|
|
#include <seastar/core/on_internal_error.hh>
|
|
#include "alternator/executor.hh"
|
|
#include "alternator/consumed_capacity.hh"
|
|
#include "auth/permission.hh"
|
|
#include "auth/resource.hh"
|
|
#include "cdc/log.hh"
|
|
#include "cdc/cdc_options.hh"
|
|
#include "auth/service.hh"
|
|
#include "db/config.hh"
|
|
#include "db/view/view_build_status.hh"
|
|
#include "mutation/tombstone.hh"
|
|
#include "locator/abstract_replication_strategy.hh"
|
|
#include "utils/log.hh"
|
|
#include "schema/schema_builder.hh"
|
|
#include "exceptions/exceptions.hh"
|
|
#include "service/client_state.hh"
|
|
#include "mutation/timestamp.hh"
|
|
#include "types/map.hh"
|
|
#include "schema/schema.hh"
|
|
#include "query/query-request.hh"
|
|
#include "query/query-result-reader.hh"
|
|
#include "cql3/selection/selection.hh"
|
|
#include "cql3/result_set.hh"
|
|
#include "bytes.hh"
|
|
#include "service/pager/query_pagers.hh"
|
|
#include <functional>
|
|
#include "error.hh"
|
|
#include "serialization.hh"
|
|
#include "expressions.hh"
|
|
#include "conditions.hh"
|
|
#include <optional>
|
|
#include "utils/assert.hh"
|
|
#include "utils/overloaded_functor.hh"
|
|
#include "mutation/collection_mutation.hh"
|
|
#include "schema/schema.hh"
|
|
#include "db/tags/extension.hh"
|
|
#include "db/tags/utils.hh"
|
|
#include "replica/database.hh"
|
|
#include "alternator/rmw_operation.hh"
|
|
#include <seastar/core/coroutine.hh>
|
|
#include <seastar/core/sleep.hh>
|
|
#include <seastar/core/loop.hh>
|
|
#include <seastar/coroutine/maybe_yield.hh>
|
|
#include <boost/range/algorithm/find_end.hpp>
|
|
#include <unordered_set>
|
|
#include "service/storage_proxy.hh"
|
|
#include "gms/feature_service.hh"
|
|
#include "gms/gossiper.hh"
|
|
#include "utils/error_injection.hh"
|
|
#include "db/schema_tables.hh"
|
|
#include "utils/rjson.hh"
|
|
#include "alternator/extract_from_attrs.hh"
|
|
#include "types/types.hh"
|
|
#include "db/system_keyspace.hh"
|
|
#include "cql3/statements/ks_prop_defs.hh"
|
|
|
|
using namespace std::chrono_literals;
|
|
|
|
logging::logger elogger("alternator-executor");
|
|
|
|
namespace alternator {
|
|
|
|
// Alternator-specific table properties stored as hidden table tags:
|
|
//
|
|
// Alternator doesn't keep its own records of which Alternator tables exist
|
|
// or how each one was configured. Instead, an Alternator table is created
|
|
// as a CQL table - and that CQL table stores its own name, schema, views,
|
|
// and so on. However, there are some Alternator-specific properties of a
|
|
// table which are not part of a CQL schema but which we need to remember.
|
|
// We store those extra properties as hidden "tags" on the CQL table, using
|
|
// the schema's "tags extension". This extension provides a map<string,string>
|
|
// for the table that is stored persistently on disk, but also readable
|
|
// quickly from memory.
|
|
// Alternator also uses the tags extension to store user-defined tags on
|
|
// tables (the TagResource, UntagResource and ListTagsOfResource requests).
|
|
// So the internal tags are kept hidden from the user by using the prefix
|
|
// "system:" in their name (see tag_key_is_internal()).
|
|
// The following is the list of these Alternator-specific hidden tags that
|
|
// Alternator adds to tables:
|
|
//
|
|
// Tags storing the "ReadCapacityUnits" and "WriteCapacityUnits"
|
|
// configured for a table with BillingMode=PROVISIONED.
|
|
const sstring RCU_TAG_KEY("system:provisioned_rcu");
|
|
const sstring WCU_TAG_KEY("system:provisioned_wcu");
|
|
// Tag storing the table's original creation time, in milliseconds since the
|
|
// Unix epoch. All tables get this tag when they are created, but it may be
|
|
// missing in old tables created before this tag was introduced.
|
|
const sstring TABLE_CREATION_TIME_TAG_KEY("system:table_creation_time");
|
|
// If this tag is present, it stores the name of the attribute that was
|
|
// configured by UpdateTimeToLive to be the expiration-time attribute for
|
|
// this table.
|
|
extern const sstring TTL_TAG_KEY("system:ttl_attribute");
|
|
// This will be set to 1 in a case, where user DID NOT specify a range key.
|
|
// The way GSI / LSI is implemented by Alternator assumes user specified keys will come first
|
|
// in materialized view's key list. Then, if needed missing keys are added (current implementation
|
|
// of materialized views requires that all base hash / range keys were added to the view as well).
|
|
// Alternator allows only a single range key attribute to be specified by the user. So if
|
|
// the SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY is set the user didn't specify any key and
|
|
// base table's keys were added as range keys. In all other cases either the first key is the user specified key,
|
|
// following ones are base table's keys added as needed or range key list will be empty.
|
|
static const sstring SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY("system:spurious_range_key_added_to_gsi_and_user_didnt_specify_range_key");
|
|
|
|
// The following tags also have the "system:" prefix but are NOT used
|
|
// by Alternator to store table properties - only the user ever writes to
|
|
// them, as a way to configure the table. As such, these tags are writable
|
|
// (and readable) by the user, and not hidden by tag_key_is_internal().
|
|
// The reason why both hidden (internal) and user-configurable tags share the
|
|
// same "system:" prefix is historic.
|
|
|
|
// Setting the tag with a numeric value will enable a specific initial number
|
|
// of tablets (setting the value to 0 means enabling tablets with
|
|
// an automatic selection of the best number of tablets).
|
|
// Setting this tag to any non-numeric value (e.g., an empty string or the
|
|
// word "none") will ask to disable tablets.
|
|
static constexpr auto INITIAL_TABLETS_TAG_KEY = "system:initial_tablets";
|
|
|
|
|
|
enum class table_status {
|
|
active = 0,
|
|
creating,
|
|
updating,
|
|
deleting
|
|
};
|
|
|
|
static std::string_view table_status_to_sstring(table_status tbl_status) {
|
|
switch (tbl_status) {
|
|
case table_status::active:
|
|
return "ACTIVE";
|
|
case table_status::creating:
|
|
return "CREATING";
|
|
case table_status::updating:
|
|
return "UPDATING";
|
|
case table_status::deleting:
|
|
return "DELETING";
|
|
}
|
|
return "UNKNOWN";
|
|
}
|
|
|
|
static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type,
|
|
const std::map<sstring, sstring>& tags_map, const gms::feature_service& feat, const db::tablets_mode_t::mode tablets_mode);
|
|
|
|
static map_type attrs_type() {
|
|
static thread_local auto t = map_type_impl::get_instance(utf8_type, bytes_type, true);
|
|
return t;
|
|
}
|
|
|
|
static const column_definition& attrs_column(const schema& schema) {
|
|
const column_definition* cdef = schema.get_column_definition(bytes(executor::ATTRS_COLUMN_NAME));
|
|
SCYLLA_ASSERT(cdef);
|
|
return *cdef;
|
|
}
|
|
|
|
|
|
static lw_shared_ptr<stats> get_stats_from_schema(service::storage_proxy& sp, const schema& schema) {
|
|
try {
|
|
replica::table& table = sp.local_db().find_column_family(schema.id());
|
|
if (!table.get_stats().alternator_stats) {
|
|
table.get_stats().alternator_stats = seastar::make_shared<table_stats>(schema.ks_name(), schema.cf_name());
|
|
}
|
|
return table.get_stats().alternator_stats->_stats;
|
|
} catch (std::runtime_error&) {
|
|
// If we're here it means that a table we are currently working on was deleted before the
|
|
// operation completed, returning a temporary object is fine, if the table get deleted so will its metrics
|
|
return make_lw_shared<stats>();
|
|
}
|
|
}
|
|
|
|
executor::body_writer make_streamed(rjson::value&& value) {
|
|
return [value = std::move(value)](output_stream<char>&& _out) mutable -> future<> {
|
|
auto out = std::move(_out);
|
|
std::exception_ptr ex;
|
|
try {
|
|
co_await rjson::print(value, out);
|
|
} catch (...) {
|
|
ex = std::current_exception();
|
|
}
|
|
co_await out.close();
|
|
co_await rjson::destroy_gently(std::move(value));
|
|
if (ex) {
|
|
co_await coroutine::return_exception_ptr(std::move(ex));
|
|
}
|
|
};
|
|
}
|
|
|
|
// make_streamed_with_extra_array() is variant of make_streamed() above, which
|
|
// builds a streaming response (a function writing to an output stream) from a
|
|
// JSON object (rjson::value) but adds to it at the end an additional array.
|
|
// The extra array is given a separate chunked_vector to avoid putting it
|
|
// inside the rjson::value - because RapidJSON does contiguous allocations for
|
|
// arrays which we want to avoid for potentially long arrays in Query/Scan
|
|
// responses (see #23535).
|
|
// If we ever fix RapidJSON to avoid contiguous allocations for arrays, or
|
|
// replace it entirely (#24458), we can remove this function and the function
|
|
// rjson::print_with_extra_array() which it calls.
|
|
executor::body_writer make_streamed_with_extra_array(rjson::value&& value,
|
|
std::string array_name, utils::chunked_vector<rjson::value>&& array) {
|
|
return [value = std::move(value), array_name = std::move(array_name), array = std::move(array)](output_stream<char>&& _out) mutable -> future<> {
|
|
auto out = std::move(_out);
|
|
std::exception_ptr ex;
|
|
try {
|
|
co_await rjson::print_with_extra_array(value, array_name, array, out);
|
|
} catch (...) {
|
|
ex = std::current_exception();
|
|
}
|
|
co_await out.close();
|
|
co_await rjson::destroy_gently(std::move(value));
|
|
// TODO: can/should we also destroy the array gently?
|
|
if (ex) {
|
|
co_await coroutine::return_exception_ptr(std::move(ex));
|
|
}
|
|
};
|
|
}
|
|
|
|
// This function throws api_error::validation if input value is not an object.
|
|
static void validate_is_object(const rjson::value& value, const char* caller) {
|
|
if (!value.IsObject()) {
|
|
throw api_error::validation(fmt::format("{} must be an object", caller));
|
|
}
|
|
}
|
|
|
|
// This function assumes the given value is an object and returns requested member value.
|
|
// If it is not possible an api_error::validation is thrown.
|
|
static const rjson::value& get_member(const rjson::value& obj, const char* member_name, const char* caller) {
|
|
validate_is_object(obj, caller);
|
|
const rjson::value* ret = rjson::find(obj, member_name);
|
|
if (!ret) {
|
|
throw api_error::validation(fmt::format("{} is missing a mandatory member {}", caller, member_name));
|
|
}
|
|
return *ret;
|
|
}
|
|
|
|
|
|
// This function assumes the given value is an object with a single member, and returns this member.
|
|
// In case the requirements are not met an api_error::validation is thrown.
|
|
static const rjson::value::Member& get_single_member(const rjson::value& v, const char* caller) {
|
|
if (!v.IsObject() || v.MemberCount() != 1) {
|
|
throw api_error::validation(format("{}: expected an object with a single member.", caller));
|
|
}
|
|
return *(v.MemberBegin());
|
|
}
|
|
|
|
executor::executor(gms::gossiper& gossiper,
|
|
service::storage_proxy& proxy,
|
|
service::migration_manager& mm,
|
|
db::system_distributed_keyspace& sdks,
|
|
cdc::metadata& cdc_metadata,
|
|
smp_service_group ssg,
|
|
utils::updateable_value<uint32_t> default_timeout_in_ms)
|
|
: _gossiper(gossiper),
|
|
_proxy(proxy),
|
|
_mm(mm),
|
|
_sdks(sdks),
|
|
_cdc_metadata(cdc_metadata),
|
|
_enforce_authorization(_proxy.data_dictionary().get_config().alternator_enforce_authorization),
|
|
_warn_authorization(_proxy.data_dictionary().get_config().alternator_warn_authorization),
|
|
_ssg(ssg),
|
|
_parsed_expression_cache(std::make_unique<parsed::expression_cache>(
|
|
parsed::expression_cache::config{_proxy.data_dictionary().get_config().alternator_max_expression_cache_entries_per_shard},
|
|
_stats))
|
|
{
|
|
s_default_timeout_in_ms = std::move(default_timeout_in_ms);
|
|
register_metrics(_metrics, _stats);
|
|
}
|
|
|
|
executor::~executor() = default;
|
|
|
|
static void set_table_creation_time(std::map<sstring, sstring>& tags_map, db_clock::time_point creation_time) {
|
|
auto tm = std::chrono::duration_cast<std::chrono::milliseconds>(creation_time.time_since_epoch()).count();
|
|
tags_map[TABLE_CREATION_TIME_TAG_KEY] = std::to_string(tm);
|
|
}
|
|
|
|
static double get_table_creation_time(const schema &schema) {
|
|
auto time = db::find_tag(schema, TABLE_CREATION_TIME_TAG_KEY);
|
|
if (time) {
|
|
try {
|
|
auto val = std::stoll(*time);
|
|
if (val > 0) {
|
|
return val / 1000.0;
|
|
}
|
|
}
|
|
catch(...) {}
|
|
}
|
|
return 0.0;
|
|
}
|
|
|
|
void executor::supplement_table_info(rjson::value& descr, const schema& schema, service::storage_proxy& sp) {
|
|
auto creation_time = get_table_creation_time(schema);
|
|
|
|
rjson::add(descr, "CreationDateTime", rjson::value(creation_time));
|
|
rjson::add(descr, "TableStatus", "ACTIVE");
|
|
rjson::add(descr, "TableId", rjson::from_string(schema.id().to_sstring()));
|
|
|
|
executor::supplement_table_stream_info(descr, schema, sp);
|
|
}
|
|
|
|
// We would have liked to support table names up to 255 bytes, like DynamoDB.
|
|
// But Scylla creates a directory whose name is the table's name plus 33
|
|
// bytes (dash and UUID), and since directory names are limited to 255 bytes,
|
|
// we need to limit table names to 222 bytes, instead of 255.
|
|
// See https://github.com/scylladb/scylla/issues/4480
|
|
// We actually have two limits here,
|
|
// * max_table_name_length is the limit that Alternator will impose on names
|
|
// of new Alternator tables.
|
|
// * max_auxiliary_table_name_length is the potentially higher absolute limit
|
|
// that Scylla imposes on the names of auxiliary tables that Alternator
|
|
// wants to create internally - i.e. materialized views or CDC log tables.
|
|
// The second limit might mean that it is not possible to add a GSI to an
|
|
// existing table, because the name of the new auxiliary table may go over
|
|
// the limit. The second limit is also one of the reasons why the first limit
|
|
// is set lower than 222 - to have room to enable streams which add the extra
|
|
// suffix "_scylla_cdc_log" to the table name.
|
|
static constexpr int max_table_name_length = 192;
|
|
static constexpr int max_auxiliary_table_name_length = 222;
|
|
|
|
static bool valid_table_name_chars(std::string_view name) {
|
|
for (auto c : name) {
|
|
if ((c < 'a' || c > 'z') &&
|
|
(c < 'A' || c > 'Z') &&
|
|
(c < '0' || c > '9') &&
|
|
c != '_' &&
|
|
c != '-' &&
|
|
c != '.') {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// validate_table_name() validates the TableName parameter in a request - it
|
|
// should only be called in CreateTable or when a request looking for an
|
|
// existing table failed to find it. validate_table_name() throws the
|
|
// appropriate api_error if this validation fails.
|
|
// The DynamoDB developer guide, https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.NamingRulesDataTypes.html#HowItWorks.NamingRules
|
|
// specifies that table "names must be between 3 and 255 characters long and
|
|
// can contain only the following characters: a-z, A-Z, 0-9, _ (underscore),
|
|
// - (dash), . (dot)". However, Alternator only allows max_table_name_length
|
|
// characters (see above) - not 255.
|
|
static void validate_table_name(std::string_view name) {
|
|
if (name.length() < 3 || name.length() > max_table_name_length) {
|
|
throw api_error::validation(
|
|
format("TableName must be at least 3 characters long and at most {} characters long", max_table_name_length));
|
|
}
|
|
if (!valid_table_name_chars(name)) {
|
|
throw api_error::validation(
|
|
"TableName must satisfy regular expression pattern: [a-zA-Z0-9_.-]+");
|
|
}
|
|
}
|
|
|
|
// Validate that a CDC log table could be created for the base table with a
|
|
// given table_name, and if not, throw a user-visible api_error::validation.
|
|
// It is not possible to create a CDC log table if the table name is so long
|
|
// that adding the 15-character suffix "_scylla_cdc_log" (cdc_log_suffix)
|
|
// makes it go over max_auxiliary_table_name_length.
|
|
// Note that if max_table_name_length is set to less than 207 (which is
|
|
// max_auxiliary_table_name_length-15), then this function will never
|
|
// fail. However, it's still important to call it in UpdateTable, in case
|
|
// we have pre-existing tables with names longer than this to avoid #24598.
|
|
static void validate_cdc_log_name_length(std::string_view table_name) {
|
|
if (cdc::log_name(table_name).length() > max_auxiliary_table_name_length) {
|
|
// CDC will add cdc_log_suffix ("_scylla_cdc_log") to the table name
|
|
// to create its log table, and this will exceed the maximum allowed
|
|
// length. To provide a more helpful error message, we assume that
|
|
// cdc::log_name() always adds a suffix of the same length.
|
|
int suffix_len = cdc::log_name(table_name).length() - table_name.length();
|
|
throw api_error::validation(fmt::format("Streams cannot be enabled to a table whose name is longer than {} characters: {}",
|
|
max_auxiliary_table_name_length - suffix_len, table_name));
|
|
}
|
|
}
|
|
|
|
// In DynamoDB index names are local to a table, while in Scylla, materialized
|
|
// view names are global (in a keyspace). So we need to compose a unique name
|
|
// for the view taking into account both the table's name and the index name.
|
|
// We concatenate the table and index name separated by a delim character
|
|
// (a character not allowed by DynamoDB in ordinary table names, default: ":").
|
|
// The downside of this approach is that it limits the sum of the lengths,
|
|
// instead of each component individually as DynamoDB does.
|
|
// The view_name() function assumes the table_name has already been validated
|
|
// but validates the legality of index_name and the combination of both.
|
|
static std::string view_name(std::string_view table_name, std::string_view index_name, const std::string& delim = ":", bool validate_len = true) {
|
|
if (index_name.length() < 3) {
|
|
throw api_error::validation("IndexName must be at least 3 characters long");
|
|
}
|
|
if (!valid_table_name_chars(index_name)) {
|
|
throw api_error::validation(
|
|
fmt::format("IndexName '{}' must satisfy regular expression pattern: [a-zA-Z0-9_.-]+", index_name));
|
|
}
|
|
std::string ret = std::string(table_name) + delim + std::string(index_name);
|
|
if (ret.length() > max_auxiliary_table_name_length && validate_len) {
|
|
throw api_error::validation(
|
|
fmt::format("The total length of TableName ('{}') and IndexName ('{}') cannot exceed {} characters",
|
|
table_name, index_name, max_auxiliary_table_name_length - delim.size()));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static std::string lsi_name(std::string_view table_name, std::string_view index_name, bool validate_len = true) {
|
|
return view_name(table_name, index_name, "!:", validate_len);
|
|
}
|
|
|
|
/** Extract table name from a request.
|
|
* Most requests expect the table's name to be listed in a "TableName" field.
|
|
* This convenience function returns the name or api_error in case the
|
|
* table name is missing or not a string.
|
|
*/
|
|
static std::optional<std::string> find_table_name(const rjson::value& request) {
|
|
const rjson::value* table_name_value = rjson::find(request, "TableName");
|
|
if (!table_name_value) {
|
|
return std::nullopt;
|
|
}
|
|
if (!table_name_value->IsString()) {
|
|
throw api_error::validation("Non-string TableName field in request");
|
|
}
|
|
std::string table_name = rjson::to_string(*table_name_value);
|
|
return table_name;
|
|
}
|
|
|
|
static std::string get_table_name(const rjson::value& request) {
|
|
auto name = find_table_name(request);
|
|
if (!name) {
|
|
throw api_error::validation("Missing TableName field in request");
|
|
}
|
|
return *name;
|
|
}
|
|
|
|
/** Extract table schema from a request.
|
|
* Many requests expect the table's name to be listed in a "TableName" field
|
|
* and need to look it up as an existing table. This convenience function
|
|
* does this, with the appropriate validation and api_error in case the table
|
|
* name is missing, invalid or the table doesn't exist. If everything is
|
|
* successful, it returns the table's schema.
|
|
*/
|
|
schema_ptr executor::find_table(service::storage_proxy& proxy, const rjson::value& request) {
|
|
auto table_name = find_table_name(request);
|
|
if (!table_name) {
|
|
return nullptr;
|
|
}
|
|
return find_table(proxy, *table_name);
|
|
}
|
|
|
|
schema_ptr executor::find_table(service::storage_proxy& proxy, std::string_view table_name) {
|
|
try {
|
|
return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + sstring(table_name), table_name);
|
|
} catch(data_dictionary::no_such_column_family&) {
|
|
// DynamoDB returns validation error even when table does not exist
|
|
// and the table name is invalid.
|
|
validate_table_name(table_name);
|
|
|
|
throw api_error::resource_not_found(
|
|
fmt::format("Requested resource not found: Table: {} not found", table_name));
|
|
}
|
|
}
|
|
|
|
schema_ptr get_table(service::storage_proxy& proxy, const rjson::value& request) {
|
|
auto schema = executor::find_table(proxy, request);
|
|
if (!schema) {
|
|
// if we get here then the name was missing, since syntax or missing actual CF
|
|
// checks throw. Slow path, but just call get_table_name to generate exception.
|
|
get_table_name(request);
|
|
}
|
|
return schema;
|
|
}
|
|
|
|
// try_get_internal_table() handles the special case that the given table_name
|
|
// begins with INTERNAL_TABLE_PREFIX (".scylla.alternator."). In that case,
|
|
// this function assumes that the rest of the name refers to an internal
|
|
// Scylla table (e.g., system table) and returns the schema of that table -
|
|
// or an exception if it doesn't exist. Otherwise, if table_name does not
|
|
// start with INTERNAL_TABLE_PREFIX, this function returns an empty schema_ptr
|
|
// and the caller should look for a normal Alternator table with that name.
|
|
static schema_ptr try_get_internal_table(data_dictionary::database db, std::string_view table_name) {
|
|
size_t it = table_name.find(executor::INTERNAL_TABLE_PREFIX);
|
|
if (it != 0) {
|
|
return schema_ptr{};
|
|
}
|
|
table_name.remove_prefix(executor::INTERNAL_TABLE_PREFIX.size());
|
|
size_t delim = table_name.find_first_of('.');
|
|
if (delim == std::string_view::npos) {
|
|
return schema_ptr{};
|
|
}
|
|
std::string_view ks_name = table_name.substr(0, delim);
|
|
table_name.remove_prefix(ks_name.size() + 1);
|
|
// Only internal keyspaces can be accessed to avoid leakage
|
|
auto ks = db.try_find_keyspace(ks_name);
|
|
if (!ks || !ks->is_internal()) {
|
|
return schema_ptr{};
|
|
}
|
|
try {
|
|
return db.find_schema(ks_name, table_name);
|
|
} catch (data_dictionary::no_such_column_family&) {
|
|
// DynamoDB returns validation error even when table does not exist
|
|
// and the table name is invalid.
|
|
validate_table_name(table_name);
|
|
throw api_error::resource_not_found(
|
|
fmt::format("Requested resource not found: Internal table: {}.{} not found", ks_name, table_name));
|
|
}
|
|
}
|
|
|
|
// get_table_or_view() is similar to to get_table(), except it returns either
|
|
// a table or a materialized view from which to read, based on the TableName
|
|
// and optional IndexName in the request. Only requests like Query and Scan
|
|
// which allow IndexName should use this function.
|
|
enum class table_or_view_type { base, lsi, gsi };
|
|
static std::pair<schema_ptr, table_or_view_type>
|
|
get_table_or_view(service::storage_proxy& proxy, const rjson::value& request) {
|
|
table_or_view_type type = table_or_view_type::base;
|
|
std::string table_name = get_table_name(request);
|
|
|
|
if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) {
|
|
return {s, type};
|
|
}
|
|
|
|
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
|
|
const rjson::value* index_name = rjson::find(request, "IndexName");
|
|
std::string orig_table_name;
|
|
if (index_name) {
|
|
if (index_name->IsString()) {
|
|
orig_table_name = std::move(table_name);
|
|
table_name = view_name(orig_table_name, rjson::to_string_view(*index_name));
|
|
type = table_or_view_type::gsi;
|
|
} else {
|
|
throw api_error::validation(
|
|
fmt::format("Non-string IndexName '{}'", rjson::to_string_view(*index_name)));
|
|
}
|
|
// If no tables for global indexes were found, the index may be local
|
|
if (!proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
|
|
type = table_or_view_type::lsi;
|
|
table_name = lsi_name(orig_table_name, rjson::to_string_view(*index_name));
|
|
}
|
|
}
|
|
|
|
try {
|
|
return { proxy.data_dictionary().find_schema(keyspace_name, table_name), type };
|
|
} catch(data_dictionary::no_such_column_family&) {
|
|
if (index_name) {
|
|
// DynamoDB returns a different error depending on whether the
|
|
// base table doesn't exist (ResourceNotFoundException) or it
|
|
// does exist but the index does not (ValidationException).
|
|
if (proxy.data_dictionary().has_schema(keyspace_name, orig_table_name)) {
|
|
throw api_error::validation(
|
|
fmt::format("Requested resource not found: Index '{}' for table '{}'", rjson::to_string_view(*index_name), orig_table_name));
|
|
} else {
|
|
throw api_error::resource_not_found(
|
|
fmt::format("Requested resource not found: Table: {} not found", orig_table_name));
|
|
}
|
|
} else {
|
|
throw api_error::resource_not_found(
|
|
fmt::format("Requested resource not found: Table: {} not found", table_name));
|
|
}
|
|
}
|
|
}
|
|
|
|
// get_table_for_write() is similar to get_table(), but additionally, if the
|
|
// configuration allows this, may also allow writing to system table with
|
|
// prefix INTERNAL_TABLE_PREFIX. This is analogous to the function
|
|
// get_table_or_view() above which allows *reading* internal tables.
|
|
static schema_ptr get_table_for_write(service::storage_proxy& proxy, const rjson::value& request) {
|
|
std::string table_name = get_table_name(request);
|
|
if (schema_ptr s = try_get_internal_table(proxy.data_dictionary(), table_name)) {
|
|
if (!proxy.data_dictionary().get_config().alternator_allow_system_table_write()) {
|
|
throw api_error::resource_not_found(fmt::format(
|
|
"Table {} is an internal table, and writing to it is forbidden"
|
|
" by the alternator_allow_system_table_write configuration",
|
|
table_name));
|
|
}
|
|
return s;
|
|
}
|
|
return executor::find_table(proxy, table_name);
|
|
}
|
|
|
|
// Convenience function for getting the value of a string attribute, or a
|
|
// default value if it is missing. If the attribute exists, but is not a
|
|
// string, a descriptive api_error is thrown.
|
|
static std::string get_string_attribute(const rjson::value& value, std::string_view attribute_name, const char* default_return) {
|
|
const rjson::value* attribute_value = rjson::find(value, attribute_name);
|
|
if (!attribute_value)
|
|
return default_return;
|
|
if (!attribute_value->IsString()) {
|
|
throw api_error::validation(fmt::format("Expected string value for attribute {}, got: {}",
|
|
attribute_name, value));
|
|
}
|
|
return rjson::to_string(*attribute_value);
|
|
}
|
|
|
|
// Convenience function for getting the value of a boolean attribute, or a
|
|
// default value if it is missing. If the attribute exists, but is not a
|
|
// bool, a descriptive api_error is thrown.
|
|
static bool get_bool_attribute(const rjson::value& value, std::string_view attribute_name, bool default_return) {
|
|
const rjson::value* attribute_value = rjson::find(value, attribute_name);
|
|
if (!attribute_value) {
|
|
return default_return;
|
|
}
|
|
if (!attribute_value->IsBool()) {
|
|
throw api_error::validation(fmt::format("Expected boolean value for attribute {}, got: {}",
|
|
attribute_name, value));
|
|
}
|
|
return attribute_value->GetBool();
|
|
}
|
|
|
|
// Convenience function for getting the value of an integer attribute, or
|
|
// an empty optional if it is missing. If the attribute exists, but is not
|
|
// an integer, a descriptive api_error is thrown.
|
|
static std::optional<int> get_int_attribute(const rjson::value& value, std::string_view attribute_name) {
|
|
const rjson::value* attribute_value = rjson::find(value, attribute_name);
|
|
if (!attribute_value)
|
|
return {};
|
|
if (!attribute_value->IsInt()) {
|
|
throw api_error::validation(fmt::format("Expected integer value for attribute {}, got: {}",
|
|
attribute_name, value));
|
|
}
|
|
return attribute_value->GetInt();
|
|
}
|
|
|
|
// Sets a KeySchema object inside the given JSON parent describing the key
|
|
// attributes of the the given schema as being either HASH or RANGE keys.
|
|
// Additionally, adds to a given map mappings between the key attribute
|
|
// names and their type (as a DynamoDB type string).
|
|
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>* attribute_types, const std::map<sstring, sstring> *tags) {
|
|
rjson::value key_schema = rjson::empty_array();
|
|
const bool ignore_range_keys_as_spurious = tags != nullptr && tags->contains(SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY);
|
|
|
|
for (const column_definition& cdef : schema.partition_key_columns()) {
|
|
rjson::value key = rjson::empty_object();
|
|
rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text()));
|
|
rjson::add(key, "KeyType", "HASH");
|
|
rjson::push_back(key_schema, std::move(key));
|
|
if (attribute_types) {
|
|
(*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type);
|
|
}
|
|
}
|
|
if (!ignore_range_keys_as_spurious) {
|
|
// NOTE: user requested key (there can be at most one) will always come first.
|
|
// There might be more keys following it, which were added, but those were
|
|
// not requested by the user, so we ignore them.
|
|
for (const column_definition& cdef : schema.clustering_key_columns()) {
|
|
rjson::value key = rjson::empty_object();
|
|
rjson::add(key, "AttributeName", rjson::from_string(cdef.name_as_text()));
|
|
rjson::add(key, "KeyType", "RANGE");
|
|
rjson::push_back(key_schema, std::move(key));
|
|
if (attribute_types) {
|
|
(*attribute_types)[cdef.name_as_text()] = type_to_string(cdef.type);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
rjson::add(parent, "KeySchema", std::move(key_schema));
|
|
|
|
}
|
|
|
|
void executor::describe_key_schema(rjson::value& parent, const schema& schema, std::unordered_map<std::string,std::string>& attribute_types, const std::map<sstring, sstring> *tags) {
|
|
describe_key_schema(parent, schema, &attribute_types, tags);
|
|
}
|
|
|
|
static rjson::value generate_arn_for_table(const schema& schema) {
|
|
return rjson::from_string(format("arn:scylla:alternator:{}:scylla:table/{}", schema.ks_name(), schema.cf_name()));
|
|
}
|
|
|
|
static rjson::value generate_arn_for_index(const schema& schema, std::string_view index_name) {
|
|
return rjson::from_string(fmt::format(
|
|
"arn:scylla:alternator:{}:scylla:table/{}/index/{}",
|
|
schema.ks_name(), schema.cf_name(), index_name));
|
|
}
|
|
|
|
// The following function checks if a given view has finished building.
|
|
// We need this for describe_table() to know if a view is still backfilling,
|
|
// or active.
|
|
//
|
|
// Currently we don't have in view_ptr the knowledge whether a view finished
|
|
// building long ago - so checking this involves a somewhat inefficient, but
|
|
// still node-local, process:
|
|
// We need a table that can accurately tell that all nodes have finished
|
|
// building this view. system.built_views is not good enough because it only
|
|
// knows the view building status in the current node. In recent versions,
|
|
// after PR #19745, we have a local table system.view_build_status_v2 with
|
|
// global information, replacing the old system_distributed.view_build_status.
|
|
// In theory, there can be a period during upgrading an old cluster when this
|
|
// table is not yet available. However, since the IndexStatus is a new feature
|
|
// too, it is acceptable that it doesn't yet work in the middle of the update.
|
|
static future<bool> is_view_built(
|
|
view_ptr view,
|
|
service::storage_proxy& proxy,
|
|
service::client_state& client_state,
|
|
tracing::trace_state_ptr trace_state,
|
|
service_permit permit) {
|
|
auto schema = proxy.data_dictionary().find_table(
|
|
"system", db::system_keyspace::VIEW_BUILD_STATUS_V2).schema();
|
|
// The table system.view_build_status_v2 has "keyspace_name" and
|
|
// "view_name" as the partition key, and each clustering row has
|
|
// "host_id" as clustering key and a string "status". We need to
|
|
// read a single partition:
|
|
partition_key pk = partition_key::from_exploded(*schema,
|
|
{utf8_type->decompose(view->ks_name()),
|
|
utf8_type->decompose(view->cf_name())});
|
|
dht::partition_range_vector partition_ranges{
|
|
dht::partition_range(dht::decorate_key(*schema, pk))};
|
|
auto selection = cql3::selection::selection::wildcard(schema); // only for get_query_options()!
|
|
auto partition_slice = query::partition_slice(
|
|
{query::clustering_range::make_open_ended_both_sides()},
|
|
{}, // static columns
|
|
{schema->get_column_definition("status")->id}, // regular columns
|
|
selection->get_query_options());
|
|
auto command = ::make_lw_shared<query::read_command>(
|
|
schema->id(), schema->version(), partition_slice,
|
|
proxy.get_max_result_size(partition_slice),
|
|
query::tombstone_limit(proxy.get_tombstone_limit()));
|
|
service::storage_proxy::coordinator_query_result qr =
|
|
co_await proxy.query(
|
|
schema, std::move(command), std::move(partition_ranges),
|
|
db::consistency_level::LOCAL_ONE,
|
|
service::storage_proxy::coordinator_query_options(
|
|
executor::default_timeout(), std::move(permit), client_state, trace_state));
|
|
query::result_set rs = query::result_set::from_raw_result(
|
|
schema, partition_slice, *qr.query_result);
|
|
std::unordered_map<locator::host_id, sstring> statuses;
|
|
for (auto&& r : rs.rows()) {
|
|
auto host_id = r.get<utils::UUID>("host_id");
|
|
auto status = r.get<sstring>("status");
|
|
if (host_id && status) {
|
|
statuses.emplace(locator::host_id(*host_id), *status);
|
|
}
|
|
}
|
|
// A view is considered "built" if all nodes reported SUCCESS in having
|
|
// built this view. Note that we need this "SUCCESS" for all nodes in the
|
|
// cluster - even those that are temporarily down (their success is known
|
|
// by this node, even if they are down). Conversely, we don't care what is
|
|
// the recorded status for any node which is no longer in the cluster - it
|
|
// is possible we forgot to erase the status of nodes that left the
|
|
// cluster, but here we just ignore them and look at the nodes actually
|
|
// in the topology.
|
|
bool all_built = true;
|
|
auto token_metadata = proxy.get_token_metadata_ptr();
|
|
token_metadata->get_topology().for_each_node(
|
|
[&] (const locator::node& node) {
|
|
// Note: we could skip nodes in DCs which have no replication of
|
|
// this view. However, in practice even those nodes would run
|
|
// the view building (and just see empty content) so we don't
|
|
// need to bother with this skipping.
|
|
auto it = statuses.find(node.host_id());
|
|
if (it == statuses.end() || it->second != "SUCCESS") {
|
|
all_built = false;
|
|
}
|
|
});
|
|
co_return all_built;
|
|
|
|
}
|
|
|
|
static future<rjson::value> fill_table_description(schema_ptr schema, table_status tbl_status, service::storage_proxy& proxy, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
|
|
{
|
|
rjson::value table_description = rjson::empty_object();
|
|
auto tags_ptr = db::get_tags_of_table(schema);
|
|
|
|
rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
|
|
|
|
auto creation_timestamp = get_table_creation_time(*schema);
|
|
|
|
// FIXME: In DynamoDB the CreateTable implementation is asynchronous, and
|
|
// the table may be in "Creating" state until creating is finished.
|
|
// We don't currently do this in Alternator - instead CreateTable waits
|
|
// until the table is really available. So/ DescribeTable returns either
|
|
// ACTIVE or doesn't exist at all (and DescribeTable returns an error).
|
|
// The states CREATING and UPDATING are not currently returned.
|
|
rjson::add(table_description, "TableStatus", rjson::from_string(table_status_to_sstring(tbl_status)));
|
|
rjson::add(table_description, "TableArn", generate_arn_for_table(*schema));
|
|
rjson::add(table_description, "TableId", rjson::from_string(schema->id().to_sstring()));
|
|
rjson::add(table_description, "BillingModeSummary", rjson::empty_object());
|
|
rjson::add(table_description["BillingModeSummary"], "LastUpdateToPayPerRequestDateTime", rjson::value(creation_timestamp));
|
|
// In PAY_PER_REQUEST billing mode, provisioned capacity should return 0
|
|
int rcu = 0;
|
|
int wcu = 0;
|
|
bool is_pay_per_request = true;
|
|
|
|
if (tags_ptr) {
|
|
auto rcu_tag = tags_ptr->find(RCU_TAG_KEY);
|
|
auto wcu_tag = tags_ptr->find(WCU_TAG_KEY);
|
|
if (rcu_tag != tags_ptr->end() && wcu_tag != tags_ptr->end()) {
|
|
try {
|
|
rcu = std::stoi(rcu_tag->second);
|
|
wcu = std::stoi(wcu_tag->second);
|
|
is_pay_per_request = false;
|
|
} catch (...) {
|
|
rcu = 0;
|
|
wcu = 0;
|
|
}
|
|
}
|
|
}
|
|
if (is_pay_per_request) {
|
|
rjson::add(table_description["BillingModeSummary"], "BillingMode", "PAY_PER_REQUEST");
|
|
} else {
|
|
rjson::add(table_description["BillingModeSummary"], "BillingMode", "PROVISIONED");
|
|
}
|
|
rjson::add(table_description, "ProvisionedThroughput", rjson::empty_object());
|
|
rjson::add(table_description["ProvisionedThroughput"], "ReadCapacityUnits", rcu);
|
|
rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", wcu);
|
|
rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);
|
|
|
|
|
|
|
|
data_dictionary::table t = proxy.data_dictionary().find_column_family(schema);
|
|
|
|
if (tbl_status != table_status::deleting) {
|
|
rjson::add(table_description, "CreationDateTime", rjson::value(creation_timestamp));
|
|
std::unordered_map<std::string,std::string> key_attribute_types;
|
|
// Add base table's KeySchema and collect types for AttributeDefinitions:
|
|
executor::describe_key_schema(table_description, *schema, key_attribute_types, tags_ptr);
|
|
if (!t.views().empty()) {
|
|
rjson::value gsi_array = rjson::empty_array();
|
|
rjson::value lsi_array = rjson::empty_array();
|
|
for (const view_ptr& vptr : t.views()) {
|
|
rjson::value view_entry = rjson::empty_object();
|
|
const sstring& cf_name = vptr->cf_name();
|
|
size_t delim_it = cf_name.find(':');
|
|
if (delim_it == sstring::npos) {
|
|
elogger.error("Invalid internal index table name: {}", cf_name);
|
|
continue;
|
|
}
|
|
sstring index_name = cf_name.substr(delim_it + 1);
|
|
rjson::add(view_entry, "IndexName", rjson::from_string(index_name));
|
|
rjson::add(view_entry, "IndexArn", generate_arn_for_index(*schema, index_name));
|
|
// Add indexes's KeySchema and collect types for AttributeDefinitions:
|
|
executor::describe_key_schema(view_entry, *vptr, key_attribute_types, db::get_tags_of_table(vptr));
|
|
// Add projection type
|
|
rjson::value projection = rjson::empty_object();
|
|
rjson::add(projection, "ProjectionType", "ALL");
|
|
// FIXME: we have to get ProjectionType from the schema when it is added
|
|
rjson::add(view_entry, "Projection", std::move(projection));
|
|
// Local secondary indexes are marked by an extra '!' sign occurring before the ':' delimiter
|
|
bool is_lsi = (delim_it > 1 && cf_name[delim_it-1] == '!');
|
|
// Add IndexStatus and Backfilling flags, but only for GSIs -
|
|
// LSIs can only be created with the table itself and do not
|
|
// have a status. Alternator schema operations are synchronous
|
|
// so only two combinations of these flags are possible: ACTIVE
|
|
// (for a built view) or CREATING+Backfilling (if view building
|
|
// is in progress).
|
|
if (!is_lsi) {
|
|
if (co_await is_view_built(vptr, proxy, client_state, trace_state, permit)) {
|
|
rjson::add(view_entry, "IndexStatus", "ACTIVE");
|
|
} else {
|
|
rjson::add(view_entry, "IndexStatus", "CREATING");
|
|
rjson::add(view_entry, "Backfilling", rjson::value(true));
|
|
}
|
|
}
|
|
rjson::value& index_array = is_lsi ? lsi_array : gsi_array;
|
|
rjson::push_back(index_array, std::move(view_entry));
|
|
}
|
|
if (!lsi_array.Empty()) {
|
|
rjson::add(table_description, "LocalSecondaryIndexes", std::move(lsi_array));
|
|
}
|
|
if (!gsi_array.Empty()) {
|
|
rjson::add(table_description, "GlobalSecondaryIndexes", std::move(gsi_array));
|
|
}
|
|
}
|
|
// Use map built by describe_key_schema() for base and indexes to produce
|
|
// AttributeDefinitions for all key columns:
|
|
rjson::value attribute_definitions = rjson::empty_array();
|
|
for (auto& type : key_attribute_types) {
|
|
rjson::value key = rjson::empty_object();
|
|
rjson::add(key, "AttributeName", rjson::from_string(type.first));
|
|
rjson::add(key, "AttributeType", rjson::from_string(type.second));
|
|
rjson::push_back(attribute_definitions, std::move(key));
|
|
}
|
|
rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));
|
|
}
|
|
executor::supplement_table_stream_info(table_description, *schema, proxy);
|
|
|
|
// FIXME: still missing some response fields (issue #5026)
|
|
co_return table_description;
|
|
}
|
|
|
|
bool is_alternator_keyspace(const sstring& ks_name) {
|
|
return ks_name.find(executor::KEYSPACE_NAME_PREFIX) == 0;
|
|
}
|
|
|
|
sstring executor::table_name(const schema& s) {
|
|
return s.cf_name();
|
|
}
|
|
|
|
future<executor::request_return_type> executor::describe_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.describe_table++;
|
|
elogger.trace("Describing table {}", request);
|
|
|
|
schema_ptr schema = get_table(_proxy, request);
|
|
get_stats_from_schema(_proxy, *schema)->api_operations.describe_table++;
|
|
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
|
|
|
rjson::value table_description = co_await fill_table_description(schema, table_status::active, _proxy, client_state, trace_state, permit);
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "Table", std::move(table_description));
|
|
elogger.trace("returning {}", response);
|
|
co_return rjson::print(std::move(response));
|
|
}
|
|
|
|
// This function increments the authorization_failures counter, and may also
|
|
// log a warn-level message and/or throw an access_denied exception, depending
|
|
// on what enforce_authorization and warn_authorization are set to.
|
|
// Note that if enforce_authorization is false, this function will return
|
|
// without throwing. So a caller that doesn't want to continue after an
|
|
// authorization_error must explicitly return after calling this function.
|
|
static void authorization_error(alternator::stats& stats, bool enforce_authorization, bool warn_authorization, std::string msg) {
|
|
stats.authorization_failures++;
|
|
if (enforce_authorization) {
|
|
if (warn_authorization) {
|
|
elogger.warn("alternator_warn_authorization=true: {}", msg);
|
|
}
|
|
throw api_error::access_denied(std::move(msg));
|
|
} else {
|
|
if (warn_authorization) {
|
|
elogger.warn("If you set alternator_enforce_authorization=true the following will be enforced: {}", msg);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check CQL's Role-Based Access Control (RBAC) permission_to_check (MODIFY,
|
|
// SELECT, DROP, etc.) on the given table. When permission is denied an
|
|
// appropriate user-readable api_error::access_denied is thrown.
|
|
future<> verify_permission(
|
|
bool enforce_authorization,
|
|
bool warn_authorization,
|
|
const service::client_state& client_state,
|
|
const schema_ptr& schema,
|
|
auth::permission permission_to_check,
|
|
alternator::stats& stats) {
|
|
if (!enforce_authorization && !warn_authorization) {
|
|
co_return;
|
|
}
|
|
// Unfortunately, the fix for issue #23218 did not modify the function
|
|
// that we use here - check_has_permissions(). So if we want to allow
|
|
// writes to internal tables (from try_get_internal_table()) only to a
|
|
// superuser, we need to explicitly check it here.
|
|
if (permission_to_check == auth::permission::MODIFY && is_internal_keyspace(schema->ks_name())) {
|
|
if (!client_state.user() ||
|
|
!client_state.user()->name ||
|
|
!co_await client_state.get_auth_service()->underlying_role_manager().is_superuser(*client_state.user()->name)) {
|
|
sstring username = "<anonymous>";
|
|
if (client_state.user() && client_state.user()->name) {
|
|
username = client_state.user()->name.value();
|
|
}
|
|
authorization_error(stats, enforce_authorization, warn_authorization, fmt::format(
|
|
"Write access denied on internal table {}.{} to role {} because it is not a superuser",
|
|
schema->ks_name(), schema->cf_name(), username));
|
|
co_return;
|
|
}
|
|
}
|
|
auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name());
|
|
if (!client_state.user() || !client_state.user()->name ||
|
|
!co_await client_state.check_has_permission(auth::command_desc(permission_to_check, resource))) {
|
|
sstring username = "<anonymous>";
|
|
if (client_state.user() && client_state.user()->name) {
|
|
username = client_state.user()->name.value();
|
|
}
|
|
// Using exceptions for errors makes this function faster in the
|
|
// success path (when the operation is allowed).
|
|
authorization_error(stats, enforce_authorization, warn_authorization, fmt::format(
|
|
"{} access on table {}.{} is denied to role {}, client address {}",
|
|
auth::permissions::to_string(permission_to_check),
|
|
schema->ks_name(), schema->cf_name(), username, client_state.get_client_address()));
|
|
}
|
|
}
|
|
|
|
// Similar to verify_permission() above, but just for CREATE operations.
|
|
// Those do not operate on any specific table, so require permissions on
|
|
// ALL KEYSPACES instead of any specific table.
|
|
static future<> verify_create_permission(bool enforce_authorization, bool warn_authorization, const service::client_state& client_state, alternator::stats& stats) {
|
|
if (!enforce_authorization && !warn_authorization) {
|
|
co_return;
|
|
}
|
|
auto resource = auth::resource(auth::resource_kind::data);
|
|
if (!co_await client_state.check_has_permission(auth::command_desc(auth::permission::CREATE, resource))) {
|
|
sstring username = "<anonymous>";
|
|
if (client_state.user() && client_state.user()->name) {
|
|
username = client_state.user()->name.value();
|
|
}
|
|
authorization_error(stats, enforce_authorization, warn_authorization, fmt::format(
|
|
"CREATE access on ALL KEYSPACES is denied to role {}", username));
|
|
}
|
|
}
|
|
|
|
future<executor::request_return_type> executor::delete_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.delete_table++;
|
|
elogger.trace("Deleting table {}", request);
|
|
|
|
std::string table_name = get_table_name(request);
|
|
|
|
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
|
|
tracing::add_alternator_table_name(trace_state, table_name);
|
|
auto& p = _proxy.container();
|
|
|
|
schema_ptr schema = get_table(_proxy, request);
|
|
rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, _proxy, client_state, trace_state, permit);
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::DROP, _stats);
|
|
co_await _mm.container().invoke_on(0, [&, cs = client_state.move_to_other_shard()] (service::migration_manager& mm) -> future<> {
|
|
size_t retries = mm.get_concurrent_ddl_retries();
|
|
for (;;) {
|
|
auto group0_guard = co_await mm.start_group0_operation();
|
|
|
|
std::optional<data_dictionary::table> tbl = p.local().data_dictionary().try_find_table(keyspace_name, table_name);
|
|
if (!tbl) {
|
|
// DynamoDB returns validation error even when table does not exist
|
|
// and the table name is invalid.
|
|
validate_table_name(table_name);
|
|
throw api_error::resource_not_found(fmt::format("Requested resource not found: Table: {} not found", table_name));
|
|
}
|
|
|
|
auto m = co_await service::prepare_column_family_drop_announcement(p.local(), keyspace_name, table_name, group0_guard.write_timestamp(), service::drop_views::yes);
|
|
auto m2 = co_await service::prepare_keyspace_drop_announcement(p.local(), keyspace_name, group0_guard.write_timestamp());
|
|
|
|
std::move(m2.begin(), m2.end(), std::back_inserter(m));
|
|
|
|
// When deleting a table and its views, we need to remove this role's
|
|
// special permissions in those tables (undoing the "auto-grant" done
|
|
// by CreateTable). If we didn't do this, if a second role later
|
|
// recreates a table with the same name, the first role would still
|
|
// have permissions over the new table.
|
|
// To make things more robust we just remove *all* permissions for
|
|
// the deleted table (CQL's drop_table_statement also does this).
|
|
// Unfortunately, there is an API mismatch between this code (which
|
|
// uses separate group0_guard and vector<mutation>) and the function
|
|
// revoke_all() which uses a combined "group0_batch" structure - so
|
|
// we need to do some ugly back-and-forth conversions between the pair
|
|
// to the group0_batch and back to the pair :-(
|
|
service::group0_batch mc(std::move(group0_guard));
|
|
mc.add_mutations(std::move(m));
|
|
auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name());
|
|
co_await auth::revoke_all(*cs.get().get_auth_service(), resource, mc);
|
|
for (const view_ptr& v : tbl->views()) {
|
|
resource = auth::make_data_resource(v->ks_name(), v->cf_name());
|
|
co_await auth::revoke_all(*cs.get().get_auth_service(), resource, mc);
|
|
}
|
|
std::tie(m, group0_guard) = co_await std::move(mc).extract();
|
|
|
|
try {
|
|
co_await mm.announce(std::move(m), std::move(group0_guard), fmt::format("alternator-executor: delete {} table", table_name));
|
|
break;
|
|
} catch (const service::group0_concurrent_modification& ex) {
|
|
elogger.info("Failed to execute DeleteTable {} due to concurrent schema modifications. {}.",
|
|
table_name, retries ? "Retrying" : "Number of retries exceeded, giving up");
|
|
if (retries--) {
|
|
continue;
|
|
}
|
|
throw;
|
|
}
|
|
}
|
|
});
|
|
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "TableDescription", std::move(table_description));
|
|
elogger.trace("returning {}", response);
|
|
co_return rjson::print(std::move(response));
|
|
}
|
|
|
|
static data_type parse_key_type(std::string_view type) {
|
|
// Note that keys are only allowed to be string, blob or number (S/B/N).
|
|
// The other types: boolean and various lists or sets - are not allowed.
|
|
if (type.length() == 1) {
|
|
switch (type[0]) {
|
|
case 'S': return utf8_type;
|
|
case 'B': return bytes_type;
|
|
case 'N': return decimal_type; // FIXME: use a specialized Alternator type, not the general "decimal_type".
|
|
}
|
|
}
|
|
throw api_error::validation(
|
|
fmt::format("Invalid key type '{}', can only be S, B or N.", type));
|
|
}
|
|
|
|
|
|
static void add_column(schema_builder& builder, const std::string& name, const rjson::value& attribute_definitions, column_kind kind, bool computed_column=false) {
|
|
// FIXME: Currently, the column name ATTRS_COLUMN_NAME is not allowed
|
|
// because we use it for our untyped attribute map, and we can't have a
|
|
// second column with the same name. We should fix this, by renaming
|
|
// some column names which we want to reserve.
|
|
if (name == executor::ATTRS_COLUMN_NAME) {
|
|
throw api_error::validation(fmt::format("Column name '{}' is currently reserved. FIXME.", name));
|
|
}
|
|
for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
|
|
const rjson::value& attribute_info = *it;
|
|
if (rjson::to_string_view(attribute_info["AttributeName"]) == name) {
|
|
std::string_view type = rjson::to_string_view(attribute_info["AttributeType"]);
|
|
data_type dt = parse_key_type(type);
|
|
if (computed_column) {
|
|
// Computed column for GSI (doesn't choose a real column as-is
|
|
// but rather extracts a single value from the ":attrs" map)
|
|
alternator_type at = type_info_from_string(type).atype;
|
|
builder.with_computed_column(to_bytes(name), dt, kind,
|
|
std::make_unique<extract_from_attrs_column_computation>(to_bytes(name), at));
|
|
} else {
|
|
builder.with_column(to_bytes(name), dt, kind);
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
throw api_error::validation(
|
|
fmt::format("KeySchema key '{}' missing in AttributeDefinitions", name));
|
|
}
|
|
|
|
// Parse the KeySchema request attribute, which specifies the column names
|
|
// for a key. A KeySchema must include up to two elements, the first must be
|
|
// the HASH key name, and the second one, if exists, must be a RANGE key name.
|
|
// The function returns the two column names - the first is the hash key
|
|
// and always present, the second is the range key and may be an empty string.
|
|
static std::pair<std::string, std::string> parse_key_schema(const rjson::value& obj, std::string_view supplementary_context) {
|
|
const rjson::value *key_schema;
|
|
if (!obj.IsObject() || !(key_schema = rjson::find(obj, "KeySchema"))) {
|
|
throw api_error::validation("Missing KeySchema member");
|
|
}
|
|
if (!key_schema->IsArray() || key_schema->Size() < 1 || key_schema->Size() > 2) {
|
|
throw api_error::validation("KeySchema must list exactly one or two key columns");
|
|
}
|
|
if (!(*key_schema)[0].IsObject()) {
|
|
throw api_error::validation("First element of KeySchema must be an object");
|
|
}
|
|
const rjson::value *v = rjson::find((*key_schema)[0], "KeyType");
|
|
if (!v || !v->IsString() || rjson::to_string_view(*v) != "HASH") {
|
|
throw api_error::validation("First key in KeySchema must be a HASH key");
|
|
}
|
|
v = rjson::find((*key_schema)[0], "AttributeName");
|
|
if (!v || !v->IsString()) {
|
|
throw api_error::validation("First key in KeySchema must have string AttributeName");
|
|
}
|
|
validate_attr_name_length(supplementary_context, v->GetStringLength(), true, "HASH key in KeySchema - ");
|
|
std::string hash_key = rjson::to_string(*v);
|
|
std::string range_key;
|
|
if (key_schema->Size() == 2) {
|
|
if (!(*key_schema)[1].IsObject()) {
|
|
throw api_error::validation("Second element of KeySchema must be an object");
|
|
}
|
|
v = rjson::find((*key_schema)[1], "KeyType");
|
|
if (!v || !v->IsString() || rjson::to_string_view(*v) != "RANGE") {
|
|
throw api_error::validation("Second key in KeySchema must be a RANGE key");
|
|
}
|
|
v = rjson::find((*key_schema)[1], "AttributeName");
|
|
if (!v || !v->IsString()) {
|
|
throw api_error::validation("Second key in KeySchema must have string AttributeName");
|
|
}
|
|
validate_attr_name_length(supplementary_context, v->GetStringLength(), true, "RANGE key in KeySchema - ");
|
|
range_key = v->GetString();
|
|
}
|
|
return {hash_key, range_key};
|
|
}
|
|
|
|
static schema_ptr get_table_from_arn(service::storage_proxy& proxy, std::string_view arn) {
|
|
// Expected format: arn:scylla:alternator:${KEYSPACE_NAME}:scylla:table/${TABLE_NAME};
|
|
constexpr size_t prefix_size = sizeof("arn:scylla:alternator:") - 1;
|
|
// NOTE: This code returns AccessDeniedException if it's problematic to parse or recognize an arn.
|
|
// Technically, a properly formatted, but nonexistent arn *should* return AccessDeniedException,
|
|
// while an incorrectly formatted one should return ValidationException.
|
|
// Unfortunately, the rules are really uncertain, since DynamoDB
|
|
// states that arns are of the form arn:partition:service:region:account-id:resource-type/resource-id
|
|
// or similar - yet, for some arns that do not fit that pattern (e.g. "john"),
|
|
// it still returns AccessDeniedException rather than ValidationException.
|
|
// Consequently, this code simply falls back to AccessDeniedException,
|
|
// concluding that an error is an error and code which uses tagging
|
|
// must be ready for handling AccessDeniedException instances anyway.
|
|
try {
|
|
size_t keyspace_end = arn.find_first_of(':', prefix_size);
|
|
std::string_view keyspace_name = arn.substr(prefix_size, keyspace_end - prefix_size);
|
|
size_t table_start = arn.find_first_of('/');
|
|
std::string_view table_name = arn.substr(table_start + 1);
|
|
if (table_name.find('/') != std::string_view::npos) {
|
|
// A table name cannot contain a '/' - if it does, it's not a
|
|
// table ARN, it may be an index. DynamoDB returns a
|
|
// ValidationException in that case - see #10786.
|
|
throw api_error::validation(fmt::format("ResourceArn '{}' is not a valid table ARN", table_name));
|
|
}
|
|
// FIXME: remove sstring creation once find_schema gains a view-based interface
|
|
return proxy.data_dictionary().find_schema(sstring(keyspace_name), sstring(table_name));
|
|
} catch (const data_dictionary::no_such_column_family& e) {
|
|
throw api_error::resource_not_found(fmt::format("ResourceArn '{}' not found", arn));
|
|
} catch (const std::out_of_range& e) {
|
|
throw api_error::access_denied("Incorrect resource identifier");
|
|
}
|
|
}
|
|
|
|
static bool is_legal_tag_char(char c) {
|
|
// FIXME: According to docs, unicode strings should also be accepted.
|
|
// Alternator currently uses a simplified ASCII approach
|
|
return std::isalnum(c) || std::isspace(c)
|
|
|| c == '+' || c == '-' || c == '=' || c == '.' || c == '_' || c == ':' || c == '/' ;
|
|
}
|
|
|
|
static bool validate_legal_tag_chars(std::string_view tag) {
|
|
return std::all_of(tag.begin(), tag.end(), &is_legal_tag_char);
|
|
}
|
|
|
|
static const std::unordered_set<std::string_view> allowed_write_isolation_values = {
|
|
"f", "forbid", "forbid_rmw",
|
|
"a", "always", "always_use_lwt",
|
|
"o", "only_rmw_uses_lwt",
|
|
"u", "unsafe", "unsafe_rmw",
|
|
};
|
|
|
|
static void validate_tags(const std::map<sstring, sstring>& tags) {
|
|
auto it = tags.find(rmw_operation::WRITE_ISOLATION_TAG_KEY);
|
|
if (it != tags.end()) {
|
|
std::string_view value = it->second;
|
|
if (!allowed_write_isolation_values.contains(value)) {
|
|
throw api_error::validation(
|
|
fmt::format("Incorrect write isolation tag {}. Allowed values: {}", value, allowed_write_isolation_values));
|
|
}
|
|
}
|
|
}
|
|
|
|
static rmw_operation::write_isolation parse_write_isolation(std::string_view value) {
|
|
if (!value.empty()) {
|
|
switch (value[0]) {
|
|
case 'f':
|
|
return rmw_operation::write_isolation::FORBID_RMW;
|
|
case 'a':
|
|
return rmw_operation::write_isolation::LWT_ALWAYS;
|
|
case 'o':
|
|
return rmw_operation::write_isolation::LWT_RMW_ONLY;
|
|
case 'u':
|
|
return rmw_operation::write_isolation::UNSAFE_RMW;
|
|
}
|
|
}
|
|
// Shouldn't happen as validate_tags() / set_default_write_isolation()
|
|
// verify allow only a closed set of values.
|
|
return rmw_operation::default_write_isolation;
|
|
|
|
}
|
|
// This default_write_isolation is always overwritten in main.cc, which calls
|
|
// set_default_write_isolation().
|
|
rmw_operation::write_isolation rmw_operation::default_write_isolation =
|
|
rmw_operation::write_isolation::LWT_ALWAYS;
|
|
void rmw_operation::set_default_write_isolation(std::string_view value) {
|
|
if (value.empty()) {
|
|
throw std::runtime_error("When Alternator is enabled, write "
|
|
"isolation policy must be selected, using the "
|
|
"'--alternator-write-isolation' option. "
|
|
"See docs/alternator/alternator.md for instructions.");
|
|
}
|
|
if (!allowed_write_isolation_values.contains(value)) {
|
|
throw std::runtime_error(fmt::format("Invalid --alternator-write-isolation "
|
|
"setting '{}'. Allowed values: {}.",
|
|
value, allowed_write_isolation_values));
|
|
}
|
|
default_write_isolation = parse_write_isolation(value);
|
|
}
|
|
|
|
// Alternator uses tags whose keys start with the "system:" prefix for
|
|
// internal purposes. Those should not be readable by ListTagsOfResource,
|
|
// nor writable with TagResource or UntagResource (see #24098).
|
|
// Only a few specific system tags, currently only "system:write_isolation"
|
|
// and "system:initial_tablets", are deliberately intended to be set and read
|
|
// by the user, so are not considered "internal".
|
|
static bool tag_key_is_internal(std::string_view tag_key) {
|
|
return tag_key.starts_with("system:")
|
|
&& tag_key != rmw_operation::WRITE_ISOLATION_TAG_KEY
|
|
&& tag_key != INITIAL_TABLETS_TAG_KEY;
|
|
}
|
|
|
|
enum class update_tags_action { add_tags, delete_tags };
|
|
static void update_tags_map(const rjson::value& tags, std::map<sstring, sstring>& tags_map, update_tags_action action) {
|
|
if (action == update_tags_action::add_tags) {
|
|
for (auto it = tags.Begin(); it != tags.End(); ++it) {
|
|
if (!it->IsObject()) {
|
|
throw api_error::validation("invalid tag object");
|
|
}
|
|
const rjson::value* key = rjson::find(*it, "Key");
|
|
const rjson::value* value = rjson::find(*it, "Value");
|
|
if (!key || !key->IsString() || !value || !value->IsString()) {
|
|
throw api_error::validation("string Key and Value required");
|
|
}
|
|
auto tag_key = rjson::to_string_view(*key);
|
|
auto tag_value = rjson::to_string_view(*value);
|
|
|
|
if (tag_key.empty()) {
|
|
throw api_error::validation("A tag Key cannot be empty");
|
|
}
|
|
if (tag_key.size() > 128) {
|
|
throw api_error::validation("A tag Key is limited to 128 characters");
|
|
}
|
|
if (!validate_legal_tag_chars(tag_key)) {
|
|
throw api_error::validation("A tag Key can only contain letters, spaces, and [+-=._:/]");
|
|
}
|
|
if (tag_key_is_internal(tag_key)) {
|
|
throw api_error::validation(fmt::format("Tag key '{}' is reserved for internal use", tag_key));
|
|
}
|
|
// Note tag values are limited similarly to tag keys, but have a
|
|
// longer length limit, and *can* be empty.
|
|
if (tag_value.size() > 256) {
|
|
throw api_error::validation("A tag Value is limited to 256 characters");
|
|
}
|
|
if (!validate_legal_tag_chars(tag_value)) {
|
|
throw api_error::validation("A tag Value can only contain letters, spaces, and [+-=._:/]");
|
|
}
|
|
tags_map[sstring(tag_key)] = sstring(tag_value);
|
|
}
|
|
} else if (action == update_tags_action::delete_tags) {
|
|
for (auto it = tags.Begin(); it != tags.End(); ++it) {
|
|
auto tag_key = rjson::to_string_view(*it);
|
|
if (tag_key_is_internal(tag_key)) {
|
|
throw api_error::validation(fmt::format("Tag key '{}' is reserved for internal use", tag_key));
|
|
}
|
|
tags_map.erase(sstring(tag_key));
|
|
}
|
|
}
|
|
|
|
if (tags_map.size() > 50) {
|
|
throw api_error::validation("Number of Tags exceed the current limit for the provided ResourceArn");
|
|
}
|
|
validate_tags(tags_map);
|
|
}
|
|
|
|
const std::map<sstring, sstring>& get_tags_of_table_or_throw(schema_ptr schema) {
|
|
auto tags_ptr = db::get_tags_of_table(schema);
|
|
if (tags_ptr) {
|
|
return *tags_ptr;
|
|
} else {
|
|
throw api_error::validation(format("Table {} does not have valid tagging information", schema->ks_name()));
|
|
}
|
|
}
|
|
|
|
future<executor::request_return_type> executor::tag_resource(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.tag_resource++;
|
|
|
|
const rjson::value* arn = rjson::find(request, "ResourceArn");
|
|
if (!arn || !arn->IsString()) {
|
|
co_return api_error::access_denied("Incorrect resource identifier");
|
|
}
|
|
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
|
|
get_stats_from_schema(_proxy, *schema)->api_operations.tag_resource++;
|
|
const rjson::value* tags = rjson::find(request, "Tags");
|
|
if (!tags || !tags->IsArray()) {
|
|
co_return api_error::validation("Cannot parse tags");
|
|
}
|
|
if (tags->Size() < 1) {
|
|
co_return api_error::validation("The number of tags must be at least 1") ;
|
|
}
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
|
|
co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
|
|
update_tags_map(*tags, tags_map, update_tags_action::add_tags);
|
|
});
|
|
co_return ""; // empty response
|
|
}
|
|
|
|
future<executor::request_return_type> executor::untag_resource(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.untag_resource++;
|
|
|
|
const rjson::value* arn = rjson::find(request, "ResourceArn");
|
|
if (!arn || !arn->IsString()) {
|
|
co_return api_error::access_denied("Incorrect resource identifier");
|
|
}
|
|
const rjson::value* tags = rjson::find(request, "TagKeys");
|
|
if (!tags || !tags->IsArray()) {
|
|
co_return api_error::validation(format("Cannot parse tag keys"));
|
|
}
|
|
|
|
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
|
|
get_stats_from_schema(_proxy, *schema)->api_operations.untag_resource++;
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::ALTER, _stats);
|
|
co_await db::modify_tags(_mm, schema->ks_name(), schema->cf_name(), [tags](std::map<sstring, sstring>& tags_map) {
|
|
update_tags_map(*tags, tags_map, update_tags_action::delete_tags);
|
|
});
|
|
co_return ""; // empty response
|
|
}
|
|
|
|
future<executor::request_return_type> executor::list_tags_of_resource(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.list_tags_of_resource++;
|
|
const rjson::value* arn = rjson::find(request, "ResourceArn");
|
|
if (!arn || !arn->IsString()) {
|
|
return make_ready_future<request_return_type>(api_error::access_denied("Incorrect resource identifier"));
|
|
}
|
|
schema_ptr schema = get_table_from_arn(_proxy, rjson::to_string_view(*arn));
|
|
get_stats_from_schema(_proxy, *schema)->api_operations.list_tags_of_resource++;
|
|
auto tags_map = get_tags_of_table_or_throw(schema);
|
|
rjson::value ret = rjson::empty_object();
|
|
rjson::add(ret, "Tags", rjson::empty_array());
|
|
|
|
rjson::value& tags = ret["Tags"];
|
|
for (auto& tag_entry : tags_map) {
|
|
if (tag_key_is_internal(tag_entry.first)) {
|
|
continue;
|
|
}
|
|
rjson::value new_entry = rjson::empty_object();
|
|
rjson::add(new_entry, "Key", rjson::from_string(tag_entry.first));
|
|
rjson::add(new_entry, "Value", rjson::from_string(tag_entry.second));
|
|
rjson::push_back(tags, std::move(new_entry));
|
|
}
|
|
|
|
return make_ready_future<executor::request_return_type>(rjson::print(std::move(ret)));
|
|
}
|
|
|
|
struct billing_mode_type {
|
|
bool provisioned = false;
|
|
int rcu;
|
|
int wcu;
|
|
};
|
|
|
|
static billing_mode_type verify_billing_mode(const rjson::value& request) {
|
|
// Alternator does not yet support billing or throughput limitations, but
|
|
// let's verify that BillingMode is at least legal.
|
|
std::string billing_mode = get_string_attribute(request, "BillingMode", "PROVISIONED");
|
|
if (billing_mode == "PAY_PER_REQUEST") {
|
|
if (rjson::find(request, "ProvisionedThroughput")) {
|
|
throw api_error::validation("When BillingMode=PAY_PER_REQUEST, ProvisionedThroughput cannot be specified.");
|
|
}
|
|
} else if (billing_mode == "PROVISIONED") {
|
|
const rjson::value *provisioned_throughput = rjson::find(request, "ProvisionedThroughput");
|
|
if (!provisioned_throughput) {
|
|
throw api_error::validation("When BillingMode=PROVISIONED, ProvisionedThroughput must be specified.");
|
|
}
|
|
const rjson::value& throughput = *provisioned_throughput;
|
|
auto rcu = get_int_attribute(throughput, "ReadCapacityUnits");
|
|
auto wcu = get_int_attribute(throughput, "WriteCapacityUnits");
|
|
if (!rcu.has_value()) {
|
|
throw api_error::validation("provisionedThroughput.readCapacityUnits is missing, when BillingMode=PROVISIONED, ProvisionedThroughput must be specified.");
|
|
}
|
|
if (!wcu.has_value()) {
|
|
throw api_error::validation("provisionedThroughput.writeCapacityUnits is missing, when BillingMode=PROVISIONED, ProvisionedThroughput must be specified.");
|
|
}
|
|
return billing_mode_type{true, *rcu, *wcu};
|
|
} else {
|
|
throw api_error::validation("Unknown BillingMode={}. Must be PAY_PER_REQUEST or PROVISIONED.");
|
|
}
|
|
return billing_mode_type();
|
|
}
|
|
|
|
// Validate that a AttributeDefinitions parameter in CreateTable is valid, and
|
|
// throws user-facing api_error::validation if it's not.
|
|
// In particular, verify that the same AttributeName doesn't appear more than
|
|
// once (Issue #13870).
|
|
// Return the set of attribute names defined in AttributeDefinitions - this
|
|
// set is useful for later verifying that all of them are used by some
|
|
// KeySchema (issue #19784)
|
|
static std::unordered_set<std::string> validate_attribute_definitions(std::string_view supplementary_context, const rjson::value& attribute_definitions) {
|
|
if (!attribute_definitions.IsArray()) {
|
|
throw api_error::validation("AttributeDefinitions must be an array");
|
|
}
|
|
std::unordered_set<std::string> seen_attribute_names;
|
|
for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
|
|
const rjson::value* attribute_name = rjson::find(*it, "AttributeName");
|
|
if (!attribute_name) {
|
|
throw api_error::validation("AttributeName missing in AttributeDefinitions");
|
|
}
|
|
if (!attribute_name->IsString()) {
|
|
throw api_error::validation("AttributeName in AttributeDefinitions must be a string");
|
|
}
|
|
validate_attr_name_length(supplementary_context, attribute_name->GetStringLength(), true, "in AttributeDefinitions - ");
|
|
auto [it2, added] = seen_attribute_names.emplace(rjson::to_string_view(*attribute_name));
|
|
if (!added) {
|
|
throw api_error::validation(fmt::format("Duplicate AttributeName={} in AttributeDefinitions",
|
|
rjson::to_string_view(*attribute_name)));
|
|
}
|
|
const rjson::value* attribute_type = rjson::find(*it, "AttributeType");
|
|
if (!attribute_type) {
|
|
throw api_error::validation("AttributeType missing in AttributeDefinitions");
|
|
}
|
|
if (!attribute_type->IsString()) {
|
|
throw api_error::validation("AttributeType in AttributeDefinitions must be a string");
|
|
}
|
|
}
|
|
return seen_attribute_names;
|
|
}
|
|
|
|
// The following "extract_from_attrs_column_computation" implementation is
|
|
// what allows Alternator GSIs and LSIs to use in a materialized view's key a
|
|
// member from the ":attrs" map instead of a real column in the schema:
|
|
|
|
const bytes extract_from_attrs_column_computation::MAP_NAME = executor::ATTRS_COLUMN_NAME;
|
|
|
|
column_computation_ptr extract_from_attrs_column_computation::clone() const {
|
|
return std::make_unique<extract_from_attrs_column_computation>(*this);
|
|
}
|
|
|
|
// Serialize the *definition* of this column computation into a JSON
|
|
// string with a unique "type" string - TYPE_NAME - which then causes
|
|
// column_computation::deserialize() to create an object from this class.
|
|
bytes extract_from_attrs_column_computation::serialize() const {
|
|
rjson::value ret = rjson::empty_object();
|
|
rjson::add(ret, "type", TYPE_NAME);
|
|
rjson::add(ret, "attr_name", rjson::from_string(to_string_view(_attr_name)));
|
|
rjson::add(ret, "desired_type", represent_type(_desired_type).ident);
|
|
return to_bytes(rjson::print(ret));
|
|
}
|
|
|
|
// Construct an extract_from_attrs_column_computation object based on the
|
|
// saved output of serialize(). Calls on_internal_error() if the string
|
|
// doesn't match the expected output format of serialize(). "type" is not
|
|
// checked - we assume the caller (column_computation::deserialize()) won't
|
|
// call this constructor if "type" doesn't match.
|
|
extract_from_attrs_column_computation::extract_from_attrs_column_computation(const rjson::value &v) {
|
|
const rjson::value* attr_name = rjson::find(v, "attr_name");
|
|
if (attr_name->IsString()) {
|
|
_attr_name = bytes(to_bytes_view(rjson::to_string_view(*attr_name)));
|
|
const rjson::value* desired_type = rjson::find(v, "desired_type");
|
|
if (desired_type->IsString()) {
|
|
_desired_type = type_info_from_string(rjson::to_string_view(*desired_type)).atype;
|
|
switch (_desired_type) {
|
|
case alternator_type::S:
|
|
case alternator_type::B:
|
|
case alternator_type::N:
|
|
// We're done
|
|
return;
|
|
default:
|
|
// Fall through to on_internal_error below.
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
on_internal_error(elogger, format("Improperly formatted alternator::extract_from_attrs_column_computation computed column definition: {}", v));
|
|
}
|
|
|
|
regular_column_transformation::result extract_from_attrs_column_computation::compute_value(
|
|
const schema& schema,
|
|
const partition_key& key,
|
|
const db::view::clustering_or_static_row& row) const
|
|
{
|
|
const column_definition* attrs_col = schema.get_column_definition(MAP_NAME);
|
|
if (!attrs_col || !attrs_col->is_regular() || !attrs_col->is_multi_cell()) {
|
|
on_internal_error(elogger, "extract_from_attrs_column_computation::compute_value() on a table without an attrs map");
|
|
}
|
|
// Look for the desired attribute _attr_name in the attrs_col map in row:
|
|
const atomic_cell_or_collection* attrs = row.cells().find_cell(attrs_col->id);
|
|
if (!attrs) {
|
|
return regular_column_transformation::result();
|
|
}
|
|
collection_mutation_view cmv = attrs->as_collection_mutation();
|
|
return cmv.with_deserialized(*attrs_col->type, [this] (const collection_mutation_view_description& cmvd) {
|
|
for (auto&& [key, cell] : cmvd.cells) {
|
|
if (key == _attr_name) {
|
|
return regular_column_transformation::result(cell,
|
|
std::bind(serialized_value_if_type, std::placeholders::_1, _desired_type));
|
|
}
|
|
}
|
|
return regular_column_transformation::result();
|
|
});
|
|
}
|
|
|
|
// extract_from_attrs_column_computation needs the whole row to compute
|
|
// value, it can't use just the partition key.
|
|
bytes extract_from_attrs_column_computation::compute_value(const schema&, const partition_key&) const {
|
|
on_internal_error(elogger, "extract_from_attrs_column_computation::compute_value called without row");
|
|
}
|
|
|
|
// Because `CreateTable` request creates GSI/LSI together with the base table (so the base table is empty),
|
|
// we can skip view building process and immediately mark the view as built on all nodes.
|
|
//
|
|
// However, we can do this only for tablet-based views because `view_building_worker` will automatically propagate
|
|
// this information to `system.built_views` table (see `view_building_worker::update_built_views()`).
|
|
// For vnode-based views, `view_builder` will process the view and mark it as built.
|
|
static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out, std::vector<schema_ptr> schemas, api::timestamp_type ts, service::storage_proxy& sp) {
|
|
auto token_metadata = sp.get_token_metadata_ptr();
|
|
for (auto& schema: schemas) {
|
|
if (schema->is_view()) {
|
|
for (auto& host_id: token_metadata->get_topology().get_all_host_ids()) {
|
|
auto view_status_mut = co_await sp.system_keyspace().make_view_build_status_mutation(ts, {schema->ks_name(), schema->cf_name()}, host_id, db::view::build_status::SUCCESS);
|
|
out.push_back(std::move(view_status_mut));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request,
|
|
service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, bool enforce_authorization, bool warn_authorization, stats& stats, const db::tablets_mode_t::mode tablets_mode) {
|
|
SCYLLA_ASSERT(this_shard_id() == 0);
|
|
|
|
// We begin by parsing and validating the content of the CreateTable
|
|
// command. We can't inspect the current database schema at this point
|
|
// (e.g., verify that this table doesn't already exist) - we can only
|
|
// do this further down - after taking group0_guard.
|
|
std::string table_name = get_table_name(request);
|
|
validate_table_name(table_name);
|
|
|
|
if (table_name.find(executor::INTERNAL_TABLE_PREFIX) == 0) {
|
|
co_return api_error::validation(fmt::format("Prefix {} is reserved for accessing internal tables", executor::INTERNAL_TABLE_PREFIX));
|
|
}
|
|
std::string keyspace_name = executor::KEYSPACE_NAME_PREFIX + table_name;
|
|
const rjson::value* attribute_definitions = rjson::find(request, "AttributeDefinitions");
|
|
if (attribute_definitions == nullptr) {
|
|
co_return api_error::validation("Missing AttributeDefinitions in CreateTable request");
|
|
}
|
|
// Save the list of AttributeDefinitions in unused_attribute_definitions,
|
|
// and below remove each one as we see it in a KeySchema of the table or
|
|
// any of its GSIs or LSIs. If anything remains in this set at the end of
|
|
// this function, it's an error.
|
|
std::unordered_set<std::string> unused_attribute_definitions =
|
|
validate_attribute_definitions("", *attribute_definitions);
|
|
|
|
tracing::add_alternator_table_name(trace_state, table_name);
|
|
|
|
schema_builder builder(keyspace_name, table_name);
|
|
auto [hash_key, range_key] = parse_key_schema(request, "");
|
|
add_column(builder, hash_key, *attribute_definitions, column_kind::partition_key);
|
|
unused_attribute_definitions.erase(hash_key);
|
|
if (!range_key.empty()) {
|
|
add_column(builder, range_key, *attribute_definitions, column_kind::clustering_key);
|
|
unused_attribute_definitions.erase(range_key);
|
|
}
|
|
builder.with_column(bytes(executor::ATTRS_COLUMN_NAME), attrs_type(), column_kind::regular_column);
|
|
|
|
billing_mode_type bm = verify_billing_mode(request);
|
|
|
|
schema_ptr partial_schema = builder.build();
|
|
|
|
// Parse Local/GlobalSecondaryIndexes parameters before creating the
|
|
// base table, so if we have a parse errors we can fail without creating
|
|
// any table.
|
|
std::vector<schema_builder> view_builders;
|
|
std::unordered_set<std::string> index_names;
|
|
|
|
const rjson::value* lsi = rjson::find(request, "LocalSecondaryIndexes");
|
|
if (lsi) {
|
|
if (!lsi->IsArray()) {
|
|
throw api_error::validation("LocalSecondaryIndexes must be an array.");
|
|
}
|
|
for (const rjson::value& l : lsi->GetArray()) {
|
|
const rjson::value* index_name_v = rjson::find(l, "IndexName");
|
|
if (!index_name_v || !index_name_v->IsString()) {
|
|
throw api_error::validation("LocalSecondaryIndexes IndexName must be a string.");
|
|
}
|
|
std::string_view index_name = rjson::to_string_view(*index_name_v);
|
|
auto [it, added] = index_names.emplace(index_name);
|
|
if (!added) {
|
|
co_return api_error::validation(fmt::format("Duplicate IndexName '{}', ", index_name));
|
|
}
|
|
std::string vname(lsi_name(table_name, index_name));
|
|
elogger.trace("Adding LSI {}", index_name);
|
|
if (range_key.empty()) {
|
|
co_return api_error::validation("LocalSecondaryIndex requires that the base table have a range key");
|
|
}
|
|
// FIXME: read and handle "Projection" parameter. This will
|
|
// require the MV code to copy just parts of the attrs map.
|
|
schema_builder view_builder(keyspace_name, vname);
|
|
auto [view_hash_key, view_range_key] = parse_key_schema(l, "Local Secondary Index");
|
|
if (view_hash_key != hash_key) {
|
|
co_return api_error::validation("LocalSecondaryIndex hash key must match the base table hash key");
|
|
}
|
|
add_column(view_builder, view_hash_key, *attribute_definitions, column_kind::partition_key);
|
|
unused_attribute_definitions.erase(view_hash_key);
|
|
if (view_range_key.empty()) {
|
|
co_return api_error::validation("LocalSecondaryIndex must specify a sort key");
|
|
}
|
|
unused_attribute_definitions.erase(view_range_key);
|
|
if (view_range_key == hash_key) {
|
|
co_return api_error::validation("LocalSecondaryIndex sort key cannot be the same as hash key");
|
|
}
|
|
add_column(view_builder, view_range_key, *attribute_definitions, column_kind::clustering_key, view_range_key != range_key);
|
|
// Base key columns which aren't part of the index's key need to
|
|
// be added to the view nonetheless, as (additional) clustering
|
|
// key(s).
|
|
if (!range_key.empty() && view_range_key != range_key) {
|
|
add_column(view_builder, range_key, *attribute_definitions, column_kind::clustering_key);
|
|
}
|
|
view_builder.with_column(bytes(executor::ATTRS_COLUMN_NAME), attrs_type(), column_kind::regular_column);
|
|
// Note above we don't need to add virtual columns, as all
|
|
// base columns were copied to view. TODO: reconsider the need
|
|
// for virtual columns when we support Projection.
|
|
// LSIs have no tags, but Scylla's "synchronous_updates" feature
|
|
// (which an LSIs need), is actually implemented as a tag so we
|
|
// need to add it here:
|
|
std::map<sstring, sstring> tags_map = {{db::SYNCHRONOUS_VIEW_UPDATES_TAG_KEY, "true"}};
|
|
view_builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
|
|
view_builders.emplace_back(std::move(view_builder));
|
|
}
|
|
}
|
|
|
|
const rjson::value* gsi = rjson::find(request, "GlobalSecondaryIndexes");
|
|
if (gsi) {
|
|
if (!gsi->IsArray()) {
|
|
co_return api_error::validation("GlobalSecondaryIndexes must be an array.");
|
|
}
|
|
for (const rjson::value& g : gsi->GetArray()) {
|
|
const rjson::value* index_name_v = rjson::find(g, "IndexName");
|
|
if (!index_name_v || !index_name_v->IsString()) {
|
|
co_return api_error::validation("GlobalSecondaryIndexes IndexName must be a string.");
|
|
}
|
|
std::string_view index_name = rjson::to_string_view(*index_name_v);
|
|
auto [it, added] = index_names.emplace(index_name);
|
|
if (!added) {
|
|
co_return api_error::validation(fmt::format("Duplicate IndexName '{}', ", index_name));
|
|
}
|
|
std::string vname(view_name(table_name, index_name));
|
|
elogger.trace("Adding GSI {}", index_name);
|
|
// FIXME: read and handle "Projection" parameter. This will
|
|
// require the MV code to copy just parts of the attrs map.
|
|
schema_builder view_builder(keyspace_name, vname);
|
|
auto [view_hash_key, view_range_key] = parse_key_schema(g, "GlobalSecondaryIndexes");
|
|
|
|
// If an attribute is already a real column in the base table
|
|
// (i.e., a key attribute), we can use it directly as a view key.
|
|
// Otherwise, we need to add it as a "computed column", which
|
|
// extracts and deserializes the attribute from the ":attrs" map.
|
|
bool view_hash_key_real_column = partial_schema->get_column_definition(to_bytes(view_hash_key));
|
|
add_column(view_builder, view_hash_key, *attribute_definitions, column_kind::partition_key, !view_hash_key_real_column);
|
|
unused_attribute_definitions.erase(view_hash_key);
|
|
if (!view_range_key.empty()) {
|
|
bool view_range_key_real_column = partial_schema->get_column_definition(to_bytes(view_range_key));
|
|
add_column(view_builder, view_range_key, *attribute_definitions, column_kind::clustering_key, !view_range_key_real_column);
|
|
if (!partial_schema->get_column_definition(to_bytes(view_range_key)) &&
|
|
!partial_schema->get_column_definition(to_bytes(view_hash_key))) {
|
|
// FIXME: This warning should go away. See issue #6714
|
|
elogger.warn("Only 1 regular column from the base table should be used in the GSI key in order to ensure correct liveness management without assumptions");
|
|
}
|
|
unused_attribute_definitions.erase(view_range_key);
|
|
}
|
|
|
|
// Base key columns which aren't part of the index's key need to
|
|
// be added to the view nonetheless, as (additional) clustering
|
|
// key(s).
|
|
// NOTE: DescribeTable's implementation depends on those keys being added AFTER user specified keys.
|
|
bool spurious_base_key_added_as_range_key = false;
|
|
if (hash_key != view_hash_key && hash_key != view_range_key) {
|
|
add_column(view_builder, hash_key, *attribute_definitions, column_kind::clustering_key);
|
|
spurious_base_key_added_as_range_key = true;
|
|
}
|
|
if (!range_key.empty() && range_key != view_hash_key && range_key != view_range_key) {
|
|
add_column(view_builder, range_key, *attribute_definitions, column_kind::clustering_key);
|
|
spurious_base_key_added_as_range_key = true;
|
|
}
|
|
std::map<sstring, sstring> tags;
|
|
if (view_range_key.empty() && spurious_base_key_added_as_range_key) {
|
|
tags[SPURIOUS_RANGE_KEY_ADDED_TO_GSI_AND_USER_DIDNT_SPECIFY_RANGE_KEY_TAG_KEY] = "true";
|
|
}
|
|
view_builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(std::move(tags)));
|
|
view_builders.emplace_back(std::move(view_builder));
|
|
}
|
|
}
|
|
if (!unused_attribute_definitions.empty()) {
|
|
co_return api_error::validation(fmt::format(
|
|
"AttributeDefinitions defines spurious attributes not used by any KeySchema: {}",
|
|
unused_attribute_definitions));
|
|
}
|
|
|
|
// We don't yet support configuring server-side encryption (SSE) via the
|
|
// SSESpecifiction attribute, but an SSESpecification with Enabled=false
|
|
// is simply the default, and should be accepted:
|
|
rjson::value* sse_specification = rjson::find(request, "SSESpecification");
|
|
if (sse_specification && sse_specification->IsObject()) {
|
|
rjson::value* enabled = rjson::find(*sse_specification, "Enabled");
|
|
if (!enabled || !enabled->IsBool()) {
|
|
co_return api_error("ValidationException", "SSESpecification needs boolean Enabled");
|
|
}
|
|
if (enabled->GetBool()) {
|
|
// TODO: full support for SSESpecification
|
|
co_return api_error("ValidationException", "SSESpecification: configuring encryption-at-rest is not yet supported.");
|
|
}
|
|
}
|
|
|
|
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
|
if (stream_specification && stream_specification->IsObject()) {
|
|
if (executor::add_stream_options(*stream_specification, builder, sp)) {
|
|
validate_cdc_log_name_length(builder.cf_name());
|
|
}
|
|
}
|
|
|
|
// Parse the "Tags" parameter early, so we can avoid creating the table
|
|
// at all if this parsing failed.
|
|
const rjson::value* tags = rjson::find(request, "Tags");
|
|
std::map<sstring, sstring> tags_map;
|
|
if (tags && tags->IsArray()) {
|
|
update_tags_map(*tags, tags_map, update_tags_action::add_tags);
|
|
}
|
|
if (bm.provisioned) {
|
|
tags_map[RCU_TAG_KEY] = std::to_string(bm.rcu);
|
|
tags_map[WCU_TAG_KEY] = std::to_string(bm.wcu);
|
|
}
|
|
set_table_creation_time(tags_map, db_clock::now());
|
|
builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
|
|
|
|
co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, stats);
|
|
|
|
schema_ptr schema = builder.build();
|
|
for (auto& view_builder : view_builders) {
|
|
// Note below we don't need to add virtual columns, as all
|
|
// base columns were copied to view. TODO: reconsider the need
|
|
// for virtual columns when we support Projection.
|
|
for (const column_definition& regular_cdef : schema->regular_columns()) {
|
|
if (!view_builder.has_column(*cql3::to_identifier(regular_cdef))) {
|
|
view_builder.with_column(regular_cdef.name(), regular_cdef.type, column_kind::regular_column);
|
|
}
|
|
}
|
|
const bool include_all_columns = true;
|
|
view_builder.with_view_info(schema, include_all_columns, ""/*where clause*/);
|
|
}
|
|
|
|
size_t retries = mm.get_concurrent_ddl_retries();
|
|
for (;;) {
|
|
auto group0_guard = co_await mm.start_group0_operation();
|
|
auto ts = group0_guard.write_timestamp();
|
|
utils::chunked_vector<mutation> schema_mutations;
|
|
auto ksm = create_keyspace_metadata(keyspace_name, sp, gossiper, ts, tags_map, sp.features(), tablets_mode);
|
|
// Alternator Streams doesn't yet work when the table uses tablets (#23838)
|
|
if (stream_specification && stream_specification->IsObject()) {
|
|
auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
|
|
if (stream_enabled && stream_enabled->IsBool() && stream_enabled->GetBool()) {
|
|
locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
|
|
const auto& topo = sp.local_db().get_token_metadata().get_topology();
|
|
auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
|
|
if (rs->uses_tablets()) {
|
|
co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
|
|
"If you want to use streams, create a table with vnodes by setting the tag 'system:initial_tablets' set to 'none'.");
|
|
}
|
|
}
|
|
}
|
|
// Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
|
|
// GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
|
|
if (!view_builders.empty() && ksm->uses_tablets() && !sp.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
|
|
co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
|
|
}
|
|
try {
|
|
schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
|
|
} catch (exceptions::already_exists_exception&) {
|
|
if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
|
|
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
|
|
}
|
|
}
|
|
if (sp.data_dictionary().try_find_table(schema->id())) {
|
|
// This should never happen, the ID is supposed to be unique
|
|
co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
|
|
}
|
|
std::vector<schema_ptr> schemas;
|
|
schemas.push_back(schema);
|
|
for (schema_builder& view_builder : view_builders) {
|
|
schemas.push_back(view_builder.build());
|
|
}
|
|
co_await service::prepare_new_column_families_announcement(schema_mutations, sp, *ksm, schemas, ts);
|
|
if (ksm->uses_tablets()) {
|
|
co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, sp);
|
|
}
|
|
|
|
// If a role is allowed to create a table, we must give it permissions to
|
|
// use (and eventually delete) the specific table it just created (and
|
|
// also the view tables). This is known as "auto-grant".
|
|
// Unfortunately, there is an API mismatch between this code (which uses
|
|
// separate group0_guard and vector<mutation>) and the function
|
|
// grant_applicable_permissions() which uses a combined "group0_batch"
|
|
// structure - so we need to do some ugly back-and-forth conversions
|
|
// between the pair to the group0_batch and back to the pair :-(
|
|
service::group0_batch mc(std::move(group0_guard));
|
|
mc.add_mutations(std::move(schema_mutations));
|
|
if (client_state.user()) {
|
|
auto resource = auth::make_data_resource(schema->ks_name(), schema->cf_name());
|
|
co_await auth::grant_applicable_permissions(
|
|
*client_state.get_auth_service(), *client_state.user(), resource, mc);
|
|
for (const schema_builder& view_builder : view_builders) {
|
|
resource = auth::make_data_resource(view_builder.ks_name(), view_builder.cf_name());
|
|
co_await auth::grant_applicable_permissions(
|
|
*client_state.get_auth_service(), *client_state.user(), resource, mc);
|
|
}
|
|
}
|
|
std::tie(schema_mutations, group0_guard) = co_await std::move(mc).extract();
|
|
try {
|
|
co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
|
|
break;
|
|
} catch (const service::group0_concurrent_modification& ex) {
|
|
elogger.info("Failed to execute CreateTable {} due to concurrent schema modifications. {}.",
|
|
table_name, retries ? "Retrying" : "Number of retries exceeded, giving up");
|
|
if (retries--) {
|
|
continue;
|
|
}
|
|
throw;
|
|
}
|
|
}
|
|
|
|
co_await mm.wait_for_schema_agreement(sp.local_db(), db::timeout_clock::now() + 10s, nullptr);
|
|
rjson::value status = rjson::empty_object();
|
|
executor::supplement_table_info(request, *schema, sp);
|
|
rjson::add(status, "TableDescription", std::move(request));
|
|
co_return rjson::print(std::move(status));
|
|
}
|
|
|
|
future<executor::request_return_type> executor::create_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.create_table++;
|
|
elogger.trace("Creating table {}", request);
|
|
|
|
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
|
|
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
|
|
const db::tablets_mode_t::mode tablets_mode = _proxy.data_dictionary().get_config().tablets_mode_for_new_keyspaces(); // type cast
|
|
co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, e.local()._stats, std::move(tablets_mode));
|
|
});
|
|
}
|
|
|
|
// When UpdateTable adds a GSI, the type of its key columns must be specified
|
|
// in a AttributeDefinitions. If one of these key columns are *already* key
|
|
// columns of the base table or any of its prior GSIs or LSIs, the type
|
|
// given in AttributeDefinitions must match the type of the existing key -
|
|
// otherwise Alternator will not know which type to enforce in new writes.
|
|
// This function checks for such conflicts. It assumes that the structure of
|
|
// the given attribute_definitions was already validated (with
|
|
// validate_attribute_definitions()).
|
|
// This function should be called multiple times - once for the base schema
|
|
// and once for each of its views (existing GSIs and LSIs on this table).
|
|
static void check_attribute_definitions_conflicts(const rjson::value& attribute_definitions, const schema& schema) {
|
|
for (auto& def : schema.primary_key_columns()) {
|
|
std::string def_type = type_to_string(def.type);
|
|
for (auto it = attribute_definitions.Begin(); it != attribute_definitions.End(); ++it) {
|
|
const rjson::value& attribute_info = *it;
|
|
if (rjson::to_string_view(attribute_info["AttributeName"]) == def.name_as_text()) {
|
|
std::string_view type = rjson::to_string_view(attribute_info["AttributeType"]);
|
|
if (type != def_type) {
|
|
throw api_error::validation(fmt::format("AttributeDefinitions redefined {} to {} already a key attribute of type {} in this table", def.name_as_text(), type, def_type));
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
future<executor::request_return_type> executor::update_table(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.update_table++;
|
|
elogger.trace("Updating table {}", request);
|
|
|
|
static const std::vector<sstring> unsupported = {
|
|
"ProvisionedThroughput",
|
|
"ReplicaUpdates",
|
|
"SSESpecification",
|
|
};
|
|
|
|
for (auto& s : unsupported) {
|
|
if (rjson::find(request, s)) {
|
|
co_await coroutine::return_exception(api_error::validation(s + " not supported"));
|
|
}
|
|
}
|
|
|
|
bool empty_request = true;
|
|
|
|
if (rjson::find(request, "BillingMode")) {
|
|
empty_request = false;
|
|
verify_billing_mode(request);
|
|
}
|
|
|
|
co_return co_await _mm.container().invoke_on(0, [&p = _proxy.container(), request = std::move(request), gt = tracing::global_trace_state_ptr(std::move(trace_state)), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization), client_state_other_shard = client_state.move_to_other_shard(), empty_request, &e = this->container()]
|
|
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
|
|
schema_ptr schema;
|
|
size_t retries = mm.get_concurrent_ddl_retries();
|
|
for (;;) {
|
|
auto group0_guard = co_await mm.start_group0_operation();
|
|
|
|
schema_ptr tab = get_table(p.local(), request);
|
|
|
|
tracing::add_alternator_table_name(gt, tab->cf_name());
|
|
|
|
// the ugly but harmless conversion to string_view here is because
|
|
// Seastar's sstring is missing a find(std::string_view) :-()
|
|
if (std::string_view(tab->cf_name()).find(INTERNAL_TABLE_PREFIX) == 0) {
|
|
co_await coroutine::return_exception(api_error::validation(fmt::format("Prefix {} is reserved for accessing internal tables", INTERNAL_TABLE_PREFIX)));
|
|
}
|
|
|
|
schema_builder builder(tab);
|
|
|
|
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
|
|
if (stream_specification && stream_specification->IsObject()) {
|
|
empty_request = false;
|
|
if (add_stream_options(*stream_specification, builder, p.local())) {
|
|
validate_cdc_log_name_length(builder.cf_name());
|
|
}
|
|
// Alternator Streams doesn't yet work when the table uses tablets (#23838)
|
|
auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
|
|
if (stream_enabled && stream_enabled->IsBool()) {
|
|
if (stream_enabled->GetBool()) {
|
|
if (p.local().local_db().find_keyspace(tab->ks_name()).get_replication_strategy().uses_tablets()) {
|
|
co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
|
|
"If you want to enable streams, re-create this table with vnodes (with the tag 'system:initial_tablets' set to 'none').");
|
|
}
|
|
if (tab->cdc_options().enabled()) {
|
|
co_return api_error::validation("Table already has an enabled stream: TableName: " + tab->cf_name());
|
|
}
|
|
}
|
|
else if (!tab->cdc_options().enabled()) {
|
|
co_return api_error::validation("Table has no stream to disable: TableName: " + tab->cf_name());
|
|
}
|
|
}
|
|
}
|
|
|
|
schema = builder.build();
|
|
std::vector<view_ptr> new_views;
|
|
std::vector<std::string> dropped_views;
|
|
|
|
rjson::value* gsi_updates = rjson::find(request, "GlobalSecondaryIndexUpdates");
|
|
if (gsi_updates) {
|
|
if (!gsi_updates->IsArray()) {
|
|
co_return api_error::validation("GlobalSecondaryIndexUpdates must be an array");
|
|
}
|
|
if (gsi_updates->Size() > 1) {
|
|
// Although UpdateTable takes an array of operations and could
|
|
// support multiple Create and/or Delete operations in one
|
|
// command, DynamoDB doesn't actually allows this, and throws
|
|
// a LimitExceededException if this is attempted.
|
|
co_return api_error::limit_exceeded("GlobalSecondaryIndexUpdates only allows one index creation or deletion");
|
|
}
|
|
if (gsi_updates->Size() == 1) {
|
|
empty_request = false;
|
|
if (!(*gsi_updates)[0].IsObject() || (*gsi_updates)[0].MemberCount() != 1) {
|
|
co_return api_error::validation("GlobalSecondaryIndexUpdates array must contain one object with a Create, Delete or Update operation");
|
|
}
|
|
auto it = (*gsi_updates)[0].MemberBegin();
|
|
const std::string_view op = rjson::to_string_view(it->name);
|
|
if (!it->value.IsObject()) {
|
|
co_return api_error::validation("GlobalSecondaryIndexUpdates entries must be objects");
|
|
}
|
|
const rjson::value* index_name_v = rjson::find(it->value, "IndexName");
|
|
if (!index_name_v || !index_name_v->IsString()) {
|
|
co_return api_error::validation("GlobalSecondaryIndexUpdates operation must have IndexName");
|
|
}
|
|
std::string_view index_name = rjson::to_string_view(*index_name_v);
|
|
std::string_view table_name = schema->cf_name();
|
|
std::string_view keyspace_name = schema->ks_name();
|
|
std::string vname(view_name(table_name, index_name));
|
|
if (op == "Create") {
|
|
const rjson::value* attribute_definitions = rjson::find(request, "AttributeDefinitions");
|
|
if (!attribute_definitions) {
|
|
co_return api_error::validation("GlobalSecondaryIndexUpdates Create needs AttributeDefinitions");
|
|
}
|
|
std::unordered_set<std::string> unused_attribute_definitions =
|
|
validate_attribute_definitions("GlobalSecondaryIndexUpdates", *attribute_definitions);
|
|
check_attribute_definitions_conflicts(*attribute_definitions, *schema);
|
|
for (auto& view : p.local().data_dictionary().find_column_family(tab).views()) {
|
|
check_attribute_definitions_conflicts(*attribute_definitions, *view);
|
|
}
|
|
|
|
if (p.local().data_dictionary().has_schema(keyspace_name, vname)) {
|
|
// Surprisingly, DynamoDB uses validation error here, not resource_in_use
|
|
co_return api_error::validation(fmt::format(
|
|
"GSI {} already exists in table {}", index_name, table_name));
|
|
}
|
|
if (p.local().data_dictionary().has_schema(keyspace_name, lsi_name(table_name, index_name, false))) {
|
|
co_return api_error::validation(fmt::format(
|
|
"LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name));
|
|
}
|
|
if (p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy().uses_tablets() &&
|
|
!p.local().data_dictionary().get_config().rf_rack_valid_keyspaces()) {
|
|
co_return api_error::validation("GlobalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
|
|
}
|
|
|
|
elogger.trace("Adding GSI {}", index_name);
|
|
// FIXME: read and handle "Projection" parameter. This will
|
|
// require the MV code to copy just parts of the attrs map.
|
|
schema_builder view_builder(keyspace_name, vname);
|
|
auto [view_hash_key, view_range_key] = parse_key_schema(it->value, "GlobalSecondaryIndexUpdates");
|
|
// If an attribute is already a real column in the base
|
|
// table (i.e., a key attribute in the base table),
|
|
// we can use it directly as a view key. Otherwise, we
|
|
// need to add it as a "computed column", which extracts
|
|
// and deserializes the attribute from the ":attrs" map.
|
|
bool view_hash_key_real_column =
|
|
schema->get_column_definition(to_bytes(view_hash_key));
|
|
add_column(view_builder, view_hash_key, *attribute_definitions, column_kind::partition_key, !view_hash_key_real_column);
|
|
unused_attribute_definitions.erase(view_hash_key);
|
|
if (!view_range_key.empty()) {
|
|
bool view_range_key_real_column =
|
|
schema->get_column_definition(to_bytes(view_range_key));
|
|
add_column(view_builder, view_range_key, *attribute_definitions, column_kind::clustering_key, !view_range_key_real_column);
|
|
if (!schema->get_column_definition(to_bytes(view_range_key)) &&
|
|
!schema->get_column_definition(to_bytes(view_hash_key))) {
|
|
// FIXME: This warning should go away. See issue #6714
|
|
elogger.warn("Only 1 regular column from the base table should be used in the GSI key in order to ensure correct liveness management without assumptions");
|
|
}
|
|
unused_attribute_definitions.erase(view_range_key);
|
|
}
|
|
// Surprisingly, although DynamoDB checks for unused
|
|
// AttributeDefinitions in CreateTable, it does not
|
|
// check it in UpdateTable. We decided to check anyway.
|
|
if (!unused_attribute_definitions.empty()) {
|
|
co_return api_error::validation(fmt::format(
|
|
"AttributeDefinitions defines spurious attributes not used by any KeySchema: {}",
|
|
unused_attribute_definitions));
|
|
}
|
|
// Base key columns which aren't part of the index's key need to
|
|
// be added to the view nonetheless, as (additional) clustering
|
|
// key(s).
|
|
for (auto& def : schema->primary_key_columns()) {
|
|
if (def.name_as_text() != view_hash_key && def.name_as_text() != view_range_key) {
|
|
view_builder.with_column(def.name(), def.type, column_kind::clustering_key);
|
|
}
|
|
}
|
|
// GSIs have no tags:
|
|
view_builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>());
|
|
// Note below we don't need to add virtual columns, as all
|
|
// base columns were copied to view. TODO: reconsider the need
|
|
// for virtual columns when we support Projection.
|
|
for (const column_definition& regular_cdef : schema->regular_columns()) {
|
|
if (!view_builder.has_column(*cql3::to_identifier(regular_cdef))) {
|
|
view_builder.with_column(regular_cdef.name(), regular_cdef.type, column_kind::regular_column);
|
|
}
|
|
}
|
|
const bool include_all_columns = true;
|
|
view_builder.with_view_info(schema, include_all_columns, ""/*where clause*/);
|
|
new_views.emplace_back(view_builder.build());
|
|
} else if (op == "Delete") {
|
|
elogger.trace("Deleting GSI {}", index_name);
|
|
if (!p.local().data_dictionary().has_schema(keyspace_name, vname)) {
|
|
co_return api_error::resource_not_found(fmt::format("No GSI {} in table {}", index_name, table_name));
|
|
}
|
|
dropped_views.emplace_back(vname);
|
|
} else if (op == "Update") {
|
|
co_return api_error::validation("GlobalSecondaryIndexUpdates Update not yet supported");
|
|
} else {
|
|
co_return api_error::validation(fmt::format("GlobalSecondaryIndexUpdates supports a Create, Delete or Update operation, saw '{}'", op));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (empty_request) {
|
|
co_return api_error::validation("UpdateTable requires one of GlobalSecondaryIndexUpdates, StreamSpecification or BillingMode to be specified");
|
|
}
|
|
|
|
co_await verify_permission(enforce_authorization, warn_authorization, client_state_other_shard.get(), schema, auth::permission::ALTER, e.local()._stats);
|
|
auto m = co_await service::prepare_column_family_update_announcement(p.local(), schema, std::vector<view_ptr>(), group0_guard.write_timestamp());
|
|
for (view_ptr view : new_views) {
|
|
auto m2 = co_await service::prepare_new_view_announcement(p.local(), view, group0_guard.write_timestamp());
|
|
std::move(m2.begin(), m2.end(), std::back_inserter(m));
|
|
}
|
|
for (const std::string& view_name : dropped_views) {
|
|
auto m2 = co_await service::prepare_view_drop_announcement(p.local(), schema->ks_name(), view_name, group0_guard.write_timestamp());
|
|
std::move(m2.begin(), m2.end(), std::back_inserter(m));
|
|
}
|
|
// If a role is allowed to create a GSI, we should give it permissions
|
|
// to read the GSI it just created. This is known as "auto-grant".
|
|
// Also, when we delete a GSI we should revoke any permissions set on
|
|
// it - so if it's ever created again the old permissions wouldn't be
|
|
// remembered for the new GSI. This is known as "auto-revoke"
|
|
if (client_state_other_shard.get().user() && (!new_views.empty() || !dropped_views.empty())) {
|
|
service::group0_batch mc(std::move(group0_guard));
|
|
mc.add_mutations(std::move(m));
|
|
for (view_ptr view : new_views) {
|
|
auto resource = auth::make_data_resource(view->ks_name(), view->cf_name());
|
|
co_await auth::grant_applicable_permissions(
|
|
*client_state_other_shard.get().get_auth_service(), *client_state_other_shard.get().user(), resource, mc);
|
|
}
|
|
for (const auto& view_name : dropped_views) {
|
|
auto resource = auth::make_data_resource(schema->ks_name(), view_name);
|
|
co_await auth::revoke_all(*client_state_other_shard.get().get_auth_service(), resource, mc);
|
|
}
|
|
std::tie(m, group0_guard) = co_await std::move(mc).extract();
|
|
}
|
|
try {
|
|
co_await mm.announce(std::move(m), std::move(group0_guard), format("alternator-executor: update {} table", tab->cf_name()));
|
|
break;
|
|
} catch (const service::group0_concurrent_modification& ex) {
|
|
elogger.info("Failed to execute UpdateTable {} due to concurrent schema modifications. {}.",
|
|
tab->cf_name(), retries ? "Retrying" : "Number of retries exceeded, giving up");
|
|
if (retries--) {
|
|
continue;
|
|
}
|
|
throw;
|
|
}
|
|
}
|
|
co_await mm.wait_for_schema_agreement(p.local().local_db(), db::timeout_clock::now() + 10s, nullptr);
|
|
|
|
rjson::value status = rjson::empty_object();
|
|
supplement_table_info(request, *schema, p.local());
|
|
rjson::add(status, "TableDescription", std::move(request));
|
|
co_return rjson::print(std::move(status));
|
|
});
|
|
}
|
|
|
|
// attribute_collector is a helper class used to accept several attribute
|
|
// puts or deletes, and collect them as single collection mutation.
|
|
// The implementation is somewhat complicated by the need of cells in a
|
|
// collection to be sorted by key order.
|
|
class attribute_collector {
|
|
std::map<bytes, atomic_cell, serialized_compare> collected;
|
|
void add(bytes&& name, atomic_cell&& cell) {
|
|
collected.emplace(std::move(name), std::move(cell));
|
|
}
|
|
void add(const bytes& name, atomic_cell&& cell) {
|
|
collected.emplace(name, std::move(cell));
|
|
}
|
|
public:
|
|
attribute_collector() : collected(attrs_type()->get_keys_type()->as_less_comparator()) { }
|
|
void put(bytes&& name, const bytes& val, api::timestamp_type ts) {
|
|
add(std::move(name), atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
|
|
|
|
}
|
|
void put(const bytes& name, const bytes& val, api::timestamp_type ts) {
|
|
add(name, atomic_cell::make_live(*bytes_type, ts, val, atomic_cell::collection_member::yes));
|
|
}
|
|
void del(bytes&& name, api::timestamp_type ts) {
|
|
add(std::move(name), atomic_cell::make_dead(ts, gc_clock::now()));
|
|
}
|
|
void del(const bytes& name, api::timestamp_type ts) {
|
|
add(name, atomic_cell::make_dead(ts, gc_clock::now()));
|
|
}
|
|
collection_mutation_description to_mut() {
|
|
collection_mutation_description ret;
|
|
for (auto&& e : collected) {
|
|
ret.cells.emplace_back(e.first, std::move(e.second));
|
|
}
|
|
return ret;
|
|
}
|
|
bool empty() const {
|
|
return collected.empty();
|
|
}
|
|
};
|
|
|
|
// After calling pk_from_json() and ck_from_json() to extract the pk and ck
|
|
// components of a key, and if that succeeded, call check_key() to further
|
|
// check that the key doesn't have any spurious components.
|
|
static void check_key(const rjson::value& key, const schema_ptr& schema) {
|
|
if (key.MemberCount() != (schema->clustering_key_size() == 0 ? 1 : 2)) {
|
|
throw api_error::validation("Given key attribute not in schema");
|
|
}
|
|
}
|
|
|
|
// Verify that a value parsed from the user input is legal. In particular,
|
|
// we check that the value is not an empty set, string or bytes - which is
|
|
// (somewhat artificially) forbidden by DynamoDB.
|
|
void validate_value(const rjson::value& v, const char* caller) {
|
|
if (!v.IsObject() || v.MemberCount() != 1) {
|
|
throw api_error::validation(format("{}: improperly formatted value '{}'", caller, v));
|
|
}
|
|
auto it = v.MemberBegin();
|
|
const std::string_view type = rjson::to_string_view(it->name);
|
|
if (type == "SS" || type == "BS" || type == "NS") {
|
|
if (!it->value.IsArray()) {
|
|
throw api_error::validation(format("{}: improperly formatted set '{}'", caller, v));
|
|
}
|
|
if (it->value.Size() == 0) {
|
|
throw api_error::validation(format("{}: empty set not allowed", caller));
|
|
}
|
|
} else if (type == "S" || type == "B") {
|
|
if (!it->value.IsString()) {
|
|
throw api_error::validation(format("{}: improperly formatted value '{}'", caller, v));
|
|
}
|
|
} else if (type == "N") {
|
|
if (!it->value.IsString()) {
|
|
// DynamoDB uses a SerializationException in this case, not ValidationException.
|
|
throw api_error::serialization(format("{}: number value must be encoded as string '{}'", caller, v));
|
|
}
|
|
} else if (type != "L" && type != "M" && type != "BOOL" && type != "NULL") {
|
|
// TODO: can do more sanity checks on the content of the above types.
|
|
throw api_error::validation(fmt::format("{}: unknown type {} for value {}", caller, type, v));
|
|
}
|
|
}
|
|
|
|
// The put_or_delete_item class builds the mutations needed by the PutItem and
|
|
// DeleteItem operations - either as stand-alone commands or part of a list
|
|
// of commands in BatchWriteItem.
|
|
// put_or_delete_item splits each operation into two stages: Constructing the
|
|
// object parses and validates the user input (throwing exceptions if there
|
|
// are input errors). Later, build() generates the actual mutation, with a
|
|
// specified timestamp. This split is needed because of the peculiar needs of
|
|
// BatchWriteItem and LWT. BatchWriteItem needs all parsing to happen before
|
|
// any writing happens (if one of the commands has an error, none of the
|
|
// writes should be done). LWT makes it impossible for the parse step to
|
|
// generate "mutation" objects, because the timestamp still isn't known.
|
|
class put_or_delete_item {
|
|
private:
|
|
partition_key _pk;
|
|
clustering_key _ck;
|
|
struct cell {
|
|
bytes column_name;
|
|
bytes value;
|
|
};
|
|
// PutItem: engaged _cells, write these cells to item (_pk, _ck).
|
|
// DeleteItem: disengaged _cells, delete the entire item (_pk, _ck).
|
|
std::optional<std::vector<cell>> _cells;
|
|
// WCU calculation takes into account some length in bytes,
|
|
// that length can have different meaning depends on the operation but the
|
|
// the calculation of length in bytes to WCU is the same.
|
|
uint64_t _length_in_bytes = 0;
|
|
public:
|
|
struct delete_item {};
|
|
struct put_item {};
|
|
put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item);
|
|
put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes);
|
|
// put_or_delete_item doesn't keep a reference to schema (so it can be
|
|
// moved between shards for LWT) so it needs to be given again to build():
|
|
mutation build(schema_ptr schema, api::timestamp_type ts) const;
|
|
const partition_key& pk() const { return _pk; }
|
|
const clustering_key& ck() const { return _ck; }
|
|
uint64_t length_in_bytes() const noexcept {
|
|
return _length_in_bytes;
|
|
}
|
|
void set_length_in_bytes(uint64_t length) noexcept {
|
|
_length_in_bytes = length;
|
|
}
|
|
bool is_put_item() noexcept {
|
|
return _cells.has_value();
|
|
}
|
|
};
|
|
|
|
put_or_delete_item::put_or_delete_item(const rjson::value& key, schema_ptr schema, delete_item)
|
|
: _pk(pk_from_json(key, schema)), _ck(ck_from_json(key, schema)) {
|
|
check_key(key, schema);
|
|
}
|
|
|
|
// find_attribute() checks whether the named attribute is stored in the
|
|
// schema as a real column (we do this for key attribute, and for a GSI key)
|
|
// and if so, returns that column. If not, the function returns nullptr,
|
|
// telling the caller that the attribute is stored serialized in the
|
|
// ATTRS_COLUMN_NAME map - not in a stand-alone column in the schema.
|
|
static inline const column_definition* find_attribute(const schema& schema, const bytes& attribute_name) {
|
|
const column_definition* cdef = schema.get_column_definition(attribute_name);
|
|
// Although ATTRS_COLUMN_NAME exists as an actual column, when used as an
|
|
// attribute name it should refer to an attribute inside ATTRS_COLUMN_NAME
|
|
// not to ATTRS_COLUMN_NAME itself. This if() is needed for #5009.
|
|
if (cdef && cdef->name() == executor::ATTRS_COLUMN_NAME) {
|
|
return nullptr;
|
|
}
|
|
return cdef;
|
|
}
|
|
|
|
|
|
// Get a list of all attributes that serve as a key attributes for any of the
|
|
// GSIs or LSIs of this table, and the declared type for each (can be only
|
|
// "S", "B", or "N"). The implementation below will also list the base table's
|
|
// key columns (they are the views' clustering keys).
|
|
std::unordered_map<bytes, std::string> si_key_attributes(data_dictionary::table t) {
|
|
std::unordered_map<bytes, std::string> ret;
|
|
for (const view_ptr& v : t.views()) {
|
|
for (const column_definition& cdef : v->partition_key_columns()) {
|
|
ret[cdef.name()] = type_to_string(cdef.type);
|
|
}
|
|
for (const column_definition& cdef : v->clustering_key_columns()) {
|
|
ret[cdef.name()] = type_to_string(cdef.type);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
// When an attribute is a key (hash or sort) of one of the GSIs or LSIs on a
|
|
// table, DynamoDB refuses an update to that attribute with an unsuitable
|
|
// value. Unsuitable values are:
|
|
// 1. An empty string (those are normally allowed as values, but not allowed
|
|
// as keys, including GSI keys).
|
|
// 2. A value with a type different than that declared for the GSI key.
|
|
// Normally non-key attributes can take values of any type (DynamoDB is
|
|
// schema-less), but as soon as an attribute is used as a GSI key, it
|
|
// must be set only to the specific type declared for that key.
|
|
// (Note that a missing value for an GSI key attribute is fine - the update
|
|
// will happen on the base table, but won't reach the view table. In this
|
|
// case, this function simply won't be called for this attribute.)
|
|
//
|
|
// This function checks if the given attribute update is an update to some
|
|
// GSI's key, and if the value is unsuitable, a api_error::validation is
|
|
// thrown. The checking here is similar to the checking done in
|
|
// get_key_from_typed_value() for the base table's key columns.
|
|
//
|
|
// validate_value_if_index_key() should only be called after validate_value()
|
|
// already validated that the value itself has a valid form.
|
|
static inline void validate_value_if_index_key(
|
|
std::unordered_map<bytes, std::string> key_attributes,
|
|
const bytes& attribute,
|
|
const rjson::value& value) {
|
|
if (key_attributes.empty()) {
|
|
return;
|
|
}
|
|
auto it = key_attributes.find(attribute);
|
|
if (it == key_attributes.end()) {
|
|
// Given attribute is not a key column with a fixed type, so no
|
|
// more validation to do.
|
|
return;
|
|
}
|
|
const std::string& expected_type = it->second;
|
|
// We assume that validate_value() was previously called on this value,
|
|
// so value is known to be of the proper format (an object with one
|
|
// member, whose key and value are strings)
|
|
std::string_view value_type = rjson::to_string_view(value.MemberBegin()->name);
|
|
if (expected_type != value_type) {
|
|
throw api_error::validation(fmt::format(
|
|
"Type mismatch: expected type {} for GSI or LSI key attribute {}, got type {}",
|
|
expected_type, to_string_view(attribute), value_type));
|
|
}
|
|
std::string_view value_content = rjson::to_string_view(value.MemberBegin()->value);
|
|
if (value_content.empty()) {
|
|
throw api_error::validation(fmt::format(
|
|
"GSI or LSI key attribute {} cannot be set to an empty string", to_string_view(attribute)));
|
|
}
|
|
}
|
|
|
|
put_or_delete_item::put_or_delete_item(const rjson::value& item, schema_ptr schema, put_item, std::unordered_map<bytes, std::string> key_attributes)
|
|
: _pk(pk_from_json(item, schema)), _ck(ck_from_json(item, schema)) {
|
|
_cells = std::vector<cell>();
|
|
_cells->reserve(item.MemberCount());
|
|
for (auto it = item.MemberBegin(); it != item.MemberEnd(); ++it) {
|
|
bytes column_name = to_bytes(rjson::to_string_view(it->name));
|
|
validate_value(it->value, "PutItem");
|
|
const column_definition* cdef = find_attribute(*schema, column_name);
|
|
validate_attr_name_length("", column_name.size(), cdef && cdef->is_primary_key());
|
|
_length_in_bytes += column_name.size();
|
|
if (!cdef) {
|
|
// This attribute may be a key column of one of the GSI or LSI,
|
|
// in which case there are some limitations on the value.
|
|
validate_value_if_index_key(key_attributes, column_name, it->value);
|
|
bytes value = serialize_item(it->value);
|
|
if (value.size()) {
|
|
// ScyllaDB uses one extra byte compared to DynamoDB for the bytes length
|
|
_length_in_bytes += value.size() - 1;
|
|
}
|
|
_cells->push_back({std::move(column_name), serialize_item(it->value)});
|
|
} else if (!cdef->is_primary_key()) {
|
|
// Fixed-type regular columns were used for LSIs and also (in the
|
|
// slightly more distant past) GSIs, before they were moved to
|
|
// :attrs. We keep this branch for backward compatibility.
|
|
bytes value = get_key_from_typed_value(it->value, *cdef);
|
|
_cells->push_back({std::move(column_name), value});
|
|
if (value.size()) {
|
|
// ScyllaDB uses one extra byte compared to DynamoDB for the bytes length
|
|
_length_in_bytes += value.size() - 1;
|
|
}
|
|
}
|
|
}
|
|
if (_pk.representation().size() > 2) {
|
|
// ScyllaDB uses two extra bytes compared to DynamoDB for the key bytes length
|
|
_length_in_bytes += _pk.representation().size() - 2;
|
|
}
|
|
if (_ck.representation().size() > 2) {
|
|
// ScyllaDB uses two extra bytes compared to DynamoDB for the key bytes length
|
|
_length_in_bytes += _ck.representation().size() - 2;
|
|
}
|
|
}
|
|
|
|
mutation put_or_delete_item::build(schema_ptr schema, api::timestamp_type ts) const {
|
|
mutation m(schema, _pk);
|
|
// If there's no clustering key, a tombstone should be created directly
|
|
// on a partition, not on a clustering row - otherwise it will look like
|
|
// an open-ended range tombstone, which will crash on KA/LA sstable format.
|
|
// Ref: #6035
|
|
const bool use_partition_tombstone = schema->clustering_key_size() == 0;
|
|
if (!_cells) {
|
|
if (use_partition_tombstone) {
|
|
m.partition().apply(tombstone(ts, gc_clock::now()));
|
|
} else {
|
|
// a DeleteItem operation:
|
|
m.partition().clustered_row(*schema, _ck).apply(tombstone(ts, gc_clock::now()));
|
|
}
|
|
return m;
|
|
}
|
|
// else, a PutItem operation:
|
|
auto& row = m.partition().clustered_row(*schema, _ck);
|
|
attribute_collector attrs_collector;
|
|
for (auto& c : *_cells) {
|
|
const column_definition* cdef = find_attribute(*schema, c.column_name);
|
|
if (!cdef) {
|
|
attrs_collector.put(c.column_name, c.value, ts);
|
|
} else {
|
|
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, std::move(c.value)));
|
|
}
|
|
}
|
|
auto attrs = attrs_column(*schema);
|
|
if (!attrs_collector.empty()) {
|
|
auto serialized_map = attrs_collector.to_mut().serialize(*attrs_type());
|
|
row.cells().apply(attrs, std::move(serialized_map));
|
|
}
|
|
// To allow creation of an item with no attributes, we need a row marker.
|
|
row.apply(row_marker(ts));
|
|
// PutItem is supposed to completely replace the old item, so we need to
|
|
// also have a tombstone removing old cells. Important points:
|
|
// 1) Alternator's schema is dynamic, therefore we store data in a map
|
|
// in column :attrs. Since we're replacing a row, invalidating only
|
|
// :attrs is enough. Alternator base tables also had columns for LSI
|
|
// keys and GSI keys. New tables no longer have such columns, but old
|
|
// tables created in the past may still have them.
|
|
// 2) We use a collection tombstone for the :attrs column instead of a row
|
|
// tombstone. While a row tombstone would also replace the data, it has
|
|
// an undesirable side effect for CDC, which would report it as a
|
|
// separate deletion event. To model PutItem's "replace" semantic, we
|
|
// leverage a corner case: a collection tombstone at ts-1 paired with an
|
|
// upsert at ts is not reported by CDC as a separate REMOVE event. We
|
|
// can't use the timestamp ts, because when data and tombstone tie on
|
|
// timestamp, the tombstone wins. These tricks were introduced in
|
|
// Scylla to handle collection replacements in CQL (see #6084, PR #6491,
|
|
// e.g. cql3::maps::setter::execute()) and we utilize it to avoid
|
|
// emitting the REMOVE event (resolving #6930).
|
|
row.cells().apply(attrs, collection_mutation_description{tombstone{ts - 1, gc_clock::now()}}.serialize(*attrs.type));
|
|
// Note that for old tables created with regular LSI and GSI key columns,
|
|
// we must also delete the regular columns that are not part of the new
|
|
// schema consisting of pk, ck, and :attrs.
|
|
for (const auto& cdef : schema->regular_columns()) {
|
|
if (cdef.name_as_text() != executor::ATTRS_COLUMN_NAME) {
|
|
row.cells().apply(cdef, atomic_cell::make_dead(ts - 1, gc_clock::now()));
|
|
}
|
|
}
|
|
return m;
|
|
}
|
|
|
|
// The DynamoDB API doesn't let the client control the server's timeout, so
|
|
// we have a global default_timeout() for Alternator requests. The value of
|
|
// s_default_timeout_ms is overwritten in alternator::controller::start_server()
|
|
// based on the "alternator_timeout_in_ms" configuration parameter.
|
|
thread_local utils::updateable_value<uint32_t> executor::s_default_timeout_in_ms{10'000};
|
|
db::timeout_clock::time_point executor::default_timeout() {
|
|
return db::timeout_clock::now() + std::chrono::milliseconds(s_default_timeout_in_ms);
|
|
}
|
|
|
|
static lw_shared_ptr<query::read_command> previous_item_read_command(service::storage_proxy& proxy,
|
|
schema_ptr schema,
|
|
const clustering_key& ck,
|
|
shared_ptr<cql3::selection::selection> selection) {
|
|
std::vector<query::clustering_range> bounds;
|
|
if (schema->clustering_key_size() == 0) {
|
|
bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
} else {
|
|
bounds.push_back(query::clustering_range::make_singular(ck));
|
|
}
|
|
// FIXME: We pretend to take a selection (all callers currently give us a
|
|
// wildcard selection...) but here we read the entire item anyway. We
|
|
// should take the column list from selection instead of building it here.
|
|
auto regular_columns =
|
|
schema->regular_columns() | std::views::transform(&column_definition::id)
|
|
| std::ranges::to<query::column_id_vector>();
|
|
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
|
|
return ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, proxy.get_max_result_size(partition_slice),
|
|
query::tombstone_limit(proxy.get_tombstone_limit()));
|
|
}
|
|
|
|
static dht::partition_range_vector to_partition_ranges(const schema& schema, const partition_key& pk) {
|
|
return dht::partition_range_vector{dht::partition_range(dht::decorate_key(schema, pk))};
|
|
}
|
|
static dht::partition_range_vector to_partition_ranges(const dht::decorated_key& pk) {
|
|
return dht::partition_range_vector{dht::partition_range(pk)};
|
|
}
|
|
|
|
// Parse the different options for the ReturnValues parameter. We parse all
|
|
// the known options, but only UpdateItem actually supports all of them. The
|
|
// other operations (DeleteItem and PutItem) will refuse some of them.
|
|
rmw_operation::returnvalues rmw_operation::parse_returnvalues(const rjson::value& request) {
|
|
const rjson::value* attribute_value = rjson::find(request, "ReturnValues");
|
|
if (!attribute_value) {
|
|
return rmw_operation::returnvalues::NONE;
|
|
}
|
|
if (!attribute_value->IsString()) {
|
|
throw api_error::validation(format("Expected string value for ReturnValues, got: {}", *attribute_value));
|
|
}
|
|
auto s = rjson::to_string_view(*attribute_value);
|
|
if (s == "NONE") {
|
|
return rmw_operation::returnvalues::NONE;
|
|
} else if (s == "ALL_OLD") {
|
|
return rmw_operation::returnvalues::ALL_OLD;
|
|
} else if (s == "UPDATED_OLD") {
|
|
return rmw_operation::returnvalues::UPDATED_OLD;
|
|
} else if (s == "ALL_NEW") {
|
|
return rmw_operation::returnvalues::ALL_NEW;
|
|
} else if (s == "UPDATED_NEW") {
|
|
return rmw_operation::returnvalues::UPDATED_NEW;
|
|
} else {
|
|
throw api_error::validation(fmt::format("Unrecognized value for ReturnValues: {}", s));
|
|
}
|
|
}
|
|
|
|
rmw_operation::returnvalues_on_condition_check_failure
|
|
rmw_operation::parse_returnvalues_on_condition_check_failure(const rjson::value& request) {
|
|
const rjson::value* attribute_value = rjson::find(request, "ReturnValuesOnConditionCheckFailure");
|
|
if (!attribute_value) {
|
|
return rmw_operation::returnvalues_on_condition_check_failure::NONE;
|
|
}
|
|
if (!attribute_value->IsString()) {
|
|
throw api_error::validation(format("Expected string value for ReturnValuesOnConditionCheckFailure, got: {}", *attribute_value));
|
|
}
|
|
auto s = rjson::to_string_view(*attribute_value);
|
|
if (s == "NONE") {
|
|
return rmw_operation::returnvalues_on_condition_check_failure::NONE;
|
|
} else if (s == "ALL_OLD") {
|
|
return rmw_operation::returnvalues_on_condition_check_failure::ALL_OLD;
|
|
} else {
|
|
throw api_error::validation(fmt::format("Unrecognized value for ReturnValuesOnConditionCheckFailure: {}", s));
|
|
}
|
|
}
|
|
|
|
rmw_operation::rmw_operation(service::storage_proxy& proxy, rjson::value&& request)
|
|
: _request(std::move(request))
|
|
, _schema(get_table_for_write(proxy, _request))
|
|
, _write_isolation(get_write_isolation_for_schema(_schema))
|
|
, _consumed_capacity(_request)
|
|
, _returnvalues(parse_returnvalues(_request))
|
|
, _returnvalues_on_condition_check_failure(parse_returnvalues_on_condition_check_failure(_request))
|
|
{
|
|
// _pk and _ck will be assigned later, by the subclass's constructor
|
|
// (each operation puts the key in a slightly different location in
|
|
// the request).
|
|
}
|
|
|
|
std::optional<mutation> rmw_operation::apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) {
|
|
if (qr->row_count()) {
|
|
auto selection = cql3::selection::selection::wildcard(_schema);
|
|
uint64_t item_length = 0;
|
|
auto previous_item = executor::describe_single_item(_schema, slice, *selection, *qr, {}, &item_length);
|
|
if (_consumed_capacity._total_bytes < item_length) {
|
|
_consumed_capacity._total_bytes = item_length;
|
|
}
|
|
if (previous_item) {
|
|
if (should_fill_preimage()) {
|
|
cdc_opts.preimage = make_lw_shared<cql3::untyped_result_set>(*_schema, std::move(qr), *selection, slice);
|
|
}
|
|
return apply(std::make_unique<rjson::value>(std::move(*previous_item)), ts, cdc_opts);
|
|
}
|
|
}
|
|
return apply(std::unique_ptr<rjson::value>(), ts, cdc_opts);
|
|
}
|
|
|
|
rmw_operation::write_isolation rmw_operation::get_write_isolation_for_schema(schema_ptr schema) {
|
|
const auto tags_ptr = db::get_tags_of_table(schema);
|
|
if (!tags_ptr) {
|
|
// Tags missing entirely from this table. This can't happen for a
|
|
// normal Alternator table, but can happen if get_table_for_write()
|
|
// allowed writing to a non-Alternator table (e.g., an internal table).
|
|
// If it is a system table, LWT will not work (and is also pointless
|
|
// for non-distributed tables), so use UNSAFE_RMW.
|
|
if(is_internal_keyspace(schema->ks_name())) {
|
|
return write_isolation::UNSAFE_RMW;
|
|
} else {
|
|
return default_write_isolation;
|
|
}
|
|
}
|
|
auto it = tags_ptr->find(WRITE_ISOLATION_TAG_KEY);
|
|
if (it == tags_ptr->end() || it->second.empty()) {
|
|
return default_write_isolation;
|
|
}
|
|
return parse_write_isolation(it->second);
|
|
}
|
|
|
|
// shard_for_execute() checks whether execute() must be called on a specific
|
|
// other shard. Running execute() on a specific shard is necessary only if it
|
|
// will use LWT (storage_proxy::cas()). This is because cas() can only be
|
|
// called on the specific shard owning (as per get_cas_shard()) _pk's token.
|
|
// Knowing if execute() will call cas() or not may depend on whether there is
|
|
// a read-before-write, but not just on it - depending on configuration,
|
|
// execute() may unconditionally use cas() for every write. Unfortunately,
|
|
// this requires duplicating here a bit of logic from execute().
|
|
// The returned cas_shard must be passed to execute() to ensure
|
|
// the tablet shard won't change. The caller must hold the returned object for
|
|
// the duration of execution, even if we were already on the right shard - so it doesn't move.
|
|
std::optional<service::cas_shard> rmw_operation::shard_for_execute(bool needs_read_before_write) {
|
|
if (_write_isolation == write_isolation::FORBID_RMW ||
|
|
(_write_isolation == write_isolation::LWT_RMW_ONLY && !needs_read_before_write) ||
|
|
_write_isolation == write_isolation::UNSAFE_RMW) {
|
|
return {};
|
|
}
|
|
// If we're still here, cas() *will* be called by execute(), so let's
|
|
// find the appropriate shard to run it on:
|
|
const auto token = dht::get_token(*_schema, _pk);
|
|
return service::cas_shard(*_schema, token);
|
|
}
|
|
|
|
// Build the return value from the different RMW operations (UpdateItem,
|
|
// PutItem, DeleteItem). All these return nothing by default, but can
|
|
// optionally return Attributes if requested via the ReturnValues option.
|
|
static executor::request_return_type rmw_operation_return(rjson::value&& attributes, const consumed_capacity_counter& consumed_capacity, uint64_t& metric) {
|
|
rjson::value ret = rjson::empty_object();
|
|
consumed_capacity.add_consumed_capacity_to_response_if_needed(ret);
|
|
metric += consumed_capacity.get_consumed_capacity_units();
|
|
if (!attributes.IsNull()) {
|
|
rjson::add(ret, "Attributes", std::move(attributes));
|
|
}
|
|
return rjson::print(std::move(ret));
|
|
}
|
|
|
|
static future<std::unique_ptr<rjson::value>> get_previous_item(
|
|
service::storage_proxy& proxy,
|
|
service::client_state& client_state,
|
|
schema_ptr schema,
|
|
const partition_key& pk,
|
|
const clustering_key& ck,
|
|
service_permit permit,
|
|
db::consistency_level cl,
|
|
uint64_t& item_length)
|
|
{
|
|
auto selection = cql3::selection::selection::wildcard(schema);
|
|
auto command = previous_item_read_command(proxy, schema, ck, selection);
|
|
command->allow_limit = db::allow_per_partition_rate_limit::yes;
|
|
return proxy.query(schema, command, to_partition_ranges(*schema, pk), cl, service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state)).then(
|
|
[schema, command, selection = std::move(selection), &item_length] (service::storage_proxy::coordinator_query_result qr) {
|
|
auto previous_item = executor::describe_single_item(schema, command->slice, *selection, *qr.query_result, {}, &item_length);
|
|
if (previous_item) {
|
|
return make_ready_future<std::unique_ptr<rjson::value>>(std::make_unique<rjson::value>(std::move(*previous_item)));
|
|
} else {
|
|
return make_ready_future<std::unique_ptr<rjson::value>>();
|
|
}
|
|
});
|
|
}
|
|
|
|
static future<std::unique_ptr<rjson::value>> get_previous_item(
|
|
service::storage_proxy& proxy,
|
|
service::client_state& client_state,
|
|
schema_ptr schema,
|
|
const partition_key& pk,
|
|
const clustering_key& ck,
|
|
service_permit permit,
|
|
alternator::stats& global_stats,
|
|
alternator::stats& per_table_stats,
|
|
uint64_t& item_length)
|
|
{
|
|
global_stats.reads_before_write++;
|
|
per_table_stats.reads_before_write++;
|
|
return get_previous_item(proxy, client_state, schema, pk, ck, permit, db::consistency_level::LOCAL_QUORUM, item_length);
|
|
}
|
|
|
|
static future<uint64_t> get_previous_item_size(
|
|
service::storage_proxy& proxy,
|
|
service::client_state& client_state,
|
|
schema_ptr schema,
|
|
const partition_key& pk,
|
|
const clustering_key& ck,
|
|
service_permit permit) {
|
|
uint64_t item_length = 0;
|
|
// The use of get_previous_item here is for DynamoDB calculation compatibility mode,
|
|
// and the actual value is ignored. For performance reasons, we use CL_LOCAL_ONE.
|
|
co_await get_previous_item(proxy, client_state, schema, pk, ck, permit, db::consistency_level::LOCAL_ONE, item_length);
|
|
co_return item_length;
|
|
}
|
|
|
|
future<executor::request_return_type> rmw_operation::execute(service::storage_proxy& proxy,
|
|
std::optional<service::cas_shard> cas_shard,
|
|
service::client_state& client_state,
|
|
tracing::trace_state_ptr trace_state,
|
|
service_permit permit,
|
|
bool needs_read_before_write,
|
|
stats& global_stats,
|
|
stats& per_table_stats,
|
|
uint64_t& wcu_total) {
|
|
auto cdc_opts = cdc::per_request_options{
|
|
.alternator = true,
|
|
.alternator_streams_increased_compatibility = schema()->cdc_options().enabled() && proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
|
};
|
|
if (needs_read_before_write) {
|
|
if (_write_isolation == write_isolation::FORBID_RMW) {
|
|
throw api_error::validation("Read-modify-write operations are disabled by 'forbid_rmw' write isolation policy. Refer to https://github.com/scylladb/scylla/blob/master/docs/alternator/alternator.md#write-isolation-policies for more information.");
|
|
}
|
|
global_stats.reads_before_write++;
|
|
per_table_stats.reads_before_write++;
|
|
if (_write_isolation == write_isolation::UNSAFE_RMW) {
|
|
// This is the old, unsafe, read before write which does first
|
|
// a read, then a write. TODO: remove this mode entirely.
|
|
return get_previous_item(proxy, client_state, schema(), _pk, _ck, permit, global_stats, per_table_stats, _consumed_capacity._total_bytes).then(
|
|
[this, &proxy, &wcu_total, trace_state, permit = std::move(permit), cdc_opts = std::move(cdc_opts)] (std::unique_ptr<rjson::value> previous_item) mutable {
|
|
std::optional<mutation> m = apply(std::move(previous_item), api::new_timestamp(), cdc_opts);
|
|
if (!m) {
|
|
return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("The conditional request failed", std::move(_return_attributes)));
|
|
}
|
|
return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes, false, std::move(cdc_opts)).then([this,&wcu_total] () mutable {
|
|
return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
|
|
});
|
|
});
|
|
}
|
|
} else if (_write_isolation != write_isolation::LWT_ALWAYS) {
|
|
std::optional<mutation> m = apply(nullptr, api::new_timestamp(), cdc_opts);
|
|
SCYLLA_ASSERT(m); // !needs_read_before_write, so apply() did not check a condition
|
|
return proxy.mutate(utils::chunked_vector<mutation>{std::move(*m)}, db::consistency_level::LOCAL_QUORUM, executor::default_timeout(), trace_state, std::move(permit), db::allow_per_partition_rate_limit::yes, false, std::move(cdc_opts)).then([this, &wcu_total] () mutable {
|
|
return rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total);
|
|
});
|
|
}
|
|
if (!cas_shard) {
|
|
on_internal_error(elogger, "cas_shard is not set");
|
|
}
|
|
// If we're still here, we need to do this write using LWT:
|
|
global_stats.write_using_lwt++;
|
|
per_table_stats.write_using_lwt++;
|
|
auto timeout = executor::default_timeout();
|
|
auto selection = cql3::selection::selection::wildcard(schema());
|
|
auto read_command = needs_read_before_write ?
|
|
previous_item_read_command(proxy, schema(), _ck, selection) :
|
|
nullptr;
|
|
return proxy.cas(schema(), std::move(*cas_shard), *this, read_command, to_partition_ranges(*schema(), _pk),
|
|
{timeout, std::move(permit), client_state, trace_state},
|
|
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM, timeout, timeout, true, std::move(cdc_opts)).then([this, read_command, &wcu_total] (bool is_applied) mutable {
|
|
if (!is_applied) {
|
|
return make_ready_future<executor::request_return_type>(api_error::conditional_check_failed("The conditional request failed", std::move(_return_attributes)));
|
|
}
|
|
return make_ready_future<executor::request_return_type>(rmw_operation_return(std::move(_return_attributes), _consumed_capacity, wcu_total));
|
|
});
|
|
}
|
|
|
|
static parsed::condition_expression get_parsed_condition_expression(parsed::expression_cache& parsed_expression_cache, rjson::value& request) {
|
|
rjson::value* condition_expression = rjson::find(request, "ConditionExpression");
|
|
if (!condition_expression) {
|
|
// Returning an empty() condition_expression means no condition.
|
|
return parsed::condition_expression{};
|
|
}
|
|
if (!condition_expression->IsString()) {
|
|
throw api_error::validation("ConditionExpression must be a string");
|
|
}
|
|
if (condition_expression->GetStringLength() == 0) {
|
|
throw api_error::validation("ConditionExpression must not be empty");
|
|
}
|
|
try {
|
|
return parsed_expression_cache.parse_condition_expression(rjson::to_string_view(*condition_expression), "ConditionExpression");
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
}
|
|
|
|
static bool check_needs_read_before_write(const parsed::condition_expression& condition_expression) {
|
|
// Theoretically, a condition expression may exist but not refer to the
|
|
// item at all. But this is not a useful case and there is no point in
|
|
// optimizing for it.
|
|
return !condition_expression.empty();
|
|
}
|
|
|
|
// Fail the expression if it has unused attribute names or values. This is
|
|
// how DynamoDB behaves, so we do too.
|
|
static void verify_all_are_used(const rjson::value* field,
|
|
const std::unordered_set<std::string>& used, const char* field_name, const char* operation) {
|
|
if (!field) {
|
|
return;
|
|
}
|
|
for (auto it = field->MemberBegin(); it != field->MemberEnd(); ++it) {
|
|
if (!used.contains(rjson::to_string(it->name))) {
|
|
throw api_error::validation(
|
|
format("{} has spurious '{}', not used in {}",
|
|
field_name, rjson::to_string_view(it->name), operation));
|
|
}
|
|
}
|
|
}
|
|
|
|
class put_item_operation : public rmw_operation {
|
|
private:
|
|
put_or_delete_item _mutation_builder;
|
|
public:
|
|
parsed::condition_expression _condition_expression;
|
|
put_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
|
|
: rmw_operation(proxy, std::move(request))
|
|
, _mutation_builder(rjson::get(_request, "Item"), schema(), put_or_delete_item::put_item{},
|
|
si_key_attributes(proxy.data_dictionary().find_table(schema()->ks_name(), schema()->cf_name()))) {
|
|
_pk = _mutation_builder.pk();
|
|
_ck = _mutation_builder.ck();
|
|
if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
|
|
throw api_error::validation(format("PutItem supports only NONE or ALL_OLD for ReturnValues"));
|
|
}
|
|
_condition_expression = get_parsed_condition_expression(parsed_expression_cache, _request);
|
|
const rjson::value* expression_attribute_names = rjson::find(_request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(_request, "ExpressionAttributeValues");
|
|
if (!_condition_expression.empty()) {
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
resolve_condition_expression(_condition_expression,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "PutItem");
|
|
verify_all_are_used(expression_attribute_values, used_attribute_values,"ExpressionAttributeValues", "PutItem");
|
|
} else {
|
|
if (expression_attribute_names) {
|
|
throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
|
|
}
|
|
if (expression_attribute_values) {
|
|
throw api_error::validation("ExpressionAttributeValues cannot be used without ConditionExpression");
|
|
}
|
|
}
|
|
_consumed_capacity += _mutation_builder.length_in_bytes();
|
|
}
|
|
bool needs_read_before_write() const {
|
|
return _request.HasMember("Expected") ||
|
|
check_needs_read_before_write(_condition_expression) ||
|
|
_returnvalues == returnvalues::ALL_OLD;
|
|
}
|
|
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
|
|
if (!verify_expected(_request, previous_item.get()) ||
|
|
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
|
if (previous_item && _returnvalues_on_condition_check_failure ==
|
|
returnvalues_on_condition_check_failure::ALL_OLD) {
|
|
_return_attributes = std::move(*previous_item);
|
|
}
|
|
// If the update is to be cancelled because of an unfulfilled Expected
|
|
// condition, return an empty optional mutation, which is more
|
|
// efficient than throwing an exception.
|
|
return {};
|
|
}
|
|
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
|
_return_attributes = std::move(*previous_item);
|
|
} else {
|
|
_return_attributes = {};
|
|
}
|
|
return _mutation_builder.build(_schema, ts);
|
|
}
|
|
virtual ~put_item_operation() = default;
|
|
};
|
|
|
|
future<executor::request_return_type> executor::put_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.put_item++;
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
elogger.trace("put_item {}", request);
|
|
|
|
auto op = make_shared<put_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
|
|
tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
|
|
const bool needs_read_before_write = op->needs_read_before_write();
|
|
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
|
|
|
|
auto cas_shard = op->shard_for_execute(needs_read_before_write);
|
|
|
|
if (cas_shard && !cas_shard->this_shard()) {
|
|
_stats.api_operations.put_item--; // uncount on this shard, will be counted in other shard
|
|
_stats.shard_bounce_for_lwt++;
|
|
co_return co_await container().invoke_on(cas_shard->shard(), _ssg,
|
|
[request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
|
|
(executor& e) mutable {
|
|
return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
|
|
(service::client_state& client_state) mutable {
|
|
//FIXME: Instead of passing empty_service_permit() to the background operation,
|
|
// the current permit's lifetime should be prolonged, so that it's destructed
|
|
// only after all background operations are finished as well.
|
|
return e.put_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request));
|
|
});
|
|
});
|
|
}
|
|
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
|
|
per_table_stats->api_operations.put_item++;
|
|
uint64_t wcu_total = 0;
|
|
auto res = co_await op->execute(_proxy, std::move(cas_shard), client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
|
|
per_table_stats->operation_sizes.put_item_op_size_kb.add(bytes_to_kb_ceil(op->consumed_capacity()._total_bytes));
|
|
per_table_stats->wcu_total[stats::wcu_types::PUT_ITEM] += wcu_total;
|
|
_stats.wcu_total[stats::wcu_types::PUT_ITEM] += wcu_total;
|
|
per_table_stats->api_operations.put_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
_stats.api_operations.put_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
co_return res;
|
|
}
|
|
|
|
class delete_item_operation : public rmw_operation {
|
|
private:
|
|
put_or_delete_item _mutation_builder;
|
|
public:
|
|
parsed::condition_expression _condition_expression;
|
|
delete_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request)
|
|
: rmw_operation(proxy, std::move(request))
|
|
, _mutation_builder(rjson::get(_request, "Key"), schema(), put_or_delete_item::delete_item{}) {
|
|
_pk = _mutation_builder.pk();
|
|
_ck = _mutation_builder.ck();
|
|
if (_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::ALL_OLD) {
|
|
throw api_error::validation(format("DeleteItem supports only NONE or ALL_OLD for ReturnValues"));
|
|
}
|
|
_condition_expression = get_parsed_condition_expression(parsed_expression_cache, _request);
|
|
const rjson::value* expression_attribute_names = rjson::find(_request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(_request, "ExpressionAttributeValues");
|
|
if (!_condition_expression.empty()) {
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
resolve_condition_expression(_condition_expression,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "DeleteItem");
|
|
verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "DeleteItem");
|
|
} else {
|
|
if (expression_attribute_names) {
|
|
throw api_error::validation("ExpressionAttributeNames cannot be used without ConditionExpression");
|
|
}
|
|
if (expression_attribute_values) {
|
|
throw api_error::validation("ExpressionAttributeValues cannot be used without ConditionExpression");
|
|
}
|
|
}
|
|
}
|
|
bool needs_read_before_write() const {
|
|
return _request.HasMember("Expected") ||
|
|
check_needs_read_before_write(_condition_expression) ||
|
|
_returnvalues == returnvalues::ALL_OLD;
|
|
}
|
|
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override {
|
|
if (!verify_expected(_request, previous_item.get()) ||
|
|
!verify_condition_expression(_condition_expression, previous_item.get())) {
|
|
if (previous_item && _returnvalues_on_condition_check_failure ==
|
|
returnvalues_on_condition_check_failure::ALL_OLD) {
|
|
_return_attributes = std::move(*previous_item);
|
|
}
|
|
// If the update is to be cancelled because of an unfulfilled Expected
|
|
// condition, return an empty optional mutation, which is more
|
|
// efficient than throwing an exception.
|
|
return {};
|
|
}
|
|
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
|
_return_attributes = std::move(*previous_item);
|
|
} else {
|
|
_return_attributes = {};
|
|
}
|
|
if (_consumed_capacity._total_bytes == 0) {
|
|
_consumed_capacity._total_bytes = 1;
|
|
}
|
|
return _mutation_builder.build(_schema, ts);
|
|
}
|
|
virtual ~delete_item_operation() = default;
|
|
};
|
|
|
|
future<executor::request_return_type> executor::delete_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.delete_item++;
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
elogger.trace("delete_item {}", request);
|
|
|
|
auto op = make_shared<delete_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
|
|
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
|
|
tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
|
|
const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();
|
|
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
|
|
|
|
auto cas_shard = op->shard_for_execute(needs_read_before_write);
|
|
|
|
if (cas_shard && !cas_shard->this_shard()) {
|
|
_stats.api_operations.delete_item--; // uncount on this shard, will be counted in other shard
|
|
_stats.shard_bounce_for_lwt++;
|
|
per_table_stats->shard_bounce_for_lwt++;
|
|
co_return co_await container().invoke_on(cas_shard->shard(), _ssg,
|
|
[request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
|
|
(executor& e) mutable {
|
|
return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
|
|
(service::client_state& client_state) mutable {
|
|
//FIXME: Instead of passing empty_service_permit() to the background operation,
|
|
// the current permit's lifetime should be prolonged, so that it's destructed
|
|
// only after all background operations are finished as well.
|
|
return e.delete_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request));
|
|
});
|
|
});
|
|
}
|
|
per_table_stats->api_operations.delete_item++;
|
|
uint64_t wcu_total = 0;
|
|
auto res = co_await op->execute(_proxy, std::move(cas_shard), client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
|
|
if (op->consumed_capacity()._total_bytes > 1) {
|
|
per_table_stats->operation_sizes.delete_item_op_size_kb.add(bytes_to_kb_ceil(op->consumed_capacity()._total_bytes));
|
|
}
|
|
per_table_stats->wcu_total[stats::wcu_types::DELETE_ITEM] += wcu_total;
|
|
_stats.wcu_total[stats::wcu_types::DELETE_ITEM] += wcu_total;
|
|
per_table_stats->api_operations.delete_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
_stats.api_operations.delete_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
co_return res;
|
|
}
|
|
|
|
static schema_ptr get_table_from_batch_request(const service::storage_proxy& proxy, const rjson::value::ConstMemberIterator& batch_request) {
|
|
sstring table_name = rjson::to_sstring(batch_request->name); // JSON keys are always strings
|
|
try {
|
|
return proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
|
|
} catch(data_dictionary::no_such_column_family&) {
|
|
// DynamoDB returns validation error even when table does not exist
|
|
// and the table name is invalid.
|
|
validate_table_name(table_name);
|
|
throw api_error::resource_not_found(format("Requested resource not found: Table: {} not found", table_name));
|
|
}
|
|
}
|
|
|
|
using primary_key = std::pair<partition_key, clustering_key>;
|
|
struct primary_key_hash {
|
|
schema_ptr _s;
|
|
size_t operator()(const primary_key& key) const {
|
|
return utils::hash_combine(partition_key::hashing(*_s)(key.first), clustering_key::hashing(*_s)(key.second));
|
|
}
|
|
};
|
|
struct primary_key_equal {
|
|
schema_ptr _s;
|
|
bool operator()(const primary_key& k1, const primary_key& k2) const {
|
|
return partition_key::equality(*_s)(k1.first, k2.first) && clustering_key::equality(*_s)(k1.second, k2.second);
|
|
}
|
|
};
|
|
|
|
// This is a cas_request subclass for applying given put_or_delete_items to
|
|
// one partition using LWT as part as BatchWriteItem. This is a write-only
|
|
// operation, not needing the previous value of the item (the mutation to be
|
|
// done is known prior to starting the operation). Nevertheless, we want to
|
|
// do this mutation via LWT to ensure that it is serialized with other LWT
|
|
// mutations to the same partition.
|
|
//
|
|
// The std::vector<put_or_delete_item> must remain alive until the
|
|
// storage_proxy::cas() future is resolved.
|
|
class put_or_delete_item_cas_request : public service::cas_request {
|
|
schema_ptr schema;
|
|
const std::vector<put_or_delete_item>& _mutation_builders;
|
|
public:
|
|
put_or_delete_item_cas_request(schema_ptr s, const std::vector<put_or_delete_item>& b) :
|
|
schema(std::move(s)), _mutation_builders(b) { }
|
|
virtual ~put_or_delete_item_cas_request() = default;
|
|
virtual std::optional<mutation> apply(foreign_ptr<lw_shared_ptr<query::result>> qr, const query::partition_slice& slice, api::timestamp_type ts, cdc::per_request_options& cdc_opts) override {
|
|
std::optional<mutation> ret;
|
|
for (const put_or_delete_item& mutation_builder : _mutation_builders) {
|
|
// We assume all these builders have the same partition.
|
|
if (ret) {
|
|
ret->apply(mutation_builder.build(schema, ts));
|
|
} else {
|
|
ret = mutation_builder.build(schema, ts);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
};
|
|
|
|
future<> executor::cas_write(schema_ptr schema, service::cas_shard cas_shard, const dht::decorated_key& dk,
|
|
const std::vector<put_or_delete_item>& mutation_builders, service::client_state& client_state,
|
|
tracing::trace_state_ptr trace_state, service_permit permit)
|
|
{
|
|
if (!cas_shard.this_shard()) {
|
|
_stats.shard_bounce_for_lwt++;
|
|
return container().invoke_on(cas_shard.shard(), _ssg,
|
|
[cs = client_state.move_to_other_shard(),
|
|
&mb = mutation_builders,
|
|
&dk,
|
|
ks = schema->ks_name(),
|
|
cf = schema->cf_name(),
|
|
gt = tracing::global_trace_state_ptr(trace_state),
|
|
permit = std::move(permit)]
|
|
(executor& self) mutable {
|
|
return do_with(cs.get(), [&mb, &dk, ks = std::move(ks), cf = std::move(cf),
|
|
trace_state = tracing::trace_state_ptr(gt), &self]
|
|
(service::client_state& client_state) mutable {
|
|
auto schema = self._proxy.data_dictionary().find_schema(ks, cf);
|
|
service::cas_shard cas_shard(*schema, dk.token());
|
|
|
|
//FIXME: Instead of passing empty_service_permit() to the background operation,
|
|
// the current permit's lifetime should be prolonged, so that it's destructed
|
|
// only after all background operations are finished as well.
|
|
return self.cas_write(schema, std::move(cas_shard), dk, mb, client_state, std::move(trace_state), empty_service_permit());
|
|
});
|
|
});
|
|
}
|
|
|
|
auto timeout = executor::default_timeout();
|
|
auto op = std::make_unique<put_or_delete_item_cas_request>(schema, mutation_builders);
|
|
auto* op_ptr = op.get();
|
|
auto cdc_opts = cdc::per_request_options{
|
|
.alternator = true,
|
|
.alternator_streams_increased_compatibility =
|
|
schema->cdc_options().enabled() && _proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
|
};
|
|
return _proxy.cas(schema, std::move(cas_shard), *op_ptr, nullptr, to_partition_ranges(dk),
|
|
{timeout, std::move(permit), client_state, trace_state},
|
|
db::consistency_level::LOCAL_SERIAL, db::consistency_level::LOCAL_QUORUM,
|
|
timeout, timeout, true, std::move(cdc_opts)).finally([op = std::move(op)]{}).discard_result();
|
|
// We discarded cas()'s future value ("is_applied") because BatchWriteItem
|
|
// does not need to support conditional updates.
|
|
}
|
|
|
|
|
|
struct schema_decorated_key {
|
|
schema_ptr schema;
|
|
dht::decorated_key dk;
|
|
};
|
|
struct schema_decorated_key_hash {
|
|
size_t operator()(const schema_decorated_key& k) const {
|
|
return std::hash<dht::token>()(k.dk.token());
|
|
}
|
|
};
|
|
struct schema_decorated_key_equal {
|
|
bool operator()(const schema_decorated_key& k1, const schema_decorated_key& k2) const {
|
|
return k1.schema == k2.schema && k1.dk.equal(*k1.schema, k2.dk);
|
|
}
|
|
};
|
|
|
|
// FIXME: if we failed writing some of the mutations, need to return a list
|
|
// of these failed mutations rather than fail the whole write (issue #5650).
|
|
future<> executor::do_batch_write(
|
|
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,
|
|
service::client_state& client_state,
|
|
tracing::trace_state_ptr trace_state,
|
|
service_permit permit) {
|
|
if (mutation_builders.empty()) {
|
|
return make_ready_future<>();
|
|
}
|
|
// NOTE: technically, do_batch_write could be reworked to use LWT only for part
|
|
// of the batched requests and not use it for others, but it's not considered
|
|
// likely that a batch will contain both tables which always demand LWT and ones
|
|
// that don't - it's fragile to split a batch into multiple storage proxy requests though.
|
|
// Hence, the decision is conservative - if any table enforces LWT,the whole batch will use it.
|
|
const bool needs_lwt = std::ranges::any_of(mutation_builders | std::views::keys, [] (const schema_ptr& schema) {
|
|
return rmw_operation::get_write_isolation_for_schema(schema) == rmw_operation::write_isolation::LWT_ALWAYS;
|
|
});
|
|
if (!needs_lwt) {
|
|
// Do a normal write, without LWT:
|
|
utils::chunked_vector<mutation> mutations;
|
|
mutations.reserve(mutation_builders.size());
|
|
api::timestamp_type now = api::new_timestamp();
|
|
bool any_cdc_enabled = false;
|
|
for (auto& b : mutation_builders) {
|
|
mutations.push_back(b.second.build(b.first, now));
|
|
any_cdc_enabled |= b.first->cdc_options().enabled();
|
|
}
|
|
return _proxy.mutate(std::move(mutations),
|
|
db::consistency_level::LOCAL_QUORUM,
|
|
executor::default_timeout(),
|
|
trace_state,
|
|
std::move(permit),
|
|
db::allow_per_partition_rate_limit::yes,
|
|
false,
|
|
cdc::per_request_options{
|
|
.alternator = true,
|
|
.alternator_streams_increased_compatibility = any_cdc_enabled && _proxy.data_dictionary().get_config().alternator_streams_increased_compatibility(),
|
|
});
|
|
} else {
|
|
// Do the write via LWT:
|
|
// Multiple mutations may be destined for the same partition, adding
|
|
// or deleting different items of one partition. Join them together
|
|
// because we can do them in one cas() call.
|
|
using map_type = std::unordered_map<schema_decorated_key,
|
|
std::vector<put_or_delete_item>,
|
|
schema_decorated_key_hash,
|
|
schema_decorated_key_equal>;
|
|
auto key_builders = std::make_unique<map_type>(1, schema_decorated_key_hash{}, schema_decorated_key_equal{});
|
|
for (auto&& b : std::move(mutation_builders)) {
|
|
auto [it, added] = key_builders->try_emplace(schema_decorated_key {
|
|
.schema = b.first,
|
|
.dk = dht::decorate_key(*b.first, b.second.pk())
|
|
});
|
|
it->second.push_back(std::move(b.second));
|
|
}
|
|
auto* key_builders_ptr = key_builders.get();
|
|
return parallel_for_each(*key_builders_ptr, [this, &client_state, trace_state, permit = std::move(permit)] (const auto& e) {
|
|
_stats.write_using_lwt++;
|
|
auto desired_shard = service::cas_shard(*e.first.schema, e.first.dk.token());
|
|
auto s = e.first.schema;
|
|
|
|
static const auto* injection_name = "alternator_executor_batch_write_wait";
|
|
return utils::get_local_injector().inject(injection_name, [s = std::move(s)] (auto& handler) -> future<> {
|
|
const auto ks = handler.get("keyspace");
|
|
const auto cf = handler.get("table");
|
|
const auto shard = std::atoll(handler.get("shard")->data());
|
|
if (ks == s->ks_name() && cf == s->cf_name() && shard == this_shard_id()) {
|
|
elogger.info("{}: hit", injection_name);
|
|
co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
|
|
elogger.info("{}: continue", injection_name);
|
|
}
|
|
}).then([&e, desired_shard = std::move(desired_shard),
|
|
&client_state, trace_state = std::move(trace_state), permit = std::move(permit), this]() mutable
|
|
{
|
|
return cas_write(e.first.schema, std::move(desired_shard), e.first.dk,
|
|
std::move(e.second), client_state, std::move(trace_state), std::move(permit));
|
|
});
|
|
}).finally([key_builders = std::move(key_builders)]{});
|
|
}
|
|
}
|
|
|
|
future<executor::request_return_type> executor::batch_write_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.batch_write_item++;
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
const rjson::value& request_items = get_member(request, "RequestItems", "BatchWriteItem content");
|
|
validate_is_object(request_items, "RequestItems");
|
|
if (request_items.ObjectEmpty()) {
|
|
co_return api_error::validation("RequestItems can't be empty");
|
|
}
|
|
|
|
const auto maximum_batch_write_size = _proxy.data_dictionary().get_config().alternator_max_items_in_batch_write();
|
|
|
|
size_t total_items = 0;
|
|
for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
|
|
if (!it->value.IsArray() || it->value.Empty()) {
|
|
co_return api_error::validation("Member of RequestItems must be a non-empty array of WriteRequest objects");
|
|
}
|
|
total_items += it->value.Size();
|
|
}
|
|
if (total_items > maximum_batch_write_size) {
|
|
co_return api_error::validation(fmt::format("Invalid length of BatchWriteItem command, got {} items, "
|
|
"maximum is {} (from configuration variable alternator_max_items_in_batch_write)", total_items, maximum_batch_write_size));
|
|
}
|
|
bool should_add_wcu = wcu_consumed_capacity_counter::should_add_capacity(request);
|
|
rjson::value consumed_capacity = rjson::empty_array();
|
|
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders;
|
|
// WCU calculation is performed at the end of execution.
|
|
// We need to keep track of changes per table, both for internal metrics
|
|
// and to be able to return the values if should_add_wcu is true.
|
|
// For each table, we need its stats and schema.
|
|
std::vector<std::pair<lw_shared_ptr<stats>, schema_ptr>> per_table_wcu;
|
|
|
|
mutation_builders.reserve(request_items.MemberCount());
|
|
per_table_wcu.reserve(request_items.MemberCount());
|
|
for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
|
|
schema_ptr schema = get_table_from_batch_request(_proxy, it);
|
|
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(schema));
|
|
per_table_stats->api_operations.batch_write_item++;
|
|
per_table_stats->api_operations.batch_write_item_batch_total += it->value.Size();
|
|
per_table_stats->api_operations.batch_write_item_histogram.add(it->value.Size());
|
|
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
|
|
|
std::unordered_set<primary_key, primary_key_hash, primary_key_equal> used_keys(
|
|
1, primary_key_hash{schema}, primary_key_equal{schema});
|
|
for (auto& request : it->value.GetArray()) {
|
|
auto& r = get_single_member(request, "RequestItems element");
|
|
const auto r_name = rjson::to_string_view(r.name);
|
|
if (r_name == "PutRequest") {
|
|
const rjson::value& item = get_member(r.value, "Item", "PutRequest");
|
|
validate_is_object(item, "Item in PutRequest");
|
|
auto&& put_item = put_or_delete_item(
|
|
item, schema, put_or_delete_item::put_item{},
|
|
si_key_attributes(_proxy.data_dictionary().find_table(schema->ks_name(), schema->cf_name())));
|
|
mutation_builders.emplace_back(schema, std::move(put_item));
|
|
auto mut_key = std::make_pair(mutation_builders.back().second.pk(), mutation_builders.back().second.ck());
|
|
if (used_keys.contains(mut_key)) {
|
|
co_return api_error::validation("Provided list of item keys contains duplicates");
|
|
}
|
|
used_keys.insert(std::move(mut_key));
|
|
} else if (r_name == "DeleteRequest") {
|
|
const rjson::value& key = get_member(r.value, "Key", "DeleteRequest");
|
|
validate_is_object(key, "Key in DeleteRequest");
|
|
mutation_builders.emplace_back(schema, put_or_delete_item(
|
|
key, schema, put_or_delete_item::delete_item{}));
|
|
auto mut_key = std::make_pair(mutation_builders.back().second.pk(),
|
|
mutation_builders.back().second.ck());
|
|
if (used_keys.contains(mut_key)) {
|
|
co_return api_error::validation("Provided list of item keys contains duplicates");
|
|
}
|
|
used_keys.insert(std::move(mut_key));
|
|
} else {
|
|
co_return api_error::validation(fmt::format("Unknown BatchWriteItem request type: {}", r_name));
|
|
}
|
|
}
|
|
per_table_wcu.emplace_back(std::make_pair(per_table_stats, schema));
|
|
}
|
|
for (const auto& b : mutation_builders) {
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, b.first, auth::permission::MODIFY, _stats);
|
|
}
|
|
// If alternator_force_read_before_write is true we will first get the previous item size
|
|
// and only then do send the mutation.
|
|
if (_proxy.data_dictionary().get_config().alternator_force_read_before_write()) {
|
|
std::vector<future<uint64_t>> previous_items_sizes;
|
|
previous_items_sizes.reserve(mutation_builders.size());
|
|
|
|
// Parallel get all previous item sizes
|
|
for (const auto& b : mutation_builders) {
|
|
previous_items_sizes.emplace_back(get_previous_item_size(
|
|
_proxy,
|
|
client_state,
|
|
b.first,
|
|
b.second.pk(),
|
|
b.second.ck(),
|
|
permit));
|
|
}
|
|
size_t pos = 0;
|
|
// We are going to wait for all the requests
|
|
for (auto&& pi : previous_items_sizes) {
|
|
auto res = co_await std::move(pi);
|
|
if (mutation_builders[pos].second.length_in_bytes() < res) {
|
|
mutation_builders[pos].second.set_length_in_bytes(res);
|
|
}
|
|
pos++;
|
|
}
|
|
}
|
|
|
|
|
|
size_t wcu_put_units = 0;
|
|
size_t wcu_delete_units = 0;
|
|
|
|
size_t pos = 0;
|
|
size_t total_wcu;
|
|
// Here we calculate the per-table WCU.
|
|
// The size in the mutation is based either on the operation size,
|
|
// or, if we performed a read-before-write, on the larger of the operation size
|
|
// and the previous item's size.
|
|
for (const auto& w : per_table_wcu) {
|
|
total_wcu = 0;
|
|
// The following loop goes over all items from the same table
|
|
while(pos < mutation_builders.size() && w.second->id() == mutation_builders[pos].first->id()) {
|
|
uint64_t item_size = mutation_builders[pos].second.length_in_bytes();
|
|
size_t wcu = wcu_consumed_capacity_counter::get_units(item_size ? item_size : 1);
|
|
total_wcu += wcu;
|
|
if (mutation_builders[pos].second.is_put_item()) {
|
|
w.first->wcu_total[stats::PUT_ITEM] += wcu;
|
|
wcu_put_units += wcu;
|
|
} else {
|
|
w.first->wcu_total[stats::DELETE_ITEM] += wcu;
|
|
wcu_delete_units += wcu;
|
|
}
|
|
w.first->operation_sizes.batch_write_item_op_size_kb.add(bytes_to_kb_ceil(item_size));
|
|
pos++;
|
|
}
|
|
if (should_add_wcu) {
|
|
rjson::value entry = rjson::empty_object();
|
|
rjson::add(entry, "TableName", rjson::from_string(w.second->cf_name()));
|
|
rjson::add(entry, "CapacityUnits", total_wcu);
|
|
rjson::push_back(consumed_capacity, std::move(entry));
|
|
}
|
|
}
|
|
_stats.wcu_total[stats::PUT_ITEM] += wcu_put_units;
|
|
_stats.wcu_total[stats::DELETE_ITEM] += wcu_delete_units;
|
|
_stats.api_operations.batch_write_item_batch_total += total_items;
|
|
_stats.api_operations.batch_write_item_histogram.add(total_items);
|
|
co_await do_batch_write(std::move(mutation_builders), client_state, trace_state, std::move(permit));
|
|
// FIXME: Issue #5650: If we failed writing some of the updates,
|
|
// need to return a list of these failed updates in UnprocessedItems
|
|
// rather than fail the whole write (issue #5650).
|
|
rjson::value ret = rjson::empty_object();
|
|
rjson::add(ret, "UnprocessedItems", rjson::empty_object());
|
|
if (should_add_wcu) {
|
|
rjson::add(ret, "ConsumedCapacity", std::move(consumed_capacity));
|
|
}
|
|
_stats.api_operations.batch_write_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
co_return rjson::print(std::move(ret));
|
|
}
|
|
|
|
static const std::string_view get_item_type_string(const rjson::value& v) {
|
|
const rjson::value::Member& mem = get_single_member(v, "Item");
|
|
return rjson::to_string_view(mem.name);
|
|
}
|
|
|
|
// attrs_to_get saves for each top-level attribute an attrs_to_get_node,
|
|
// a hierarchy of subparts that need to be kept. The following function
|
|
// takes a given JSON value and drops its parts which weren't asked to be
|
|
// kept. It modifies the given JSON value, or returns false to signify that
|
|
// the entire object should be dropped.
|
|
// Note that The JSON value is assumed to be encoded using the DynamoDB
|
|
// conventions - i.e., it is really a map whose key has a type string,
|
|
// and the value is the real object.
|
|
template<typename T>
|
|
static bool hierarchy_filter(rjson::value& val, const attribute_path_map_node<T>& h) {
|
|
if (!val.IsObject() || val.MemberCount() != 1) {
|
|
// This shouldn't happen. We shouldn't have stored malformed objects.
|
|
// But today Alternator does not validate the structure of nested
|
|
// documents before storing them, so this can happen on read.
|
|
throw api_error::internal(format("Malformed value object read: {}", val));
|
|
}
|
|
const char* type = val.MemberBegin()->name.GetString();
|
|
rjson::value& v = val.MemberBegin()->value;
|
|
if (h.has_members()) {
|
|
const auto& members = h.get_members();
|
|
if (type[0] != 'M' || !v.IsObject()) {
|
|
// If v is not an object (dictionary, map), none of the members
|
|
// can match.
|
|
return false;
|
|
}
|
|
rjson::value newv = rjson::empty_object();
|
|
for (auto it = v.MemberBegin(); it != v.MemberEnd(); ++it) {
|
|
std::string attr = rjson::to_string(it->name);
|
|
auto x = members.find(attr);
|
|
if (x != members.end()) {
|
|
if (x->second) {
|
|
// Only a part of this attribute is to be filtered, do it.
|
|
if (hierarchy_filter(it->value, *x->second)) {
|
|
// because newv started empty and attr are unique
|
|
// (keys of v), we can use add() here
|
|
rjson::add_with_string_name(newv, attr, std::move(it->value));
|
|
}
|
|
} else {
|
|
// The entire attribute is to be kept
|
|
rjson::add_with_string_name(newv, attr, std::move(it->value));
|
|
}
|
|
}
|
|
}
|
|
if (newv.MemberCount() == 0) {
|
|
return false;
|
|
}
|
|
v = newv;
|
|
} else if (h.has_indexes()) {
|
|
const auto& indexes = h.get_indexes();
|
|
if (type[0] != 'L' || !v.IsArray()) {
|
|
return false;
|
|
}
|
|
rjson::value newv = rjson::empty_array();
|
|
const auto& a = v.GetArray();
|
|
for (unsigned i = 0; i < v.Size(); i++) {
|
|
auto x = indexes.find(i);
|
|
if (x != indexes.end()) {
|
|
if (x->second) {
|
|
if (hierarchy_filter(a[i], *x->second)) {
|
|
rjson::push_back(newv, std::move(a[i]));
|
|
}
|
|
} else {
|
|
// The entire attribute is to be kept
|
|
rjson::push_back(newv, std::move(a[i]));
|
|
}
|
|
}
|
|
}
|
|
if (newv.Size() == 0) {
|
|
return false;
|
|
}
|
|
v = newv;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Add a path to a attribute_path_map. Throws a validation error if the path
|
|
// "overlaps" with one already in the filter (one is a sub-path of the other)
|
|
// or "conflicts" with it (both a member and index is requested).
|
|
template<typename T>
|
|
void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const parsed::path& p, T value = {}) {
|
|
using node = attribute_path_map_node<T>;
|
|
// The first step is to look for the top-level attribute (p.root()):
|
|
auto it = map.find(p.root());
|
|
if (it == map.end()) {
|
|
if (p.has_operators()) {
|
|
it = map.emplace(p.root(), node {std::nullopt}).first;
|
|
} else {
|
|
(void) map.emplace(p.root(), node {std::move(value)}).first;
|
|
// Value inserted for top-level node. We're done.
|
|
return;
|
|
}
|
|
} else if(!p.has_operators()) {
|
|
// If p is top-level and we already have it or a part of it
|
|
// in map, it's a forbidden overlapping path.
|
|
throw api_error::validation(fmt::format(
|
|
"Invalid {}: two document paths overlap at {}", source, p.root()));
|
|
} else if (it->second.has_value()) {
|
|
// If we're here, it != map.end() && p.has_operators && it->second.has_value().
|
|
// This means the top-level attribute already has a value, and we're
|
|
// trying to add a non-top-level value. It's an overlap.
|
|
throw api_error::validation(fmt::format("Invalid {}: two document paths overlap at {}", source, p.root()));
|
|
}
|
|
node* h = &it->second;
|
|
// The second step is to walk h from the top-level node to the inner node
|
|
// where we're supposed to insert the value:
|
|
for (const auto& op : p.operators()) {
|
|
std::visit(overloaded_functor {
|
|
[&] (const std::string& member) {
|
|
if (h->is_empty()) {
|
|
*h = node {typename node::members_t()};
|
|
} else if (h->has_indexes()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
|
|
} else if (h->has_value()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
|
|
}
|
|
typename node::members_t& members = h->get_members();
|
|
auto it = members.find(member);
|
|
if (it == members.end()) {
|
|
it = members.insert({member, std::make_unique<node>()}).first;
|
|
}
|
|
h = it->second.get();
|
|
},
|
|
[&] (unsigned index) {
|
|
if (h->is_empty()) {
|
|
*h = node {typename node::indexes_t()};
|
|
} else if (h->has_members()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths conflict at {}", source, p));
|
|
} else if (h->has_value()) {
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
|
|
}
|
|
typename node::indexes_t& indexes = h->get_indexes();
|
|
auto it = indexes.find(index);
|
|
if (it == indexes.end()) {
|
|
it = indexes.insert({index, std::make_unique<node>()}).first;
|
|
}
|
|
h = it->second.get();
|
|
}
|
|
}, op);
|
|
}
|
|
// Finally, insert the value in the node h.
|
|
if (h->is_empty()) {
|
|
*h = node {std::move(value)};
|
|
} else {
|
|
throw api_error::validation(format("Invalid {}: two document paths overlap at {}", source, p));
|
|
}
|
|
}
|
|
|
|
// A very simplified version of the above function for the special case of
|
|
// adding only top-level attribute. It's not only simpler, we also use a
|
|
// different error message, referring to a "duplicate attribute"instead of
|
|
// "overlapping paths". DynamoDB also has this distinction (errors in
|
|
// AttributesToGet refer to duplicates, not overlaps, but errors in
|
|
// ProjectionExpression refer to overlap - even if it's an exact duplicate).
|
|
template<typename T>
|
|
void attribute_path_map_add(const char* source, attribute_path_map<T>& map, const std::string& attr, T value = {}) {
|
|
using node = attribute_path_map_node<T>;
|
|
auto it = map.find(attr);
|
|
if (it == map.end()) {
|
|
map.emplace(attr, node {std::move(value)});
|
|
} else {
|
|
throw api_error::validation(fmt::format(
|
|
"Invalid {}: Duplicate attribute: {}", source, attr));
|
|
}
|
|
}
|
|
|
|
// Parse the "Select" parameter of a Scan or Query operation, throwing a
|
|
// ValidationException in various forbidden combinations of options and
|
|
// finally returning one of three options:
|
|
// 1. regular - the default scan behavior of returning all or specific
|
|
// attributes ("ALL_ATTRIBUTES" or "SPECIFIC_ATTRIBUTES").
|
|
// 2. count - just count the items ("COUNT")
|
|
// 3. projection - return projected attributes ("ALL_PROJECTED_ATTRIBUTES")
|
|
// An ValidationException is thrown when recognizing an invalid combination
|
|
// of options - such as ALL_PROJECTED_ATTRIBUTES for a base table, or
|
|
// SPECIFIC_ATTRIBUTES without ProjectionExpression or AttributesToGet.
|
|
enum class select_type { regular, count, projection };
|
|
static select_type parse_select(const rjson::value& request, table_or_view_type table_type) {
|
|
const rjson::value* select_value = rjson::find(request, "Select");
|
|
if (!select_value) {
|
|
// If "Select" is not specified, it defaults to ALL_ATTRIBUTES
|
|
// on a base table, or ALL_PROJECTED_ATTRIBUTES on an index
|
|
return table_type == table_or_view_type::base ?
|
|
select_type::regular : select_type::projection;
|
|
}
|
|
if (!select_value->IsString()) {
|
|
throw api_error::validation("Select parameter must be a string");
|
|
}
|
|
std::string_view select = rjson::to_string_view(*select_value);
|
|
const bool has_attributes_to_get = request.HasMember("AttributesToGet");
|
|
const bool has_projection_expression = request.HasMember("ProjectionExpression");
|
|
if (select == "SPECIFIC_ATTRIBUTES") {
|
|
if (has_projection_expression || has_attributes_to_get) {
|
|
return select_type::regular;
|
|
}
|
|
throw api_error::validation("Select=SPECIFIC_ATTRIBUTES requires AttributesToGet or ProjectionExpression");
|
|
}
|
|
if (has_projection_expression || has_attributes_to_get) {
|
|
throw api_error::validation("AttributesToGet or ProjectionExpression require Select to be either SPECIFIC_ATTRIBUTES or missing");
|
|
}
|
|
if (select == "COUNT") {
|
|
return select_type::count;
|
|
}
|
|
if (select == "ALL_ATTRIBUTES") {
|
|
// FIXME: when we support projections (#5036), if this is a GSI and
|
|
// not all attributes are projected to it, we should throw.
|
|
return select_type::regular;
|
|
}
|
|
if (select == "ALL_PROJECTED_ATTRIBUTES") {
|
|
if (table_type == table_or_view_type::base) {
|
|
throw api_error::validation("ALL_PROJECTED_ATTRIBUTES only allowed for indexes");
|
|
}
|
|
return select_type::projection;
|
|
}
|
|
throw api_error::validation(fmt::format("Unknown Select value '{}'. Allowed choices: ALL_ATTRIBUTES, SPECIFIC_ATTRIBUTES, ALL_PROJECTED_ATTRIBUTES, COUNT",
|
|
select));
|
|
}
|
|
|
|
// calculate_attrs_to_get() takes either AttributesToGet or
|
|
// ProjectionExpression parameters (having both is *not* allowed),
|
|
// and returns the list of cells we need to read, or a disengaged optional
|
|
// when *all* attributes are to be returned.
|
|
// However, in our current implementation, only top-level attributes are
|
|
// stored as separate cells - a nested document is stored serialized together
|
|
// (as JSON) in the same cell. So this function return a map - each key is the
|
|
// top-level attribute we will need need to read, and the value for each
|
|
// top-level attribute is the partial hierarchy (struct hierarchy_filter)
|
|
// that we will need to extract from that serialized JSON.
|
|
// For example, if ProjectionExpression lists a.b and a.c[2], we
|
|
// return one top-level attribute name, "a", with the value "{b, c[2]}".
|
|
|
|
static std::optional<attrs_to_get> calculate_attrs_to_get(const rjson::value& req, parsed::expression_cache& parsed_expression_cache, std::unordered_set<std::string>& used_attribute_names, select_type select = select_type::regular) {
|
|
if (select == select_type::count) {
|
|
// An empty map asks to retrieve no attributes. Note that this is
|
|
// different from a disengaged optional which means retrieve all.
|
|
return attrs_to_get();
|
|
}
|
|
// FIXME: also need to handle select_type::projection
|
|
const bool has_attributes_to_get = req.HasMember("AttributesToGet");
|
|
const bool has_projection_expression = req.HasMember("ProjectionExpression");
|
|
if (has_attributes_to_get && has_projection_expression) {
|
|
throw api_error::validation(
|
|
format("GetItem does not allow both ProjectionExpression and AttributesToGet to be given together"));
|
|
}
|
|
if (has_attributes_to_get) {
|
|
const rjson::value& attributes_to_get = req["AttributesToGet"];
|
|
attrs_to_get ret;
|
|
for (auto it = attributes_to_get.Begin(); it != attributes_to_get.End(); ++it) {
|
|
attribute_path_map_add("AttributesToGet", ret, rjson::to_string(*it));
|
|
validate_attr_name_length("AttributesToGet", it->GetStringLength(), false);
|
|
}
|
|
if (ret.empty()) {
|
|
throw api_error::validation("Empty AttributesToGet is not allowed. Consider using Select=COUNT instead.");
|
|
}
|
|
return ret;
|
|
} else if (has_projection_expression) {
|
|
const rjson::value& projection_expression = req["ProjectionExpression"];
|
|
const rjson::value* expression_attribute_names = rjson::find(req, "ExpressionAttributeNames");
|
|
std::vector<parsed::path> paths_to_get;
|
|
try {
|
|
paths_to_get = parsed_expression_cache.parse_projection_expression(rjson::to_string_view(projection_expression));
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
resolve_projection_expression(paths_to_get, expression_attribute_names, used_attribute_names);
|
|
attrs_to_get ret;
|
|
for (const parsed::path& p : paths_to_get) {
|
|
attribute_path_map_add("ProjectionExpression", ret, p);
|
|
}
|
|
return ret;
|
|
}
|
|
// An disengaged optional asks to read everything
|
|
return std::nullopt;
|
|
}
|
|
|
|
/**
|
|
* Helper routine to extract data when we already have
|
|
* row, etc etc.
|
|
*
|
|
* Note: include_all_embedded_attributes means we should
|
|
* include all values in the `ATTRS_COLUMN_NAME` map column.
|
|
*
|
|
* We could change the behaviour to simply include all values
|
|
* from this column if the `ATTRS_COLUMN_NAME` is explicit in
|
|
* `attrs_to_get`, but I am scared to do that now in case
|
|
* there is some corner case in existing code.
|
|
*
|
|
* Explicit bool means we can be sure all previous calls are
|
|
* as before.
|
|
*/
|
|
void executor::describe_single_item(const cql3::selection::selection& selection,
|
|
const std::vector<managed_bytes_opt>& result_row,
|
|
const std::optional<attrs_to_get>& attrs_to_get,
|
|
rjson::value& item,
|
|
uint64_t* item_length_in_bytes,
|
|
bool include_all_embedded_attributes)
|
|
{
|
|
const auto& columns = selection.get_columns();
|
|
auto column_it = columns.begin();
|
|
for (const managed_bytes_opt& cell : result_row) {
|
|
if (!cell) {
|
|
++column_it;
|
|
continue;
|
|
}
|
|
std::string column_name = (*column_it)->name_as_text();
|
|
if (column_name != executor::ATTRS_COLUMN_NAME) {
|
|
if (item_length_in_bytes) {
|
|
(*item_length_in_bytes) += column_name.length() + cell->size();
|
|
}
|
|
if (!attrs_to_get || attrs_to_get->contains(column_name)) {
|
|
// item is expected to start empty, and column_name are unique
|
|
// so add() makes sense
|
|
rjson::add_with_string_name(item, column_name, rjson::empty_object());
|
|
rjson::value& field = item[column_name.c_str()];
|
|
cell->with_linearized([&] (bytes_view linearized_cell) {
|
|
rjson::add_with_string_name(field, type_to_string((*column_it)->type), json_key_column_value(linearized_cell, **column_it));
|
|
});
|
|
}
|
|
} else {
|
|
auto deserialized = attrs_type()->deserialize(*cell);
|
|
auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
|
|
for (auto entry : keys_and_values) {
|
|
std::string attr_name = value_cast<sstring>(entry.first);
|
|
if (item_length_in_bytes) {
|
|
(*item_length_in_bytes) += attr_name.length();
|
|
}
|
|
if (include_all_embedded_attributes || !attrs_to_get || attrs_to_get->contains(attr_name)) {
|
|
bytes value = value_cast<bytes>(entry.second);
|
|
if (item_length_in_bytes && value.length()) {
|
|
// ScyllaDB uses one extra byte compared to DynamoDB for the bytes length
|
|
(*item_length_in_bytes) += value.length() - 1;
|
|
}
|
|
rjson::value v = deserialize_item(value);
|
|
if (attrs_to_get) {
|
|
auto it = attrs_to_get->find(attr_name);
|
|
if (it != attrs_to_get->end()) {
|
|
// attrs_to_get may have asked for only part of
|
|
// this attribute. hierarchy_filter() modifies v,
|
|
// and returns false when nothing is to be kept.
|
|
if (!hierarchy_filter(v, it->second)) {
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
// item is expected to start empty, and attribute
|
|
// names are unique so add() makes sense
|
|
rjson::add_with_string_name(item, attr_name, std::move(v));
|
|
} else if (item_length_in_bytes) {
|
|
(*item_length_in_bytes) += value_cast<bytes>(entry.second).length() - 1;
|
|
}
|
|
}
|
|
}
|
|
++column_it;
|
|
}
|
|
}
|
|
|
|
std::optional<rjson::value> executor::describe_single_item(schema_ptr schema,
|
|
const query::partition_slice& slice,
|
|
const cql3::selection::selection& selection,
|
|
const query::result& query_result,
|
|
const std::optional<attrs_to_get>& attrs_to_get,
|
|
uint64_t* item_length_in_bytes) {
|
|
rjson::value item = rjson::empty_object();
|
|
|
|
cql3::selection::result_set_builder builder(selection, gc_clock::now());
|
|
query::result_view::consume(query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, selection));
|
|
|
|
auto result_set = builder.build();
|
|
if (result_set->empty()) {
|
|
if (item_length_in_bytes) {
|
|
// empty results is counted as having a minimal length (e.g. 1 byte).
|
|
(*item_length_in_bytes) += 1;
|
|
}
|
|
// If there is no matching item, we're supposed to return an empty
|
|
// object without an Item member - not one with an empty Item member
|
|
return {};
|
|
}
|
|
if (result_set->size() > 1) {
|
|
// If the result set contains multiple rows, the code should have
|
|
// called describe_multi_item(), not this function.
|
|
throw std::logic_error("describe_single_item() asked to describe multiple items");
|
|
}
|
|
describe_single_item(selection, *result_set->rows().begin(), attrs_to_get, item, item_length_in_bytes);
|
|
return item;
|
|
}
|
|
|
|
future<std::vector<rjson::value>> executor::describe_multi_item(schema_ptr schema,
|
|
const query::partition_slice&& slice,
|
|
shared_ptr<cql3::selection::selection> selection,
|
|
foreign_ptr<lw_shared_ptr<query::result>> query_result,
|
|
shared_ptr<const std::optional<attrs_to_get>> attrs_to_get,
|
|
noncopyable_function<void(uint64_t)> item_callback) {
|
|
cql3::selection::result_set_builder builder(*selection, gc_clock::now());
|
|
query::result_view::consume(*query_result, slice, cql3::selection::result_set_builder::visitor(builder, *schema, *selection));
|
|
auto result_set = builder.build();
|
|
std::vector<rjson::value> ret;
|
|
for (auto& result_row : result_set->rows()) {
|
|
rjson::value item = rjson::empty_object();
|
|
uint64_t item_length_in_bytes = 0;
|
|
describe_single_item(*selection, result_row, *attrs_to_get, item, &item_length_in_bytes);
|
|
item_callback(item_length_in_bytes);
|
|
ret.push_back(std::move(item));
|
|
co_await coroutine::maybe_yield();
|
|
}
|
|
co_return ret;
|
|
}
|
|
|
|
static bool check_needs_read_before_write(const parsed::value& v) {
|
|
return std::visit(overloaded_functor {
|
|
[&] (const parsed::constant& c) -> bool {
|
|
return false;
|
|
},
|
|
[&] (const parsed::value::function_call& f) -> bool {
|
|
return std::ranges::any_of(f._parameters, [&] (const parsed::value& param) {
|
|
return check_needs_read_before_write(param);
|
|
});
|
|
},
|
|
[&] (const parsed::path& p) -> bool {
|
|
return true;
|
|
}
|
|
}, v._value);
|
|
}
|
|
|
|
static bool check_needs_read_before_write(const attribute_path_map<parsed::update_expression::action>& update_expression) {
|
|
return std::ranges::any_of(update_expression, [](const auto& p) {
|
|
if (!p.second.has_value()) {
|
|
// If the action is not on the top-level attribute, we need to
|
|
// read the old item: we change only a part of the top-level
|
|
// attribute, and write the full top-level attribute back.
|
|
return true;
|
|
}
|
|
// Otherwise, the action p.second.get_value() is just on top-level
|
|
// attribute. Check if it needs read-before-write:
|
|
return std::visit(overloaded_functor {
|
|
[&] (const parsed::update_expression::action::set& a) -> bool {
|
|
return check_needs_read_before_write(a._rhs._v1) || (a._rhs._op != 'v' && check_needs_read_before_write(a._rhs._v2));
|
|
},
|
|
[&] (const parsed::update_expression::action::remove& a) -> bool {
|
|
return false;
|
|
},
|
|
[&] (const parsed::update_expression::action::add& a) -> bool {
|
|
return true;
|
|
},
|
|
[&] (const parsed::update_expression::action::del& a) -> bool {
|
|
return true;
|
|
}
|
|
}, p.second.get_value()._action);
|
|
});
|
|
}
|
|
|
|
/*!
|
|
* \brief estimate_value_size provides a rough size estimation
|
|
* for an rjson value object.
|
|
*
|
|
* When calculating RCU and WCU, we need to determine the length of the JSON representation
|
|
* (specifically, the length of each key and each value).
|
|
*
|
|
* When possible, this is calculated as a side effect of other operations.
|
|
* estimate_value_size is used when this calculation cannot be performed directly,
|
|
* but we still need an estimated value.
|
|
*
|
|
* It achieves this without streaming any values and uses a fixed size for numbers.
|
|
* The aim is not to provide a perfect 1-to-1 size calculation, as WCU is calculated
|
|
* in 1KB units. A ballpark estimate is sufficient.
|
|
*/
|
|
static size_t estimate_value_size(const rjson::value& value) {
|
|
size_t size = 0;
|
|
|
|
if (value.IsString()) {
|
|
size += value.GetStringLength();
|
|
}
|
|
else if (value.IsNumber()) {
|
|
size += 8;
|
|
}
|
|
else if (value.IsBool()) {
|
|
size += 5;
|
|
}
|
|
else if (value.IsArray()) {
|
|
for (auto& v : value.GetArray()) {
|
|
size += estimate_value_size(v); // Recursively calculate array element sizes
|
|
}
|
|
}
|
|
else if (value.IsObject()) {
|
|
for (auto it = value.MemberBegin(); it != value.MemberEnd(); ++it) {
|
|
size += it->name.GetStringLength(); // Size of the key
|
|
size += estimate_value_size(it->value); // Size of the value
|
|
}
|
|
}
|
|
return size;
|
|
}
|
|
|
|
class update_item_operation : public rmw_operation {
|
|
public:
|
|
// Some information parsed during the constructor to check for input
|
|
// errors, and cached to be used again during apply().
|
|
rjson::value* _attribute_updates;
|
|
// Instead of keeping a parsed::update_expression with an unsorted list
|
|
// list of actions, we keep them in an attribute_path_map which groups
|
|
// them by top-level attribute, and detects forbidden overlaps/conflicts.
|
|
attribute_path_map<parsed::update_expression::action> _update_expression;
|
|
|
|
// Saved list of GSI keys in the table being updated, used for
|
|
// validate_value_if_index_key()
|
|
std::unordered_map<bytes, std::string> _key_attributes;
|
|
|
|
parsed::condition_expression _condition_expression;
|
|
|
|
update_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& request);
|
|
virtual ~update_item_operation() = default;
|
|
virtual std::optional<mutation> apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const override;
|
|
bool needs_read_before_write() const;
|
|
|
|
private:
|
|
void delete_attribute(bytes&& column_name, const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
|
|
attribute_collector& modified_attrs) const;
|
|
void update_attribute(bytes&& column_name, const rjson::value& json_value, const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts,
|
|
deletable_row& row, attribute_collector& modified_attrs, const attribute_path_map_node<parsed::update_expression::action>* h = nullptr) const;
|
|
void apply_attribute_updates(const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
|
|
attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const;
|
|
void apply_update_expression(const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
|
|
attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const;
|
|
};
|
|
|
|
update_item_operation::update_item_operation(parsed::expression_cache& parsed_expression_cache, service::storage_proxy& proxy, rjson::value&& update_info)
|
|
: rmw_operation(proxy, std::move(update_info))
|
|
{
|
|
const rjson::value* key = rjson::find(_request, "Key");
|
|
if (!key) {
|
|
throw api_error::validation("UpdateItem requires a Key parameter");
|
|
}
|
|
_pk = pk_from_json(*key, _schema);
|
|
_ck = ck_from_json(*key, _schema);
|
|
check_key(*key, _schema);
|
|
|
|
const rjson::value* expression_attribute_names = rjson::find(_request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(_request, "ExpressionAttributeValues");
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
|
|
const rjson::value* update_expression = rjson::find(_request, "UpdateExpression");
|
|
if (update_expression) {
|
|
if (!update_expression->IsString()) {
|
|
throw api_error::validation("UpdateExpression must be a string");
|
|
}
|
|
try {
|
|
parsed::update_expression expr = parsed_expression_cache.parse_update_expression(rjson::to_string_view(*update_expression));
|
|
resolve_update_expression(expr,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
for (auto& action : expr.actions()) {
|
|
// Unfortunately we need to copy the action's path, because
|
|
// we std::move the action object.
|
|
auto p = action._path;
|
|
attribute_path_map_add("UpdateExpression", _update_expression, p, std::move(action));
|
|
}
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
}
|
|
_attribute_updates = rjson::find(_request, "AttributeUpdates");
|
|
if (_attribute_updates) {
|
|
if (!_attribute_updates->IsObject()) {
|
|
throw api_error::validation("AttributeUpdates must be an object");
|
|
}
|
|
for (auto it = std::as_const(*_attribute_updates).MemberBegin(); it != std::as_const(*_attribute_updates).MemberEnd(); ++it) {
|
|
validate_attr_name_length("AttributeUpdates", it->name.GetStringLength(), false);
|
|
}
|
|
}
|
|
|
|
_condition_expression = get_parsed_condition_expression(parsed_expression_cache, _request);
|
|
resolve_condition_expression(_condition_expression,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
|
|
verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "UpdateItem");
|
|
verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "UpdateItem");
|
|
|
|
// DynamoDB forbids having both old-style AttributeUpdates or Expected
|
|
// and new-style UpdateExpression or ConditionExpression in the same request
|
|
const rjson::value* expected = rjson::find(_request, "Expected");
|
|
if (update_expression && _attribute_updates) {
|
|
throw api_error::validation(
|
|
format("UpdateItem does not allow both AttributeUpdates and UpdateExpression to be given together"));
|
|
}
|
|
if (update_expression && expected) {
|
|
throw api_error::validation(
|
|
format("UpdateItem does not allow both old-style Expected and new-style UpdateExpression to be given together"));
|
|
}
|
|
if (_attribute_updates && !_condition_expression.empty()) {
|
|
throw api_error::validation(
|
|
format("UpdateItem does not allow both old-style AttributeUpdates and new-style ConditionExpression to be given together"));
|
|
}
|
|
if (_pk.representation().size() > 2) {
|
|
// ScyllaDB uses two extra bytes compared to DynamoDB for the key bytes length
|
|
_consumed_capacity._total_bytes += _pk.representation().size() - 2;
|
|
}
|
|
if (_ck.representation().size() > 2) {
|
|
// ScyllaDB uses two extra bytes compared to DynamoDB for the key bytes length
|
|
_consumed_capacity._total_bytes += _ck.representation().size() - 2;
|
|
}
|
|
if (expression_attribute_names) {
|
|
_consumed_capacity._total_bytes += estimate_value_size(*expression_attribute_names);
|
|
}
|
|
if (expression_attribute_values) {
|
|
_consumed_capacity._total_bytes += estimate_value_size(*expression_attribute_values);
|
|
}
|
|
|
|
_key_attributes = si_key_attributes(proxy.data_dictionary().find_table(
|
|
_schema->ks_name(), _schema->cf_name()));
|
|
}
|
|
|
|
// These are the cases where update_item_operation::apply() needs to use
|
|
// "previous_item" for certain AttributeUpdates operations (ADD or DELETE)
|
|
static bool check_needs_read_before_write_attribute_updates(rjson::value *attribute_updates) {
|
|
if (!attribute_updates) {
|
|
return false;
|
|
}
|
|
// We already confirmed in update_item_operation::update_item_operation()
|
|
// that _attribute_updates, when it exists, is a map
|
|
for (auto it = attribute_updates->MemberBegin(); it != attribute_updates->MemberEnd(); ++it) {
|
|
rjson::value* action = rjson::find(it->value, "Action");
|
|
if (action) {
|
|
std::string_view action_s = rjson::to_string_view(*action);
|
|
if (action_s == "ADD") {
|
|
return true;
|
|
}
|
|
// For DELETE operation, it only needs a read before write if the
|
|
// "Value" option is used. Without it, it's just a delete.
|
|
if (action_s == "DELETE" && it->value.HasMember("Value")) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
update_item_operation::needs_read_before_write() const {
|
|
return check_needs_read_before_write(_update_expression) ||
|
|
check_needs_read_before_write(_condition_expression) ||
|
|
check_needs_read_before_write_attribute_updates(_attribute_updates) ||
|
|
_request.HasMember("Expected") ||
|
|
(_returnvalues != returnvalues::NONE && _returnvalues != returnvalues::UPDATED_NEW);
|
|
}
|
|
|
|
// action_result() returns the result of applying an UpdateItem action -
|
|
// this result is either a JSON object or an unset optional which indicates
|
|
// the action was a deletion. The caller (update_item_operation::apply()
|
|
// below) will either write this JSON as the content of a column, or
|
|
// use it as a piece in a bigger top-level attribute.
|
|
static std::optional<rjson::value> action_result(
|
|
const parsed::update_expression::action& action,
|
|
const rjson::value* previous_item) {
|
|
return std::visit(overloaded_functor {
|
|
[&] (const parsed::update_expression::action::set& a) -> std::optional<rjson::value> {
|
|
return calculate_value(a._rhs, previous_item);
|
|
},
|
|
[&] (const parsed::update_expression::action::remove& a) -> std::optional<rjson::value> {
|
|
return std::nullopt;
|
|
},
|
|
[&] (const parsed::update_expression::action::add& a) -> std::optional<rjson::value> {
|
|
parsed::value base;
|
|
parsed::value addition;
|
|
base.set_path(action._path);
|
|
addition.set_constant(a._valref);
|
|
rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
|
|
rjson::value v2 = calculate_value(addition, calculate_value_caller::UpdateExpression, previous_item);
|
|
rjson::value result;
|
|
// An ADD can be used to create a new attribute (when
|
|
// v1.IsNull()) or to add to a pre-existing attribute:
|
|
if (v1.IsNull()) {
|
|
const auto v2_type = get_item_type_string(v2);
|
|
if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS") {
|
|
result = v2;
|
|
} else {
|
|
throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v2));
|
|
}
|
|
} else {
|
|
const auto v1_type = get_item_type_string(v1);
|
|
if (v1_type == "N") {
|
|
if (get_item_type_string(v2) != "N") {
|
|
throw api_error::validation(fmt::format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
|
|
}
|
|
result = number_add(v1, v2);
|
|
} else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
|
|
if (get_item_type_string(v2) != v1_type) {
|
|
throw api_error::validation(fmt::format("Incorrect operand type for operator or function. Expected {}: {}", v1_type, rjson::print(v2)));
|
|
}
|
|
result = set_sum(v1, v2);
|
|
} else {
|
|
throw api_error::validation(format("An operand in the update expression has an incorrect data type: {}", v1));
|
|
}
|
|
}
|
|
return result;
|
|
},
|
|
[&] (const parsed::update_expression::action::del& a) -> std::optional<rjson::value> {
|
|
parsed::value base;
|
|
parsed::value subset;
|
|
base.set_path(action._path);
|
|
subset.set_constant(a._valref);
|
|
rjson::value v1 = calculate_value(base, calculate_value_caller::UpdateExpression, previous_item);
|
|
rjson::value v2 = calculate_value(subset, calculate_value_caller::UpdateExpression, previous_item);
|
|
if (!v1.IsNull()) {
|
|
return set_diff(v1, v2);
|
|
}
|
|
// When we return nullopt here, we ask to *delete* this attribute,
|
|
// which is unnecessary because we know the attribute does not
|
|
// exist anyway. This is a waste, but a small one. Note that also
|
|
// for the "remove" action above we don't bother to check if the
|
|
// previous_item add anything to remove.
|
|
return std::nullopt;
|
|
}
|
|
}, action._action);
|
|
}
|
|
|
|
}
|
|
|
|
// Print an attribute_path_map_node<action> as the list of paths it contains:
|
|
template <> struct fmt::formatter<alternator::attribute_path_map_node<alternator::parsed::update_expression::action>> {
|
|
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
|
|
// this function recursively call into itself, so we have to forward declare it.
|
|
auto format(const alternator::attribute_path_map_node<alternator::parsed::update_expression::action>& h, fmt::format_context& ctx) const
|
|
-> decltype(ctx.out());
|
|
};
|
|
|
|
auto fmt::formatter<alternator::attribute_path_map_node<alternator::parsed::update_expression::action>>::format(const alternator::attribute_path_map_node<alternator::parsed::update_expression::action>& h, fmt::format_context& ctx) const
|
|
-> decltype(ctx.out()) {
|
|
auto out = ctx.out();
|
|
if (h.has_value()) {
|
|
out = fmt::format_to(out, " {}", h.get_value()._path);
|
|
} else if (h.has_members()) {
|
|
for (auto& member : h.get_members()) {
|
|
out = fmt::format_to(out, "{}", *member.second);
|
|
}
|
|
} else if (h.has_indexes()) {
|
|
for (auto& index : h.get_indexes()) {
|
|
out = fmt::format_to(out, "{}", *index.second);
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
namespace alternator {
|
|
|
|
// Apply the hierarchy of actions in an attribute_path_map_node<action> to a
|
|
// JSON object which uses DynamoDB's serialization conventions. The complete,
|
|
// unmodified, previous_item is also necessary for the right-hand sides of the
|
|
// actions. Modifies obj in-place or returns false if it is to be removed.
|
|
static bool hierarchy_actions(
|
|
rjson::value& obj,
|
|
const attribute_path_map_node<parsed::update_expression::action>& h,
|
|
const rjson::value* previous_item)
|
|
{
|
|
if (!obj.IsObject() || obj.MemberCount() != 1) {
|
|
// This shouldn't happen. We shouldn't have stored malformed objects.
|
|
// But today Alternator does not validate the structure of nested
|
|
// documents before storing them, so this can happen on read.
|
|
throw api_error::validation(format("Malformed value object read: {}", obj));
|
|
}
|
|
const char* type = obj.MemberBegin()->name.GetString();
|
|
rjson::value& v = obj.MemberBegin()->value;
|
|
if (h.has_value()) {
|
|
// Action replacing everything in this position in the hierarchy
|
|
std::optional<rjson::value> newv = action_result(h.get_value(), previous_item);
|
|
if (newv) {
|
|
obj = std::move(*newv);
|
|
} else {
|
|
return false;
|
|
}
|
|
} else if (h.has_members()) {
|
|
if (type[0] != 'M' || !v.IsObject()) {
|
|
// A .something on a non-map doesn't work.
|
|
throw api_error::validation(fmt::format("UpdateExpression: document paths not valid for this item:{}", h));
|
|
}
|
|
for (const auto& member : h.get_members()) {
|
|
std::string attr = member.first;
|
|
const attribute_path_map_node<parsed::update_expression::action>& subh = *member.second;
|
|
rjson::value *subobj = rjson::find(v, attr);
|
|
if (subobj) {
|
|
if (!hierarchy_actions(*subobj, subh, previous_item)) {
|
|
rjson::remove_member(v, attr);
|
|
}
|
|
} else {
|
|
// When a.b does not exist, setting a.b itself (i.e.
|
|
// subh.has_value()) is fine, but setting a.b.c is not.
|
|
if (subh.has_value()) {
|
|
std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
|
|
if (newv) {
|
|
// This is the !subobj case, so v doesn't have an
|
|
// attr member so we can use add()
|
|
rjson::add_with_string_name(v, attr, std::move(*newv));
|
|
} else {
|
|
// Removing a.b when a is a map but a.b doesn't exist
|
|
// is silently ignored. It's not considered an error.
|
|
}
|
|
} else {
|
|
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
|
}
|
|
}
|
|
}
|
|
} else if (h.has_indexes()) {
|
|
if (type[0] != 'L' || !v.IsArray()) {
|
|
// A [i] on a non-list doesn't work.
|
|
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
|
}
|
|
unsigned nremoved = 0;
|
|
for (const auto& index : h.get_indexes()) {
|
|
unsigned i = index.first - nremoved;
|
|
const attribute_path_map_node<parsed::update_expression::action>& subh = *index.second;
|
|
if (i < v.Size()) {
|
|
if (!hierarchy_actions(v[i], subh, previous_item)) {
|
|
v.Erase(v.Begin() + i);
|
|
// If we have the actions "REMOVE a[1] SET a[3] = :val",
|
|
// the index 3 refers to the original indexes, before any
|
|
// items were removed. So we offset the next indexes
|
|
// (which are guaranteed to be higher than i - indexes is
|
|
// a sorted map) by an increased "nremoved".
|
|
nremoved++;
|
|
}
|
|
} else {
|
|
// If a[7] does not exist, setting a[7] itself (i.e.
|
|
// subh.has_value()) is fine - and appends an item, though
|
|
// not necessarily with index 7. But setting a[7].b will
|
|
// not work.
|
|
if (subh.has_value()) {
|
|
std::optional<rjson::value> newv = action_result(subh.get_value(), previous_item);
|
|
if (newv) {
|
|
rjson::push_back(v, std::move(*newv));
|
|
} else {
|
|
// Removing a[7] when the list has fewer elements is
|
|
// silently ignored. It's not considered an error.
|
|
}
|
|
} else {
|
|
throw api_error::validation(format("UpdateExpression: document paths not valid for this item:{}", h));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void update_item_operation::delete_attribute(bytes&& column_name, const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts,
|
|
deletable_row& row, attribute_collector& modified_attrs) const {
|
|
if (_returnvalues == returnvalues::ALL_NEW) {
|
|
rjson::remove_member(_return_attributes, to_string_view(column_name));
|
|
} else if (_returnvalues == returnvalues::UPDATED_OLD && previous_item) {
|
|
std::string_view cn = to_string_view(column_name);
|
|
const rjson::value* col = rjson::find(*previous_item, cn);
|
|
if (col) {
|
|
// In the UPDATED_OLD case the item starts empty and column
|
|
// names are unique, so we can use add()
|
|
rjson::add_with_string_name(_return_attributes, cn, rjson::copy(*col));
|
|
}
|
|
}
|
|
const column_definition* cdef = find_attribute(*_schema, column_name);
|
|
if (cdef) {
|
|
row.cells().apply(*cdef, atomic_cell::make_dead(ts, gc_clock::now()));
|
|
} else {
|
|
modified_attrs.del(std::move(column_name), ts);
|
|
}
|
|
}
|
|
|
|
void update_item_operation::update_attribute(bytes&& column_name, const rjson::value& json_value, const std::unique_ptr<rjson::value>& previous_item,
|
|
const api::timestamp_type ts, deletable_row& row, attribute_collector& modified_attrs,
|
|
const attribute_path_map_node<parsed::update_expression::action>* h) const {
|
|
if (_returnvalues == returnvalues::ALL_NEW) {
|
|
rjson::replace_with_string_name(_return_attributes, to_string_view(column_name), rjson::copy(json_value));
|
|
} else if (_returnvalues == returnvalues::UPDATED_NEW) {
|
|
rjson::value&& v = rjson::copy(json_value);
|
|
if (h) {
|
|
// If the operation was only on specific attribute paths,
|
|
// leave only them in _return_attributes.
|
|
if (hierarchy_filter(v, *h)) {
|
|
// In the UPDATED_NEW case, _return_attributes starts
|
|
// empty and the attribute names are unique, so we can
|
|
// use add().
|
|
rjson::add_with_string_name(_return_attributes, to_string_view(column_name), std::move(v));
|
|
}
|
|
} else {
|
|
rjson::add_with_string_name(_return_attributes, to_string_view(column_name), std::move(v));
|
|
}
|
|
} else if (_returnvalues == returnvalues::UPDATED_OLD && previous_item) {
|
|
std::string_view cn = to_string_view(column_name);
|
|
const rjson::value* col = rjson::find(*previous_item, cn);
|
|
if (col) {
|
|
rjson::value&& v = rjson::copy(*col);
|
|
if (h) {
|
|
if (hierarchy_filter(v, *h)) {
|
|
// In the UPDATED_OLD case, _return_attributes starts
|
|
// empty and the attribute names are unique, so we can
|
|
// use add().
|
|
rjson::add_with_string_name(_return_attributes, cn, std::move(v));
|
|
}
|
|
} else {
|
|
rjson::add_with_string_name(_return_attributes, cn, std::move(v));
|
|
}
|
|
}
|
|
}
|
|
const column_definition* cdef = find_attribute(*_schema, column_name);
|
|
if (cdef) {
|
|
bytes column_value = get_key_from_typed_value(json_value, *cdef);
|
|
row.cells().apply(*cdef, atomic_cell::make_live(*cdef->type, ts, column_value));
|
|
} else {
|
|
// This attribute may be a key column of one of the GSIs or LSIs,
|
|
// in which case there are some limitations on the value.
|
|
validate_value_if_index_key(_key_attributes, column_name, json_value);
|
|
modified_attrs.put(std::move(column_name), serialize_item(json_value), ts);
|
|
}
|
|
}
|
|
|
|
inline void update_item_operation::apply_attribute_updates(const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
|
|
attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const {
|
|
for (auto it = _attribute_updates->MemberBegin(); it != _attribute_updates->MemberEnd(); ++it) {
|
|
// Note that it.key() is the name of the column, *it is the operation
|
|
bytes column_name = to_bytes(rjson::to_string_view(it->name));
|
|
const column_definition* cdef = _schema->get_column_definition(column_name);
|
|
if (cdef && cdef->is_primary_key()) {
|
|
throw api_error::validation(format("UpdateItem cannot update key column {}", rjson::to_string_view(it->name)));
|
|
}
|
|
std::string action = rjson::to_string((it->value)["Action"]);
|
|
if (action == "DELETE") {
|
|
// The DELETE operation can do two unrelated tasks. Without a
|
|
// "Value" option, it is used to delete an attribute. With a
|
|
// "Value" option, it is used to delete a set of elements from
|
|
// a set attribute of the same type.
|
|
if (it->value.HasMember("Value")) {
|
|
// Subtracting sets needs a read of previous_item, so
|
|
// check_needs_read_before_write_attribute_updates()
|
|
// returns true in this case, and previous_item is
|
|
// available to us when the item exists.
|
|
const rjson::value* v1 = previous_item ? rjson::find(*previous_item, to_string_view(column_name)) : nullptr;
|
|
const rjson::value& v2 = (it->value)["Value"];
|
|
validate_value(v2, "AttributeUpdates");
|
|
const auto v2_type = get_item_type_string(v2);
|
|
if (v2_type != "SS" && v2_type != "NS" && v2_type != "BS") {
|
|
throw api_error::validation(fmt::format("AttributeUpdates DELETE operation with Value only valid for sets, got type {}", v2_type));
|
|
}
|
|
if (v1) {
|
|
std::optional<rjson::value> result = set_diff(*v1, v2);
|
|
if (result) {
|
|
any_updates = true;
|
|
update_attribute(std::move(column_name), *result, previous_item, ts, row, modified_attrs);
|
|
} else {
|
|
// DynamoDB does not allow empty sets - if the
|
|
// result is empty, delete the attribute.
|
|
any_deletes = true;
|
|
delete_attribute(std::move(column_name), previous_item, ts, row, modified_attrs);
|
|
}
|
|
} else {
|
|
// if the attribute or item don't exist, the DELETE
|
|
// operation should silently do nothing - and not
|
|
// create an empty item. It's a waste to call
|
|
// do_delete() on an attribute we already know is
|
|
// deleted, so we can just mark any_deletes = true.
|
|
any_deletes = true;
|
|
}
|
|
} else {
|
|
any_deletes = true;
|
|
delete_attribute(std::move(column_name), previous_item, ts, row, modified_attrs);
|
|
}
|
|
} else if (action == "PUT") {
|
|
const rjson::value& value = (it->value)["Value"];
|
|
validate_value(value, "AttributeUpdates");
|
|
any_updates = true;
|
|
update_attribute(std::move(column_name), value, previous_item, ts, row, modified_attrs);
|
|
} else if (action == "ADD") {
|
|
// Note that check_needs_read_before_write_attribute_updates()
|
|
// made sure we retrieved previous_item (if exists) when there
|
|
// is an ADD action.
|
|
const rjson::value* v1 = previous_item ? rjson::find(*previous_item, to_string_view(column_name)) : nullptr;
|
|
const rjson::value& v2 = (it->value)["Value"];
|
|
validate_value(v2, "AttributeUpdates");
|
|
// An ADD can be used to create a new attribute (when
|
|
// !v1) or to add to a pre-existing attribute:
|
|
if (!v1) {
|
|
const auto v2_type = get_item_type_string(v2);
|
|
if (v2_type == "N" || v2_type == "SS" || v2_type == "NS" || v2_type == "BS" || v2_type == "L") {
|
|
any_updates = true;
|
|
update_attribute(std::move(column_name), v2, previous_item, ts, row, modified_attrs);
|
|
} else {
|
|
throw api_error::validation(format("An operand in the AttributeUpdates ADD has an incorrect data type: {}", v2));
|
|
}
|
|
} else {
|
|
const auto v1_type = get_item_type_string(*v1);
|
|
const auto v2_type = get_item_type_string(v2);
|
|
if (v2_type != v1_type) {
|
|
throw api_error::validation(fmt::format("Operand type mismatch in AttributeUpdates ADD. Expected {}, got {}", v1_type, v2_type));
|
|
}
|
|
if (v1_type == "N") {
|
|
any_updates = true;
|
|
update_attribute(std::move(column_name), number_add(*v1, v2), previous_item, ts, row, modified_attrs);
|
|
} else if (v1_type == "SS" || v1_type == "NS" || v1_type == "BS") {
|
|
any_updates = true;
|
|
update_attribute(std::move(column_name), set_sum(*v1, v2), previous_item, ts, row, modified_attrs);
|
|
} else if (v1_type == "L") {
|
|
// The DynamoDB documentation doesn't say it supports
|
|
// lists in ADD operations, but it turns out that it
|
|
// does. Interestingly, this is only true for
|
|
// AttributeUpdates (this code) - the similar ADD
|
|
// in UpdateExpression doesn't support lists.
|
|
any_updates = true;
|
|
update_attribute(std::move(column_name), list_concatenate(*v1, v2), previous_item, ts, row, modified_attrs);
|
|
} else {
|
|
throw api_error::validation(format("An operand in the AttributeUpdates ADD has an incorrect data type: {}", *v1));
|
|
}
|
|
}
|
|
} else {
|
|
throw api_error::validation(fmt::format("Unknown Action value '{}' in AttributeUpdates", action));
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void update_item_operation::apply_update_expression(const std::unique_ptr<rjson::value>& previous_item, const api::timestamp_type ts, deletable_row& row,
|
|
attribute_collector& modified_attrs, bool& any_updates, bool& any_deletes) const {
|
|
for (auto& actions : _update_expression) {
|
|
// The actions of _update_expression are grouped by top-level
|
|
// attributes. Here, all actions in actions.second share the same
|
|
// top-level attribute actions.first.
|
|
std::string column_name = actions.first;
|
|
const column_definition* cdef = _schema->get_column_definition(to_bytes(column_name));
|
|
if (cdef && cdef->is_primary_key()) {
|
|
throw api_error::validation(fmt::format("UpdateItem cannot update key column {}", column_name));
|
|
}
|
|
if (actions.second.has_value()) {
|
|
// An action on a top-level attribute column_name. The single
|
|
// action is actions.second.get_value(). We can simply invoke
|
|
// the action and replace the attribute with its result:
|
|
std::optional<rjson::value> result = action_result(actions.second.get_value(), previous_item.get());
|
|
if (result) {
|
|
any_updates = true;
|
|
update_attribute(to_bytes(column_name), *result, previous_item, ts, row, modified_attrs, &actions.second);
|
|
} else {
|
|
any_deletes = true;
|
|
delete_attribute(to_bytes(column_name), previous_item, ts, row, modified_attrs);
|
|
}
|
|
} else {
|
|
// We have actions on a path or more than one path in the same
|
|
// top-level attribute column_name - but not on the top-level
|
|
// attribute as a whole. We already read the full top-level
|
|
// attribute (see check_needs_read_before_write()), and now we
|
|
// need to modify pieces of it and write back the entire
|
|
// top-level attribute.
|
|
if (!previous_item) {
|
|
throw api_error::validation(format("UpdateItem cannot update nested document path on non-existent item"));
|
|
}
|
|
const rjson::value* toplevel = rjson::find(*previous_item, column_name);
|
|
if (!toplevel) {
|
|
throw api_error::validation(fmt::format("UpdateItem cannot update document path: missing attribute {}", column_name));
|
|
}
|
|
rjson::value result = rjson::copy(*toplevel);
|
|
any_updates = true;
|
|
hierarchy_actions(result, actions.second, previous_item.get());
|
|
update_attribute(to_bytes(column_name), std::move(result), previous_item, ts, row, modified_attrs, &actions.second);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::optional<mutation> update_item_operation::apply(std::unique_ptr<rjson::value> previous_item, api::timestamp_type ts, cdc::per_request_options& cdc_opts) const {
|
|
if (_consumed_capacity._total_bytes == 0) {
|
|
_consumed_capacity._total_bytes = 1;
|
|
}
|
|
if (!verify_expected(_request, previous_item.get()) || !verify_condition_expression(_condition_expression, previous_item.get())) {
|
|
if (previous_item && _returnvalues_on_condition_check_failure == returnvalues_on_condition_check_failure::ALL_OLD) {
|
|
_return_attributes = std::move(*previous_item);
|
|
}
|
|
// If the update is to be cancelled because of an unfulfilled
|
|
// condition, return an empty optional mutation, which is more
|
|
// efficient than throwing an exception.
|
|
return {};
|
|
}
|
|
|
|
// In the ReturnValues=ALL_NEW case, we make a copy of previous_item into
|
|
// _return_attributes and parts of it will be overwritten by the new
|
|
// updates (in do_update() and do_delete()). We need to make a copy and
|
|
// cannot overwrite previous_item directly because we still need its
|
|
// original content for update expressions. For example, the expression
|
|
// "REMOVE a SET b=a" is valid, and needs the original value of a to
|
|
// stick around.
|
|
// Note that for ReturnValues=ALL_OLD, we don't need to copy here, and
|
|
// can just move previous_item later, when we don't need it any more.
|
|
if (_returnvalues == returnvalues::ALL_NEW) {
|
|
if (previous_item) {
|
|
_return_attributes = rjson::copy(*previous_item);
|
|
} else {
|
|
// If there is no previous item, usually a new item is created
|
|
// and contains the given key. This may be cancelled at the end
|
|
// of this function if the update is just deletes.
|
|
_return_attributes = rjson::copy(rjson::get(_request, "Key"));
|
|
}
|
|
} else if (_returnvalues == returnvalues::UPDATED_OLD || _returnvalues == returnvalues::UPDATED_NEW) {
|
|
_return_attributes = rjson::empty_object();
|
|
}
|
|
|
|
mutation m(_schema, _pk);
|
|
bool any_updates = false;
|
|
bool any_deletes = false;
|
|
auto& row = m.partition().clustered_row(*_schema, _ck);
|
|
auto modified_attrs = attribute_collector();
|
|
if (!_update_expression.empty()) {
|
|
apply_update_expression(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
|
|
}
|
|
if (_attribute_updates) {
|
|
apply_attribute_updates(previous_item, ts, row, modified_attrs, any_updates, any_deletes);
|
|
}
|
|
if (!modified_attrs.empty()) {
|
|
auto serialized_map = modified_attrs.to_mut().serialize(*attrs_type());
|
|
row.cells().apply(attrs_column(*_schema), std::move(serialized_map));
|
|
}
|
|
// To allow creation of an item with no attributes, we need a row marker.
|
|
// Note that unlike Scylla, even an "update" operation needs to add a row
|
|
// marker. An update with only DELETE operations must not add a row marker
|
|
// (this was issue #5862) but any other update, even an empty one, should.
|
|
if (any_updates || !any_deletes) {
|
|
row.apply(row_marker(ts));
|
|
} else if (_returnvalues == returnvalues::ALL_NEW && !previous_item) {
|
|
// There was no pre-existing item, and we're not creating one, so
|
|
// don't report the new item in the returned Attributes.
|
|
_return_attributes = rjson::null_value();
|
|
}
|
|
if (_returnvalues == returnvalues::ALL_OLD && previous_item) {
|
|
_return_attributes = std::move(*previous_item);
|
|
}
|
|
// ReturnValues=UPDATED_OLD/NEW never return an empty Attributes field,
|
|
// even if a new item was created. Instead it should be missing entirely.
|
|
if (_returnvalues == returnvalues::UPDATED_OLD || _returnvalues == returnvalues::UPDATED_NEW) {
|
|
if (_return_attributes.MemberCount() == 0) {
|
|
_return_attributes = rjson::null_value();
|
|
}
|
|
}
|
|
|
|
return m;
|
|
}
|
|
|
|
future<executor::request_return_type> executor::update_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.update_item++;
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
elogger.trace("update_item {}", request);
|
|
|
|
auto op = make_shared<update_item_operation>(*_parsed_expression_cache, _proxy, std::move(request));
|
|
tracing::add_alternator_table_name(trace_state, op->schema()->cf_name());
|
|
const bool needs_read_before_write = _proxy.data_dictionary().get_config().alternator_force_read_before_write() || op->needs_read_before_write();
|
|
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, op->schema(), auth::permission::MODIFY, _stats);
|
|
|
|
auto cas_shard = op->shard_for_execute(needs_read_before_write);
|
|
|
|
if (cas_shard && !cas_shard->this_shard()) {
|
|
_stats.api_operations.update_item--; // uncount on this shard, will be counted in other shard
|
|
_stats.shard_bounce_for_lwt++;
|
|
co_return co_await container().invoke_on(cas_shard->shard(), _ssg,
|
|
[request = std::move(*op).move_request(), cs = client_state.move_to_other_shard(), gt = tracing::global_trace_state_ptr(trace_state), permit = std::move(permit)]
|
|
(executor& e) mutable {
|
|
return do_with(cs.get(), [&e, request = std::move(request), trace_state = tracing::trace_state_ptr(gt)]
|
|
(service::client_state& client_state) mutable {
|
|
//FIXME: Instead of passing empty_service_permit() to the background operation,
|
|
// the current permit's lifetime should be prolonged, so that it's destructed
|
|
// only after all background operations are finished as well.
|
|
return e.update_item(client_state, std::move(trace_state), empty_service_permit(), std::move(request));
|
|
});
|
|
});
|
|
}
|
|
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *(op->schema()));
|
|
per_table_stats->api_operations.update_item++;
|
|
uint64_t wcu_total = 0;
|
|
auto res = co_await op->execute(_proxy, std::move(cas_shard), client_state, trace_state, std::move(permit), needs_read_before_write, _stats, *per_table_stats, wcu_total);
|
|
per_table_stats->operation_sizes.update_item_op_size_kb.add(bytes_to_kb_ceil(op->consumed_capacity()._total_bytes));
|
|
per_table_stats->wcu_total[stats::wcu_types::UPDATE_ITEM] += wcu_total;
|
|
_stats.wcu_total[stats::wcu_types::UPDATE_ITEM] += wcu_total;
|
|
per_table_stats->api_operations.update_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
_stats.api_operations.update_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
co_return res;
|
|
}
|
|
|
|
// Check according to the request's "ConsistentRead" field, which consistency
|
|
// level we need to use for the read. The field can be True for strongly
|
|
// consistent reads, or False for eventually consistent reads, or if this
|
|
// field is absence, we default to eventually consistent reads.
|
|
// In Scylla, eventually-consistent reads are implemented as consistency
|
|
// level LOCAL_ONE, and strongly-consistent reads as LOCAL_QUORUM.
|
|
static db::consistency_level get_read_consistency(const rjson::value& request) {
|
|
const rjson::value* consistent_read_value = rjson::find(request, "ConsistentRead");
|
|
bool consistent_read = false;
|
|
if (consistent_read_value && !consistent_read_value->IsNull()) {
|
|
if (consistent_read_value->IsBool()) {
|
|
consistent_read = consistent_read_value->GetBool();
|
|
} else {
|
|
throw api_error::validation("ConsistentRead flag must be a boolean");
|
|
}
|
|
}
|
|
return consistent_read ? db::consistency_level::LOCAL_QUORUM : db::consistency_level::LOCAL_ONE;
|
|
}
|
|
|
|
// describe_item() wraps the result of describe_single_item() by a map
|
|
// as needed by the GetItem request. It should not be used for other purposes,
|
|
// use describe_single_item() instead.
|
|
static rjson::value describe_item(schema_ptr schema,
|
|
const query::partition_slice& slice,
|
|
const cql3::selection::selection& selection,
|
|
const query::result& query_result,
|
|
const std::optional<attrs_to_get>& attrs_to_get,
|
|
consumed_capacity_counter& consumed_capacity,
|
|
uint64_t& metric) {
|
|
std::optional<rjson::value> opt_item = executor::describe_single_item(std::move(schema), slice, selection, std::move(query_result), attrs_to_get, &consumed_capacity._total_bytes);
|
|
rjson::value item_descr = rjson::empty_object();
|
|
if (opt_item) {
|
|
rjson::add(item_descr, "Item", std::move(*opt_item));
|
|
}
|
|
consumed_capacity.add_consumed_capacity_to_response_if_needed(item_descr);
|
|
metric += consumed_capacity.get_half_units();
|
|
return item_descr;
|
|
}
|
|
|
|
future<executor::request_return_type> executor::get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.get_item++;
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
elogger.trace("Getting item {}", request);
|
|
|
|
schema_ptr schema = get_table(_proxy, request);
|
|
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *schema);
|
|
per_table_stats->api_operations.get_item++;
|
|
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
|
|
|
rjson::value& query_key = request["Key"];
|
|
db::consistency_level cl = get_read_consistency(request);
|
|
|
|
partition_key pk = pk_from_json(query_key, schema);
|
|
dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*schema, pk))};
|
|
|
|
std::vector<query::clustering_range> bounds;
|
|
if (schema->clustering_key_size() == 0) {
|
|
bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
} else {
|
|
clustering_key ck = ck_from_json(query_key, schema);
|
|
bounds.push_back(query::clustering_range::make_singular(std::move(ck)));
|
|
}
|
|
check_key(query_key, schema);
|
|
|
|
//TODO(sarna): It would be better to fetch only some attributes of the map, not all
|
|
auto regular_columns =
|
|
schema->regular_columns() | std::views::transform(&column_definition::id)
|
|
| std::ranges::to<query::column_id_vector>();
|
|
|
|
auto selection = cql3::selection::selection::wildcard(schema);
|
|
|
|
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
|
|
auto command = ::make_lw_shared<query::read_command>(schema->id(), schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
|
|
query::tombstone_limit(_proxy.get_tombstone_limit()));
|
|
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names);
|
|
const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
|
|
verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "GetItem");
|
|
rcu_consumed_capacity_counter add_capacity(request, cl == db::consistency_level::LOCAL_QUORUM);
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::SELECT, _stats);
|
|
service::storage_proxy::coordinator_query_result qr =
|
|
co_await _proxy.query(
|
|
schema, std::move(command), std::move(partition_ranges), cl,
|
|
service::storage_proxy::coordinator_query_options(executor::default_timeout(), std::move(permit), client_state, trace_state));
|
|
per_table_stats->api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
_stats.api_operations.get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
uint64_t rcu_half_units = 0;
|
|
rjson::value res = describe_item(schema, partition_slice, *selection, *qr.query_result, std::move(attrs_to_get), add_capacity, rcu_half_units);
|
|
per_table_stats->rcu_half_units_total += rcu_half_units;
|
|
_stats.rcu_half_units_total += rcu_half_units;
|
|
// Update item size metrics only if we found an item.
|
|
if (qr.query_result->row_count().value_or(0) > 0) {
|
|
per_table_stats->operation_sizes.get_item_op_size_kb.add(bytes_to_kb_ceil(add_capacity._total_bytes));
|
|
}
|
|
co_return rjson::print(std::move(res));
|
|
}
|
|
|
|
static void check_big_object(const rjson::value& val, int& size_left);
|
|
static void check_big_array(const rjson::value& val, int& size_left);
|
|
|
|
bool is_big(const rjson::value& val, int big_size) {
|
|
if (val.IsString()) {
|
|
return ssize_t(val.GetStringLength()) > big_size;
|
|
} else if (val.IsObject()) {
|
|
check_big_object(val, big_size);
|
|
return big_size < 0;
|
|
} else if (val.IsArray()) {
|
|
check_big_array(val, big_size);
|
|
return big_size < 0;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void check_big_array(const rjson::value& val, int& size_left) {
|
|
// Assume a fixed size of 10 bytes for each number, boolean, etc., or
|
|
// beginning of a sub-object. This doesn't have to be accurate.
|
|
size_left -= 10 * val.Size();
|
|
for (const auto& v : val.GetArray()) {
|
|
if (size_left < 0) {
|
|
return;
|
|
}
|
|
// Note that we avoid recursive calls for the leaves (anything except
|
|
// array or object) because usually those greatly outnumber the trunk.
|
|
if (v.IsString()) {
|
|
size_left -= v.GetStringLength();
|
|
} else if (v.IsObject()) {
|
|
check_big_object(v, size_left);
|
|
} else if (v.IsArray()) {
|
|
check_big_array(v, size_left);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void check_big_object(const rjson::value& val, int& size_left) {
|
|
size_left -= 10 * val.MemberCount();
|
|
for (const auto& m : val.GetObject()) {
|
|
if (size_left < 0) {
|
|
return;
|
|
}
|
|
size_left -= m.name.GetStringLength();
|
|
if (m.value.IsString()) {
|
|
size_left -= m.value.GetStringLength();
|
|
} else if (m.value.IsObject()) {
|
|
check_big_object(m.value, size_left);
|
|
} else if (m.value.IsArray()) {
|
|
check_big_array(m.value, size_left);
|
|
}
|
|
}
|
|
}
|
|
|
|
future<executor::request_return_type> executor::batch_get_item(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
// FIXME: In this implementation, an unbounded batch size can cause
|
|
// unbounded response JSON object to be buffered in memory, unbounded
|
|
// parallelism of the requests, and unbounded amount of non-preemptable
|
|
// work in the following loops. So we should limit the batch size, and/or
|
|
// the response size, as DynamoDB does.
|
|
_stats.api_operations.batch_get_item++;
|
|
rjson::value& request_items = request["RequestItems"];
|
|
auto start_time = std::chrono::steady_clock::now();
|
|
// We need to validate all the parameters before starting any asynchronous
|
|
// query, and fail the entire request on any parse error. So we parse all
|
|
// the input into our own vector "requests", each element a table_requests
|
|
// listing all the request aimed at a single table. For efficiency, inside
|
|
// each table_requests we further group together all reads going to the
|
|
// same partition, so we can later send them together.
|
|
bool should_add_rcu = rcu_consumed_capacity_counter::should_add_capacity(request);
|
|
struct table_requests {
|
|
schema_ptr schema;
|
|
db::consistency_level cl;
|
|
::shared_ptr<const std::optional<alternator::attrs_to_get>> attrs_to_get;
|
|
// clustering_keys keeps a sorted set of clustering keys. It must
|
|
// be sorted for the read below (see #10827). Additionally each
|
|
// clustering key is mapped to the original rjson::value "Key".
|
|
using clustering_keys = std::map<clustering_key, rjson::value*, clustering_key::less_compare>;
|
|
std::unordered_map<partition_key, clustering_keys, partition_key::hashing, partition_key::equality> requests;
|
|
table_requests(schema_ptr s)
|
|
: schema(std::move(s))
|
|
, requests(8, partition_key::hashing(*schema), partition_key::equality(*schema))
|
|
{}
|
|
void add(rjson::value& key) {
|
|
auto pk = pk_from_json(key, schema);
|
|
auto it = requests.find(pk);
|
|
if (it == requests.end()) {
|
|
it = requests.emplace(pk, clustering_key::less_compare(*schema)).first;
|
|
}
|
|
auto ck = ck_from_json(key, schema);
|
|
if (auto [_, inserted] = it->second.emplace(ck, &key); !inserted) {
|
|
throw api_error::validation("Provided list of item keys contains duplicates");
|
|
}
|
|
}
|
|
};
|
|
std::vector<table_requests> requests;
|
|
uint batch_size = 0;
|
|
for (auto it = request_items.MemberBegin(); it != request_items.MemberEnd(); ++it) {
|
|
table_requests rs(get_table_from_batch_request(_proxy, it));
|
|
tracing::add_alternator_table_name(trace_state, rs.schema->cf_name());
|
|
rs.cl = get_read_consistency(it->value);
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
rs.attrs_to_get = ::make_shared<const std::optional<attrs_to_get>>(calculate_attrs_to_get(it->value, *_parsed_expression_cache, used_attribute_names));
|
|
const rjson::value* expression_attribute_names = rjson::find(it->value, "ExpressionAttributeNames");
|
|
verify_all_are_used(expression_attribute_names, used_attribute_names,"ExpressionAttributeNames", "GetItem");
|
|
auto& keys = (it->value)["Keys"];
|
|
for (rjson::value& key : keys.GetArray()) {
|
|
rs.add(key);
|
|
check_key(key, rs.schema);
|
|
}
|
|
batch_size += rs.requests.size();
|
|
requests.emplace_back(std::move(rs));
|
|
}
|
|
|
|
for (const table_requests& tr : requests) {
|
|
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, tr.schema, auth::permission::SELECT, _stats);
|
|
}
|
|
|
|
_stats.api_operations.batch_get_item_batch_total += batch_size;
|
|
_stats.api_operations.batch_get_item_histogram.add(batch_size);
|
|
// If we got here, all "requests" are valid, so let's start the
|
|
// requests for the different partitions all in parallel.
|
|
std::vector<future<std::vector<rjson::value>>> response_futures;
|
|
std::vector<uint64_t> consumed_rcu_half_units_per_table(requests.size());
|
|
for (size_t i = 0; i < requests.size(); i++) {
|
|
const table_requests& rs = requests[i];
|
|
bool is_quorum = rs.cl == db::consistency_level::LOCAL_QUORUM;
|
|
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
|
|
per_table_stats->api_operations.batch_get_item_histogram.add(rs.requests.size());
|
|
for (const auto& [pk, cks] : rs.requests) {
|
|
dht::partition_range_vector partition_ranges{dht::partition_range(dht::decorate_key(*rs.schema, pk))};
|
|
std::vector<query::clustering_range> bounds;
|
|
if (rs.schema->clustering_key_size() == 0) {
|
|
bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
} else {
|
|
for (auto& ck : cks) {
|
|
bounds.push_back(query::clustering_range::make_singular(ck.first));
|
|
}
|
|
}
|
|
auto regular_columns =
|
|
rs.schema->regular_columns() | std::views::transform(&column_definition::id)
|
|
| std::ranges::to<query::column_id_vector>();
|
|
auto selection = cql3::selection::selection::wildcard(rs.schema);
|
|
auto partition_slice = query::partition_slice(std::move(bounds), {}, std::move(regular_columns), selection->get_query_options());
|
|
auto command = ::make_lw_shared<query::read_command>(rs.schema->id(), rs.schema->version(), partition_slice, _proxy.get_max_result_size(partition_slice),
|
|
query::tombstone_limit(_proxy.get_tombstone_limit()));
|
|
command->allow_limit = db::allow_per_partition_rate_limit::yes;
|
|
const auto item_callback = [is_quorum, per_table_stats, &rcus_per_table = consumed_rcu_half_units_per_table[i]](uint64_t size) {
|
|
rcus_per_table += rcu_consumed_capacity_counter::get_half_units(size, is_quorum);
|
|
// Update item size only if the item exists.
|
|
if (size > 0) {
|
|
per_table_stats->operation_sizes.batch_get_item_op_size_kb.add(bytes_to_kb_ceil(size));
|
|
}
|
|
};
|
|
future<std::vector<rjson::value>> f = _proxy.query(rs.schema, std::move(command), std::move(partition_ranges), rs.cl,
|
|
service::storage_proxy::coordinator_query_options(executor::default_timeout(), permit, client_state, trace_state)).then(
|
|
[schema = rs.schema, partition_slice = std::move(partition_slice), selection = std::move(selection), attrs_to_get = rs.attrs_to_get, item_callback = std::move(item_callback)] (service::storage_proxy::coordinator_query_result qr) mutable {
|
|
utils::get_local_injector().inject("alternator_batch_get_item", [] { throw std::runtime_error("batch_get_item injection"); });
|
|
return describe_multi_item(std::move(schema), std::move(partition_slice), std::move(selection), std::move(qr.query_result), std::move(attrs_to_get), std::move(item_callback));
|
|
});
|
|
response_futures.push_back(std::move(f));
|
|
}
|
|
}
|
|
|
|
// Wait for all requests to complete, and then return the response.
|
|
// In case of full failure (no reads succeeded), an arbitrary error
|
|
// from one of the operations will be returned.
|
|
bool some_succeeded = false;
|
|
std::exception_ptr eptr;
|
|
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "Responses", rjson::empty_object());
|
|
rjson::add(response, "UnprocessedKeys", rjson::empty_object());
|
|
auto fut_it = response_futures.begin();
|
|
rjson::value consumed_capacity = rjson::empty_array();
|
|
for (size_t i = 0; i < requests.size(); i++) {
|
|
const table_requests& rs = requests[i];
|
|
std::string table = table_name(*rs.schema);
|
|
for (const auto& [_, cks] : rs.requests) {
|
|
auto& fut = *fut_it;
|
|
++fut_it;
|
|
try {
|
|
std::vector<rjson::value> results = co_await std::move(fut);
|
|
some_succeeded = true;
|
|
if (!response["Responses"].HasMember(table)) {
|
|
rjson::add_with_string_name(response["Responses"], table, rjson::empty_array());
|
|
}
|
|
for (rjson::value& json : results) {
|
|
rjson::push_back(response["Responses"][table], std::move(json));
|
|
}
|
|
} catch(...) {
|
|
eptr = std::current_exception();
|
|
// This read of potentially several rows in one partition,
|
|
// failed. We need to add the row key(s) to UnprocessedKeys.
|
|
if (!response["UnprocessedKeys"].HasMember(table)) {
|
|
// Add the table's entry in UnprocessedKeys. Need to copy
|
|
// all the table's parameters from the request except the
|
|
// Keys field, which we start empty and then build below.
|
|
rjson::add_with_string_name(response["UnprocessedKeys"], table, rjson::empty_object());
|
|
rjson::value& unprocessed_item = response["UnprocessedKeys"][table];
|
|
rjson::value& request_item = request_items[table];
|
|
for (auto it = request_item.MemberBegin(); it != request_item.MemberEnd(); ++it) {
|
|
if (it->name != "Keys") {
|
|
rjson::add_with_string_name(unprocessed_item,
|
|
rjson::to_string_view(it->name), rjson::copy(it->value));
|
|
}
|
|
}
|
|
rjson::add_with_string_name(unprocessed_item, "Keys", rjson::empty_array());
|
|
}
|
|
for (auto& ck : cks) {
|
|
rjson::push_back(response["UnprocessedKeys"][table]["Keys"], std::move(*ck.second));
|
|
}
|
|
}
|
|
}
|
|
uint64_t rcu_half_units = consumed_rcu_half_units_per_table[i];
|
|
_stats.rcu_half_units_total += rcu_half_units;
|
|
lw_shared_ptr<stats> per_table_stats = get_stats_from_schema(_proxy, *rs.schema);
|
|
per_table_stats->rcu_half_units_total += rcu_half_units;
|
|
if (should_add_rcu) {
|
|
rjson::value entry = rjson::empty_object();
|
|
rjson::add(entry, "TableName", table);
|
|
rjson::add(entry, "CapacityUnits", rcu_half_units*0.5);
|
|
rjson::push_back(consumed_capacity, std::move(entry));
|
|
}
|
|
}
|
|
|
|
if (should_add_rcu) {
|
|
rjson::add(response, "ConsumedCapacity", std::move(consumed_capacity));
|
|
}
|
|
elogger.trace("Unprocessed keys: {}", response["UnprocessedKeys"]);
|
|
if (!some_succeeded && eptr) {
|
|
co_await coroutine::return_exception_ptr(std::move(eptr));
|
|
}
|
|
_stats.api_operations.batch_get_item_latency.mark(std::chrono::steady_clock::now() - start_time);
|
|
if (is_big(response)) {
|
|
co_return make_streamed(std::move(response));
|
|
} else {
|
|
co_return rjson::print(std::move(response));
|
|
}
|
|
}
|
|
|
|
// "filter" represents a condition that can be applied to individual items
|
|
// read by a Query or Scan operation, to decide whether to keep the item.
|
|
// A filter is constructed from a Query or Scan request. This uses the
|
|
// relevant fields in the query (FilterExpression or QueryFilter/ScanFilter +
|
|
// ConditionalOperator). These fields are pre-checked and pre-parsed as much
|
|
// as possible, to ensure that later checking of many items is efficient.
|
|
class filter {
|
|
private:
|
|
// Holding QueryFilter/ScanFilter + ConditionalOperator:
|
|
struct conditions_filter {
|
|
bool require_all;
|
|
rjson::value conditions;
|
|
};
|
|
// Holding a parsed FilterExpression:
|
|
struct expression_filter {
|
|
parsed::condition_expression expression;
|
|
};
|
|
std::optional<std::variant<conditions_filter, expression_filter>> _imp;
|
|
public:
|
|
// Filtering for Scan and Query are very similar, but there are some
|
|
// small differences, especially the names of the request attributes.
|
|
enum class request_type { SCAN, QUERY };
|
|
// Note that a filter does not store pointers to the query used to
|
|
// construct it.
|
|
filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt,
|
|
std::unordered_set<std::string>& used_attribute_names,
|
|
std::unordered_set<std::string>& used_attribute_values);
|
|
bool check(const rjson::value& item) const;
|
|
bool filters_on(std::string_view attribute) const;
|
|
// for_filters_on() runs the given function on the attributes that the
|
|
// filter works on. It may run for the same attribute more than once if
|
|
// used more than once in the filter.
|
|
void for_filters_on(const noncopyable_function<void(std::string_view)>& func) const;
|
|
operator bool() const { return bool(_imp); }
|
|
};
|
|
|
|
filter::filter(parsed::expression_cache& parsed_expression_cache, const rjson::value& request, request_type rt,
|
|
std::unordered_set<std::string>& used_attribute_names,
|
|
std::unordered_set<std::string>& used_attribute_values) {
|
|
const rjson::value* expression = rjson::find(request, "FilterExpression");
|
|
const char* conditions_attribute = (rt == request_type::SCAN) ? "ScanFilter" : "QueryFilter";
|
|
const rjson::value* conditions = rjson::find(request, conditions_attribute);
|
|
auto conditional_operator = get_conditional_operator(request);
|
|
if (conditional_operator != conditional_operator_type::MISSING &&
|
|
(!conditions || (conditions->IsObject() && conditions->GetObject().ObjectEmpty()))) {
|
|
throw api_error::validation(
|
|
format("'ConditionalOperator' parameter cannot be specified for missing or empty {}",
|
|
conditions_attribute));
|
|
}
|
|
if (expression && conditions) {
|
|
throw api_error::validation(
|
|
format("FilterExpression and {} are not allowed together", conditions_attribute));
|
|
}
|
|
if (expression) {
|
|
if (!expression->IsString()) {
|
|
throw api_error::validation("FilterExpression must be a string");
|
|
}
|
|
if (expression->GetStringLength() == 0) {
|
|
throw api_error::validation("FilterExpression must not be empty");
|
|
}
|
|
if (rjson::find(request, "AttributesToGet")) {
|
|
throw api_error::validation("Cannot use both old-style and new-style parameters in same request: FilterExpression and AttributesToGet");
|
|
}
|
|
try {
|
|
auto parsed = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(*expression), "FilterExpression");
|
|
const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
|
|
resolve_condition_expression(parsed,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
_imp = expression_filter { std::move(parsed) };
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
}
|
|
if (conditions) {
|
|
if (rjson::find(request, "ProjectionExpression")) {
|
|
throw api_error::validation(format("Cannot use both old-style and new-style parameters in same request: {} and ProjectionExpression", conditions_attribute));
|
|
}
|
|
bool require_all = conditional_operator != conditional_operator_type::OR;
|
|
_imp = conditions_filter { require_all, rjson::copy(*conditions) };
|
|
}
|
|
}
|
|
|
|
bool filter::check(const rjson::value& item) const {
|
|
if (!_imp) {
|
|
return true;
|
|
}
|
|
return std::visit(overloaded_functor {
|
|
[&] (const conditions_filter& f) -> bool {
|
|
return verify_condition(f.conditions, f.require_all, &item);
|
|
},
|
|
[&] (const expression_filter& f) -> bool {
|
|
return verify_condition_expression(f.expression, &item);
|
|
}
|
|
}, *_imp);
|
|
}
|
|
|
|
bool filter::filters_on(std::string_view attribute) const {
|
|
if (!_imp) {
|
|
return false;
|
|
}
|
|
return std::visit(overloaded_functor {
|
|
[&] (const conditions_filter& f) -> bool {
|
|
for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) {
|
|
if (rjson::to_string_view(it->name) == attribute) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
},
|
|
[&] (const expression_filter& f) -> bool {
|
|
return condition_expression_on(f.expression, attribute);
|
|
}
|
|
}, *_imp);
|
|
}
|
|
|
|
void filter::for_filters_on(const noncopyable_function<void(std::string_view)>& func) const {
|
|
if (_imp) {
|
|
std::visit(overloaded_functor {
|
|
[&] (const conditions_filter& f) -> void {
|
|
for (auto it = f.conditions.MemberBegin(); it != f.conditions.MemberEnd(); ++it) {
|
|
func(rjson::to_string_view(it->name));
|
|
}
|
|
},
|
|
[&] (const expression_filter& f) -> void {
|
|
return for_condition_expression_on(f.expression, func);
|
|
}
|
|
}, *_imp);
|
|
}
|
|
}
|
|
|
|
class describe_items_visitor {
|
|
typedef std::vector<const column_definition*> columns_t;
|
|
const columns_t& _columns;
|
|
const std::optional<attrs_to_get>& _attrs_to_get;
|
|
std::unordered_set<std::string> _extra_filter_attrs;
|
|
const filter& _filter;
|
|
typename columns_t::const_iterator _column_it;
|
|
rjson::value _item;
|
|
// _items is a chunked_vector<rjson::value> instead of a RapidJson array
|
|
// (rjson::value) because unfortunately RapidJson arrays are stored
|
|
// contiguously in memory, and cause large allocations when a Query/Scan
|
|
// returns a long list of short items (issue #23535).
|
|
utils::chunked_vector<rjson::value> _items;
|
|
size_t _scanned_count;
|
|
|
|
public:
|
|
describe_items_visitor(const columns_t& columns, const std::optional<attrs_to_get>& attrs_to_get, filter& filter)
|
|
: _columns(columns)
|
|
, _attrs_to_get(attrs_to_get)
|
|
, _filter(filter)
|
|
, _column_it(columns.begin())
|
|
, _item(rjson::empty_object())
|
|
, _scanned_count(0)
|
|
{
|
|
// _filter.check() may need additional attributes not listed in
|
|
// _attrs_to_get (i.e., not requested as part of the output).
|
|
// We list those in _extra_filter_attrs. We will include them in
|
|
// the JSON but take them out before finally returning the JSON.
|
|
if (_attrs_to_get) {
|
|
_filter.for_filters_on([&] (std::string_view attr) {
|
|
std::string a(attr); // no heterogeneous maps searches :-(
|
|
if (!_attrs_to_get->contains(a)) {
|
|
_extra_filter_attrs.emplace(std::move(a));
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
void start_row() {
|
|
_column_it = _columns.begin();
|
|
}
|
|
|
|
void accept_value(managed_bytes_view_opt result_bytes_view) {
|
|
if (!result_bytes_view) {
|
|
++_column_it;
|
|
return;
|
|
}
|
|
result_bytes_view->with_linearized([this] (bytes_view bv) {
|
|
std::string column_name = (*_column_it)->name_as_text();
|
|
if (column_name != executor::ATTRS_COLUMN_NAME) {
|
|
if (!_attrs_to_get || _attrs_to_get->contains(column_name) || _extra_filter_attrs.contains(column_name)) {
|
|
if (!_item.HasMember(column_name.c_str())) {
|
|
rjson::add_with_string_name(_item, column_name, rjson::empty_object());
|
|
}
|
|
rjson::value& field = _item[column_name.c_str()];
|
|
rjson::add_with_string_name(field, type_to_string((*_column_it)->type), json_key_column_value(bv, **_column_it));
|
|
}
|
|
} else {
|
|
auto deserialized = attrs_type()->deserialize(bv);
|
|
auto keys_and_values = value_cast<map_type_impl::native_type>(deserialized);
|
|
for (auto entry : keys_and_values) {
|
|
std::string attr_name = value_cast<sstring>(entry.first);
|
|
if (!_attrs_to_get || _attrs_to_get->contains(attr_name) || _extra_filter_attrs.contains(attr_name)) {
|
|
bytes value = value_cast<bytes>(entry.second);
|
|
// Even if _attrs_to_get asked to keep only a part of a
|
|
// top-level attribute, we keep the entire attribute
|
|
// at this stage, because the item filter might still
|
|
// need the other parts (it was easier for us to keep
|
|
// extra_filter_attrs at top-level granularity). We'll
|
|
// filter the unneeded parts after item filtering.
|
|
rjson::add_with_string_name(_item, attr_name, deserialize_item(value));
|
|
}
|
|
}
|
|
}
|
|
});
|
|
++_column_it;
|
|
}
|
|
|
|
void end_row() {
|
|
if (_filter.check(_item)) {
|
|
// As noted above, we kept entire top-level attributes listed in
|
|
// _attrs_to_get. We may need to only keep parts of them.
|
|
if (_attrs_to_get) {
|
|
for (const auto& attr: *_attrs_to_get) {
|
|
// If !attr.has_value() it means we were asked not to keep
|
|
// attr entirely, but just parts of it.
|
|
if (!attr.second.has_value()) {
|
|
rjson::value* toplevel= rjson::find(_item, attr.first);
|
|
if (toplevel && !hierarchy_filter(*toplevel, attr.second)) {
|
|
rjson::remove_member(_item, attr.first);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Remove the extra attributes _extra_filter_attrs which we had
|
|
// to add just for the filter, and not requested to be returned:
|
|
for (const auto& attr : _extra_filter_attrs) {
|
|
rjson::remove_member(_item, attr);
|
|
}
|
|
|
|
_items.push_back(std::move(_item));
|
|
}
|
|
_item = rjson::empty_object();
|
|
++_scanned_count;
|
|
}
|
|
|
|
utils::chunked_vector<rjson::value> get_items() && {
|
|
return std::move(_items);
|
|
}
|
|
|
|
size_t get_scanned_count() {
|
|
return _scanned_count;
|
|
}
|
|
};
|
|
|
|
// describe_items() returns a JSON object that includes members "Count"
|
|
// and "ScannedCount", but *not* "Items" - that is returned separately
|
|
// as a chunked_vector to avoid large contiguous allocations which
|
|
// RapidJSON does of its array. The caller should add "Items" to the
|
|
// returned JSON object if needed, or print it separately.
|
|
// The returned chunked_vector (the items) is std::optional<>, because
|
|
// the user may have requested only to count items, and not return any
|
|
// items - which is different from returning an empty list of items.
|
|
static future<std::tuple<rjson::value, std::optional<utils::chunked_vector<rjson::value>>, size_t>> describe_items(
|
|
const cql3::selection::selection& selection,
|
|
std::unique_ptr<cql3::result_set> result_set,
|
|
std::optional<attrs_to_get>&& attrs_to_get,
|
|
filter&& filter) {
|
|
describe_items_visitor visitor(selection.get_columns(), attrs_to_get, filter);
|
|
co_await result_set->visit_gently(visitor);
|
|
auto scanned_count = visitor.get_scanned_count();
|
|
utils::chunked_vector<rjson::value> items = std::move(visitor).get_items();
|
|
rjson::value items_descr = rjson::empty_object();
|
|
auto size = items.size();
|
|
rjson::add(items_descr, "Count", rjson::value(size));
|
|
rjson::add(items_descr, "ScannedCount", rjson::value(scanned_count));
|
|
// If attrs_to_get && attrs_to_get->empty(), this means the user asked not
|
|
// to get any attributes (i.e., a Scan or Query with Select=COUNT) and we
|
|
// shouldn't return "Items" at all.
|
|
// TODO: consider optimizing the case of Select=COUNT without a filter.
|
|
// In that case, we currently build a list of empty items and here drop
|
|
// it. We could just count the items and not bother with the empty items.
|
|
// (However, remember that when we do have a filter, we need the items).
|
|
std::optional<utils::chunked_vector<rjson::value>> opt_items;
|
|
if (!attrs_to_get || !attrs_to_get->empty()) {
|
|
opt_items = std::move(items);
|
|
}
|
|
co_return std::tuple(std::move(items_descr), std::move(opt_items), size);
|
|
}
|
|
|
|
static rjson::value encode_paging_state(const schema& schema, const service::pager::paging_state& paging_state) {
|
|
rjson::value last_evaluated_key = rjson::empty_object();
|
|
std::vector<bytes> exploded_pk = paging_state.get_partition_key().explode();
|
|
auto exploded_pk_it = exploded_pk.begin();
|
|
for (const column_definition& cdef : schema.partition_key_columns()) {
|
|
rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef.name_as_text()), rjson::empty_object());
|
|
rjson::value& key_entry = last_evaluated_key[cdef.name_as_text()];
|
|
rjson::add_with_string_name(key_entry, type_to_string(cdef.type), json_key_column_value(*exploded_pk_it, cdef));
|
|
++exploded_pk_it;
|
|
}
|
|
auto pos = paging_state.get_position_in_partition();
|
|
if (pos.has_key()) {
|
|
// Alternator itself allows at most one column in clustering key, but
|
|
// user can use Alternator api to access system tables which might have
|
|
// multiple clustering key columns. So we need to handle that case here.
|
|
auto cdef_it = schema.clustering_key_columns().begin();
|
|
for(const auto &exploded_ck : pos.key().explode()) {
|
|
rjson::add_with_string_name(last_evaluated_key, std::string_view(cdef_it->name_as_text()), rjson::empty_object());
|
|
rjson::value& key_entry = last_evaluated_key[cdef_it->name_as_text()];
|
|
rjson::add_with_string_name(key_entry, type_to_string(cdef_it->type), json_key_column_value(exploded_ck, *cdef_it));
|
|
++cdef_it;
|
|
}
|
|
}
|
|
// To avoid possible conflicts (and thus having to reserve these names) we
|
|
// avoid adding the weight and region fields of the position to the paging
|
|
// state. Alternator will never need these as it doesn't have range
|
|
// tombstones (the only thing that can generate a position other than at(row)).
|
|
// We conditionally include these fields when reading CQL tables through alternator.
|
|
if (!is_alternator_keyspace(schema.ks_name()) && (!pos.has_key() || pos.get_bound_weight() != bound_weight::equal)) {
|
|
rjson::add_with_string_name(last_evaluated_key, scylla_paging_region, rjson::empty_object());
|
|
rjson::add(last_evaluated_key[scylla_paging_region.data()], "S", rjson::from_string(fmt::to_string(pos.region())));
|
|
rjson::add_with_string_name(last_evaluated_key, scylla_paging_weight, rjson::empty_object());
|
|
rjson::add(last_evaluated_key[scylla_paging_weight.data()], "N", static_cast<int>(pos.get_bound_weight()));
|
|
}
|
|
return last_evaluated_key;
|
|
}
|
|
|
|
// RapidJSON allocates arrays contiguously in memory, so we want to avoid
|
|
// returning a large number of items as a single rapidjson array, and use
|
|
// a chunked_vector instead. The following constant is an arbitrary cutoff
|
|
// point for when to switch from a rapidjson array to a chunked_vector.
|
|
static constexpr int max_items_for_rapidjson_array = 256;
|
|
|
|
static future<executor::request_return_type> do_query(service::storage_proxy& proxy,
|
|
schema_ptr table_schema,
|
|
const rjson::value* exclusive_start_key,
|
|
dht::partition_range_vector partition_ranges,
|
|
std::vector<query::clustering_range> ck_bounds,
|
|
std::optional<attrs_to_get> attrs_to_get,
|
|
uint32_t limit,
|
|
db::consistency_level cl,
|
|
filter filter,
|
|
query::partition_slice::option_set custom_opts,
|
|
service::client_state& client_state,
|
|
alternator::stats& stats,
|
|
tracing::trace_state_ptr trace_state,
|
|
service_permit permit,
|
|
bool enforce_authorization,
|
|
bool warn_authorization) {
|
|
lw_shared_ptr<service::pager::paging_state> old_paging_state = nullptr;
|
|
|
|
tracing::trace(trace_state, "Performing a database query");
|
|
|
|
// Reverse the schema and the clustering bounds as the underlying code expects
|
|
// reversed queries in the native reversed format.
|
|
auto query_schema = table_schema;
|
|
const bool reversed = custom_opts.contains<query::partition_slice::option::reversed>();
|
|
if (reversed) {
|
|
query_schema = table_schema->get_reversed();
|
|
|
|
std::reverse(ck_bounds.begin(), ck_bounds.end());
|
|
for (auto& bound : ck_bounds) {
|
|
bound = query::reverse(bound);
|
|
}
|
|
}
|
|
|
|
if (exclusive_start_key) {
|
|
partition_key pk = pk_from_json(*exclusive_start_key, table_schema);
|
|
auto pos = position_in_partition::for_partition_start();
|
|
if (table_schema->clustering_key_size() > 0) {
|
|
pos = pos_from_json(*exclusive_start_key, table_schema);
|
|
}
|
|
old_paging_state = make_lw_shared<service::pager::paging_state>(pk, pos, query::max_partitions, query_id::create_null_id(), service::pager::paging_state::replicas_per_token_range{}, std::nullopt, 0);
|
|
}
|
|
|
|
co_await verify_permission(enforce_authorization, warn_authorization, client_state, table_schema, auth::permission::SELECT, stats);
|
|
|
|
auto regular_columns =
|
|
table_schema->regular_columns() | std::views::transform(&column_definition::id)
|
|
| std::ranges::to<query::column_id_vector>();
|
|
auto static_columns =
|
|
table_schema->static_columns() | std::views::transform(&column_definition::id)
|
|
| std::ranges::to<query::column_id_vector>();
|
|
auto selection = cql3::selection::selection::wildcard(table_schema);
|
|
query::partition_slice::option_set opts = selection->get_query_options();
|
|
opts.add(custom_opts);
|
|
auto partition_slice = query::partition_slice(std::move(ck_bounds), std::move(static_columns), std::move(regular_columns), opts);
|
|
auto command = ::make_lw_shared<query::read_command>(query_schema->id(), query_schema->version(), partition_slice, proxy.get_max_result_size(partition_slice),
|
|
query::tombstone_limit(proxy.get_tombstone_limit()));
|
|
|
|
elogger.trace("Executing read query (reversed {}): table schema {}, query schema {}", partition_slice.is_reversed(), table_schema->version(), query_schema->version());
|
|
|
|
auto query_state_ptr = std::make_unique<service::query_state>(client_state, trace_state, std::move(permit));
|
|
|
|
// FIXME: should be moved above, set on opts, so get_max_result_size knows it?
|
|
command->slice.options.set<query::partition_slice::option::allow_short_read>();
|
|
auto query_options = std::make_unique<cql3::query_options>(cl, std::vector<cql3::raw_value>{});
|
|
query_options = std::make_unique<cql3::query_options>(std::move(query_options), std::move(old_paging_state));
|
|
auto p = service::pager::query_pagers::pager(proxy, query_schema, selection, *query_state_ptr, *query_options, command, std::move(partition_ranges), nullptr);
|
|
|
|
std::unique_ptr<cql3::result_set> rs = co_await p->fetch_page(limit, gc_clock::now(), executor::default_timeout());
|
|
if (!p->is_exhausted()) {
|
|
rs->get_metadata().set_paging_state(p->state());
|
|
}
|
|
auto paging_state = rs->get_metadata().paging_state();
|
|
bool has_filter = filter;
|
|
auto [items_descr, opt_items, size] = co_await describe_items(*selection, std::move(rs), std::move(attrs_to_get), std::move(filter));
|
|
if (paging_state) {
|
|
rjson::add(items_descr, "LastEvaluatedKey", encode_paging_state(*table_schema, *paging_state));
|
|
}
|
|
if (has_filter) {
|
|
stats.cql_stats.filtered_rows_read_total += p->stats().rows_read_total;
|
|
// update our "filtered_row_matched_total" for all the rows matched, despited the filter
|
|
stats.cql_stats.filtered_rows_matched_total += size;
|
|
}
|
|
if (opt_items) {
|
|
if (opt_items->size() >= max_items_for_rapidjson_array) {
|
|
// There are many items, better print the JSON and the array of
|
|
// items (opt_items) separately to avoid RapidJSON's contiguous
|
|
// allocation of arrays.
|
|
co_return make_streamed_with_extra_array(std::move(items_descr), "Items", std::move(*opt_items));
|
|
}
|
|
// There aren't many items in the chunked vector opt_items,
|
|
// let's just insert them into the JSON object and print the
|
|
// full JSON normally.
|
|
rjson::value items_json = rjson::empty_array();
|
|
for (auto& item : *opt_items) {
|
|
rjson::push_back(items_json, std::move(item));
|
|
}
|
|
rjson::add(items_descr, "Items", std::move(items_json));
|
|
}
|
|
if (is_big(items_descr)) {
|
|
co_return make_streamed(std::move(items_descr));
|
|
}
|
|
co_return rjson::print(std::move(items_descr));
|
|
}
|
|
|
|
static dht::token token_for_segment(int segment, int total_segments) {
|
|
SCYLLA_ASSERT(total_segments > 1 && segment >= 0 && segment < total_segments);
|
|
uint64_t delta = std::numeric_limits<uint64_t>::max() / total_segments;
|
|
return dht::token::from_int64(std::numeric_limits<int64_t>::min() + delta * segment);
|
|
}
|
|
|
|
static dht::partition_range get_range_for_segment(int segment, int total_segments) {
|
|
if (total_segments == 1) {
|
|
return dht::partition_range::make_open_ended_both_sides();
|
|
}
|
|
if (segment == 0) {
|
|
dht::token ending_token = token_for_segment(1, total_segments);
|
|
return dht::partition_range::make_ending_with(
|
|
dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false));
|
|
} else if (segment == total_segments - 1) {
|
|
dht::token starting_token = token_for_segment(segment, total_segments);
|
|
return dht::partition_range::make_starting_with(
|
|
dht::partition_range::bound(dht::ring_position::starting_at(starting_token)));
|
|
} else {
|
|
dht::token starting_token = token_for_segment(segment, total_segments);
|
|
dht::token ending_token = token_for_segment(segment + 1, total_segments);
|
|
return dht::partition_range::make(
|
|
dht::partition_range::bound(dht::ring_position::starting_at(starting_token)),
|
|
dht::partition_range::bound(dht::ring_position::ending_at(ending_token), false)
|
|
);
|
|
}
|
|
}
|
|
|
|
future<executor::request_return_type> executor::scan(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.scan++;
|
|
elogger.trace("Scanning {}", request);
|
|
|
|
auto [schema, table_type] = get_table_or_view(_proxy, request);
|
|
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
|
get_stats_from_schema(_proxy, *schema)->api_operations.scan++;
|
|
auto segment = get_int_attribute(request, "Segment");
|
|
auto total_segments = get_int_attribute(request, "TotalSegments");
|
|
if (segment || total_segments) {
|
|
if (!segment || !total_segments) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"Both Segment and TotalSegments attributes need to be present for a parallel scan"));
|
|
}
|
|
if (*segment < 0 || *segment >= *total_segments) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"Segment must be non-negative and less than TotalSegments"));
|
|
}
|
|
if (*total_segments < 0 || *total_segments > 1000000) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"TotalSegments must be non-negative and less or equal to 1000000"));
|
|
}
|
|
}
|
|
|
|
rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey");
|
|
|
|
db::consistency_level cl = get_read_consistency(request);
|
|
if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"Consistent reads are not allowed on global indexes (GSI)"));
|
|
}
|
|
rjson::value* limit_json = rjson::find(request, "Limit");
|
|
uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits<uint32_t>::max();
|
|
if (limit <= 0) {
|
|
return make_ready_future<request_return_type>(api_error::validation("Limit must be greater than 0"));
|
|
}
|
|
|
|
select_type select = parse_select(request, table_type);
|
|
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select);
|
|
|
|
dht::partition_range_vector partition_ranges;
|
|
if (segment) {
|
|
auto range = get_range_for_segment(*segment, *total_segments);
|
|
if (exclusive_start_key) {
|
|
auto ring_pos = dht::ring_position{dht::decorate_key(*schema, pk_from_json(*exclusive_start_key, schema))};
|
|
if (!range.contains(ring_pos, dht::ring_position_comparator(*schema))) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
format("The provided starting key is invalid: Invalid ExclusiveStartKey. Please use ExclusiveStartKey "
|
|
"with correct Segment. TotalSegments: {} Segment: {}", *total_segments, *segment)));
|
|
}
|
|
}
|
|
partition_ranges.push_back(range);
|
|
} else {
|
|
partition_ranges.push_back(dht::partition_range::make_open_ended_both_sides());
|
|
}
|
|
std::vector<query::clustering_range> ck_bounds{query::clustering_range::make_open_ended_both_sides()};
|
|
|
|
filter filter(*_parsed_expression_cache, request, filter::request_type::SCAN, used_attribute_names, used_attribute_values);
|
|
// Note: Unlike Query, Scan does allow a filter on the key attributes.
|
|
// For some *specific* cases of key filtering, such an equality test on
|
|
// partition key or comparison operator for the sort key, we could have
|
|
// optimized the filtering by modifying partition_ranges and/or
|
|
// ck_bounds. We haven't done this optimization yet.
|
|
|
|
const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
|
|
verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Scan");
|
|
verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Scan");
|
|
|
|
return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
|
|
std::move(filter), query::partition_slice::option_set(), client_state, _stats, trace_state, std::move(permit), _enforce_authorization, _warn_authorization);
|
|
}
|
|
|
|
static dht::partition_range calculate_pk_bound(schema_ptr schema, const column_definition& pk_cdef, const rjson::value& comp_definition, const rjson::value& attrs) {
|
|
auto op = get_comparison_operator(comp_definition);
|
|
if (op != comparison_operator_type::EQ) {
|
|
throw api_error::validation(format("Hash key can only be restricted with equality operator (EQ). {} not supported.", comp_definition));
|
|
}
|
|
if (attrs.Size() != 1) {
|
|
throw api_error::validation(format("A single attribute is required for a hash key EQ restriction: {}", attrs));
|
|
}
|
|
bytes raw_value = get_key_from_typed_value(attrs[0], pk_cdef);
|
|
partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value));
|
|
auto decorated_key = dht::decorate_key(*schema, pk);
|
|
return dht::partition_range(decorated_key);
|
|
}
|
|
|
|
static query::clustering_range get_clustering_range_for_begins_with(bytes&& target, const clustering_key& ck, schema_ptr schema, data_type t) {
|
|
auto it = boost::range::find_end(target, bytes("\xFF"), std::not_equal_to<bytes::value_type>());
|
|
if (it != target.end()) {
|
|
++*it;
|
|
target.resize(std::distance(target.begin(), it) + 1);
|
|
clustering_key upper_limit = clustering_key::from_single_value(*schema, target);
|
|
return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit, false));
|
|
}
|
|
return query::clustering_range::make_starting_with(query::clustering_range::bound(ck));
|
|
}
|
|
|
|
static query::clustering_range calculate_ck_bound(schema_ptr schema, const column_definition& ck_cdef, const rjson::value& comp_definition, const rjson::value& attrs) {
|
|
auto op = get_comparison_operator(comp_definition);
|
|
const size_t expected_attrs_size = (op == comparison_operator_type::BETWEEN) ? 2 : 1;
|
|
if (attrs.Size() != expected_attrs_size) {
|
|
throw api_error::validation(format("{} arguments expected for a sort key restriction: {}", expected_attrs_size, attrs));
|
|
}
|
|
bytes raw_value = get_key_from_typed_value(attrs[0], ck_cdef);
|
|
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
|
|
switch (op) {
|
|
case comparison_operator_type::EQ:
|
|
return query::clustering_range(ck);
|
|
case comparison_operator_type::LE:
|
|
return query::clustering_range::make_ending_with(query::clustering_range::bound(ck));
|
|
case comparison_operator_type::LT:
|
|
return query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false));
|
|
case comparison_operator_type::GE:
|
|
return query::clustering_range::make_starting_with(query::clustering_range::bound(ck));
|
|
case comparison_operator_type::GT:
|
|
return query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false));
|
|
case comparison_operator_type::BETWEEN: {
|
|
bytes raw_upper_limit = get_key_from_typed_value(attrs[1], ck_cdef);
|
|
clustering_key upper_limit = clustering_key::from_single_value(*schema, raw_upper_limit);
|
|
return query::clustering_range::make(query::clustering_range::bound(ck), query::clustering_range::bound(upper_limit));
|
|
}
|
|
case comparison_operator_type::BEGINS_WITH: {
|
|
if (raw_value.empty()) {
|
|
return query::clustering_range::make_open_ended_both_sides();
|
|
}
|
|
// NOTICE(sarna): A range starting with given prefix and ending (non-inclusively) with a string "incremented" by a single
|
|
// character at the end. Throws for NUMBER instances.
|
|
if (!ck_cdef.type->is_compatible_with(*utf8_type)) {
|
|
throw api_error::validation(fmt::format("BEGINS_WITH operator cannot be applied to type {}", type_to_string(ck_cdef.type)));
|
|
}
|
|
return get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef.type);
|
|
}
|
|
default:
|
|
throw api_error::validation(format("Operator {} not supported for sort key", comp_definition));
|
|
}
|
|
}
|
|
|
|
// Calculates primary key bounds from KeyConditions
|
|
static std::pair<dht::partition_range_vector, std::vector<query::clustering_range>>
|
|
calculate_bounds_conditions(schema_ptr schema, const rjson::value& conditions) {
|
|
dht::partition_range_vector partition_ranges;
|
|
std::vector<query::clustering_range> ck_bounds;
|
|
|
|
for (auto it = conditions.MemberBegin(); it != conditions.MemberEnd(); ++it) {
|
|
sstring key = rjson::to_sstring(it->name);
|
|
const rjson::value& condition = it->value;
|
|
|
|
const rjson::value& comp_definition = rjson::get(condition, "ComparisonOperator");
|
|
const rjson::value& attr_list = rjson::get(condition, "AttributeValueList");
|
|
|
|
const column_definition& pk_cdef = schema->partition_key_columns().front();
|
|
const column_definition* ck_cdef = schema->clustering_key_size() > 0 ? &schema->clustering_key_columns().front() : nullptr;
|
|
if (key == pk_cdef.name_as_text()) {
|
|
if (!partition_ranges.empty()) {
|
|
throw api_error::validation("Currently only a single restriction per key is allowed");
|
|
}
|
|
partition_ranges.push_back(calculate_pk_bound(schema, pk_cdef, comp_definition, attr_list));
|
|
}
|
|
if (ck_cdef && key == ck_cdef->name_as_text()) {
|
|
if (!ck_bounds.empty()) {
|
|
throw api_error::validation("Currently only a single restriction per key is allowed");
|
|
}
|
|
ck_bounds.push_back(calculate_ck_bound(schema, *ck_cdef, comp_definition, attr_list));
|
|
}
|
|
}
|
|
|
|
// Validate that a query's conditions must be on the hash key, and
|
|
// optionally also on the sort key if it exists.
|
|
if (partition_ranges.empty()) {
|
|
throw api_error::validation(format("Query missing condition on hash key '{}'", schema->partition_key_columns().front().name_as_text()));
|
|
}
|
|
if (schema->clustering_key_size() == 0) {
|
|
if (conditions.MemberCount() != 1) {
|
|
throw api_error::validation("Only one condition allowed in table with only hash key");
|
|
}
|
|
} else {
|
|
if (conditions.MemberCount() == 2 && ck_bounds.empty()) {
|
|
throw api_error::validation(format("Query missing condition on sort key '{}'", schema->clustering_key_columns().front().name_as_text()));
|
|
} else if (conditions.MemberCount() > 2) {
|
|
throw api_error::validation("Only one or two conditions allowed in table with hash key and sort key");
|
|
}
|
|
}
|
|
|
|
if (ck_bounds.empty()) {
|
|
ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
}
|
|
|
|
return {std::move(partition_ranges), std::move(ck_bounds)};
|
|
}
|
|
|
|
// Extract the top-level column name specified in a KeyConditionExpression.
|
|
// If a nested attribute path is given, a ValidationException is generated.
|
|
// If the column name is a #reference to ExpressionAttributeNames, the
|
|
// reference is resolved.
|
|
// Note this function returns a string_view, which may refer to data in the
|
|
// given parsed::value or expression_attribute_names.
|
|
static std::string_view get_toplevel(const parsed::value& v,
|
|
const rjson::value* expression_attribute_names,
|
|
std::unordered_set<std::string>& used_attribute_names)
|
|
{
|
|
const parsed::path& path = std::get<parsed::path>(v._value);
|
|
if (path.has_operators()) {
|
|
throw api_error::validation("KeyConditionExpression does not support nested attributes");
|
|
}
|
|
std::string_view column_name = path.root();
|
|
if (column_name.size() > 0 && column_name[0] == '#') {
|
|
used_attribute_names.emplace(column_name);
|
|
if (!expression_attribute_names) {
|
|
throw api_error::validation(
|
|
fmt::format("ExpressionAttributeNames missing, entry '{}' required by KeyConditionExpression",
|
|
column_name));
|
|
}
|
|
const rjson::value* value = rjson::find(*expression_attribute_names, column_name);
|
|
if (!value || !value->IsString()) {
|
|
throw api_error::validation(
|
|
fmt::format("ExpressionAttributeNames missing entry '{}' required by KeyConditionExpression",
|
|
column_name));
|
|
}
|
|
column_name = rjson::to_string_view(*value);
|
|
}
|
|
return column_name;
|
|
}
|
|
|
|
// Extract a constant value specified in a KeyConditionExpression.
|
|
// This constant was originally parsed as a reference (:name) to a member of
|
|
// ExpressionAttributeValues, but at this point, after resolve_value(), it
|
|
// was already converted into a JSON value.
|
|
// This function decodes the value (using its given expected type) into bytes
|
|
// which Scylla uses as the actual key value. If the value has the wrong type,
|
|
// or the input had other problems, a ValidationException is thrown.
|
|
static bytes get_constant_value(const parsed::value& v,
|
|
const column_definition& column)
|
|
{
|
|
const parsed::constant& constant = std::get<parsed::constant>(v._value);
|
|
const parsed::constant::literal& lit = std::get<parsed::constant::literal>(constant._value);
|
|
return get_key_from_typed_value(*lit, column);
|
|
}
|
|
|
|
// condition_expression_and_list extracts a list of ANDed primitive conditions
|
|
// from a condition_expression. This is useful for KeyConditionExpression,
|
|
// which may not use OR or NOT. If the given condition_expression does use
|
|
// OR or NOT, this function throws a ValidationException.
|
|
static void condition_expression_and_list(
|
|
const parsed::condition_expression& condition_expression,
|
|
std::vector<const parsed::primitive_condition*>& conditions)
|
|
{
|
|
if (condition_expression._negated) {
|
|
throw api_error::validation("KeyConditionExpression cannot use NOT");
|
|
}
|
|
std::visit(overloaded_functor {
|
|
[&] (const parsed::primitive_condition& cond) {
|
|
conditions.push_back(&cond);
|
|
},
|
|
[&] (const parsed::condition_expression::condition_list& list) {
|
|
if (list.op == '|' && list.conditions.size() > 1) {
|
|
throw api_error::validation("KeyConditionExpression cannot use OR");
|
|
}
|
|
for (const parsed::condition_expression& cond : list.conditions) {
|
|
condition_expression_and_list(cond, conditions);
|
|
}
|
|
}
|
|
}, condition_expression._expression);
|
|
}
|
|
|
|
// Calculates primary key bounds from KeyConditionExpression
|
|
static std::pair<dht::partition_range_vector, std::vector<query::clustering_range>>
|
|
calculate_bounds_condition_expression(schema_ptr schema,
|
|
const rjson::value& expression,
|
|
const rjson::value* expression_attribute_values,
|
|
std::unordered_set<std::string>& used_attribute_values,
|
|
const rjson::value* expression_attribute_names,
|
|
std::unordered_set<std::string>& used_attribute_names,
|
|
parsed::expression_cache& parsed_expression_cache)
|
|
{
|
|
if (!expression.IsString()) {
|
|
throw api_error::validation("KeyConditionExpression must be a string");
|
|
}
|
|
if (expression.GetStringLength() == 0) {
|
|
throw api_error::validation("KeyConditionExpression must not be empty");
|
|
}
|
|
// We parse the KeyConditionExpression with the same parser we use for
|
|
// ConditionExpression. But KeyConditionExpression only supports a subset
|
|
// of the ConditionExpression features, so we have many additional
|
|
// verifications below that the key condition is legal. Briefly, a valid
|
|
// key condition must contain a single partition key and a single
|
|
// sort-key range.
|
|
parsed::condition_expression p;
|
|
try {
|
|
p = parsed_expression_cache.parse_condition_expression(rjson::to_string_view(expression), "KeyConditionExpression");
|
|
} catch(expressions_syntax_error& e) {
|
|
throw api_error::validation(e.what());
|
|
}
|
|
resolve_condition_expression(p,
|
|
expression_attribute_names, expression_attribute_values,
|
|
used_attribute_names, used_attribute_values);
|
|
std::vector<const parsed::primitive_condition*> conditions;
|
|
condition_expression_and_list(p, conditions);
|
|
|
|
if (conditions.size() < 1 || conditions.size() > 2) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression syntax error: must have 1 or 2 conditions");
|
|
}
|
|
// Scylla allows us to have an (equality) constraint on the partition key
|
|
// pk_cdef, and a range constraint on the *first* clustering key ck_cdef.
|
|
// Note that this is also good enough for our GSI implementation - the
|
|
// GSI's user-specified sort key will be the first clustering key.
|
|
// FIXME: In the case described in issue #5320 (base and GSI both have
|
|
// just hash key - but different ones), this may allow the user to Query
|
|
// using the base key which isn't officially part of the GSI.
|
|
const column_definition& pk_cdef = schema->partition_key_columns().front();
|
|
const column_definition* ck_cdef = schema->clustering_key_size() > 0 ?
|
|
&schema->clustering_key_columns().front() : nullptr;
|
|
|
|
dht::partition_range_vector partition_ranges;
|
|
std::vector<query::clustering_range> ck_bounds;
|
|
for (const parsed::primitive_condition* condp : conditions) {
|
|
const parsed::primitive_condition& cond = *condp;
|
|
// In all comparison operators, one operand must be a column name,
|
|
// the other is a constant (value reference). We remember which is
|
|
// which in toplevel_ind, and also the column name in key (not just
|
|
// for comparison operators).
|
|
std::string_view key;
|
|
int toplevel_ind;
|
|
switch (cond._values.size()) {
|
|
case 1: {
|
|
// The only legal single-value condition is a begin_with() function,
|
|
// and it must have two parameters - a top-level attribute and a
|
|
// value reference..
|
|
const parsed::value::function_call *f = std::get_if<parsed::value::function_call>(&cond._values[0]._value);
|
|
if (!f) {
|
|
throw api_error::validation("KeyConditionExpression cannot be just a value");
|
|
}
|
|
if (f->_function_name != "begins_with") {
|
|
throw api_error::validation(
|
|
fmt::format("KeyConditionExpression function '{}' not supported",f->_function_name));
|
|
}
|
|
if (f->_parameters.size() != 2 || !f->_parameters[0].is_path() ||
|
|
!f->_parameters[1].is_constant()) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression begins_with() takes attribute and value");
|
|
}
|
|
key = get_toplevel(f->_parameters[0], expression_attribute_names, used_attribute_names);
|
|
toplevel_ind = -1;
|
|
break;
|
|
}
|
|
case 2:
|
|
if (cond._values[0].is_path() && cond._values[1].is_constant()) {
|
|
toplevel_ind = 0;
|
|
} else if (cond._values[1].is_path() && cond._values[0].is_constant()) {
|
|
toplevel_ind = 1;
|
|
} else {
|
|
throw api_error::validation("KeyConditionExpression must compare attribute with constant");
|
|
}
|
|
key = get_toplevel(cond._values[toplevel_ind], expression_attribute_names, used_attribute_names);
|
|
break;
|
|
case 3:
|
|
// Only BETWEEN has three operands. First must be a column name,
|
|
// two other must be value references (constants):
|
|
if (cond._op != parsed::primitive_condition::type::BETWEEN) {
|
|
// Shouldn't happen unless we have a bug in the parser
|
|
throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size()));
|
|
}
|
|
if (cond._values[0].is_path() && cond._values[1].is_constant() && cond._values[2].is_constant()) {
|
|
toplevel_ind = 0;
|
|
key = get_toplevel(cond._values[0], expression_attribute_names, used_attribute_names);
|
|
} else {
|
|
throw api_error::validation("KeyConditionExpression must compare attribute with constants");
|
|
}
|
|
break;
|
|
default:
|
|
// Shouldn't happen unless we have a bug in the parser
|
|
throw std::logic_error(format("Wrong number of values {} in primitive_condition", cond._values.size()));
|
|
}
|
|
if (cond._op == parsed::primitive_condition::type::IN) {
|
|
throw api_error::validation("KeyConditionExpression does not support IN operator");
|
|
} else if (cond._op == parsed::primitive_condition::type::NE) {
|
|
throw api_error::validation("KeyConditionExpression does not support NE operator");
|
|
} else if (cond._op == parsed::primitive_condition::type::EQ) {
|
|
// the EQ operator (=) is the only one which can be used for both
|
|
// the partition key and sort key:
|
|
if (sstring(key) == pk_cdef.name_as_text()) {
|
|
if (!partition_ranges.empty()) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression allows only one condition for each key");
|
|
}
|
|
bytes raw_value = get_constant_value(cond._values[!toplevel_ind], pk_cdef);
|
|
partition_key pk = partition_key::from_singular_bytes(*schema, std::move(raw_value));
|
|
auto decorated_key = dht::decorate_key(*schema, pk);
|
|
partition_ranges.push_back(dht::partition_range(decorated_key));
|
|
} else if (ck_cdef && sstring(key) == ck_cdef->name_as_text()) {
|
|
if (!ck_bounds.empty()) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression allows only one condition for each key");
|
|
}
|
|
bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef);
|
|
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
|
|
ck_bounds.push_back(query::clustering_range(ck));
|
|
} else {
|
|
throw api_error::validation(
|
|
fmt::format("KeyConditionExpression condition on non-key attribute {}", key));
|
|
}
|
|
continue;
|
|
}
|
|
// If we're still here, it's any other operator besides EQ, and these
|
|
// are allowed *only* on the clustering key:
|
|
if (sstring(key) == pk_cdef.name_as_text()) {
|
|
throw api_error::validation(
|
|
fmt::format("KeyConditionExpression only '=' condition is supported on partition key {}", key));
|
|
} else if (!ck_cdef || sstring(key) != ck_cdef->name_as_text()) {
|
|
throw api_error::validation(
|
|
fmt::format("KeyConditionExpression condition on non-key attribute {}", key));
|
|
}
|
|
if (!ck_bounds.empty()) {
|
|
throw api_error::validation(
|
|
"KeyConditionExpression allows only one condition for each key");
|
|
}
|
|
if (cond._op == parsed::primitive_condition::type::BETWEEN) {
|
|
clustering_key ck1 = clustering_key::from_single_value(*schema,
|
|
get_constant_value(cond._values[1], *ck_cdef));
|
|
clustering_key ck2 = clustering_key::from_single_value(*schema,
|
|
get_constant_value(cond._values[2], *ck_cdef));
|
|
ck_bounds.push_back(query::clustering_range::make(
|
|
query::clustering_range::bound(ck1), query::clustering_range::bound(ck2)));
|
|
continue;
|
|
} else if (cond._values.size() == 1) {
|
|
// We already verified above, that this case this can only be a
|
|
// function call to begins_with(), with the first parameter the
|
|
// key, the second the value reference.
|
|
bytes raw_value = get_constant_value(
|
|
std::get<parsed::value::function_call>(cond._values[0]._value)._parameters[1], *ck_cdef);
|
|
if (!ck_cdef->type->is_compatible_with(*utf8_type)) {
|
|
// begins_with() supported on bytes and strings (both stored
|
|
// in the database as strings) but not on numbers.
|
|
throw api_error::validation(
|
|
fmt::format("KeyConditionExpression begins_with() not supported on type {}",
|
|
type_to_string(ck_cdef->type)));
|
|
} else if (raw_value.empty()) {
|
|
ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
} else {
|
|
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
|
|
ck_bounds.push_back(get_clustering_range_for_begins_with(std::move(raw_value), ck, schema, ck_cdef->type));
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// All remaining operator have one value reference parameter in index
|
|
// !toplevel_ind. Note how toplevel_ind==1 reverses the direction of
|
|
// an inequality.
|
|
bytes raw_value = get_constant_value(cond._values[!toplevel_ind], *ck_cdef);
|
|
clustering_key ck = clustering_key::from_single_value(*schema, raw_value);
|
|
if ((cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 0) ||
|
|
(cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 1)) {
|
|
ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck, false)));
|
|
} else if ((cond._op == parsed::primitive_condition::type::GT && toplevel_ind == 0) ||
|
|
(cond._op == parsed::primitive_condition::type::LT && toplevel_ind == 1)) {
|
|
ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck, false)));
|
|
} else if ((cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 0) ||
|
|
(cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 1)) {
|
|
ck_bounds.push_back(query::clustering_range::make_ending_with(query::clustering_range::bound(ck)));
|
|
} else if ((cond._op == parsed::primitive_condition::type::GE && toplevel_ind == 0) ||
|
|
(cond._op == parsed::primitive_condition::type::LE && toplevel_ind == 1)) {
|
|
ck_bounds.push_back(query::clustering_range::make_starting_with(query::clustering_range::bound(ck)));
|
|
}
|
|
}
|
|
|
|
if (partition_ranges.empty()) {
|
|
throw api_error::validation(
|
|
format("KeyConditionExpression requires a condition on partition key {}", pk_cdef.name_as_text()));
|
|
}
|
|
if (ck_bounds.empty()) {
|
|
ck_bounds.push_back(query::clustering_range::make_open_ended_both_sides());
|
|
}
|
|
return {std::move(partition_ranges), std::move(ck_bounds)};
|
|
}
|
|
|
|
future<executor::request_return_type> executor::query(client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.query++;
|
|
elogger.trace("Querying {}", request);
|
|
|
|
auto [schema, table_type] = get_table_or_view(_proxy, request);
|
|
get_stats_from_schema(_proxy, *schema)->api_operations.query++;
|
|
tracing::add_alternator_table_name(trace_state, schema->cf_name());
|
|
|
|
rjson::value* exclusive_start_key = rjson::find(request, "ExclusiveStartKey");
|
|
db::consistency_level cl = get_read_consistency(request);
|
|
if (table_type == table_or_view_type::gsi && cl != db::consistency_level::LOCAL_ONE) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
"Consistent reads are not allowed on global indexes (GSI)"));
|
|
}
|
|
rjson::value* limit_json = rjson::find(request, "Limit");
|
|
uint32_t limit = limit_json ? limit_json->GetUint64() : std::numeric_limits<uint32_t>::max();
|
|
if (limit <= 0) {
|
|
return make_ready_future<request_return_type>(api_error::validation("Limit must be greater than 0"));
|
|
}
|
|
|
|
const bool forward = get_bool_attribute(request, "ScanIndexForward", true);
|
|
|
|
rjson::value* key_conditions = rjson::find(request, "KeyConditions");
|
|
rjson::value* key_condition_expression = rjson::find(request, "KeyConditionExpression");
|
|
std::unordered_set<std::string> used_attribute_values;
|
|
std::unordered_set<std::string> used_attribute_names;
|
|
if (key_conditions && key_condition_expression) {
|
|
throw api_error::validation("Query does not allow both "
|
|
"KeyConditions and KeyConditionExpression to be given together");
|
|
} else if (!key_conditions && !key_condition_expression) {
|
|
throw api_error::validation("Query must have one of "
|
|
"KeyConditions or KeyConditionExpression");
|
|
}
|
|
|
|
const rjson::value* expression_attribute_names = rjson::find(request, "ExpressionAttributeNames");
|
|
const rjson::value* expression_attribute_values = rjson::find(request, "ExpressionAttributeValues");
|
|
|
|
// exactly one of key_conditions or key_condition_expression
|
|
auto [partition_ranges, ck_bounds] = key_conditions
|
|
? calculate_bounds_conditions(schema, *key_conditions)
|
|
: calculate_bounds_condition_expression(schema, *key_condition_expression,
|
|
expression_attribute_values,
|
|
used_attribute_values,
|
|
expression_attribute_names,
|
|
used_attribute_names, *_parsed_expression_cache);
|
|
|
|
filter filter(*_parsed_expression_cache, request, filter::request_type::QUERY,
|
|
used_attribute_names, used_attribute_values);
|
|
|
|
// A query is not allowed to filter on the partition key or the sort key.
|
|
for (const column_definition& cdef : schema->partition_key_columns()) { // just one
|
|
if (filter.filters_on(cdef.name_as_text())) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
format("QueryFilter can only contain non-primary key attributes: Partition key attribute: {}", cdef.name_as_text())));
|
|
}
|
|
}
|
|
for (const column_definition& cdef : schema->clustering_key_columns()) {
|
|
if (filter.filters_on(cdef.name_as_text())) {
|
|
return make_ready_future<request_return_type>(api_error::validation(
|
|
format("QueryFilter can only contain non-primary key attributes: Sort key attribute: {}", cdef.name_as_text())));
|
|
}
|
|
// FIXME: this "break" can avoid listing some clustering key columns
|
|
// we added for GSIs just because they existed in the base table -
|
|
// but not in all cases. We still have issue #5320.
|
|
break;
|
|
}
|
|
|
|
select_type select = parse_select(request, table_type);
|
|
|
|
auto attrs_to_get = calculate_attrs_to_get(request, *_parsed_expression_cache, used_attribute_names, select);
|
|
verify_all_are_used(expression_attribute_names, used_attribute_names, "ExpressionAttributeNames", "Query");
|
|
verify_all_are_used(expression_attribute_values, used_attribute_values, "ExpressionAttributeValues", "Query");
|
|
query::partition_slice::option_set opts;
|
|
opts.set_if<query::partition_slice::option::reversed>(!forward);
|
|
return do_query(_proxy, schema, exclusive_start_key, std::move(partition_ranges), std::move(ck_bounds), std::move(attrs_to_get), limit, cl,
|
|
std::move(filter), opts, client_state, _stats, std::move(trace_state), std::move(permit), _enforce_authorization, _warn_authorization);
|
|
}
|
|
|
|
future<executor::request_return_type> executor::list_tables(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.list_tables++;
|
|
elogger.trace("Listing tables {}", request);
|
|
|
|
rjson::value* exclusive_start_json = rjson::find(request, "ExclusiveStartTableName");
|
|
rjson::value* limit_json = rjson::find(request, "Limit");
|
|
std::string exclusive_start = exclusive_start_json ? rjson::to_string(*exclusive_start_json) : "";
|
|
int limit = limit_json ? limit_json->GetInt() : 100;
|
|
if (limit < 1 || limit > 100) {
|
|
co_return api_error::validation("Limit must be greater than 0 and no greater than 100");
|
|
}
|
|
|
|
auto tables = _proxy.data_dictionary().get_tables(); // hold on to temporary, table_names isn't a container, it's a view
|
|
auto table_names = tables
|
|
| std::views::filter([this] (data_dictionary::table t) {
|
|
return t.schema()->ks_name().find(KEYSPACE_NAME_PREFIX) == 0 &&
|
|
!t.schema()->is_view() &&
|
|
!cdc::is_log_for_some_table(_proxy.local_db(), t.schema()->ks_name(), t.schema()->cf_name());
|
|
})
|
|
| std::views::transform([] (data_dictionary::table t) {
|
|
return t.schema()->cf_name();
|
|
});
|
|
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "TableNames", rjson::empty_array());
|
|
rjson::value& all_tables = response["TableNames"];
|
|
|
|
//TODO(sarna): Dynamo doesn't declare any ordering when listing tables,
|
|
// but our implementation is vulnerable to changes, because the tables
|
|
// are stored in an unordered map. We may consider (partially) sorting
|
|
// the results before returning them to the client, especially if there
|
|
// is an implicit order of elements that Dynamo imposes.
|
|
auto table_names_it = [&table_names, &exclusive_start] {
|
|
if (!exclusive_start.empty()) {
|
|
auto it = std::ranges::find_if(table_names, [&exclusive_start] (const sstring& table_name) { return table_name == exclusive_start; });
|
|
return std::next(it, it != table_names.end());
|
|
} else {
|
|
return table_names.begin();
|
|
}
|
|
}();
|
|
while (limit > 0 && table_names_it != table_names.end()) {
|
|
rjson::push_back(all_tables, rjson::from_string(*table_names_it));
|
|
--limit;
|
|
++table_names_it;
|
|
}
|
|
|
|
if (table_names_it != table_names.end()) {
|
|
auto& last_table_name = *std::prev(all_tables.End());
|
|
rjson::add(response, "LastEvaluatedTableName", rjson::copy(last_table_name));
|
|
}
|
|
|
|
co_return rjson::print(std::move(response));
|
|
}
|
|
|
|
future<executor::request_return_type> executor::describe_endpoints(client_state& client_state, service_permit permit, rjson::value request, std::string host_header) {
|
|
_stats.api_operations.describe_endpoints++;
|
|
// The alternator_describe_endpoints configuration can be used to disable
|
|
// the DescribeEndpoints operation, or set it to return a fixed string
|
|
std::string override = _proxy.data_dictionary().get_config().alternator_describe_endpoints();
|
|
if (!override.empty()) {
|
|
if (override == "disabled") {
|
|
_stats.unsupported_operations++;
|
|
co_return api_error::unknown_operation(
|
|
"DescribeEndpoints disabled by configuration (alternator_describe_endpoints=disabled)");
|
|
}
|
|
host_header = std::move(override);
|
|
}
|
|
rjson::value response = rjson::empty_object();
|
|
// Without having any configuration parameter to say otherwise, we tell
|
|
// the user to return to the same endpoint they used to reach us. The only
|
|
// way we can know this is through the "Host:" header in the request,
|
|
// which typically exists (and in fact is mandatory in HTTP 1.1).
|
|
// A "Host:" header includes both host name and port, exactly what we need
|
|
// to return.
|
|
if (host_header.empty()) {
|
|
co_return api_error::validation("DescribeEndpoints needs a 'Host:' header in request");
|
|
}
|
|
rjson::add(response, "Endpoints", rjson::empty_array());
|
|
rjson::push_back(response["Endpoints"], rjson::empty_object());
|
|
rjson::add(response["Endpoints"][0], "Address", rjson::from_string(host_header));
|
|
rjson::add(response["Endpoints"][0], "CachePeriodInMinutes", rjson::value(1440));
|
|
co_return rjson::print(std::move(response));
|
|
}
|
|
|
|
static locator::replication_strategy_config_options get_network_topology_options(service::storage_proxy& sp, gms::gossiper& gossiper, int rf) {
|
|
locator::replication_strategy_config_options options;
|
|
for (const auto& dc : sp.get_token_metadata_ptr()->get_datacenter_racks_token_owners() | std::views::keys) {
|
|
options.emplace(dc, std::to_string(rf));
|
|
}
|
|
return options;
|
|
}
|
|
|
|
future<executor::request_return_type> executor::describe_continuous_backups(client_state& client_state, service_permit permit, rjson::value request) {
|
|
_stats.api_operations.describe_continuous_backups++;
|
|
// Unlike most operations which return ResourceNotFound when the given
|
|
// table doesn't exists, this operation returns a TableNoteFoundException.
|
|
// So we can't use the usual get_table() wrapper and need a bit more code:
|
|
std::string table_name = get_table_name(request);
|
|
schema_ptr schema;
|
|
try {
|
|
schema = _proxy.data_dictionary().find_schema(sstring(executor::KEYSPACE_NAME_PREFIX) + table_name, table_name);
|
|
} catch(data_dictionary::no_such_column_family&) {
|
|
// DynamoDB returns validation error even when table does not exist
|
|
// and the table name is invalid.
|
|
validate_table_name(table_name);
|
|
|
|
throw api_error::table_not_found(
|
|
fmt::format("Table {} not found", table_name));
|
|
}
|
|
rjson::value desc = rjson::empty_object();
|
|
rjson::add(desc, "ContinuousBackupsStatus", "DISABLED");
|
|
rjson::value pitr = rjson::empty_object();
|
|
rjson::add(pitr, "PointInTimeRecoveryStatus", "DISABLED");
|
|
rjson::add(desc, "PointInTimeRecoveryDescription", std::move(pitr));
|
|
rjson::value response = rjson::empty_object();
|
|
rjson::add(response, "ContinuousBackupsDescription", std::move(desc));
|
|
co_return rjson::print(std::move(response));
|
|
}
|
|
|
|
// Create the metadata for the keyspace in which we put the alternator
|
|
// table if it doesn't already exist.
|
|
// Currently, we automatically configure the keyspace based on the number
|
|
// of nodes in the cluster: A cluster with 3 or more live nodes, gets RF=3.
|
|
// A smaller cluster (presumably, a test only), gets RF=1. The user may
|
|
// manually create the keyspace to override this predefined behavior.
|
|
static lw_shared_ptr<keyspace_metadata> create_keyspace_metadata(std::string_view keyspace_name, service::storage_proxy& sp, gms::gossiper& gossiper, api::timestamp_type ts,
|
|
const std::map<sstring, sstring>& tags_map, const gms::feature_service& feat, const db::tablets_mode_t::mode tablets_mode) {
|
|
// Whether to use tablets for the table (actually for the keyspace of the
|
|
// table) is determined by tablets_mode (taken from the configuration
|
|
// option "tablets_mode_for_new_keyspaces"), as well as the presence and
|
|
// the value of a per-table tag system:initial_tablets
|
|
// (INITIAL_TABLETS_TAG_KEY).
|
|
// Setting the tag with a numeric value will enable a specific initial number
|
|
// of tablets (setting the value to 0 means enabling tablets with
|
|
// an automatic selection of the best number of tablets).
|
|
// Setting this tag to any non-numeric value (e.g., an empty string or the
|
|
// word "none") will ask to disable tablets.
|
|
// When vnodes are asked for by the tag value, but tablets are enforced by config,
|
|
// throw an exception to the client.
|
|
std::optional<unsigned> initial_tablets;
|
|
if (feat.tablets) {
|
|
auto it = tags_map.find(INITIAL_TABLETS_TAG_KEY);
|
|
if (it != tags_map.end()) {
|
|
// Tag set. If it's a valid number, use it. If not - e.g., it's
|
|
// empty or a word like "none", disable tablets by setting
|
|
// initial_tablets to a disengaged optional.
|
|
try {
|
|
initial_tablets = std::stol(tags_map.at(INITIAL_TABLETS_TAG_KEY));
|
|
} catch (...) {
|
|
if (tablets_mode == db::tablets_mode_t::mode::enforced) {
|
|
throw api_error::validation(format("Tag {} containing non-numerical value requests vnodes, but vnodes are forbidden by configuration option `tablets_mode_for_new_keyspaces: enforced`", INITIAL_TABLETS_TAG_KEY));
|
|
}
|
|
initial_tablets = std::nullopt;
|
|
elogger.trace("Following {} tag containing non-numerical value, Alternator will attempt to create a keyspace {} with vnodes.", INITIAL_TABLETS_TAG_KEY, keyspace_name);
|
|
}
|
|
} else {
|
|
// No per-table tag present, use the value from config
|
|
if (tablets_mode == db::tablets_mode_t::mode::enabled || tablets_mode == db::tablets_mode_t::mode::enforced) {
|
|
initial_tablets = 0;
|
|
elogger.trace("Following the `tablets_mode_for_new_keyspaces` flag from the settings, Alternator will attempt to create a keyspace {} with tablets.", keyspace_name);
|
|
} else {
|
|
initial_tablets = std::nullopt;
|
|
elogger.trace("Following the `tablets_mode_for_new_keyspaces` flag from the settings, Alternator will attempt to create a keyspace {} with vnodes.", keyspace_name);
|
|
}
|
|
}
|
|
}
|
|
|
|
int endpoint_count = gossiper.num_endpoints();
|
|
int rf = 3;
|
|
if (endpoint_count < rf) {
|
|
rf = 1;
|
|
elogger.warn("Creating keyspace '{}' for Alternator with unsafe RF={} because cluster only has {} nodes.",
|
|
keyspace_name, rf, endpoint_count);
|
|
}
|
|
auto opts = get_network_topology_options(sp, gossiper, rf);
|
|
cql3::statements::ks_prop_defs props;
|
|
opts["class"] = sstring("NetworkTopologyStrategy");
|
|
props.add_property(cql3::statements::ks_prop_defs::KW_REPLICATION, opts);
|
|
std::map<sstring, sstring> tablet_opts;
|
|
if (initial_tablets) {
|
|
tablet_opts["initial"] = std::to_string(*initial_tablets);
|
|
}
|
|
tablet_opts["enabled"] = initial_tablets ? "true" : "false";
|
|
props.add_property(cql3::statements::ks_prop_defs::KW_TABLETS, std::move(tablet_opts));
|
|
props.validate();
|
|
return props.as_ks_metadata(sstring(keyspace_name), *sp.get_token_metadata_ptr(), feat, sp.local_db().get_config());
|
|
}
|
|
|
|
future<> executor::start() {
|
|
// Currently, nothing to do on initialization. We delay the keyspace
|
|
// creation (create_keyspace()) until a table is actually created.
|
|
return make_ready_future<>();
|
|
}
|
|
|
|
future<> executor::stop() {
|
|
// disconnect from the value source, but keep the value unchanged.
|
|
s_default_timeout_in_ms = utils::updateable_value<uint32_t>{s_default_timeout_in_ms()};
|
|
return _parsed_expression_cache->stop();
|
|
}
|
|
|
|
} // namespace alternator
|